diff options
-rw-r--r-- | .gitignore | 28 | ||||
-rw-r--r-- | Makefile.am | 2 | ||||
-rwxr-xr-x | compound-split/compound-split.pl | 2 | ||||
-rw-r--r-- | configure.ac | 2 | ||||
-rw-r--r-- | decoder/JSON_parser.c (renamed from src/JSON_parser.c) | 0 | ||||
-rw-r--r-- | decoder/JSON_parser.h (renamed from src/JSON_parser.h) | 0 | ||||
-rw-r--r-- | decoder/Makefile.am (renamed from src/Makefile.am) | 22 | ||||
-rw-r--r-- | decoder/aligner.cc (renamed from src/aligner.cc) | 0 | ||||
-rw-r--r-- | decoder/aligner.h (renamed from src/aligner.h) | 0 | ||||
-rw-r--r-- | decoder/apply_models.cc (renamed from src/apply_models.cc) | 0 | ||||
-rw-r--r-- | decoder/apply_models.h (renamed from src/apply_models.h) | 0 | ||||
-rw-r--r-- | decoder/array2d.h (renamed from src/array2d.h) | 0 | ||||
-rw-r--r-- | decoder/bottom_up_parser.cc (renamed from src/bottom_up_parser.cc) | 0 | ||||
-rw-r--r-- | decoder/bottom_up_parser.h (renamed from src/bottom_up_parser.h) | 0 | ||||
-rw-r--r-- | decoder/cdec.cc (renamed from src/cdec.cc) | 0 | ||||
-rw-r--r-- | decoder/cdec_ff.cc (renamed from src/cdec_ff.cc) | 0 | ||||
-rw-r--r-- | decoder/csplit.cc (renamed from src/csplit.cc) | 0 | ||||
-rw-r--r-- | decoder/csplit.h (renamed from src/csplit.h) | 0 | ||||
-rw-r--r-- | decoder/dict.h (renamed from src/dict.h) | 0 | ||||
-rw-r--r-- | decoder/dict_test.cc (renamed from src/dict_test.cc) | 0 | ||||
-rw-r--r-- | decoder/earley_composer.cc (renamed from src/earley_composer.cc) | 0 | ||||
-rw-r--r-- | decoder/earley_composer.h (renamed from src/earley_composer.h) | 0 | ||||
-rw-r--r-- | decoder/exp_semiring.h (renamed from src/exp_semiring.h) | 0 | ||||
-rw-r--r-- | decoder/fdict.cc (renamed from src/fdict.cc) | 0 | ||||
-rw-r--r-- | decoder/fdict.h (renamed from src/fdict.h) | 0 | ||||
-rw-r--r-- | decoder/ff.cc (renamed from src/ff.cc) | 0 | ||||
-rw-r--r-- | decoder/ff.h (renamed from src/ff.h) | 0 | ||||
-rw-r--r-- | decoder/ff_csplit.cc (renamed from src/ff_csplit.cc) | 0 | ||||
-rw-r--r-- | decoder/ff_csplit.h (renamed from src/ff_csplit.h) | 0 | ||||
-rw-r--r-- | decoder/ff_factory.cc (renamed from src/ff_factory.cc) | 0 | ||||
-rw-r--r-- | decoder/ff_factory.h (renamed from src/ff_factory.h) | 0 | ||||
-rw-r--r-- | decoder/ff_lm.cc (renamed from src/ff_lm.cc) | 0 | ||||
-rw-r--r-- | decoder/ff_lm.h (renamed from src/ff_lm.h) | 0 | ||||
-rw-r--r-- | decoder/ff_test.cc (renamed from src/ff_test.cc) | 0 | ||||
-rw-r--r-- | decoder/ff_wordalign.cc (renamed from src/ff_wordalign.cc) | 23 | ||||
-rw-r--r-- | decoder/ff_wordalign.h (renamed from src/ff_wordalign.h) | 3 | ||||
-rw-r--r-- | decoder/filelib.cc (renamed from src/filelib.cc) | 0 | ||||
-rw-r--r-- | decoder/filelib.h (renamed from src/filelib.h) | 0 | ||||
-rw-r--r-- | decoder/forest_writer.cc (renamed from src/forest_writer.cc) | 0 | ||||
-rw-r--r-- | decoder/forest_writer.h (renamed from src/forest_writer.h) | 0 | ||||
-rw-r--r-- | decoder/freqdict.cc (renamed from src/freqdict.cc) | 0 | ||||
-rw-r--r-- | decoder/freqdict.h (renamed from src/freqdict.h) | 0 | ||||
-rw-r--r-- | decoder/fst_translator.cc (renamed from src/fst_translator.cc) | 0 | ||||
-rw-r--r-- | decoder/grammar.cc (renamed from src/grammar.cc) | 0 | ||||
-rw-r--r-- | decoder/grammar.h (renamed from src/grammar.h) | 0 | ||||
-rw-r--r-- | decoder/grammar_test.cc (renamed from src/grammar_test.cc) | 0 | ||||
-rw-r--r-- | decoder/gzstream.cc (renamed from src/gzstream.cc) | 0 | ||||
-rw-r--r-- | decoder/gzstream.h (renamed from src/gzstream.h) | 0 | ||||
-rw-r--r-- | decoder/hg.cc (renamed from src/hg.cc) | 0 | ||||
-rw-r--r-- | decoder/hg.h (renamed from src/hg.h) | 0 | ||||
-rw-r--r-- | decoder/hg_intersect.cc (renamed from src/hg_intersect.cc) | 0 | ||||
-rw-r--r-- | decoder/hg_intersect.h (renamed from src/hg_intersect.h) | 0 | ||||
-rw-r--r-- | decoder/hg_io.cc (renamed from src/hg_io.cc) | 1 | ||||
-rw-r--r-- | decoder/hg_io.h (renamed from src/hg_io.h) | 0 | ||||
-rw-r--r-- | decoder/hg_test.cc (renamed from src/hg_test.cc) | 0 | ||||
-rw-r--r-- | decoder/inside_outside.h (renamed from src/inside_outside.h) | 0 | ||||
-rw-r--r-- | decoder/json_parse.cc (renamed from src/json_parse.cc) | 0 | ||||
-rw-r--r-- | decoder/json_parse.h (renamed from src/json_parse.h) | 0 | ||||
-rw-r--r-- | decoder/kbest.h (renamed from src/kbest.h) | 0 | ||||
-rw-r--r-- | decoder/lattice.cc (renamed from src/lattice.cc) | 0 | ||||
-rw-r--r-- | decoder/lattice.h (renamed from src/lattice.h) | 0 | ||||
-rw-r--r-- | decoder/lexcrf.cc (renamed from src/lexcrf.cc) | 0 | ||||
-rw-r--r-- | decoder/lexcrf.h (renamed from src/lexcrf.h) | 0 | ||||
-rw-r--r-- | decoder/logval.h (renamed from src/logval.h) | 0 | ||||
-rw-r--r-- | decoder/maxtrans_blunsom.cc (renamed from src/maxtrans_blunsom.cc) | 0 | ||||
-rw-r--r-- | decoder/parser_test.cc (renamed from src/parser_test.cc) | 0 | ||||
-rw-r--r-- | decoder/phrasebased_translator.cc (renamed from src/phrasebased_translator.cc) | 0 | ||||
-rw-r--r-- | decoder/phrasebased_translator.h (renamed from src/phrasebased_translator.h) | 0 | ||||
-rw-r--r-- | decoder/phrasetable_fst.cc (renamed from src/phrasetable_fst.cc) | 0 | ||||
-rw-r--r-- | decoder/phrasetable_fst.h (renamed from src/phrasetable_fst.h) | 0 | ||||
-rw-r--r-- | decoder/prob.h (renamed from src/prob.h) | 0 | ||||
-rw-r--r-- | decoder/sampler.h (renamed from src/sampler.h) | 0 | ||||
-rw-r--r-- | decoder/scfg_translator.cc (renamed from src/scfg_translator.cc) | 0 | ||||
-rw-r--r-- | decoder/sentence_metadata.h (renamed from src/sentence_metadata.h) | 0 | ||||
-rw-r--r-- | decoder/small_vector.h (renamed from src/small_vector.h) | 0 | ||||
-rw-r--r-- | decoder/small_vector_test.cc (renamed from src/small_vector_test.cc) | 0 | ||||
-rw-r--r-- | decoder/sparse_vector.cc (renamed from src/sparse_vector.cc) | 0 | ||||
-rw-r--r-- | decoder/sparse_vector.h (renamed from src/sparse_vector.h) | 0 | ||||
-rw-r--r-- | decoder/stringlib.cc (renamed from src/stringlib.cc) | 0 | ||||
-rw-r--r-- | decoder/stringlib.h (renamed from src/stringlib.h) | 0 | ||||
-rw-r--r-- | decoder/tdict.cc (renamed from src/tdict.cc) | 0 | ||||
-rw-r--r-- | decoder/tdict.h (renamed from src/tdict.h) | 0 | ||||
-rw-r--r-- | decoder/test_data/dummy.3gram.lm (renamed from src/test_data/dummy.3gram.lm) | 0 | ||||
-rw-r--r-- | decoder/test_data/grammar.prune (renamed from src/test_data/grammar.prune) | 0 | ||||
-rw-r--r-- | decoder/test_data/small.json.gz (renamed from src/test_data/small.json.gz) | bin | 1561 -> 1561 bytes | |||
-rw-r--r-- | decoder/test_data/test_2gram.lm.gz (renamed from src/test_data/test_2gram.lm.gz) | bin | 587 -> 587 bytes | |||
-rw-r--r-- | decoder/test_data/weights (renamed from src/test_data/weights) | 0 | ||||
-rw-r--r-- | decoder/test_data/weights.gt (renamed from src/test_data/weights.gt) | 0 | ||||
-rw-r--r-- | decoder/timing_stats.cc (renamed from src/timing_stats.cc) | 0 | ||||
-rw-r--r-- | decoder/timing_stats.h (renamed from src/timing_stats.h) | 0 | ||||
-rw-r--r-- | decoder/translator.h (renamed from src/translator.h) | 0 | ||||
-rw-r--r-- | decoder/trule.cc (renamed from src/trule.cc) | 0 | ||||
-rw-r--r-- | decoder/trule.h (renamed from src/trule.h) | 0 | ||||
-rw-r--r-- | decoder/trule_test.cc (renamed from src/trule_test.cc) | 0 | ||||
-rw-r--r-- | decoder/ttables.cc (renamed from src/ttables.cc) | 0 | ||||
-rw-r--r-- | decoder/ttables.h (renamed from src/ttables.h) | 0 | ||||
-rw-r--r-- | decoder/viterbi.cc (renamed from src/viterbi.cc) | 0 | ||||
-rw-r--r-- | decoder/viterbi.h (renamed from src/viterbi.h) | 0 | ||||
-rw-r--r-- | decoder/weights.cc (renamed from src/weights.cc) | 0 | ||||
-rw-r--r-- | decoder/weights.h (renamed from src/weights.h) | 0 | ||||
-rw-r--r-- | decoder/weights_test.cc (renamed from src/weights_test.cc) | 0 | ||||
-rw-r--r-- | decoder/wordid.h (renamed from src/wordid.h) | 0 | ||||
-rwxr-xr-x | tests/run-system-tests.pl | 2 | ||||
-rw-r--r-- | training/Makefile.am | 20 | ||||
-rw-r--r-- | training/atools.cc | 96 | ||||
-rwxr-xr-x | training/cluster-ptrain.pl | 52 | ||||
-rwxr-xr-x | training/make-lexcrf-grammar.pl | 73 | ||||
-rw-r--r-- | vest/Makefile.am | 16 | ||||
-rwxr-xr-x | vest/dist-vest.pl | 65 |
109 files changed, 290 insertions, 117 deletions
@@ -1,9 +1,9 @@ config.h.in~ -src/ff_test -src/grammar_test -src/hg_test -src/parser_test -src/small_vector_test +decoder/ff_test +decoder/grammar_test +decoder/hg_test +decoder/parser_test +decoder/small_vector_test training/atools training/collapse_weights training/lbfgs_test @@ -29,15 +29,15 @@ configure depcomp install-sh missing -src/.deps/ -src/*.o -src/Makefile -src/Makefile.in -src/cdec -src/dict_test -src/libhg.a -src/trule_test -src/weights_test +decoder/.deps/ +decoder/*.o +decoder/Makefile +decoder/Makefile.in +decoder/cdec +decoder/dict_test +decoder/libcdec.a +decoder/trule_test +decoder/weights_test stamp-h1 training/.deps/ training/Makefile diff --git a/Makefile.am b/Makefile.am index c3780d88..b0e750f6 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,4 +1,4 @@ -SUBDIRS = src training vest +SUBDIRS = decoder training vest AUTOMAKE_OPTIONS = foreign ACLOCAL_AMFLAGS = -I m4 diff --git a/compound-split/compound-split.pl b/compound-split/compound-split.pl index beca4dc0..490a5bc5 100755 --- a/compound-split/compound-split.pl +++ b/compound-split/compound-split.pl @@ -5,7 +5,7 @@ my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir use Getopt::Long; use IPC::Open2; -my $CDEC = "$script_dir/../src/cdec"; +my $CDEC = "$script_dir/../decoder/cdec"; my $LANG = 'de'; my $BEAM = 2.1; diff --git a/configure.ac b/configure.ac index c18342b3..0fd43e08 100644 --- a/configure.ac +++ b/configure.ac @@ -42,5 +42,5 @@ then AM_CONDITIONAL([SRI_LM], true) fi -AC_OUTPUT(Makefile src/Makefile training/Makefile vest/Makefile) +AC_OUTPUT(Makefile decoder/Makefile training/Makefile vest/Makefile) diff --git a/src/JSON_parser.c b/decoder/JSON_parser.c index 175b7cc9..175b7cc9 100644 --- a/src/JSON_parser.c +++ b/decoder/JSON_parser.c diff --git a/src/JSON_parser.h b/decoder/JSON_parser.h index ceb5b24b..ceb5b24b 100644 --- a/src/JSON_parser.h +++ b/decoder/JSON_parser.h diff --git a/src/Makefile.am b/decoder/Makefile.am index 4d0459ef..f3843102 100644 --- a/src/Makefile.am +++ b/decoder/Makefile.am @@ -11,30 +11,30 @@ bin_PROGRAMS = \ cdec_SOURCES = cdec.cc forest_writer.cc maxtrans_blunsom.cc cdec_ff.cc ff_factory.cc timing_stats.cc small_vector_test_SOURCES = small_vector_test.cc -small_vector_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libhg.a +small_vector_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a parser_test_SOURCES = parser_test.cc -parser_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libhg.a +parser_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a dict_test_SOURCES = dict_test.cc -dict_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libhg.a +dict_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a ff_test_SOURCES = ff_test.cc -ff_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libhg.a +ff_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a grammar_test_SOURCES = grammar_test.cc -grammar_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libhg.a +grammar_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a hg_test_SOURCES = hg_test.cc -hg_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libhg.a +hg_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a trule_test_SOURCES = trule_test.cc -trule_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libhg.a +trule_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a weights_test_SOURCES = weights_test.cc -weights_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libhg.a +weights_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a -LDADD = libhg.a +LDADD = libcdec.a AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) AM_LDFLAGS = $(BOOST_LDFLAGS) $(BOOST_PROGRAM_OPTIONS_LIB) -lz -noinst_LIBRARIES = libhg.a +noinst_LIBRARIES = libcdec.a -libhg_a_SOURCES = \ +libcdec_a_SOURCES = \ fst_translator.cc \ csplit.cc \ scfg_translator.cc \ diff --git a/src/aligner.cc b/decoder/aligner.cc index d9d067e5..d9d067e5 100644 --- a/src/aligner.cc +++ b/decoder/aligner.cc diff --git a/src/aligner.h b/decoder/aligner.h index 970c72f2..970c72f2 100644 --- a/src/aligner.h +++ b/decoder/aligner.h diff --git a/src/apply_models.cc b/decoder/apply_models.cc index b1d002f4..b1d002f4 100644 --- a/src/apply_models.cc +++ b/decoder/apply_models.cc diff --git a/src/apply_models.h b/decoder/apply_models.h index 08fce037..08fce037 100644 --- a/src/apply_models.h +++ b/decoder/apply_models.h diff --git a/src/array2d.h b/decoder/array2d.h index e63eda0d..e63eda0d 100644 --- a/src/array2d.h +++ b/decoder/array2d.h diff --git a/src/bottom_up_parser.cc b/decoder/bottom_up_parser.cc index b3315b8a..b3315b8a 100644 --- a/src/bottom_up_parser.cc +++ b/decoder/bottom_up_parser.cc diff --git a/src/bottom_up_parser.h b/decoder/bottom_up_parser.h index 546bfb54..546bfb54 100644 --- a/src/bottom_up_parser.h +++ b/decoder/bottom_up_parser.h diff --git a/src/cdec.cc b/decoder/cdec.cc index 6185c79b..6185c79b 100644 --- a/src/cdec.cc +++ b/decoder/cdec.cc diff --git a/src/cdec_ff.cc b/decoder/cdec_ff.cc index 0a4f3d5e..0a4f3d5e 100644 --- a/src/cdec_ff.cc +++ b/decoder/cdec_ff.cc diff --git a/src/csplit.cc b/decoder/csplit.cc index 47197782..47197782 100644 --- a/src/csplit.cc +++ b/decoder/csplit.cc diff --git a/src/csplit.h b/decoder/csplit.h index ce6295c1..ce6295c1 100644 --- a/src/csplit.h +++ b/decoder/csplit.h diff --git a/src/dict.h b/decoder/dict.h index bae9debe..bae9debe 100644 --- a/src/dict.h +++ b/decoder/dict.h diff --git a/src/dict_test.cc b/decoder/dict_test.cc index 5c5d84f0..5c5d84f0 100644 --- a/src/dict_test.cc +++ b/decoder/dict_test.cc diff --git a/src/earley_composer.cc b/decoder/earley_composer.cc index a59686e0..a59686e0 100644 --- a/src/earley_composer.cc +++ b/decoder/earley_composer.cc diff --git a/src/earley_composer.h b/decoder/earley_composer.h index 9f786bf6..9f786bf6 100644 --- a/src/earley_composer.h +++ b/decoder/earley_composer.h diff --git a/src/exp_semiring.h b/decoder/exp_semiring.h index f91beee4..f91beee4 100644 --- a/src/exp_semiring.h +++ b/decoder/exp_semiring.h diff --git a/src/fdict.cc b/decoder/fdict.cc index 83aa7cea..83aa7cea 100644 --- a/src/fdict.cc +++ b/decoder/fdict.cc diff --git a/src/fdict.h b/decoder/fdict.h index ff491cfb..ff491cfb 100644 --- a/src/fdict.h +++ b/decoder/fdict.h diff --git a/src/ff.cc b/decoder/ff.cc index 2ae5b9eb..2ae5b9eb 100644 --- a/src/ff.cc +++ b/decoder/ff.cc diff --git a/src/ff_csplit.cc b/decoder/ff_csplit.cc index cac4bb8e..cac4bb8e 100644 --- a/src/ff_csplit.cc +++ b/decoder/ff_csplit.cc diff --git a/src/ff_csplit.h b/decoder/ff_csplit.h index c1cfb64b..c1cfb64b 100644 --- a/src/ff_csplit.h +++ b/decoder/ff_csplit.h diff --git a/src/ff_factory.cc b/decoder/ff_factory.cc index 1854e0bb..1854e0bb 100644 --- a/src/ff_factory.cc +++ b/decoder/ff_factory.cc diff --git a/src/ff_factory.h b/decoder/ff_factory.h index bc586567..bc586567 100644 --- a/src/ff_factory.h +++ b/decoder/ff_factory.h diff --git a/src/ff_lm.cc b/decoder/ff_lm.cc index 354787ec..354787ec 100644 --- a/src/ff_lm.cc +++ b/decoder/ff_lm.cc diff --git a/src/ff_lm.h b/decoder/ff_lm.h index cd717360..cd717360 100644 --- a/src/ff_lm.h +++ b/decoder/ff_lm.h diff --git a/src/ff_test.cc b/decoder/ff_test.cc index babaf985..babaf985 100644 --- a/src/ff_test.cc +++ b/decoder/ff_test.cc diff --git a/src/ff_wordalign.cc b/decoder/ff_wordalign.cc index e605ac8d..a00b2c76 100644 --- a/src/ff_wordalign.cc +++ b/decoder/ff_wordalign.cc @@ -14,7 +14,16 @@ using namespace std; RelativeSentencePosition::RelativeSentencePosition(const string& param) : - fid_(FD::Convert("RelativeSentencePosition")) {} + fid_(FD::Convert("RelativeSentencePosition")) { + if (!param.empty()) { + cerr << " Loading word classes from " << param << endl; + condition_on_fclass_ = true; + template_ = "RSP:FC000"; + assert(!"not implemented"); + } else { + condition_on_fclass_ = false; + } +} void RelativeSentencePosition::TraversalFeaturesImpl(const SentenceMetadata& smeta, const Hypergraph::Edge& edge, @@ -31,6 +40,9 @@ void RelativeSentencePosition::TraversalFeaturesImpl(const SentenceMetadata& sme const double val = fabs(static_cast<double>(edge.i_) / smeta.GetSourceLength() - static_cast<double>(edge.prev_i_) / smeta.GetTargetLength()); features->set_value(fid_, val); + if (condition_on_fclass_) { + assert(!"not implemented"); + } // cerr << f_len_ << " " << e_len_ << " [" << edge.i_ << "," << edge.j_ << "|" << edge.prev_i_ << "," << edge.prev_j_ << "]\t" << edge.rule_->AsString() << "\tVAL=" << val << endl; } @@ -39,10 +51,15 @@ MarkovJump::MarkovJump(const string& param) : fid_(FD::Convert("MarkovJump")), individual_params_per_jumpsize_(false), condition_on_flen_(false) { - cerr << " MarkovJump: Blunsom&Cohn feature"; + cerr << " MarkovJump"; vector<string> argv; int argc = SplitOnWhitespace(param, &argv); if (argc > 0) { + if (argv[0] == "--fclasses") { + argc--; + assert(argc > 0); + const string f_class_file = argv[1]; + } if (argc != 1 || !(argv[0] == "-f" || argv[0] == "-i" || argv[0] == "-if")) { cerr << "MarkovJump: expected parameters to be -f, -i, or -if\n"; exit(1); @@ -57,6 +74,8 @@ MarkovJump::MarkovJump(const string& param) : cerr << " (split by f-length)"; } } + } else { + cerr << " (Blunsom & Cohn definition)"; } cerr << endl; } diff --git a/src/ff_wordalign.h b/decoder/ff_wordalign.h index 1581641c..4a8b59c7 100644 --- a/src/ff_wordalign.h +++ b/decoder/ff_wordalign.h @@ -16,6 +16,8 @@ class RelativeSentencePosition : public FeatureFunction { void* out_context) const; private: const int fid_; + bool condition_on_fclass_; + std::string template_; }; class MarkovJump : public FeatureFunction { @@ -32,6 +34,7 @@ class MarkovJump : public FeatureFunction { const int fid_; bool individual_params_per_jumpsize_; bool condition_on_flen_; + bool condition_on_fclass_; std::string template_; }; diff --git a/src/filelib.cc b/decoder/filelib.cc index 79ad2847..79ad2847 100644 --- a/src/filelib.cc +++ b/decoder/filelib.cc diff --git a/src/filelib.h b/decoder/filelib.h index 62cb9427..62cb9427 100644 --- a/src/filelib.h +++ b/decoder/filelib.h diff --git a/src/forest_writer.cc b/decoder/forest_writer.cc index a9117d18..a9117d18 100644 --- a/src/forest_writer.cc +++ b/decoder/forest_writer.cc diff --git a/src/forest_writer.h b/decoder/forest_writer.h index 819a8940..819a8940 100644 --- a/src/forest_writer.h +++ b/decoder/forest_writer.h diff --git a/src/freqdict.cc b/decoder/freqdict.cc index 9e25d346..9e25d346 100644 --- a/src/freqdict.cc +++ b/decoder/freqdict.cc diff --git a/src/freqdict.h b/decoder/freqdict.h index 9acf0c33..9acf0c33 100644 --- a/src/freqdict.h +++ b/decoder/freqdict.h diff --git a/src/fst_translator.cc b/decoder/fst_translator.cc index 57feb227..57feb227 100644 --- a/src/fst_translator.cc +++ b/decoder/fst_translator.cc diff --git a/src/grammar.cc b/decoder/grammar.cc index e19bd344..e19bd344 100644 --- a/src/grammar.cc +++ b/decoder/grammar.cc diff --git a/src/grammar.h b/decoder/grammar.h index 3471e3f1..3471e3f1 100644 --- a/src/grammar.h +++ b/decoder/grammar.h diff --git a/src/grammar_test.cc b/decoder/grammar_test.cc index 62b8f958..62b8f958 100644 --- a/src/grammar_test.cc +++ b/decoder/grammar_test.cc diff --git a/src/gzstream.cc b/decoder/gzstream.cc index 9703e6ad..9703e6ad 100644 --- a/src/gzstream.cc +++ b/decoder/gzstream.cc diff --git a/src/gzstream.h b/decoder/gzstream.h index ad9785fd..ad9785fd 100644 --- a/src/gzstream.h +++ b/decoder/gzstream.h diff --git a/src/hg.cc b/decoder/hg.cc index 7bd79394..7bd79394 100644 --- a/src/hg.cc +++ b/decoder/hg.cc diff --git a/src/hg_intersect.cc b/decoder/hg_intersect.cc index a5e8913a..a5e8913a 100644 --- a/src/hg_intersect.cc +++ b/decoder/hg_intersect.cc diff --git a/src/hg_intersect.h b/decoder/hg_intersect.h index 826bdaae..826bdaae 100644 --- a/src/hg_intersect.h +++ b/decoder/hg_intersect.h diff --git a/src/hg_io.cc b/decoder/hg_io.cc index e21b1714..243106b8 100644 --- a/src/hg_io.cc +++ b/decoder/hg_io.cc @@ -443,6 +443,7 @@ void ReadPLFEdge(const std::string& in, int &c, int cur_node, Hypergraph* hg) { vector<WordID> ewords(2, 0); ewords[1] = TD::Convert(getEscapedString(in,c)); TRulePtr r(new TRule(ewords)); + r->ComputeArity(); // cerr << "RULE: " << r->AsString() << endl; if (get(in,c++) != ',') { assert(!"PCN/PLF parse error: expected , after string\n"); } size_t cnNext = 1; diff --git a/src/hg_io.h b/decoder/hg_io.h index 69a516c1..69a516c1 100644 --- a/src/hg_io.h +++ b/decoder/hg_io.h diff --git a/src/hg_test.cc b/decoder/hg_test.cc index ecd97508..ecd97508 100644 --- a/src/hg_test.cc +++ b/decoder/hg_test.cc diff --git a/src/inside_outside.h b/decoder/inside_outside.h index 9114c9d7..9114c9d7 100644 --- a/src/inside_outside.h +++ b/decoder/inside_outside.h diff --git a/src/json_parse.cc b/decoder/json_parse.cc index f6fdfea8..f6fdfea8 100644 --- a/src/json_parse.cc +++ b/decoder/json_parse.cc diff --git a/src/json_parse.h b/decoder/json_parse.h index c3cba954..c3cba954 100644 --- a/src/json_parse.h +++ b/decoder/json_parse.h diff --git a/src/kbest.h b/decoder/kbest.h index cd9b6c2b..cd9b6c2b 100644 --- a/src/kbest.h +++ b/decoder/kbest.h diff --git a/src/lattice.cc b/decoder/lattice.cc index 56bc9551..56bc9551 100644 --- a/src/lattice.cc +++ b/decoder/lattice.cc diff --git a/src/lattice.h b/decoder/lattice.h index 71589b92..71589b92 100644 --- a/src/lattice.h +++ b/decoder/lattice.h diff --git a/src/lexcrf.cc b/decoder/lexcrf.cc index 33455a3d..33455a3d 100644 --- a/src/lexcrf.cc +++ b/decoder/lexcrf.cc diff --git a/src/lexcrf.h b/decoder/lexcrf.h index 99362c81..99362c81 100644 --- a/src/lexcrf.h +++ b/decoder/lexcrf.h diff --git a/src/logval.h b/decoder/logval.h index a8ca620c..a8ca620c 100644 --- a/src/logval.h +++ b/decoder/logval.h diff --git a/src/maxtrans_blunsom.cc b/decoder/maxtrans_blunsom.cc index 4a6680e0..4a6680e0 100644 --- a/src/maxtrans_blunsom.cc +++ b/decoder/maxtrans_blunsom.cc diff --git a/src/parser_test.cc b/decoder/parser_test.cc index da1fbd89..da1fbd89 100644 --- a/src/parser_test.cc +++ b/decoder/parser_test.cc diff --git a/src/phrasebased_translator.cc b/decoder/phrasebased_translator.cc index 5eb70876..5eb70876 100644 --- a/src/phrasebased_translator.cc +++ b/decoder/phrasebased_translator.cc diff --git a/src/phrasebased_translator.h b/decoder/phrasebased_translator.h index d42ce79c..d42ce79c 100644 --- a/src/phrasebased_translator.h +++ b/decoder/phrasebased_translator.h diff --git a/src/phrasetable_fst.cc b/decoder/phrasetable_fst.cc index f421e941..f421e941 100644 --- a/src/phrasetable_fst.cc +++ b/decoder/phrasetable_fst.cc diff --git a/src/phrasetable_fst.h b/decoder/phrasetable_fst.h index 477de1f7..477de1f7 100644 --- a/src/phrasetable_fst.h +++ b/decoder/phrasetable_fst.h diff --git a/src/prob.h b/decoder/prob.h index bc297870..bc297870 100644 --- a/src/prob.h +++ b/decoder/prob.h diff --git a/src/sampler.h b/decoder/sampler.h index e5840f41..e5840f41 100644 --- a/src/sampler.h +++ b/decoder/sampler.h diff --git a/src/scfg_translator.cc b/decoder/scfg_translator.cc index 03602c6b..03602c6b 100644 --- a/src/scfg_translator.cc +++ b/decoder/scfg_translator.cc diff --git a/src/sentence_metadata.h b/decoder/sentence_metadata.h index ef9eb388..ef9eb388 100644 --- a/src/sentence_metadata.h +++ b/decoder/sentence_metadata.h diff --git a/src/small_vector.h b/decoder/small_vector.h index 800c1df1..800c1df1 100644 --- a/src/small_vector.h +++ b/decoder/small_vector.h diff --git a/src/small_vector_test.cc b/decoder/small_vector_test.cc index 84237791..84237791 100644 --- a/src/small_vector_test.cc +++ b/decoder/small_vector_test.cc diff --git a/src/sparse_vector.cc b/decoder/sparse_vector.cc index 4035b9ef..4035b9ef 100644 --- a/src/sparse_vector.cc +++ b/decoder/sparse_vector.cc diff --git a/src/sparse_vector.h b/decoder/sparse_vector.h index 6a8c9bf4..6a8c9bf4 100644 --- a/src/sparse_vector.h +++ b/decoder/sparse_vector.h diff --git a/src/stringlib.cc b/decoder/stringlib.cc index 3ed74bef..3ed74bef 100644 --- a/src/stringlib.cc +++ b/decoder/stringlib.cc diff --git a/src/stringlib.h b/decoder/stringlib.h index 76efee8f..76efee8f 100644 --- a/src/stringlib.h +++ b/decoder/stringlib.h diff --git a/src/tdict.cc b/decoder/tdict.cc index c00d20b8..c00d20b8 100644 --- a/src/tdict.cc +++ b/decoder/tdict.cc diff --git a/src/tdict.h b/decoder/tdict.h index 9d4318fe..9d4318fe 100644 --- a/src/tdict.h +++ b/decoder/tdict.h diff --git a/src/test_data/dummy.3gram.lm b/decoder/test_data/dummy.3gram.lm index ae665284..ae665284 100644 --- a/src/test_data/dummy.3gram.lm +++ b/decoder/test_data/dummy.3gram.lm diff --git a/src/test_data/grammar.prune b/decoder/test_data/grammar.prune index 4ebcb509..4ebcb509 100644 --- a/src/test_data/grammar.prune +++ b/decoder/test_data/grammar.prune diff --git a/src/test_data/small.json.gz b/decoder/test_data/small.json.gz Binary files differindex 892ba360..892ba360 100644 --- a/src/test_data/small.json.gz +++ b/decoder/test_data/small.json.gz diff --git a/src/test_data/test_2gram.lm.gz b/decoder/test_data/test_2gram.lm.gz Binary files differindex aafa7274..aafa7274 100644 --- a/src/test_data/test_2gram.lm.gz +++ b/decoder/test_data/test_2gram.lm.gz diff --git a/src/test_data/weights b/decoder/test_data/weights index ea70229c..ea70229c 100644 --- a/src/test_data/weights +++ b/decoder/test_data/weights diff --git a/src/test_data/weights.gt b/decoder/test_data/weights.gt index 08931049..08931049 100644 --- a/src/test_data/weights.gt +++ b/decoder/test_data/weights.gt diff --git a/src/timing_stats.cc b/decoder/timing_stats.cc index 85b95de5..85b95de5 100644 --- a/src/timing_stats.cc +++ b/decoder/timing_stats.cc diff --git a/src/timing_stats.h b/decoder/timing_stats.h index 0a9f7656..0a9f7656 100644 --- a/src/timing_stats.h +++ b/decoder/timing_stats.h diff --git a/src/translator.h b/decoder/translator.h index 194efbaa..194efbaa 100644 --- a/src/translator.h +++ b/decoder/translator.h diff --git a/src/trule.cc b/decoder/trule.cc index b8f6995e..b8f6995e 100644 --- a/src/trule.cc +++ b/decoder/trule.cc diff --git a/src/trule.h b/decoder/trule.h index d2b1babe..d2b1babe 100644 --- a/src/trule.h +++ b/decoder/trule.h diff --git a/src/trule_test.cc b/decoder/trule_test.cc index 02a70764..02a70764 100644 --- a/src/trule_test.cc +++ b/decoder/trule_test.cc diff --git a/src/ttables.cc b/decoder/ttables.cc index 2ea960f0..2ea960f0 100644 --- a/src/ttables.cc +++ b/decoder/ttables.cc diff --git a/src/ttables.h b/decoder/ttables.h index 3ffc238a..3ffc238a 100644 --- a/src/ttables.h +++ b/decoder/ttables.h diff --git a/src/viterbi.cc b/decoder/viterbi.cc index 82b2ce6d..82b2ce6d 100644 --- a/src/viterbi.cc +++ b/decoder/viterbi.cc diff --git a/src/viterbi.h b/decoder/viterbi.h index 46a4f528..46a4f528 100644 --- a/src/viterbi.h +++ b/decoder/viterbi.h diff --git a/src/weights.cc b/decoder/weights.cc index bb0a878f..bb0a878f 100644 --- a/src/weights.cc +++ b/decoder/weights.cc diff --git a/src/weights.h b/decoder/weights.h index f19aa3ce..f19aa3ce 100644 --- a/src/weights.h +++ b/decoder/weights.h diff --git a/src/weights_test.cc b/decoder/weights_test.cc index aa6b3db2..aa6b3db2 100644 --- a/src/weights_test.cc +++ b/decoder/weights_test.cc diff --git a/src/wordid.h b/decoder/wordid.h index fb50bcc1..fb50bcc1 100644 --- a/src/wordid.h +++ b/decoder/wordid.h diff --git a/tests/run-system-tests.pl b/tests/run-system-tests.pl index 738000dc..8555ef78 100755 --- a/tests/run-system-tests.pl +++ b/tests/run-system-tests.pl @@ -8,7 +8,7 @@ my $TEMP_DIR = tempdir( CLEANUP => 1 ); #my $cwd = cwd(); #die "Sanity failed: $cwd" unless -d $cwd; -my $DECODER = "$script_dir/../src/cdec"; +my $DECODER = "$script_dir/../decoder/cdec"; my $FILTER = "$script_dir/tools/filter-stderr.pl"; my $COMPARE_STATS = "$script_dir/tools/compare-statistics.pl"; diff --git a/training/Makefile.am b/training/Makefile.am index c4c22fa2..944c75f7 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -10,32 +10,32 @@ bin_PROGRAMS = \ optimize_test atools_SOURCES = atools.cc -atools_LDADD = $(top_srcdir)/src/libhg.a -lz +atools_LDADD = $(top_srcdir)/decoder/libcdec.a -lz model1_SOURCES = model1.cc -model1_LDADD = $(top_srcdir)/src/libhg.a -lz +model1_LDADD = $(top_srcdir)/decoder/libcdec.a -lz grammar_convert_SOURCES = grammar_convert.cc -grammar_convert_LDADD = $(top_srcdir)/src/libhg.a -lz +grammar_convert_LDADD = $(top_srcdir)/decoder/libcdec.a -lz optimize_test_SOURCES = optimize_test.cc optimize.cc -optimize_test_LDADD = $(top_srcdir)/src/libhg.a -lz +optimize_test_LDADD = $(top_srcdir)/decoder/libcdec.a -lz collapse_weights_SOURCES = collapse_weights.cc -collapse_weights_LDADD = $(top_srcdir)/src/libhg.a -lz +collapse_weights_LDADD = $(top_srcdir)/decoder/libcdec.a -lz lbfgs_test_SOURCES = lbfgs_test.cc -lbfgs_test_LDADD = $(top_srcdir)/src/libhg.a -lz +lbfgs_test_LDADD = $(top_srcdir)/decoder/libcdec.a -lz mr_optimize_reduce_SOURCES = mr_optimize_reduce.cc optimize.cc -mr_optimize_reduce_LDADD = $(top_srcdir)/src/libhg.a -lz +mr_optimize_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a -lz mr_em_train_SOURCES = mr_em_train.cc -mr_em_train_LDADD = $(top_srcdir)/src/libhg.a -lz +mr_em_train_LDADD = $(top_srcdir)/decoder/libcdec.a -lz plftools_SOURCES = plftools.cc -plftools_LDADD = $(top_srcdir)/src/libhg.a -lz +plftools_LDADD = $(top_srcdir)/decoder/libcdec.a -lz -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(BOOST_CPPFLAGS) $(GTEST_CPPFLAGS) -I$(top_srcdir)/src +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(BOOST_CPPFLAGS) $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder AM_LDFLAGS = $(BOOST_LDFLAGS) $(BOOST_PROGRAM_OPTIONS_LIB) diff --git a/training/atools.cc b/training/atools.cc index bac73859..a18250f7 100644 --- a/training/atools.cc +++ b/training/atools.cc @@ -2,6 +2,7 @@ #include <sstream> #include <vector> +#include <queue> #include <map> #include <boost/program_options.hpp> #include <boost/shared_ptr.hpp> @@ -105,6 +106,99 @@ struct IntersectCommand : public Command { } }; +struct UnionCommand : public Command { + string Name() const { return "union"; } + bool RequiresTwoOperands() const { return true; } + void Apply(const Array2D<bool>& a, const Array2D<bool>& b, Array2D<bool>* x) { + EnsureSize(a, b, x); + Array2D<bool>& res = *x; + for (int i = 0; i < res.width(); ++i) + for (int j = 0; j < res.height(); ++j) + res(i, j) = Safe(a, i, j) || Safe(b, i, j); + } +}; + +struct RefineCommand : public Command { + RefineCommand() { + neighbors_.push_back(make_pair(1,0)); + neighbors_.push_back(make_pair(-1,0)); + neighbors_.push_back(make_pair(0,1)); + neighbors_.push_back(make_pair(0,-1)); + } + bool RequiresTwoOperands() const { return true; } + protected: + void InitRefine( + const Array2D<bool>& a, + const Array2D<bool>& b, + Array2D<bool>* x) { + EnsureSize(a, b, x); + in_.clear(); un_.clear(); is_i_aligned_.clear(); is_j_aligned_.clear(); + EnsureSize(a, b, &in_); + EnsureSize(a, b, &un_); + is_i_aligned_.resize(x->width(), false); + is_j_aligned_.resize(x->height(), false); + for (int i = 0; i < in_.width(); ++i) + for (int j = 0; j < in_.height(); ++j) { + un_(i, j) = Safe(a, i, j) || Safe(b, i, j); + in_(i, j) = Safe(a, i, j) && Safe(b, i, j); + } + } + // "grow" the intersection alignment with neighboring points + // from the union alignment + void Grow(Array2D<bool>* x) { + Array2D<bool>& res = *x; + queue<pair<int, int> > q; + for (int i = 0; i < in_.width(); ++i) + for (int j = 0; j < in_.height(); ++j) + if (in_(i, j)) { + Align(i, j, x); + q.push(make_pair(i, j)); + } + while(!q.empty()) { + const pair<int,int> point = q.front(); + q.pop(); + for (int k = 0; k < neighbors_.size(); ++k) { + const int test_i = neighbors_[k].first + point.first; + const int test_j = neighbors_[k].second + point.second; + if (Safe(un_, test_i, test_j) && !res(test_i, test_j)) { + Align(test_i, test_j, x); + q.push(make_pair(test_i, test_j)); + } + } + } + } + void Final(bool do_and, Array2D<bool>* x) { + } + void Align(int i, int j, Array2D<bool>* x) { + (*x)(i, j) = true; + is_i_aligned_[i] = true; + is_j_aligned_[j] = true; + } + Array2D<bool> in_; // intersection alignment + Array2D<bool> un_; // union alignment + vector<bool> is_i_aligned_; + vector<bool> is_j_aligned_; + vector<pair<int,int> > neighbors_; +}; + +struct DiagCommand : public RefineCommand { + DiagCommand() { + neighbors_.push_back(make_pair(1,1)); + neighbors_.push_back(make_pair(-1,1)); + neighbors_.push_back(make_pair(1,-1)); + neighbors_.push_back(make_pair(-1,-1)); + } +}; + +struct GDFCommand : public DiagCommand { + string Name() const { return "gdf"; } + void Apply(const Array2D<bool>& a, const Array2D<bool>& b, Array2D<bool>* x) { + InitRefine(a, b, x); + Grow(x); + Final(false, x); + } +}; + map<string, boost::shared_ptr<Command> > commands; void InitCommandLine(int argc, char** argv, po::variables_map* conf) { @@ -163,6 +257,8 @@ int main(int argc, char **argv) { AddCommand<ConvertCommand>(); AddCommand<InvertCommand>(); AddCommand<IntersectCommand>(); + AddCommand<UnionCommand>(); + AddCommand<GDFCommand>(); AddCommand<FMeasureCommand>(); po::variables_map conf; InitCommandLine(argc, argv, &conf); diff --git a/training/cluster-ptrain.pl b/training/cluster-ptrain.pl index 9f7c1569..8b06f162 100755 --- a/training/cluster-ptrain.pl +++ b/training/cluster-ptrain.pl @@ -8,7 +8,7 @@ my $MAX_ITER_ATTEMPTS = 5; # number of times to retry a failed function evaluati my $CWD=`pwd`; chomp $CWD; my $BIN_DIR = $SCRIPT_DIR; my $OPTIMIZER = "$BIN_DIR/mr_optimize_reduce"; -my $DECODER = "$BIN_DIR/../src/cdec"; +my $DECODER = "$BIN_DIR/../decoder/cdec"; my $COMBINER_CACHE_SIZE = 150; # This is a hack to run this on a weird cluster, # eventually, I'll provide Hadoop scripts. @@ -19,32 +19,35 @@ my $restart = ''; if ($ARGV[0] && $ARGV[0] eq '--restart') { shift @ARGV; $restart = 1; } my $pmem="2500mb"; -my $nodes = 36; +my $nodes = 1; my $max_iteration = 1000; my $PRIOR_FLAG = ""; my $parallel = 1; my $CFLAG = "-C 1"; my $LOCAL; +my $DISTRIBUTED; my $PRIOR; my $OALG = "lbfgs"; my $sigsq = 1; my $means_file; -GetOptions("decoder=s" => \$DECODER, +GetOptions("cdec=s" => \$DECODER, "run_locally" => \$LOCAL, - "gaussian_prior" => \$PRIOR, + "distributed" => \$DISTRIBUTED, "sigma_squared=f" => \$sigsq, "means=s" => \$means_file, "optimizer=s" => \$OALG, + "jobs=i" => \$nodes, "pmem=s" => \$pmem ) or usage(); usage() unless scalar @ARGV==3; my $config_file = shift @ARGV; my $training_corpus = shift @ARGV; my $initial_weights = shift @ARGV; +unless ($DISTRIBUTED) { $LOCAL = 1; } die "Can't find $config_file" unless -f $config_file; die "Can't find $DECODER" unless -f $DECODER; die "Can't execute $DECODER" unless -x $DECODER; -if ($LOCAL) { print STDERR "Will running LOCALLY.\n"; $parallel = 0; } +if ($LOCAL) { print STDERR "Will run LOCALLY.\n"; $parallel = 0; } if ($PRIOR) { $PRIOR_FLAG="-p --sigma_squared $sigsq"; if ($means_file) { $PRIOR_FLAG .= " -u $means_file"; } @@ -56,20 +59,23 @@ if ($parallel) { } unless ($parallel) { $CFLAG = "-C 500"; } unless ($config_file =~ /^\//) { $config_file = $CWD . '/' . $config_file; } +my $clines = num_lines($training_corpus); print STDERR <<EOT; PTRAIN CONFIGURATION INFORMATION Config file: $config_file Training corpus: $training_corpus + Corpus size: $clines Initial weights: $initial_weights Decoder memory: $pmem - Nodes requested: $nodes Max iterations: $max_iteration Optimizer: $OALG - PRIOR: $PRIOR_FLAG - restart: $restart + Jobs requested: $nodes + prior?: $PRIOR_FLAG + restart?: $restart EOT + if ($OALG) { $OALG="-m $OALG"; } my $nodelist="1"; @@ -142,5 +148,33 @@ while ($iter < $max_iteration) { print "FINAL WEIGHTS: $dir/weights.$iter\n"; sub usage { - die "Usage: $0 [OPTIONS] cdec.ini training.corpus weights.init\n"; + die <<EOT; + +Usage: $0 [OPTIONS] cdec.ini training.corpus weights.init + + Options: + + --distributed Parallelize function evaluation + --cdec PATH Path to cdec binary + --optimize OPT lbfgs, rprop, sgd + --gaussian_prior add Gaussian prior + --means FILE if you want means other than 0 + --sigma_squared S variance on prior + --pmem MEM Memory required for decoder + +EOT +} + +sub num_lines { + my $file = shift; + my $fh; + if ($file=~ /\.gz$/) { + open $fh, "zcat $file|" or die "Couldn't fork zcat $file: $!"; + } else { + open $fh, "<$file" or die "Couldn't read $file: $!"; + } + my $lines = 0; + while(<$fh>) { $lines++; } + close $fh; + return $lines; } diff --git a/training/make-lexcrf-grammar.pl b/training/make-lexcrf-grammar.pl index 0e290492..8cdf7718 100755 --- a/training/make-lexcrf-grammar.pl +++ b/training/make-lexcrf-grammar.pl @@ -17,23 +17,27 @@ while(<M1>) { } my $ADD_MODEL1 = 0; # found that model1 hurts performance -my $IS_FRENCH_F = 0; # indicates that the f language is french -my $IS_ARABIC_F = 1; # indicates that the f language is arabic +my $IS_FRENCH_F = 1; # indicates that the f language is french +my $IS_ARABIC_F = 0; # indicates that the f language is arabic +my $IS_URDU_F = 0; # indicates that the f language is arabic my $ADD_PREFIX_ID = 0; my $ADD_LEN = 1; -my $ADD_LD = 0; +my $ADD_SIM = 1; my $ADD_DICE = 1; my $ADD_111 = 1; my $ADD_ID = 1; my $ADD_PUNC = 1; my $ADD_NUM_MM = 1; my $ADD_NULL = 1; +my $ADD_STEM_ID = 1; my $BEAM_RATIO = 50; my %fdict; my %fcounts; my %ecounts; +my %sdict; + while(<EF>) { chomp; my ($f, $e) = split /\s*\|\|\|\s*/; @@ -56,10 +60,11 @@ print STDERR "PuncMiss 0\n" if $ADD_PUNC; print STDERR "IsNull 0\n" if $ADD_NULL; print STDERR "Model1 0\n" if $ADD_MODEL1; print STDERR "DLen 0\n" if $ADD_LEN; -print STDERR "NumMM 0\n" if $ADD_NUM_MM; -print STDERR "Level 0\n" if $ADD_LD; +print STDERR "NumMM 0\nNumMatch 0\n" if $ADD_NUM_MM; +print STDERR "OrthoSim 0\n" if $ADD_SIM; print STDERR "PfxIdentical 0\n" if ($ADD_PREFIX_ID); my $fc = 1000000; +my $sids = 1000000; for my $f (sort keys %fdict) { my $re = $fdict{$f}; my $max; @@ -72,7 +77,6 @@ for my $f (sort keys %fdict) { my $dice = 2 * $efcount / ($ecounts{$e} + $fcounts{$f}); my $feats = "F$fc=1"; my $oe = $e; - my $len_e = length($oe); my $of = $f; # normalized form if ($IS_FRENCH_F) { # see http://en.wikipedia.org/wiki/Use_of_the_circumflex_in_French @@ -85,7 +89,27 @@ for my $f (sort keys %fdict) { if (length($of) > 1 && !($of =~ /\d/)) { $of =~ s/\$/sh/g; } + } elsif ($IS_URDU_F) { + if (length($of) > 1 && !($of =~ /\d/)) { + $of =~ s/\$/sh/g; + } + $oe =~ s/^-e-//; + $oe =~ s/^al-/al/; + $of =~ s/([a-z])\~/$1$1/g; + $of =~ s/E/'/g; + $of =~ s/^Aw/o/g; + $of =~ s/\|/a/g; + $of =~ s/@/h/g; + $of =~ s/c/ch/g; + $of =~ s/x/kh/g; + $of =~ s/\*/dh/g; + $of =~ s/w/o/g; + $of =~ s/Z/dh/g; + $of =~ s/y/i/g; + $of =~ s/Y/a/g; + $of = lc $of; } + my $len_e = length($oe); my $len_f = length($of); $feats .= " Model1=$m1" if ($ADD_MODEL1); $feats .= " Dice=$dice" if $ADD_DICE; @@ -100,12 +124,35 @@ for my $f (sort keys %fdict) { $feats .= " DLen=$dlen"; } } - my $f_num = ($of =~ /^-?\d[0-9\.\,]+%?$/); # this matches *two digit* and more numbers - my $e_num = ($oe =~ /^-?\d[0-9\.\,]+%?$/); + my $f_num = ($of =~ /^-?\d[0-9\.\,]+%?$/ && (length($of) > 3)); + my $e_num = ($oe =~ /^-?\d[0-9\.\,]+%?$/ && (length($oe) > 3)); my $both_non_numeric = (!$e_num && !$f_num); if ($ADD_NUM_MM && (($f_num && !$e_num) || ($e_num && !$f_num))) { $feats .= " NumMM=1"; } + if ($ADD_NUM_MM && ($f_num && $e_num) && ($oe eq $of)) { + $feats .= " NumMatch=1"; + } + if ($ADD_STEM_ID) { + my $el = 4; + my $fl = 4; + if ($oe =~ /^al|re|co/) { $el++; } + if ($of =~ /^al|re|co/) { $fl++; } + if ($oe =~ /^trans|inter/) { $el+=2; } + if ($of =~ /^trans|inter/) { $fl+=2; } + if ($fl > length($of)) { $fl = length($of); } + if ($el > length($oe)) { $el = length($oe); } + my $sf = substr $of, 0, $fl; + my $se = substr $oe, 0, $el; + my $id = $sdict{$sf}->{$se}; + if (!$id) { + $sids++; + $sdict{$sf}->{$se} = $sids; + $id = $sids; + print STDERR "S$sids 0\n" + } + $feats .= " S$id=1"; + } if ($ADD_PREFIX_ID) { if ($len_e > 3 && $len_f > 3 && $both_non_numeric) { my $pe = substr $oe, 0, 3; @@ -113,12 +160,14 @@ for my $f (sort keys %fdict) { if ($pe eq $pf) { $feats .= " PfxIdentical=1"; } } } - if ($ADD_LD) { + if ($ADD_SIM) { my $ld = 0; - if ($is_null) { $ld = length($e); } else { - $ld = levenshtein($e, $f); + my $eff = $len_e; + if ($eff < $len_f) { $eff = $len_f; } + if (!$is_null) { + $ld = ($eff - levenshtein($oe, $of)) / sqrt($eff); } - $feats .= " Leven=$ld"; + $feats .= " OrthoSim=$ld"; } my $ident = ($e eq $f); if ($ident && $ADD_ID) { $feats .= " Identical=1"; } diff --git a/vest/Makefile.am b/vest/Makefile.am index 87c2383a..d7d08133 100644 --- a/vest/Makefile.am +++ b/vest/Makefile.am @@ -8,25 +8,25 @@ bin_PROGRAMS = \ union_forests union_forests_SOURCES = union_forests.cc -union_forests_LDADD = $(top_srcdir)/src/libhg.a -lz +union_forests_LDADD = $(top_srcdir)/decoder/libcdec.a -lz fast_score_SOURCES = fast_score.cc ter.cc comb_scorer.cc scorer.cc viterbi_envelope.cc -fast_score_LDADD = $(top_srcdir)/src/libhg.a -lz +fast_score_LDADD = $(top_srcdir)/decoder/libcdec.a -lz mr_vest_generate_mapper_input_SOURCES = mr_vest_generate_mapper_input.cc line_optimizer.cc -mr_vest_generate_mapper_input_LDADD = $(top_srcdir)/src/libhg.a -lz +mr_vest_generate_mapper_input_LDADD = $(top_srcdir)/decoder/libcdec.a -lz mr_vest_map_SOURCES = viterbi_envelope.cc error_surface.cc mr_vest_map.cc scorer.cc ter.cc comb_scorer.cc line_optimizer.cc -mr_vest_map_LDADD = $(top_srcdir)/src/libhg.a -lz +mr_vest_map_LDADD = $(top_srcdir)/decoder/libcdec.a -lz mr_vest_reduce_SOURCES = error_surface.cc mr_vest_reduce.cc scorer.cc ter.cc comb_scorer.cc line_optimizer.cc viterbi_envelope.cc -mr_vest_reduce_LDADD = $(top_srcdir)/src/libhg.a -lz +mr_vest_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a -lz scorer_test_SOURCES = scorer_test.cc scorer.cc ter.cc comb_scorer.cc viterbi_envelope.cc -scorer_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/src/libhg.a -lz +scorer_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/decoder/libcdec.a -lz lo_test_SOURCES = lo_test.cc scorer.cc ter.cc comb_scorer.cc viterbi_envelope.cc error_surface.cc line_optimizer.cc -lo_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/src/libhg.a -lz +lo_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/decoder/libcdec.a -lz -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(BOOST_CPPFLAGS) $(GTEST_CPPFLAGS) -I$(top_srcdir)/src +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(BOOST_CPPFLAGS) $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder AM_LDFLAGS = $(BOOST_LDFLAGS) $(BOOST_PROGRAM_OPTIONS_LIB) diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl index 5528838c..31dbc61f 100755 --- a/vest/dist-vest.pl +++ b/vest/dist-vest.pl @@ -1,17 +1,16 @@ #!/usr/bin/env perl +use strict; +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } use Getopt::Long; use IPC::Open2; use strict; use POSIX ":sys_wait_h"; -my $mydir = `dirname $0`; -chomp $mydir; # Default settings -my $srcFile = "/fs/cliplab/mteval/Evaluation/Chinese-English/mt03.src.txt"; -my $refFiles = "/fs/cliplab/mteval/Evaluation/Chinese-English/mt03.ref.txt.*"; -my $bin_dir = "/fs/clip-software/cdec/bin"; -$bin_dir = "/Users/redpony/cdyer-svn-root/cdec/vest/bin_dir"; +my $srcFile; +my $refFiles; +my $bin_dir = $SCRIPT_DIR; die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; my $FAST_SCORE="$bin_dir/fast_score"; die "Can't find $FAST_SCORE" unless -x $FAST_SCORE; @@ -22,7 +21,7 @@ my $SCORER = $FAST_SCORE; die "Can't find $MAPPER" unless -x $MAPPER; my $forestUnion = "$bin_dir/union_forests"; die "Can't find $forestUnion" unless -x $forestUnion; -my $cdec = "$bin_dir/cdec"; +my $cdec = "$bin_dir/../decoder/cdec"; die "Can't find decoder in $cdec" unless -x $cdec; my $decoder = $cdec; my $lines_per_mapper = 440; @@ -153,7 +152,7 @@ $SIG{HUP} = "cleanup"; my $decoderBase = `basename $decoder`; chomp $decoderBase; my $newIniFile = "$dir/$decoderBase.ini"; -my $parallelize = "$mydir/parallelize.pl"; +my $parallelize = '/chomes/redpony/svn-trunk/sa-utils/parallelize.pl'; my $inputFileName = "$dir/input"; my $user = $ENV{"USER"}; @@ -254,15 +253,18 @@ while (1){ print LOGFILE "\nUNION FORESTS\n"; print LOGFILE `date`; my $mergeLog="$logdir/prune-merge.log.$iteration"; - $cmd = "$forestUnion -r $dir/hgs -n $dir/hgs-current -s $devSize"; - print LOGFILE "COMMAND:\n$cmd\n"; - $result = system($cmd); + `rm -rf $dir/hgs`; + `mv $dir/hgs-current $dir/hgs`; + #$cmd = "$forestUnion -r $dir/hgs -n $dir/hgs-current -s $devSize"; + #print LOGFILE "COMMAND:\n$cmd\n"; + #$result = system($cmd); unless ($result == 0){ cleanup(); print LOGFILE "ERROR: merge command returned non-zero exit code $result\n"; die; } `rm -f $dir/hgs-current/*.json.gz`; # clean up old HGs, they've been moved to the repository + `mkdir -p $dir/hgs-current`; my $score = 0; my $icc = 0; @@ -303,7 +305,7 @@ while (1){ my $mapoutput = $shard; my $client_name = $shard; $client_name =~ s/mapinput.//; - $client_name = "fmert.$client_name"; + $client_name = "vest.$client_name"; $mapoutput =~ s/mapinput/mapoutput/; push @mapoutputs, "$dir/splag.$im1/$mapoutput"; $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; @@ -548,36 +550,9 @@ Options: --decoder <decoder path> Decoder binary to use. - --decode-nodes <nodelist> - A list of nodes used for parallel decoding. If specific nodes - are not desired, use "1" for each node requested. Defaults to - "1 1 1 1 1 1 1 1 1 1 1 1 1 1 1", which indicates a request for - 15 nodes. - - --dont-clean - If present, this flag prevents intermediate files, including - run files and cumulative files, from being automatically removed - after a successful optimization run (these files are left if the - run fails for any reason). If used, a makefile containing - cleanup commands is written to the directory. To clean up - the intermediate files, invoke make without any arguments. - - --dry-run - Prints out the settings and exits without doing anything. - - --epsilon <epsilon> - Require that the dev set BLEU score improve by at least <epsilon> - within <interval> iterations (controlled by parameter --interval). - If not specified, defaults to .002. - --help Print this message and exit. - --interval <i> - Require that the dev set BLEU score improve by at least <epsilon> - (controlled by parameter --epsilon) within <interval> iterations. - If not specified, defaults to 5. - --iteration <I> Starting iteration number. If not specified, defaults to 1. @@ -586,18 +561,15 @@ Options: to 10. --pmem <N> - Amount of physical memory requested for parallel decoding jobs, - in the format expected by qsub. If not specified, defaults to - 2g. + Amount of physical memory requested for parallel decoding jobs. --ref-files <files> Dev set ref files. This option takes only a single string argument. To use multiple files (including file globbing), this argument should - be quoted. If not specified, defaults to - /fs/cliplab/mteval/Evaluation/Chinese-English/mt03.ref.txt.* + be quoted. --metric <method> - Metric to optimize. See fmert's --metric option for values. + Metric to optimize. Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi --normalize <feature-name> @@ -609,8 +581,7 @@ Options: set this parameter to explore other directions. Defaults to 5. --source-file <file> - Dev set source file. If not specified, defaults to - /fs/cliplab/mteval/Evaluation/Chinese-English/mt03.src.txt + Dev set source file. --weights <file> A file specifying initial feature weights. The format is |