summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Dyer <redpony@gmail.com>2009-12-14 20:35:11 -0500
committerChris Dyer <redpony@gmail.com>2009-12-14 20:35:11 -0500
commit851e389dffdd6996ea32d70defb8906de80b9edc (patch)
tree8c68ee77205badc056b8ab5b332e67e3e98017df
parentdc6930c00b4b276883280cff1ed6dcd9ddef03c7 (diff)
few small fixes of alignment tools, add new orthographic similarity feature for word aligner, final naming of directories, libraries in cdec
-rw-r--r--.gitignore28
-rw-r--r--Makefile.am2
-rwxr-xr-xcompound-split/compound-split.pl2
-rw-r--r--configure.ac2
-rw-r--r--decoder/JSON_parser.c (renamed from src/JSON_parser.c)0
-rw-r--r--decoder/JSON_parser.h (renamed from src/JSON_parser.h)0
-rw-r--r--decoder/Makefile.am (renamed from src/Makefile.am)22
-rw-r--r--decoder/aligner.cc (renamed from src/aligner.cc)0
-rw-r--r--decoder/aligner.h (renamed from src/aligner.h)0
-rw-r--r--decoder/apply_models.cc (renamed from src/apply_models.cc)0
-rw-r--r--decoder/apply_models.h (renamed from src/apply_models.h)0
-rw-r--r--decoder/array2d.h (renamed from src/array2d.h)0
-rw-r--r--decoder/bottom_up_parser.cc (renamed from src/bottom_up_parser.cc)0
-rw-r--r--decoder/bottom_up_parser.h (renamed from src/bottom_up_parser.h)0
-rw-r--r--decoder/cdec.cc (renamed from src/cdec.cc)0
-rw-r--r--decoder/cdec_ff.cc (renamed from src/cdec_ff.cc)0
-rw-r--r--decoder/csplit.cc (renamed from src/csplit.cc)0
-rw-r--r--decoder/csplit.h (renamed from src/csplit.h)0
-rw-r--r--decoder/dict.h (renamed from src/dict.h)0
-rw-r--r--decoder/dict_test.cc (renamed from src/dict_test.cc)0
-rw-r--r--decoder/earley_composer.cc (renamed from src/earley_composer.cc)0
-rw-r--r--decoder/earley_composer.h (renamed from src/earley_composer.h)0
-rw-r--r--decoder/exp_semiring.h (renamed from src/exp_semiring.h)0
-rw-r--r--decoder/fdict.cc (renamed from src/fdict.cc)0
-rw-r--r--decoder/fdict.h (renamed from src/fdict.h)0
-rw-r--r--decoder/ff.cc (renamed from src/ff.cc)0
-rw-r--r--decoder/ff.h (renamed from src/ff.h)0
-rw-r--r--decoder/ff_csplit.cc (renamed from src/ff_csplit.cc)0
-rw-r--r--decoder/ff_csplit.h (renamed from src/ff_csplit.h)0
-rw-r--r--decoder/ff_factory.cc (renamed from src/ff_factory.cc)0
-rw-r--r--decoder/ff_factory.h (renamed from src/ff_factory.h)0
-rw-r--r--decoder/ff_lm.cc (renamed from src/ff_lm.cc)0
-rw-r--r--decoder/ff_lm.h (renamed from src/ff_lm.h)0
-rw-r--r--decoder/ff_test.cc (renamed from src/ff_test.cc)0
-rw-r--r--decoder/ff_wordalign.cc (renamed from src/ff_wordalign.cc)23
-rw-r--r--decoder/ff_wordalign.h (renamed from src/ff_wordalign.h)3
-rw-r--r--decoder/filelib.cc (renamed from src/filelib.cc)0
-rw-r--r--decoder/filelib.h (renamed from src/filelib.h)0
-rw-r--r--decoder/forest_writer.cc (renamed from src/forest_writer.cc)0
-rw-r--r--decoder/forest_writer.h (renamed from src/forest_writer.h)0
-rw-r--r--decoder/freqdict.cc (renamed from src/freqdict.cc)0
-rw-r--r--decoder/freqdict.h (renamed from src/freqdict.h)0
-rw-r--r--decoder/fst_translator.cc (renamed from src/fst_translator.cc)0
-rw-r--r--decoder/grammar.cc (renamed from src/grammar.cc)0
-rw-r--r--decoder/grammar.h (renamed from src/grammar.h)0
-rw-r--r--decoder/grammar_test.cc (renamed from src/grammar_test.cc)0
-rw-r--r--decoder/gzstream.cc (renamed from src/gzstream.cc)0
-rw-r--r--decoder/gzstream.h (renamed from src/gzstream.h)0
-rw-r--r--decoder/hg.cc (renamed from src/hg.cc)0
-rw-r--r--decoder/hg.h (renamed from src/hg.h)0
-rw-r--r--decoder/hg_intersect.cc (renamed from src/hg_intersect.cc)0
-rw-r--r--decoder/hg_intersect.h (renamed from src/hg_intersect.h)0
-rw-r--r--decoder/hg_io.cc (renamed from src/hg_io.cc)1
-rw-r--r--decoder/hg_io.h (renamed from src/hg_io.h)0
-rw-r--r--decoder/hg_test.cc (renamed from src/hg_test.cc)0
-rw-r--r--decoder/inside_outside.h (renamed from src/inside_outside.h)0
-rw-r--r--decoder/json_parse.cc (renamed from src/json_parse.cc)0
-rw-r--r--decoder/json_parse.h (renamed from src/json_parse.h)0
-rw-r--r--decoder/kbest.h (renamed from src/kbest.h)0
-rw-r--r--decoder/lattice.cc (renamed from src/lattice.cc)0
-rw-r--r--decoder/lattice.h (renamed from src/lattice.h)0
-rw-r--r--decoder/lexcrf.cc (renamed from src/lexcrf.cc)0
-rw-r--r--decoder/lexcrf.h (renamed from src/lexcrf.h)0
-rw-r--r--decoder/logval.h (renamed from src/logval.h)0
-rw-r--r--decoder/maxtrans_blunsom.cc (renamed from src/maxtrans_blunsom.cc)0
-rw-r--r--decoder/parser_test.cc (renamed from src/parser_test.cc)0
-rw-r--r--decoder/phrasebased_translator.cc (renamed from src/phrasebased_translator.cc)0
-rw-r--r--decoder/phrasebased_translator.h (renamed from src/phrasebased_translator.h)0
-rw-r--r--decoder/phrasetable_fst.cc (renamed from src/phrasetable_fst.cc)0
-rw-r--r--decoder/phrasetable_fst.h (renamed from src/phrasetable_fst.h)0
-rw-r--r--decoder/prob.h (renamed from src/prob.h)0
-rw-r--r--decoder/sampler.h (renamed from src/sampler.h)0
-rw-r--r--decoder/scfg_translator.cc (renamed from src/scfg_translator.cc)0
-rw-r--r--decoder/sentence_metadata.h (renamed from src/sentence_metadata.h)0
-rw-r--r--decoder/small_vector.h (renamed from src/small_vector.h)0
-rw-r--r--decoder/small_vector_test.cc (renamed from src/small_vector_test.cc)0
-rw-r--r--decoder/sparse_vector.cc (renamed from src/sparse_vector.cc)0
-rw-r--r--decoder/sparse_vector.h (renamed from src/sparse_vector.h)0
-rw-r--r--decoder/stringlib.cc (renamed from src/stringlib.cc)0
-rw-r--r--decoder/stringlib.h (renamed from src/stringlib.h)0
-rw-r--r--decoder/tdict.cc (renamed from src/tdict.cc)0
-rw-r--r--decoder/tdict.h (renamed from src/tdict.h)0
-rw-r--r--decoder/test_data/dummy.3gram.lm (renamed from src/test_data/dummy.3gram.lm)0
-rw-r--r--decoder/test_data/grammar.prune (renamed from src/test_data/grammar.prune)0
-rw-r--r--decoder/test_data/small.json.gz (renamed from src/test_data/small.json.gz)bin1561 -> 1561 bytes
-rw-r--r--decoder/test_data/test_2gram.lm.gz (renamed from src/test_data/test_2gram.lm.gz)bin587 -> 587 bytes
-rw-r--r--decoder/test_data/weights (renamed from src/test_data/weights)0
-rw-r--r--decoder/test_data/weights.gt (renamed from src/test_data/weights.gt)0
-rw-r--r--decoder/timing_stats.cc (renamed from src/timing_stats.cc)0
-rw-r--r--decoder/timing_stats.h (renamed from src/timing_stats.h)0
-rw-r--r--decoder/translator.h (renamed from src/translator.h)0
-rw-r--r--decoder/trule.cc (renamed from src/trule.cc)0
-rw-r--r--decoder/trule.h (renamed from src/trule.h)0
-rw-r--r--decoder/trule_test.cc (renamed from src/trule_test.cc)0
-rw-r--r--decoder/ttables.cc (renamed from src/ttables.cc)0
-rw-r--r--decoder/ttables.h (renamed from src/ttables.h)0
-rw-r--r--decoder/viterbi.cc (renamed from src/viterbi.cc)0
-rw-r--r--decoder/viterbi.h (renamed from src/viterbi.h)0
-rw-r--r--decoder/weights.cc (renamed from src/weights.cc)0
-rw-r--r--decoder/weights.h (renamed from src/weights.h)0
-rw-r--r--decoder/weights_test.cc (renamed from src/weights_test.cc)0
-rw-r--r--decoder/wordid.h (renamed from src/wordid.h)0
-rwxr-xr-xtests/run-system-tests.pl2
-rw-r--r--training/Makefile.am20
-rw-r--r--training/atools.cc96
-rwxr-xr-xtraining/cluster-ptrain.pl52
-rwxr-xr-xtraining/make-lexcrf-grammar.pl73
-rw-r--r--vest/Makefile.am16
-rwxr-xr-xvest/dist-vest.pl65
109 files changed, 290 insertions, 117 deletions
diff --git a/.gitignore b/.gitignore
index 76e8610f..d2fb0f82 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,9 +1,9 @@
config.h.in~
-src/ff_test
-src/grammar_test
-src/hg_test
-src/parser_test
-src/small_vector_test
+decoder/ff_test
+decoder/grammar_test
+decoder/hg_test
+decoder/parser_test
+decoder/small_vector_test
training/atools
training/collapse_weights
training/lbfgs_test
@@ -29,15 +29,15 @@ configure
depcomp
install-sh
missing
-src/.deps/
-src/*.o
-src/Makefile
-src/Makefile.in
-src/cdec
-src/dict_test
-src/libhg.a
-src/trule_test
-src/weights_test
+decoder/.deps/
+decoder/*.o
+decoder/Makefile
+decoder/Makefile.in
+decoder/cdec
+decoder/dict_test
+decoder/libcdec.a
+decoder/trule_test
+decoder/weights_test
stamp-h1
training/.deps/
training/Makefile
diff --git a/Makefile.am b/Makefile.am
index c3780d88..b0e750f6 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,4 +1,4 @@
-SUBDIRS = src training vest
+SUBDIRS = decoder training vest
AUTOMAKE_OPTIONS = foreign
ACLOCAL_AMFLAGS = -I m4
diff --git a/compound-split/compound-split.pl b/compound-split/compound-split.pl
index beca4dc0..490a5bc5 100755
--- a/compound-split/compound-split.pl
+++ b/compound-split/compound-split.pl
@@ -5,7 +5,7 @@ my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir
use Getopt::Long;
use IPC::Open2;
-my $CDEC = "$script_dir/../src/cdec";
+my $CDEC = "$script_dir/../decoder/cdec";
my $LANG = 'de';
my $BEAM = 2.1;
diff --git a/configure.ac b/configure.ac
index c18342b3..0fd43e08 100644
--- a/configure.ac
+++ b/configure.ac
@@ -42,5 +42,5 @@ then
AM_CONDITIONAL([SRI_LM], true)
fi
-AC_OUTPUT(Makefile src/Makefile training/Makefile vest/Makefile)
+AC_OUTPUT(Makefile decoder/Makefile training/Makefile vest/Makefile)
diff --git a/src/JSON_parser.c b/decoder/JSON_parser.c
index 175b7cc9..175b7cc9 100644
--- a/src/JSON_parser.c
+++ b/decoder/JSON_parser.c
diff --git a/src/JSON_parser.h b/decoder/JSON_parser.h
index ceb5b24b..ceb5b24b 100644
--- a/src/JSON_parser.h
+++ b/decoder/JSON_parser.h
diff --git a/src/Makefile.am b/decoder/Makefile.am
index 4d0459ef..f3843102 100644
--- a/src/Makefile.am
+++ b/decoder/Makefile.am
@@ -11,30 +11,30 @@ bin_PROGRAMS = \
cdec_SOURCES = cdec.cc forest_writer.cc maxtrans_blunsom.cc cdec_ff.cc ff_factory.cc timing_stats.cc
small_vector_test_SOURCES = small_vector_test.cc
-small_vector_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libhg.a
+small_vector_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a
parser_test_SOURCES = parser_test.cc
-parser_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libhg.a
+parser_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a
dict_test_SOURCES = dict_test.cc
-dict_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libhg.a
+dict_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a
ff_test_SOURCES = ff_test.cc
-ff_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libhg.a
+ff_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a
grammar_test_SOURCES = grammar_test.cc
-grammar_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libhg.a
+grammar_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a
hg_test_SOURCES = hg_test.cc
-hg_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libhg.a
+hg_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a
trule_test_SOURCES = trule_test.cc
-trule_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libhg.a
+trule_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a
weights_test_SOURCES = weights_test.cc
-weights_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libhg.a
+weights_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a
-LDADD = libhg.a
+LDADD = libcdec.a
AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS)
AM_LDFLAGS = $(BOOST_LDFLAGS) $(BOOST_PROGRAM_OPTIONS_LIB) -lz
-noinst_LIBRARIES = libhg.a
+noinst_LIBRARIES = libcdec.a
-libhg_a_SOURCES = \
+libcdec_a_SOURCES = \
fst_translator.cc \
csplit.cc \
scfg_translator.cc \
diff --git a/src/aligner.cc b/decoder/aligner.cc
index d9d067e5..d9d067e5 100644
--- a/src/aligner.cc
+++ b/decoder/aligner.cc
diff --git a/src/aligner.h b/decoder/aligner.h
index 970c72f2..970c72f2 100644
--- a/src/aligner.h
+++ b/decoder/aligner.h
diff --git a/src/apply_models.cc b/decoder/apply_models.cc
index b1d002f4..b1d002f4 100644
--- a/src/apply_models.cc
+++ b/decoder/apply_models.cc
diff --git a/src/apply_models.h b/decoder/apply_models.h
index 08fce037..08fce037 100644
--- a/src/apply_models.h
+++ b/decoder/apply_models.h
diff --git a/src/array2d.h b/decoder/array2d.h
index e63eda0d..e63eda0d 100644
--- a/src/array2d.h
+++ b/decoder/array2d.h
diff --git a/src/bottom_up_parser.cc b/decoder/bottom_up_parser.cc
index b3315b8a..b3315b8a 100644
--- a/src/bottom_up_parser.cc
+++ b/decoder/bottom_up_parser.cc
diff --git a/src/bottom_up_parser.h b/decoder/bottom_up_parser.h
index 546bfb54..546bfb54 100644
--- a/src/bottom_up_parser.h
+++ b/decoder/bottom_up_parser.h
diff --git a/src/cdec.cc b/decoder/cdec.cc
index 6185c79b..6185c79b 100644
--- a/src/cdec.cc
+++ b/decoder/cdec.cc
diff --git a/src/cdec_ff.cc b/decoder/cdec_ff.cc
index 0a4f3d5e..0a4f3d5e 100644
--- a/src/cdec_ff.cc
+++ b/decoder/cdec_ff.cc
diff --git a/src/csplit.cc b/decoder/csplit.cc
index 47197782..47197782 100644
--- a/src/csplit.cc
+++ b/decoder/csplit.cc
diff --git a/src/csplit.h b/decoder/csplit.h
index ce6295c1..ce6295c1 100644
--- a/src/csplit.h
+++ b/decoder/csplit.h
diff --git a/src/dict.h b/decoder/dict.h
index bae9debe..bae9debe 100644
--- a/src/dict.h
+++ b/decoder/dict.h
diff --git a/src/dict_test.cc b/decoder/dict_test.cc
index 5c5d84f0..5c5d84f0 100644
--- a/src/dict_test.cc
+++ b/decoder/dict_test.cc
diff --git a/src/earley_composer.cc b/decoder/earley_composer.cc
index a59686e0..a59686e0 100644
--- a/src/earley_composer.cc
+++ b/decoder/earley_composer.cc
diff --git a/src/earley_composer.h b/decoder/earley_composer.h
index 9f786bf6..9f786bf6 100644
--- a/src/earley_composer.h
+++ b/decoder/earley_composer.h
diff --git a/src/exp_semiring.h b/decoder/exp_semiring.h
index f91beee4..f91beee4 100644
--- a/src/exp_semiring.h
+++ b/decoder/exp_semiring.h
diff --git a/src/fdict.cc b/decoder/fdict.cc
index 83aa7cea..83aa7cea 100644
--- a/src/fdict.cc
+++ b/decoder/fdict.cc
diff --git a/src/fdict.h b/decoder/fdict.h
index ff491cfb..ff491cfb 100644
--- a/src/fdict.h
+++ b/decoder/fdict.h
diff --git a/src/ff.cc b/decoder/ff.cc
index 2ae5b9eb..2ae5b9eb 100644
--- a/src/ff.cc
+++ b/decoder/ff.cc
diff --git a/src/ff.h b/decoder/ff.h
index e962b4ba..e962b4ba 100644
--- a/src/ff.h
+++ b/decoder/ff.h
diff --git a/src/ff_csplit.cc b/decoder/ff_csplit.cc
index cac4bb8e..cac4bb8e 100644
--- a/src/ff_csplit.cc
+++ b/decoder/ff_csplit.cc
diff --git a/src/ff_csplit.h b/decoder/ff_csplit.h
index c1cfb64b..c1cfb64b 100644
--- a/src/ff_csplit.h
+++ b/decoder/ff_csplit.h
diff --git a/src/ff_factory.cc b/decoder/ff_factory.cc
index 1854e0bb..1854e0bb 100644
--- a/src/ff_factory.cc
+++ b/decoder/ff_factory.cc
diff --git a/src/ff_factory.h b/decoder/ff_factory.h
index bc586567..bc586567 100644
--- a/src/ff_factory.h
+++ b/decoder/ff_factory.h
diff --git a/src/ff_lm.cc b/decoder/ff_lm.cc
index 354787ec..354787ec 100644
--- a/src/ff_lm.cc
+++ b/decoder/ff_lm.cc
diff --git a/src/ff_lm.h b/decoder/ff_lm.h
index cd717360..cd717360 100644
--- a/src/ff_lm.h
+++ b/decoder/ff_lm.h
diff --git a/src/ff_test.cc b/decoder/ff_test.cc
index babaf985..babaf985 100644
--- a/src/ff_test.cc
+++ b/decoder/ff_test.cc
diff --git a/src/ff_wordalign.cc b/decoder/ff_wordalign.cc
index e605ac8d..a00b2c76 100644
--- a/src/ff_wordalign.cc
+++ b/decoder/ff_wordalign.cc
@@ -14,7 +14,16 @@
using namespace std;
RelativeSentencePosition::RelativeSentencePosition(const string& param) :
- fid_(FD::Convert("RelativeSentencePosition")) {}
+ fid_(FD::Convert("RelativeSentencePosition")) {
+ if (!param.empty()) {
+ cerr << " Loading word classes from " << param << endl;
+ condition_on_fclass_ = true;
+ template_ = "RSP:FC000";
+ assert(!"not implemented");
+ } else {
+ condition_on_fclass_ = false;
+ }
+}
void RelativeSentencePosition::TraversalFeaturesImpl(const SentenceMetadata& smeta,
const Hypergraph::Edge& edge,
@@ -31,6 +40,9 @@ void RelativeSentencePosition::TraversalFeaturesImpl(const SentenceMetadata& sme
const double val = fabs(static_cast<double>(edge.i_) / smeta.GetSourceLength() -
static_cast<double>(edge.prev_i_) / smeta.GetTargetLength());
features->set_value(fid_, val);
+ if (condition_on_fclass_) {
+ assert(!"not implemented");
+ }
// cerr << f_len_ << " " << e_len_ << " [" << edge.i_ << "," << edge.j_ << "|" << edge.prev_i_ << "," << edge.prev_j_ << "]\t" << edge.rule_->AsString() << "\tVAL=" << val << endl;
}
@@ -39,10 +51,15 @@ MarkovJump::MarkovJump(const string& param) :
fid_(FD::Convert("MarkovJump")),
individual_params_per_jumpsize_(false),
condition_on_flen_(false) {
- cerr << " MarkovJump: Blunsom&Cohn feature";
+ cerr << " MarkovJump";
vector<string> argv;
int argc = SplitOnWhitespace(param, &argv);
if (argc > 0) {
+ if (argv[0] == "--fclasses") {
+ argc--;
+ assert(argc > 0);
+ const string f_class_file = argv[1];
+ }
if (argc != 1 || !(argv[0] == "-f" || argv[0] == "-i" || argv[0] == "-if")) {
cerr << "MarkovJump: expected parameters to be -f, -i, or -if\n";
exit(1);
@@ -57,6 +74,8 @@ MarkovJump::MarkovJump(const string& param) :
cerr << " (split by f-length)";
}
}
+ } else {
+ cerr << " (Blunsom & Cohn definition)";
}
cerr << endl;
}
diff --git a/src/ff_wordalign.h b/decoder/ff_wordalign.h
index 1581641c..4a8b59c7 100644
--- a/src/ff_wordalign.h
+++ b/decoder/ff_wordalign.h
@@ -16,6 +16,8 @@ class RelativeSentencePosition : public FeatureFunction {
void* out_context) const;
private:
const int fid_;
+ bool condition_on_fclass_;
+ std::string template_;
};
class MarkovJump : public FeatureFunction {
@@ -32,6 +34,7 @@ class MarkovJump : public FeatureFunction {
const int fid_;
bool individual_params_per_jumpsize_;
bool condition_on_flen_;
+ bool condition_on_fclass_;
std::string template_;
};
diff --git a/src/filelib.cc b/decoder/filelib.cc
index 79ad2847..79ad2847 100644
--- a/src/filelib.cc
+++ b/decoder/filelib.cc
diff --git a/src/filelib.h b/decoder/filelib.h
index 62cb9427..62cb9427 100644
--- a/src/filelib.h
+++ b/decoder/filelib.h
diff --git a/src/forest_writer.cc b/decoder/forest_writer.cc
index a9117d18..a9117d18 100644
--- a/src/forest_writer.cc
+++ b/decoder/forest_writer.cc
diff --git a/src/forest_writer.h b/decoder/forest_writer.h
index 819a8940..819a8940 100644
--- a/src/forest_writer.h
+++ b/decoder/forest_writer.h
diff --git a/src/freqdict.cc b/decoder/freqdict.cc
index 9e25d346..9e25d346 100644
--- a/src/freqdict.cc
+++ b/decoder/freqdict.cc
diff --git a/src/freqdict.h b/decoder/freqdict.h
index 9acf0c33..9acf0c33 100644
--- a/src/freqdict.h
+++ b/decoder/freqdict.h
diff --git a/src/fst_translator.cc b/decoder/fst_translator.cc
index 57feb227..57feb227 100644
--- a/src/fst_translator.cc
+++ b/decoder/fst_translator.cc
diff --git a/src/grammar.cc b/decoder/grammar.cc
index e19bd344..e19bd344 100644
--- a/src/grammar.cc
+++ b/decoder/grammar.cc
diff --git a/src/grammar.h b/decoder/grammar.h
index 3471e3f1..3471e3f1 100644
--- a/src/grammar.h
+++ b/decoder/grammar.h
diff --git a/src/grammar_test.cc b/decoder/grammar_test.cc
index 62b8f958..62b8f958 100644
--- a/src/grammar_test.cc
+++ b/decoder/grammar_test.cc
diff --git a/src/gzstream.cc b/decoder/gzstream.cc
index 9703e6ad..9703e6ad 100644
--- a/src/gzstream.cc
+++ b/decoder/gzstream.cc
diff --git a/src/gzstream.h b/decoder/gzstream.h
index ad9785fd..ad9785fd 100644
--- a/src/gzstream.h
+++ b/decoder/gzstream.h
diff --git a/src/hg.cc b/decoder/hg.cc
index 7bd79394..7bd79394 100644
--- a/src/hg.cc
+++ b/decoder/hg.cc
diff --git a/src/hg.h b/decoder/hg.h
index 7a2658b8..7a2658b8 100644
--- a/src/hg.h
+++ b/decoder/hg.h
diff --git a/src/hg_intersect.cc b/decoder/hg_intersect.cc
index a5e8913a..a5e8913a 100644
--- a/src/hg_intersect.cc
+++ b/decoder/hg_intersect.cc
diff --git a/src/hg_intersect.h b/decoder/hg_intersect.h
index 826bdaae..826bdaae 100644
--- a/src/hg_intersect.h
+++ b/decoder/hg_intersect.h
diff --git a/src/hg_io.cc b/decoder/hg_io.cc
index e21b1714..243106b8 100644
--- a/src/hg_io.cc
+++ b/decoder/hg_io.cc
@@ -443,6 +443,7 @@ void ReadPLFEdge(const std::string& in, int &c, int cur_node, Hypergraph* hg) {
vector<WordID> ewords(2, 0);
ewords[1] = TD::Convert(getEscapedString(in,c));
TRulePtr r(new TRule(ewords));
+ r->ComputeArity();
// cerr << "RULE: " << r->AsString() << endl;
if (get(in,c++) != ',') { assert(!"PCN/PLF parse error: expected , after string\n"); }
size_t cnNext = 1;
diff --git a/src/hg_io.h b/decoder/hg_io.h
index 69a516c1..69a516c1 100644
--- a/src/hg_io.h
+++ b/decoder/hg_io.h
diff --git a/src/hg_test.cc b/decoder/hg_test.cc
index ecd97508..ecd97508 100644
--- a/src/hg_test.cc
+++ b/decoder/hg_test.cc
diff --git a/src/inside_outside.h b/decoder/inside_outside.h
index 9114c9d7..9114c9d7 100644
--- a/src/inside_outside.h
+++ b/decoder/inside_outside.h
diff --git a/src/json_parse.cc b/decoder/json_parse.cc
index f6fdfea8..f6fdfea8 100644
--- a/src/json_parse.cc
+++ b/decoder/json_parse.cc
diff --git a/src/json_parse.h b/decoder/json_parse.h
index c3cba954..c3cba954 100644
--- a/src/json_parse.h
+++ b/decoder/json_parse.h
diff --git a/src/kbest.h b/decoder/kbest.h
index cd9b6c2b..cd9b6c2b 100644
--- a/src/kbest.h
+++ b/decoder/kbest.h
diff --git a/src/lattice.cc b/decoder/lattice.cc
index 56bc9551..56bc9551 100644
--- a/src/lattice.cc
+++ b/decoder/lattice.cc
diff --git a/src/lattice.h b/decoder/lattice.h
index 71589b92..71589b92 100644
--- a/src/lattice.h
+++ b/decoder/lattice.h
diff --git a/src/lexcrf.cc b/decoder/lexcrf.cc
index 33455a3d..33455a3d 100644
--- a/src/lexcrf.cc
+++ b/decoder/lexcrf.cc
diff --git a/src/lexcrf.h b/decoder/lexcrf.h
index 99362c81..99362c81 100644
--- a/src/lexcrf.h
+++ b/decoder/lexcrf.h
diff --git a/src/logval.h b/decoder/logval.h
index a8ca620c..a8ca620c 100644
--- a/src/logval.h
+++ b/decoder/logval.h
diff --git a/src/maxtrans_blunsom.cc b/decoder/maxtrans_blunsom.cc
index 4a6680e0..4a6680e0 100644
--- a/src/maxtrans_blunsom.cc
+++ b/decoder/maxtrans_blunsom.cc
diff --git a/src/parser_test.cc b/decoder/parser_test.cc
index da1fbd89..da1fbd89 100644
--- a/src/parser_test.cc
+++ b/decoder/parser_test.cc
diff --git a/src/phrasebased_translator.cc b/decoder/phrasebased_translator.cc
index 5eb70876..5eb70876 100644
--- a/src/phrasebased_translator.cc
+++ b/decoder/phrasebased_translator.cc
diff --git a/src/phrasebased_translator.h b/decoder/phrasebased_translator.h
index d42ce79c..d42ce79c 100644
--- a/src/phrasebased_translator.h
+++ b/decoder/phrasebased_translator.h
diff --git a/src/phrasetable_fst.cc b/decoder/phrasetable_fst.cc
index f421e941..f421e941 100644
--- a/src/phrasetable_fst.cc
+++ b/decoder/phrasetable_fst.cc
diff --git a/src/phrasetable_fst.h b/decoder/phrasetable_fst.h
index 477de1f7..477de1f7 100644
--- a/src/phrasetable_fst.h
+++ b/decoder/phrasetable_fst.h
diff --git a/src/prob.h b/decoder/prob.h
index bc297870..bc297870 100644
--- a/src/prob.h
+++ b/decoder/prob.h
diff --git a/src/sampler.h b/decoder/sampler.h
index e5840f41..e5840f41 100644
--- a/src/sampler.h
+++ b/decoder/sampler.h
diff --git a/src/scfg_translator.cc b/decoder/scfg_translator.cc
index 03602c6b..03602c6b 100644
--- a/src/scfg_translator.cc
+++ b/decoder/scfg_translator.cc
diff --git a/src/sentence_metadata.h b/decoder/sentence_metadata.h
index ef9eb388..ef9eb388 100644
--- a/src/sentence_metadata.h
+++ b/decoder/sentence_metadata.h
diff --git a/src/small_vector.h b/decoder/small_vector.h
index 800c1df1..800c1df1 100644
--- a/src/small_vector.h
+++ b/decoder/small_vector.h
diff --git a/src/small_vector_test.cc b/decoder/small_vector_test.cc
index 84237791..84237791 100644
--- a/src/small_vector_test.cc
+++ b/decoder/small_vector_test.cc
diff --git a/src/sparse_vector.cc b/decoder/sparse_vector.cc
index 4035b9ef..4035b9ef 100644
--- a/src/sparse_vector.cc
+++ b/decoder/sparse_vector.cc
diff --git a/src/sparse_vector.h b/decoder/sparse_vector.h
index 6a8c9bf4..6a8c9bf4 100644
--- a/src/sparse_vector.h
+++ b/decoder/sparse_vector.h
diff --git a/src/stringlib.cc b/decoder/stringlib.cc
index 3ed74bef..3ed74bef 100644
--- a/src/stringlib.cc
+++ b/decoder/stringlib.cc
diff --git a/src/stringlib.h b/decoder/stringlib.h
index 76efee8f..76efee8f 100644
--- a/src/stringlib.h
+++ b/decoder/stringlib.h
diff --git a/src/tdict.cc b/decoder/tdict.cc
index c00d20b8..c00d20b8 100644
--- a/src/tdict.cc
+++ b/decoder/tdict.cc
diff --git a/src/tdict.h b/decoder/tdict.h
index 9d4318fe..9d4318fe 100644
--- a/src/tdict.h
+++ b/decoder/tdict.h
diff --git a/src/test_data/dummy.3gram.lm b/decoder/test_data/dummy.3gram.lm
index ae665284..ae665284 100644
--- a/src/test_data/dummy.3gram.lm
+++ b/decoder/test_data/dummy.3gram.lm
diff --git a/src/test_data/grammar.prune b/decoder/test_data/grammar.prune
index 4ebcb509..4ebcb509 100644
--- a/src/test_data/grammar.prune
+++ b/decoder/test_data/grammar.prune
diff --git a/src/test_data/small.json.gz b/decoder/test_data/small.json.gz
index 892ba360..892ba360 100644
--- a/src/test_data/small.json.gz
+++ b/decoder/test_data/small.json.gz
Binary files differ
diff --git a/src/test_data/test_2gram.lm.gz b/decoder/test_data/test_2gram.lm.gz
index aafa7274..aafa7274 100644
--- a/src/test_data/test_2gram.lm.gz
+++ b/decoder/test_data/test_2gram.lm.gz
Binary files differ
diff --git a/src/test_data/weights b/decoder/test_data/weights
index ea70229c..ea70229c 100644
--- a/src/test_data/weights
+++ b/decoder/test_data/weights
diff --git a/src/test_data/weights.gt b/decoder/test_data/weights.gt
index 08931049..08931049 100644
--- a/src/test_data/weights.gt
+++ b/decoder/test_data/weights.gt
diff --git a/src/timing_stats.cc b/decoder/timing_stats.cc
index 85b95de5..85b95de5 100644
--- a/src/timing_stats.cc
+++ b/decoder/timing_stats.cc
diff --git a/src/timing_stats.h b/decoder/timing_stats.h
index 0a9f7656..0a9f7656 100644
--- a/src/timing_stats.h
+++ b/decoder/timing_stats.h
diff --git a/src/translator.h b/decoder/translator.h
index 194efbaa..194efbaa 100644
--- a/src/translator.h
+++ b/decoder/translator.h
diff --git a/src/trule.cc b/decoder/trule.cc
index b8f6995e..b8f6995e 100644
--- a/src/trule.cc
+++ b/decoder/trule.cc
diff --git a/src/trule.h b/decoder/trule.h
index d2b1babe..d2b1babe 100644
--- a/src/trule.h
+++ b/decoder/trule.h
diff --git a/src/trule_test.cc b/decoder/trule_test.cc
index 02a70764..02a70764 100644
--- a/src/trule_test.cc
+++ b/decoder/trule_test.cc
diff --git a/src/ttables.cc b/decoder/ttables.cc
index 2ea960f0..2ea960f0 100644
--- a/src/ttables.cc
+++ b/decoder/ttables.cc
diff --git a/src/ttables.h b/decoder/ttables.h
index 3ffc238a..3ffc238a 100644
--- a/src/ttables.h
+++ b/decoder/ttables.h
diff --git a/src/viterbi.cc b/decoder/viterbi.cc
index 82b2ce6d..82b2ce6d 100644
--- a/src/viterbi.cc
+++ b/decoder/viterbi.cc
diff --git a/src/viterbi.h b/decoder/viterbi.h
index 46a4f528..46a4f528 100644
--- a/src/viterbi.h
+++ b/decoder/viterbi.h
diff --git a/src/weights.cc b/decoder/weights.cc
index bb0a878f..bb0a878f 100644
--- a/src/weights.cc
+++ b/decoder/weights.cc
diff --git a/src/weights.h b/decoder/weights.h
index f19aa3ce..f19aa3ce 100644
--- a/src/weights.h
+++ b/decoder/weights.h
diff --git a/src/weights_test.cc b/decoder/weights_test.cc
index aa6b3db2..aa6b3db2 100644
--- a/src/weights_test.cc
+++ b/decoder/weights_test.cc
diff --git a/src/wordid.h b/decoder/wordid.h
index fb50bcc1..fb50bcc1 100644
--- a/src/wordid.h
+++ b/decoder/wordid.h
diff --git a/tests/run-system-tests.pl b/tests/run-system-tests.pl
index 738000dc..8555ef78 100755
--- a/tests/run-system-tests.pl
+++ b/tests/run-system-tests.pl
@@ -8,7 +8,7 @@ my $TEMP_DIR = tempdir( CLEANUP => 1 );
#my $cwd = cwd();
#die "Sanity failed: $cwd" unless -d $cwd;
-my $DECODER = "$script_dir/../src/cdec";
+my $DECODER = "$script_dir/../decoder/cdec";
my $FILTER = "$script_dir/tools/filter-stderr.pl";
my $COMPARE_STATS = "$script_dir/tools/compare-statistics.pl";
diff --git a/training/Makefile.am b/training/Makefile.am
index c4c22fa2..944c75f7 100644
--- a/training/Makefile.am
+++ b/training/Makefile.am
@@ -10,32 +10,32 @@ bin_PROGRAMS = \
optimize_test
atools_SOURCES = atools.cc
-atools_LDADD = $(top_srcdir)/src/libhg.a -lz
+atools_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
model1_SOURCES = model1.cc
-model1_LDADD = $(top_srcdir)/src/libhg.a -lz
+model1_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
grammar_convert_SOURCES = grammar_convert.cc
-grammar_convert_LDADD = $(top_srcdir)/src/libhg.a -lz
+grammar_convert_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
optimize_test_SOURCES = optimize_test.cc optimize.cc
-optimize_test_LDADD = $(top_srcdir)/src/libhg.a -lz
+optimize_test_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
collapse_weights_SOURCES = collapse_weights.cc
-collapse_weights_LDADD = $(top_srcdir)/src/libhg.a -lz
+collapse_weights_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
lbfgs_test_SOURCES = lbfgs_test.cc
-lbfgs_test_LDADD = $(top_srcdir)/src/libhg.a -lz
+lbfgs_test_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
mr_optimize_reduce_SOURCES = mr_optimize_reduce.cc optimize.cc
-mr_optimize_reduce_LDADD = $(top_srcdir)/src/libhg.a -lz
+mr_optimize_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
mr_em_train_SOURCES = mr_em_train.cc
-mr_em_train_LDADD = $(top_srcdir)/src/libhg.a -lz
+mr_em_train_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
plftools_SOURCES = plftools.cc
-plftools_LDADD = $(top_srcdir)/src/libhg.a -lz
+plftools_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
-AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(BOOST_CPPFLAGS) $(GTEST_CPPFLAGS) -I$(top_srcdir)/src
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(BOOST_CPPFLAGS) $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder
AM_LDFLAGS = $(BOOST_LDFLAGS) $(BOOST_PROGRAM_OPTIONS_LIB)
diff --git a/training/atools.cc b/training/atools.cc
index bac73859..a18250f7 100644
--- a/training/atools.cc
+++ b/training/atools.cc
@@ -2,6 +2,7 @@
#include <sstream>
#include <vector>
+#include <queue>
#include <map>
#include <boost/program_options.hpp>
#include <boost/shared_ptr.hpp>
@@ -105,6 +106,99 @@ struct IntersectCommand : public Command {
}
};
+struct UnionCommand : public Command {
+ string Name() const { return "union"; }
+ bool RequiresTwoOperands() const { return true; }
+ void Apply(const Array2D<bool>& a, const Array2D<bool>& b, Array2D<bool>* x) {
+ EnsureSize(a, b, x);
+ Array2D<bool>& res = *x;
+ for (int i = 0; i < res.width(); ++i)
+ for (int j = 0; j < res.height(); ++j)
+ res(i, j) = Safe(a, i, j) || Safe(b, i, j);
+ }
+};
+
+struct RefineCommand : public Command {
+ RefineCommand() {
+ neighbors_.push_back(make_pair(1,0));
+ neighbors_.push_back(make_pair(-1,0));
+ neighbors_.push_back(make_pair(0,1));
+ neighbors_.push_back(make_pair(0,-1));
+ }
+ bool RequiresTwoOperands() const { return true; }
+ protected:
+ void InitRefine(
+ const Array2D<bool>& a,
+ const Array2D<bool>& b,
+ Array2D<bool>* x) {
+ EnsureSize(a, b, x);
+ in_.clear(); un_.clear(); is_i_aligned_.clear(); is_j_aligned_.clear();
+ EnsureSize(a, b, &in_);
+ EnsureSize(a, b, &un_);
+ is_i_aligned_.resize(x->width(), false);
+ is_j_aligned_.resize(x->height(), false);
+ for (int i = 0; i < in_.width(); ++i)
+ for (int j = 0; j < in_.height(); ++j) {
+ un_(i, j) = Safe(a, i, j) || Safe(b, i, j);
+ in_(i, j) = Safe(a, i, j) && Safe(b, i, j);
+ }
+ }
+ // "grow" the intersection alignment with neighboring points
+ // from the union alignment
+ void Grow(Array2D<bool>* x) {
+ Array2D<bool>& res = *x;
+ queue<pair<int, int> > q;
+ for (int i = 0; i < in_.width(); ++i)
+ for (int j = 0; j < in_.height(); ++j)
+ if (in_(i, j)) {
+ Align(i, j, x);
+ q.push(make_pair(i, j));
+ }
+ while(!q.empty()) {
+ const pair<int,int> point = q.front();
+ q.pop();
+ for (int k = 0; k < neighbors_.size(); ++k) {
+ const int test_i = neighbors_[k].first + point.first;
+ const int test_j = neighbors_[k].second + point.second;
+ if (Safe(un_, test_i, test_j) && !res(test_i, test_j)) {
+ Align(test_i, test_j, x);
+ q.push(make_pair(test_i, test_j));
+ }
+ }
+ }
+ }
+ void Final(bool do_and, Array2D<bool>* x) {
+ }
+ void Align(int i, int j, Array2D<bool>* x) {
+ (*x)(i, j) = true;
+ is_i_aligned_[i] = true;
+ is_j_aligned_[j] = true;
+ }
+ Array2D<bool> in_; // intersection alignment
+ Array2D<bool> un_; // union alignment
+ vector<bool> is_i_aligned_;
+ vector<bool> is_j_aligned_;
+ vector<pair<int,int> > neighbors_;
+};
+
+struct DiagCommand : public RefineCommand {
+ DiagCommand() {
+ neighbors_.push_back(make_pair(1,1));
+ neighbors_.push_back(make_pair(-1,1));
+ neighbors_.push_back(make_pair(1,-1));
+ neighbors_.push_back(make_pair(-1,-1));
+ }
+};
+
+struct GDFCommand : public DiagCommand {
+ string Name() const { return "gdf"; }
+ void Apply(const Array2D<bool>& a, const Array2D<bool>& b, Array2D<bool>* x) {
+ InitRefine(a, b, x);
+ Grow(x);
+ Final(false, x);
+ }
+};
+
map<string, boost::shared_ptr<Command> > commands;
void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
@@ -163,6 +257,8 @@ int main(int argc, char **argv) {
AddCommand<ConvertCommand>();
AddCommand<InvertCommand>();
AddCommand<IntersectCommand>();
+ AddCommand<UnionCommand>();
+ AddCommand<GDFCommand>();
AddCommand<FMeasureCommand>();
po::variables_map conf;
InitCommandLine(argc, argv, &conf);
diff --git a/training/cluster-ptrain.pl b/training/cluster-ptrain.pl
index 9f7c1569..8b06f162 100755
--- a/training/cluster-ptrain.pl
+++ b/training/cluster-ptrain.pl
@@ -8,7 +8,7 @@ my $MAX_ITER_ATTEMPTS = 5; # number of times to retry a failed function evaluati
my $CWD=`pwd`; chomp $CWD;
my $BIN_DIR = $SCRIPT_DIR;
my $OPTIMIZER = "$BIN_DIR/mr_optimize_reduce";
-my $DECODER = "$BIN_DIR/../src/cdec";
+my $DECODER = "$BIN_DIR/../decoder/cdec";
my $COMBINER_CACHE_SIZE = 150;
# This is a hack to run this on a weird cluster,
# eventually, I'll provide Hadoop scripts.
@@ -19,32 +19,35 @@ my $restart = '';
if ($ARGV[0] && $ARGV[0] eq '--restart') { shift @ARGV; $restart = 1; }
my $pmem="2500mb";
-my $nodes = 36;
+my $nodes = 1;
my $max_iteration = 1000;
my $PRIOR_FLAG = "";
my $parallel = 1;
my $CFLAG = "-C 1";
my $LOCAL;
+my $DISTRIBUTED;
my $PRIOR;
my $OALG = "lbfgs";
my $sigsq = 1;
my $means_file;
-GetOptions("decoder=s" => \$DECODER,
+GetOptions("cdec=s" => \$DECODER,
"run_locally" => \$LOCAL,
- "gaussian_prior" => \$PRIOR,
+ "distributed" => \$DISTRIBUTED,
"sigma_squared=f" => \$sigsq,
"means=s" => \$means_file,
"optimizer=s" => \$OALG,
+ "jobs=i" => \$nodes,
"pmem=s" => \$pmem
) or usage();
usage() unless scalar @ARGV==3;
my $config_file = shift @ARGV;
my $training_corpus = shift @ARGV;
my $initial_weights = shift @ARGV;
+unless ($DISTRIBUTED) { $LOCAL = 1; }
die "Can't find $config_file" unless -f $config_file;
die "Can't find $DECODER" unless -f $DECODER;
die "Can't execute $DECODER" unless -x $DECODER;
-if ($LOCAL) { print STDERR "Will running LOCALLY.\n"; $parallel = 0; }
+if ($LOCAL) { print STDERR "Will run LOCALLY.\n"; $parallel = 0; }
if ($PRIOR) {
$PRIOR_FLAG="-p --sigma_squared $sigsq";
if ($means_file) { $PRIOR_FLAG .= " -u $means_file"; }
@@ -56,20 +59,23 @@ if ($parallel) {
}
unless ($parallel) { $CFLAG = "-C 500"; }
unless ($config_file =~ /^\//) { $config_file = $CWD . '/' . $config_file; }
+my $clines = num_lines($training_corpus);
print STDERR <<EOT;
PTRAIN CONFIGURATION INFORMATION
Config file: $config_file
Training corpus: $training_corpus
+ Corpus size: $clines
Initial weights: $initial_weights
Decoder memory: $pmem
- Nodes requested: $nodes
Max iterations: $max_iteration
Optimizer: $OALG
- PRIOR: $PRIOR_FLAG
- restart: $restart
+ Jobs requested: $nodes
+ prior?: $PRIOR_FLAG
+ restart?: $restart
EOT
+
if ($OALG) { $OALG="-m $OALG"; }
my $nodelist="1";
@@ -142,5 +148,33 @@ while ($iter < $max_iteration) {
print "FINAL WEIGHTS: $dir/weights.$iter\n";
sub usage {
- die "Usage: $0 [OPTIONS] cdec.ini training.corpus weights.init\n";
+ die <<EOT;
+
+Usage: $0 [OPTIONS] cdec.ini training.corpus weights.init
+
+ Options:
+
+ --distributed Parallelize function evaluation
+ --cdec PATH Path to cdec binary
+ --optimize OPT lbfgs, rprop, sgd
+ --gaussian_prior add Gaussian prior
+ --means FILE if you want means other than 0
+ --sigma_squared S variance on prior
+ --pmem MEM Memory required for decoder
+
+EOT
+}
+
+sub num_lines {
+ my $file = shift;
+ my $fh;
+ if ($file=~ /\.gz$/) {
+ open $fh, "zcat $file|" or die "Couldn't fork zcat $file: $!";
+ } else {
+ open $fh, "<$file" or die "Couldn't read $file: $!";
+ }
+ my $lines = 0;
+ while(<$fh>) { $lines++; }
+ close $fh;
+ return $lines;
}
diff --git a/training/make-lexcrf-grammar.pl b/training/make-lexcrf-grammar.pl
index 0e290492..8cdf7718 100755
--- a/training/make-lexcrf-grammar.pl
+++ b/training/make-lexcrf-grammar.pl
@@ -17,23 +17,27 @@ while(<M1>) {
}
my $ADD_MODEL1 = 0; # found that model1 hurts performance
-my $IS_FRENCH_F = 0; # indicates that the f language is french
-my $IS_ARABIC_F = 1; # indicates that the f language is arabic
+my $IS_FRENCH_F = 1; # indicates that the f language is french
+my $IS_ARABIC_F = 0; # indicates that the f language is arabic
+my $IS_URDU_F = 0; # indicates that the f language is arabic
my $ADD_PREFIX_ID = 0;
my $ADD_LEN = 1;
-my $ADD_LD = 0;
+my $ADD_SIM = 1;
my $ADD_DICE = 1;
my $ADD_111 = 1;
my $ADD_ID = 1;
my $ADD_PUNC = 1;
my $ADD_NUM_MM = 1;
my $ADD_NULL = 1;
+my $ADD_STEM_ID = 1;
my $BEAM_RATIO = 50;
my %fdict;
my %fcounts;
my %ecounts;
+my %sdict;
+
while(<EF>) {
chomp;
my ($f, $e) = split /\s*\|\|\|\s*/;
@@ -56,10 +60,11 @@ print STDERR "PuncMiss 0\n" if $ADD_PUNC;
print STDERR "IsNull 0\n" if $ADD_NULL;
print STDERR "Model1 0\n" if $ADD_MODEL1;
print STDERR "DLen 0\n" if $ADD_LEN;
-print STDERR "NumMM 0\n" if $ADD_NUM_MM;
-print STDERR "Level 0\n" if $ADD_LD;
+print STDERR "NumMM 0\nNumMatch 0\n" if $ADD_NUM_MM;
+print STDERR "OrthoSim 0\n" if $ADD_SIM;
print STDERR "PfxIdentical 0\n" if ($ADD_PREFIX_ID);
my $fc = 1000000;
+my $sids = 1000000;
for my $f (sort keys %fdict) {
my $re = $fdict{$f};
my $max;
@@ -72,7 +77,6 @@ for my $f (sort keys %fdict) {
my $dice = 2 * $efcount / ($ecounts{$e} + $fcounts{$f});
my $feats = "F$fc=1";
my $oe = $e;
- my $len_e = length($oe);
my $of = $f; # normalized form
if ($IS_FRENCH_F) {
# see http://en.wikipedia.org/wiki/Use_of_the_circumflex_in_French
@@ -85,7 +89,27 @@ for my $f (sort keys %fdict) {
if (length($of) > 1 && !($of =~ /\d/)) {
$of =~ s/\$/sh/g;
}
+ } elsif ($IS_URDU_F) {
+ if (length($of) > 1 && !($of =~ /\d/)) {
+ $of =~ s/\$/sh/g;
+ }
+ $oe =~ s/^-e-//;
+ $oe =~ s/^al-/al/;
+ $of =~ s/([a-z])\~/$1$1/g;
+ $of =~ s/E/'/g;
+ $of =~ s/^Aw/o/g;
+ $of =~ s/\|/a/g;
+ $of =~ s/@/h/g;
+ $of =~ s/c/ch/g;
+ $of =~ s/x/kh/g;
+ $of =~ s/\*/dh/g;
+ $of =~ s/w/o/g;
+ $of =~ s/Z/dh/g;
+ $of =~ s/y/i/g;
+ $of =~ s/Y/a/g;
+ $of = lc $of;
}
+ my $len_e = length($oe);
my $len_f = length($of);
$feats .= " Model1=$m1" if ($ADD_MODEL1);
$feats .= " Dice=$dice" if $ADD_DICE;
@@ -100,12 +124,35 @@ for my $f (sort keys %fdict) {
$feats .= " DLen=$dlen";
}
}
- my $f_num = ($of =~ /^-?\d[0-9\.\,]+%?$/); # this matches *two digit* and more numbers
- my $e_num = ($oe =~ /^-?\d[0-9\.\,]+%?$/);
+ my $f_num = ($of =~ /^-?\d[0-9\.\,]+%?$/ && (length($of) > 3));
+ my $e_num = ($oe =~ /^-?\d[0-9\.\,]+%?$/ && (length($oe) > 3));
my $both_non_numeric = (!$e_num && !$f_num);
if ($ADD_NUM_MM && (($f_num && !$e_num) || ($e_num && !$f_num))) {
$feats .= " NumMM=1";
}
+ if ($ADD_NUM_MM && ($f_num && $e_num) && ($oe eq $of)) {
+ $feats .= " NumMatch=1";
+ }
+ if ($ADD_STEM_ID) {
+ my $el = 4;
+ my $fl = 4;
+ if ($oe =~ /^al|re|co/) { $el++; }
+ if ($of =~ /^al|re|co/) { $fl++; }
+ if ($oe =~ /^trans|inter/) { $el+=2; }
+ if ($of =~ /^trans|inter/) { $fl+=2; }
+ if ($fl > length($of)) { $fl = length($of); }
+ if ($el > length($oe)) { $el = length($oe); }
+ my $sf = substr $of, 0, $fl;
+ my $se = substr $oe, 0, $el;
+ my $id = $sdict{$sf}->{$se};
+ if (!$id) {
+ $sids++;
+ $sdict{$sf}->{$se} = $sids;
+ $id = $sids;
+ print STDERR "S$sids 0\n"
+ }
+ $feats .= " S$id=1";
+ }
if ($ADD_PREFIX_ID) {
if ($len_e > 3 && $len_f > 3 && $both_non_numeric) {
my $pe = substr $oe, 0, 3;
@@ -113,12 +160,14 @@ for my $f (sort keys %fdict) {
if ($pe eq $pf) { $feats .= " PfxIdentical=1"; }
}
}
- if ($ADD_LD) {
+ if ($ADD_SIM) {
my $ld = 0;
- if ($is_null) { $ld = length($e); } else {
- $ld = levenshtein($e, $f);
+ my $eff = $len_e;
+ if ($eff < $len_f) { $eff = $len_f; }
+ if (!$is_null) {
+ $ld = ($eff - levenshtein($oe, $of)) / sqrt($eff);
}
- $feats .= " Leven=$ld";
+ $feats .= " OrthoSim=$ld";
}
my $ident = ($e eq $f);
if ($ident && $ADD_ID) { $feats .= " Identical=1"; }
diff --git a/vest/Makefile.am b/vest/Makefile.am
index 87c2383a..d7d08133 100644
--- a/vest/Makefile.am
+++ b/vest/Makefile.am
@@ -8,25 +8,25 @@ bin_PROGRAMS = \
union_forests
union_forests_SOURCES = union_forests.cc
-union_forests_LDADD = $(top_srcdir)/src/libhg.a -lz
+union_forests_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
fast_score_SOURCES = fast_score.cc ter.cc comb_scorer.cc scorer.cc viterbi_envelope.cc
-fast_score_LDADD = $(top_srcdir)/src/libhg.a -lz
+fast_score_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
mr_vest_generate_mapper_input_SOURCES = mr_vest_generate_mapper_input.cc line_optimizer.cc
-mr_vest_generate_mapper_input_LDADD = $(top_srcdir)/src/libhg.a -lz
+mr_vest_generate_mapper_input_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
mr_vest_map_SOURCES = viterbi_envelope.cc error_surface.cc mr_vest_map.cc scorer.cc ter.cc comb_scorer.cc line_optimizer.cc
-mr_vest_map_LDADD = $(top_srcdir)/src/libhg.a -lz
+mr_vest_map_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
mr_vest_reduce_SOURCES = error_surface.cc mr_vest_reduce.cc scorer.cc ter.cc comb_scorer.cc line_optimizer.cc viterbi_envelope.cc
-mr_vest_reduce_LDADD = $(top_srcdir)/src/libhg.a -lz
+mr_vest_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
scorer_test_SOURCES = scorer_test.cc scorer.cc ter.cc comb_scorer.cc viterbi_envelope.cc
-scorer_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/src/libhg.a -lz
+scorer_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/decoder/libcdec.a -lz
lo_test_SOURCES = lo_test.cc scorer.cc ter.cc comb_scorer.cc viterbi_envelope.cc error_surface.cc line_optimizer.cc
-lo_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/src/libhg.a -lz
+lo_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/decoder/libcdec.a -lz
-AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(BOOST_CPPFLAGS) $(GTEST_CPPFLAGS) -I$(top_srcdir)/src
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(BOOST_CPPFLAGS) $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder
AM_LDFLAGS = $(BOOST_LDFLAGS) $(BOOST_PROGRAM_OPTIONS_LIB)
diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl
index 5528838c..31dbc61f 100755
--- a/vest/dist-vest.pl
+++ b/vest/dist-vest.pl
@@ -1,17 +1,16 @@
#!/usr/bin/env perl
+use strict;
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; }
use Getopt::Long;
use IPC::Open2;
use strict;
use POSIX ":sys_wait_h";
-my $mydir = `dirname $0`;
-chomp $mydir;
# Default settings
-my $srcFile = "/fs/cliplab/mteval/Evaluation/Chinese-English/mt03.src.txt";
-my $refFiles = "/fs/cliplab/mteval/Evaluation/Chinese-English/mt03.ref.txt.*";
-my $bin_dir = "/fs/clip-software/cdec/bin";
-$bin_dir = "/Users/redpony/cdyer-svn-root/cdec/vest/bin_dir";
+my $srcFile;
+my $refFiles;
+my $bin_dir = $SCRIPT_DIR;
die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir;
my $FAST_SCORE="$bin_dir/fast_score";
die "Can't find $FAST_SCORE" unless -x $FAST_SCORE;
@@ -22,7 +21,7 @@ my $SCORER = $FAST_SCORE;
die "Can't find $MAPPER" unless -x $MAPPER;
my $forestUnion = "$bin_dir/union_forests";
die "Can't find $forestUnion" unless -x $forestUnion;
-my $cdec = "$bin_dir/cdec";
+my $cdec = "$bin_dir/../decoder/cdec";
die "Can't find decoder in $cdec" unless -x $cdec;
my $decoder = $cdec;
my $lines_per_mapper = 440;
@@ -153,7 +152,7 @@ $SIG{HUP} = "cleanup";
my $decoderBase = `basename $decoder`; chomp $decoderBase;
my $newIniFile = "$dir/$decoderBase.ini";
-my $parallelize = "$mydir/parallelize.pl";
+my $parallelize = '/chomes/redpony/svn-trunk/sa-utils/parallelize.pl';
my $inputFileName = "$dir/input";
my $user = $ENV{"USER"};
@@ -254,15 +253,18 @@ while (1){
print LOGFILE "\nUNION FORESTS\n";
print LOGFILE `date`;
my $mergeLog="$logdir/prune-merge.log.$iteration";
- $cmd = "$forestUnion -r $dir/hgs -n $dir/hgs-current -s $devSize";
- print LOGFILE "COMMAND:\n$cmd\n";
- $result = system($cmd);
+ `rm -rf $dir/hgs`;
+ `mv $dir/hgs-current $dir/hgs`;
+ #$cmd = "$forestUnion -r $dir/hgs -n $dir/hgs-current -s $devSize";
+ #print LOGFILE "COMMAND:\n$cmd\n";
+ #$result = system($cmd);
unless ($result == 0){
cleanup();
print LOGFILE "ERROR: merge command returned non-zero exit code $result\n";
die;
}
`rm -f $dir/hgs-current/*.json.gz`; # clean up old HGs, they've been moved to the repository
+ `mkdir -p $dir/hgs-current`;
my $score = 0;
my $icc = 0;
@@ -303,7 +305,7 @@ while (1){
my $mapoutput = $shard;
my $client_name = $shard;
$client_name =~ s/mapinput.//;
- $client_name = "fmert.$client_name";
+ $client_name = "vest.$client_name";
$mapoutput =~ s/mapinput/mapoutput/;
push @mapoutputs, "$dir/splag.$im1/$mapoutput";
$o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard";
@@ -548,36 +550,9 @@ Options:
--decoder <decoder path>
Decoder binary to use.
- --decode-nodes <nodelist>
- A list of nodes used for parallel decoding. If specific nodes
- are not desired, use "1" for each node requested. Defaults to
- "1 1 1 1 1 1 1 1 1 1 1 1 1 1 1", which indicates a request for
- 15 nodes.
-
- --dont-clean
- If present, this flag prevents intermediate files, including
- run files and cumulative files, from being automatically removed
- after a successful optimization run (these files are left if the
- run fails for any reason). If used, a makefile containing
- cleanup commands is written to the directory. To clean up
- the intermediate files, invoke make without any arguments.
-
- --dry-run
- Prints out the settings and exits without doing anything.
-
- --epsilon <epsilon>
- Require that the dev set BLEU score improve by at least <epsilon>
- within <interval> iterations (controlled by parameter --interval).
- If not specified, defaults to .002.
-
--help
Print this message and exit.
- --interval <i>
- Require that the dev set BLEU score improve by at least <epsilon>
- (controlled by parameter --epsilon) within <interval> iterations.
- If not specified, defaults to 5.
-
--iteration <I>
Starting iteration number. If not specified, defaults to 1.
@@ -586,18 +561,15 @@ Options:
to 10.
--pmem <N>
- Amount of physical memory requested for parallel decoding jobs,
- in the format expected by qsub. If not specified, defaults to
- 2g.
+ Amount of physical memory requested for parallel decoding jobs.
--ref-files <files>
Dev set ref files. This option takes only a single string argument.
To use multiple files (including file globbing), this argument should
- be quoted. If not specified, defaults to
- /fs/cliplab/mteval/Evaluation/Chinese-English/mt03.ref.txt.*
+ be quoted.
--metric <method>
- Metric to optimize. See fmert's --metric option for values.
+ Metric to optimize.
Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi
--normalize <feature-name>
@@ -609,8 +581,7 @@ Options:
set this parameter to explore other directions. Defaults to 5.
--source-file <file>
- Dev set source file. If not specified, defaults to
- /fs/cliplab/mteval/Evaluation/Chinese-English/mt03.src.txt
+ Dev set source file.
--weights <file>
A file specifying initial feature weights. The format is