diff options
-rw-r--r-- | Makefile.am | 2 | ||||
-rw-r--r-- | configure.ac | 2 | ||||
-rw-r--r-- | decoder/Makefile.am | 37 | ||||
-rw-r--r-- | decoder/aligner.cc | 74 | ||||
-rw-r--r-- | decoder/aligner.h | 2 | ||||
-rw-r--r-- | decoder/cdec.cc | 11 | ||||
-rw-r--r-- | decoder/ff_bleu.cc | 2 | ||||
-rw-r--r-- | decoder/ff_lm.cc | 2 | ||||
-rw-r--r-- | decoder/ff_wordalign.cc | 3 | ||||
-rw-r--r-- | decoder/hg.h | 4 | ||||
-rw-r--r-- | decoder/hg_io.cc | 53 | ||||
-rw-r--r-- | decoder/hg_io.h | 5 | ||||
-rwxr-xr-x | decoder/oracle_bleu.h | 2 | ||||
-rw-r--r-- | decoder/phrasebased_translator.cc | 4 | ||||
-rw-r--r-- | decoder/sentence_metadata.h | 2 | ||||
-rw-r--r-- | extools/Makefile.am | 12 | ||||
-rw-r--r-- | extools/sg_lexer.l | 3 | ||||
-rw-r--r-- | gi/clda/src/Makefile.am | 4 | ||||
-rw-r--r-- | gi/pyp-topics/src/Makefile.am | 8 | ||||
-rw-r--r-- | gi/pyp-topics/src/contexts_corpus.hh | 2 | ||||
-rw-r--r-- | gi/pyp-topics/src/contexts_lexer.h | 2 | ||||
-rw-r--r-- | gi/pyp-topics/src/contexts_lexer.l | 2 | ||||
-rw-r--r-- | mteval/Makefile.am | 23 | ||||
-rw-r--r-- | mteval/aer_scorer.cc (renamed from vest/aer_scorer.cc) | 6 | ||||
-rw-r--r-- | mteval/aer_scorer.h (renamed from vest/aer_scorer.h) | 0 | ||||
-rw-r--r-- | mteval/comb_scorer.cc (renamed from vest/comb_scorer.cc) | 0 | ||||
-rw-r--r-- | mteval/comb_scorer.h (renamed from vest/comb_scorer.h) | 0 | ||||
-rw-r--r-- | mteval/fast_score.cc (renamed from vest/fast_score.cc) | 0 | ||||
-rw-r--r-- | mteval/mbr_kbest.cc | 138 | ||||
-rw-r--r-- | mteval/scorer.cc (renamed from vest/scorer.cc) | 78 | ||||
-rw-r--r-- | mteval/scorer.h (renamed from vest/scorer.h) | 3 | ||||
-rw-r--r-- | mteval/scorer_test.cc | 182 | ||||
-rw-r--r-- | mteval/ter.cc (renamed from vest/ter.cc) | 0 | ||||
-rw-r--r-- | mteval/ter.h (renamed from vest/ter.h) | 0 | ||||
-rw-r--r-- | mteval/test_data/re.txt.0 | 5 | ||||
-rw-r--r-- | mteval/test_data/re.txt.1 | 5 | ||||
-rw-r--r-- | mteval/test_data/re.txt.2 | 5 | ||||
-rw-r--r-- | mteval/test_data/re.txt.3 | 5 | ||||
-rw-r--r-- | training/Makefile.am | 25 | ||||
-rw-r--r-- | training/atools.cc | 7 | ||||
-rw-r--r-- | utils/Makefile.am | 38 | ||||
-rw-r--r-- | utils/alignment_pharaoh.cc | 77 | ||||
-rw-r--r-- | utils/alignment_pharaoh.h | 14 | ||||
-rw-r--r-- | utils/array2d.h (renamed from decoder/array2d.h) | 0 | ||||
-rw-r--r-- | utils/b64tools.cc | 59 | ||||
-rw-r--r-- | utils/b64tools.h | 9 | ||||
-rw-r--r-- | utils/dict.cc (renamed from decoder/dict.cc) | 0 | ||||
-rw-r--r-- | utils/dict.h (renamed from decoder/dict.h) | 0 | ||||
-rw-r--r-- | utils/dict_test.cc (renamed from decoder/dict_test.cc) | 3 | ||||
-rw-r--r-- | utils/fdict.cc (renamed from decoder/fdict.cc) | 0 | ||||
-rw-r--r-- | utils/fdict.h (renamed from decoder/fdict.h) | 0 | ||||
-rwxr-xr-x | utils/feature_accum.h | 129 | ||||
-rwxr-xr-x | utils/feature_vector.h (renamed from decoder/feature_vector.h) | 0 | ||||
-rw-r--r-- | utils/filelib.cc (renamed from decoder/filelib.cc) | 0 | ||||
-rw-r--r-- | utils/filelib.h (renamed from decoder/filelib.h) | 0 | ||||
-rw-r--r-- | utils/gzstream.cc (renamed from decoder/gzstream.cc) | 0 | ||||
-rw-r--r-- | utils/gzstream.h (renamed from decoder/gzstream.h) | 0 | ||||
-rwxr-xr-x | utils/hash.h (renamed from decoder/hash.h) | 0 | ||||
-rwxr-xr-x | utils/have_64_bits.h (renamed from decoder/have_64_bits.h) | 0 | ||||
-rwxr-xr-x | utils/int_or_pointer.h (renamed from decoder/int_or_pointer.h) | 0 | ||||
-rwxr-xr-x | utils/intrusive_refcount.hpp (renamed from decoder/intrusive_refcount.hpp) | 0 | ||||
-rw-r--r-- | utils/logval.h (renamed from decoder/logval.h) | 0 | ||||
-rw-r--r-- | utils/logval_test.cc (renamed from decoder/logval_test.cc) | 0 | ||||
-rwxr-xr-x | utils/murmur_hash.h (renamed from decoder/murmur_hash.h) | 0 | ||||
-rwxr-xr-x | utils/null_deleter.h (renamed from decoder/null_deleter.h) | 0 | ||||
-rw-r--r-- | utils/prob.h (renamed from decoder/prob.h) | 0 | ||||
-rw-r--r-- | utils/sampler.h (renamed from decoder/sampler.h) | 0 | ||||
-rw-r--r-- | utils/small_vector.h (renamed from decoder/small_vector.h) | 0 | ||||
-rw-r--r-- | utils/small_vector_test.cc (renamed from decoder/small_vector_test.cc) | 0 | ||||
-rw-r--r-- | utils/sparse_vector.cc (renamed from decoder/sparse_vector.cc) | 2 | ||||
-rw-r--r-- | utils/sparse_vector.h (renamed from decoder/sparse_vector.h) | 0 | ||||
-rwxr-xr-x | utils/static_utoa.h (renamed from decoder/static_utoa.h) | 0 | ||||
-rw-r--r-- | utils/stringlib.cc (renamed from decoder/stringlib.cc) | 11 | ||||
-rw-r--r-- | utils/stringlib.h (renamed from decoder/stringlib.h) | 0 | ||||
-rwxr-xr-x | utils/stringlib_test.cc (renamed from decoder/stringlib_test.cc) | 0 | ||||
-rw-r--r-- | utils/tdict.cc (renamed from decoder/tdict.cc) | 0 | ||||
-rw-r--r-- | utils/tdict.h (renamed from decoder/tdict.h) | 0 | ||||
-rw-r--r-- | utils/test_data/weights (renamed from decoder/test_data/weights) | 0 | ||||
-rwxr-xr-x | utils/threadlocal.h (renamed from decoder/threadlocal.h) | 0 | ||||
-rw-r--r-- | utils/timing_stats.cc (renamed from decoder/timing_stats.cc) | 0 | ||||
-rw-r--r-- | utils/timing_stats.h (renamed from decoder/timing_stats.h) | 0 | ||||
-rw-r--r-- | utils/weights.cc (renamed from decoder/weights.cc) | 0 | ||||
-rw-r--r-- | utils/weights.h (renamed from decoder/weights.h) | 0 | ||||
-rw-r--r-- | utils/weights_test.cc (renamed from decoder/weights_test.cc) | 1 | ||||
-rw-r--r-- | utils/wordid.h (renamed from decoder/wordid.h) | 0 | ||||
-rw-r--r-- | vest/Makefile.am | 30 | ||||
-rw-r--r-- | vest/lo_test.cc | 5 | ||||
-rw-r--r-- | vest/mr_vest_map.cc | 5 | ||||
-rw-r--r-- | vest/mr_vest_reduce.cc | 2 |
89 files changed, 772 insertions, 333 deletions
diff --git a/Makefile.am b/Makefile.am index e82e2352..98c2561e 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,4 +1,4 @@ -SUBDIRS = decoder training vest extools gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava +SUBDIRS = utils mteval decoder training vest extools gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava AUTOMAKE_OPTIONS = foreign ACLOCAL_AMFLAGS = -I m4 diff --git a/configure.ac b/configure.ac index e627c1cc..302eebed 100644 --- a/configure.ac +++ b/configure.ac @@ -76,4 +76,4 @@ then AM_CONDITIONAL([RAND_LM], true) fi -AC_OUTPUT(Makefile extools/Makefile decoder/Makefile training/Makefile vest/Makefile gi/pyp-topics/src/Makefile gi/clda/src/Makefile) +AC_OUTPUT(Makefile utils/Makefile mteval/Makefile extools/Makefile decoder/Makefile training/Makefile vest/Makefile gi/pyp-topics/src/Makefile gi/clda/src/Makefile) diff --git a/decoder/Makefile.am b/decoder/Makefile.am index 68a7d765..f514b340 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -2,24 +2,16 @@ bin_PROGRAMS = cdec if HAVE_GTEST noinst_PROGRAMS = \ - dict_test \ - weights_test \ trule_test \ hg_test \ ff_test \ - logval_test \ parser_test \ - grammar_test \ - small_vector_test + grammar_test endif -cdec_SOURCES = cdec.cc forest_writer.cc maxtrans_blunsom.cc cdec_ff.cc timing_stats.cc -small_vector_test_SOURCES = small_vector_test.cc -small_vector_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a +cdec_SOURCES = cdec.cc forest_writer.cc maxtrans_blunsom.cc cdec_ff.cc parser_test_SOURCES = parser_test.cc parser_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a -dict_test_SOURCES = dict_test.cc -dict_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a ff_test_SOURCES = ff_test.cc ff_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a grammar_test_SOURCES = grammar_test.cc @@ -28,15 +20,12 @@ hg_test_SOURCES = hg_test.cc hg_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a trule_test_SOURCES = trule_test.cc trule_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a -weights_test_SOURCES = weights_test.cc -weights_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a -logval_test_SOURCES = logval_test.cc -logval_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) -LDADD = libcdec.a +LDADD = libcdec.a ../mteval/libmteval.a ../utils/libutils.a -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I.. -AM_LDFLAGS = -lz +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I.. -I../mteval -I../utils + +AM_LDFLAGS = ../utils/libutils.a -lz rule_lexer.cc: rule_lexer.l $(LEX) -s -CF -8 -o$@ $< @@ -49,7 +38,6 @@ libcdec_a_SOURCES = \ rule_lexer.cc \ fst_translator.cc \ csplit.cc \ - dict.cc \ translator.cc \ scfg_translator.cc \ hg.cc \ @@ -58,17 +46,10 @@ libcdec_a_SOURCES = \ viterbi.cc \ lattice.cc \ aligner.cc \ - gzstream.cc \ apply_models.cc \ earley_composer.cc \ phrasetable_fst.cc \ - sparse_vector.cc \ trule.cc \ - filelib.cc \ - stringlib.cc \ - fdict.cc \ - tdict.cc \ - weights.cc \ ttables.cc \ ff.cc \ ff_lm.cc \ @@ -78,12 +59,6 @@ libcdec_a_SOURCES = \ ff_tagger.cc \ ff_bleu.cc \ ff_factory.cc \ - ../vest/scorer.cc \ - ../vest/ter.cc \ - ../vest/aer_scorer.cc \ - ../vest/comb_scorer.cc \ - ../vest/error_surface.cc \ - ../vest/viterbi_envelope.cc \ freqdict.cc \ lexalign.cc \ lextrans.cc \ diff --git a/decoder/aligner.cc b/decoder/aligner.cc index b089f52e..92431be4 100644 --- a/decoder/aligner.cc +++ b/decoder/aligner.cc @@ -5,81 +5,11 @@ #include "sentence_metadata.h" #include "inside_outside.h" #include "viterbi.h" +#include "alignment_pharaoh.h" #include <set> using namespace std; -static bool is_digit(char x) { return x >= '0' && x <= '9'; } - -boost::shared_ptr<Array2D<bool> > AlignerTools::ReadPharaohAlignmentGrid(const string& al) { - int max_x = 0; - int max_y = 0; - int i = 0; - size_t pos = al.rfind(" ||| "); - if (pos != string::npos) { i = pos + 5; } - while (i < al.size()) { - if (al[i] == '\n' || al[i] == '\r') break; - int x = 0; - while(i < al.size() && is_digit(al[i])) { - x *= 10; - x += al[i] - '0'; - ++i; - } - if (x > max_x) max_x = x; - assert(i < al.size()); - if(al[i] != '-') { - cerr << "BAD ALIGNMENT: " << al << endl; - abort(); - } - ++i; - int y = 0; - while(i < al.size() && is_digit(al[i])) { - y *= 10; - y += al[i] - '0'; - ++i; - } - if (y > max_y) max_y = y; - while(i < al.size() && al[i] == ' ') { ++i; } - } - - boost::shared_ptr<Array2D<bool> > grid(new Array2D<bool>(max_x + 1, max_y + 1)); - i = 0; - if (pos != string::npos) { i = pos + 5; } - while (i < al.size()) { - if (al[i] == '\n' || al[i] == '\r') break; - int x = 0; - while(i < al.size() && is_digit(al[i])) { - x *= 10; - x += al[i] - '0'; - ++i; - } - assert(i < al.size()); - assert(al[i] == '-'); - ++i; - int y = 0; - while(i < al.size() && is_digit(al[i])) { - y *= 10; - y += al[i] - '0'; - ++i; - } - (*grid)(x, y) = true; - while(i < al.size() && al[i] == ' ') { ++i; } - } - // cerr << *grid << endl; - return grid; -} - -void AlignerTools::SerializePharaohFormat(const Array2D<bool>& alignment, ostream* out) { - bool need_space = false; - for (int i = 0; i < alignment.width(); ++i) - for (int j = 0; j < alignment.height(); ++j) - if (alignment(i,j)) { - if (need_space) (*out) << ' '; else need_space = true; - (*out) << i << '-' << j; - } - (*out) << endl; -} - // used with lexical models since they may not fully generate the // source string void SourceEdgeCoveragesUsingParseIndices(const Hypergraph& g, @@ -317,6 +247,6 @@ void AlignerTools::WriteAlignment(const Lattice& src_lattice, cerr << grid << endl; } (*out) << TD::GetString(src_sent) << " ||| " << TD::GetString(trg_sent) << " ||| "; - SerializePharaohFormat(grid, out); + AlignmentPharaoh::SerializePharaohFormat(grid, out); }; diff --git a/decoder/aligner.h b/decoder/aligner.h index cd159119..a088ba6c 100644 --- a/decoder/aligner.h +++ b/decoder/aligner.h @@ -10,8 +10,6 @@ class Hypergraph; class SentenceMetadata; struct AlignerTools { - static boost::shared_ptr<Array2D<bool> > ReadPharaohAlignmentGrid(const std::string& al); - static void SerializePharaohFormat(const Array2D<bool>& alignment, std::ostream* out); // assumption: g contains derivations of input/ref and // ONLY input/ref. diff --git a/decoder/cdec.cc b/decoder/cdec.cc index 8c4a25e0..3633febd 100644 --- a/decoder/cdec.cc +++ b/decoder/cdec.cc @@ -34,7 +34,7 @@ #include "inside_outside.h" #include "exp_semiring.h" #include "sentence_metadata.h" -#include "../vest/scorer.h" +#include "scorer.h" #include "apply_fsa_models.h" #include "program_options.h" #include "cfg_options.h" @@ -59,6 +59,15 @@ void ShowBanner() { cerr << "cdec v1.0 (c) 2009-2010 by Chris Dyer\n"; } +void ParseTranslatorInputLattice(const string& line, string* input, Lattice* ref) { + string sref; + ParseTranslatorInput(line, input, &sref); + if (sref.size() > 0) { + assert(ref); + LatticeTools::ConvertTextOrPLF(sref, ref); + } +} + void ConvertSV(const SparseVector<prob_t>& src, SparseVector<double>* trg) { for (SparseVector<prob_t>::const_iterator it = src.begin(); it != src.end(); ++it) trg->set_value(it->first, it->second); diff --git a/decoder/ff_bleu.cc b/decoder/ff_bleu.cc index 77989331..aa4e6d85 100644 --- a/decoder/ff_bleu.cc +++ b/decoder/ff_bleu.cc @@ -18,7 +18,7 @@ char const* bleu_usage_verbose="Uses feature id 0! Make sure there are no other #include "hg.h" #include "stringlib.h" #include "sentence_metadata.h" -#include "../vest/scorer.h" +#include "scorer.h" using namespace std; diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc index f3e65cb7..a9929253 100644 --- a/decoder/ff_lm.cc +++ b/decoder/ff_lm.cc @@ -728,7 +728,7 @@ LanguageModelRandLM::LanguageModelRandLM(const string& param) : filename = argv[0]; } } - set_order(order); +// set_order(order); int cache_MB = 200; // increase cache size randlm::RandLM* rlm = randlm::RandLM::initRandLM(filename, order, cache_MB); assert(rlm != NULL); diff --git a/decoder/ff_wordalign.cc b/decoder/ff_wordalign.cc index 0ba2bf92..087bff0c 100644 --- a/decoder/ff_wordalign.cc +++ b/decoder/ff_wordalign.cc @@ -5,6 +5,7 @@ #include <string> #include <cmath> +#include "alignment_pharaoh.h" #include "stringlib.h" #include "sentence_metadata.h" #include "hg.h" @@ -354,7 +355,7 @@ AlignerResults::AlignerResults(const std::string& param) : getline(in, line); if (!in) break; ++lc; - is_aligned_.push_back(AlignerTools::ReadPharaohAlignmentGrid(line)); + is_aligned_.push_back(AlignmentPharaoh::ReadPharaohAlignmentGrid(line)); } cerr << " Loaded " << lc << " refs\n"; } diff --git a/decoder/hg.h b/decoder/hg.h index d5c8e197..e9510997 100644 --- a/decoder/hg.h +++ b/decoder/hg.h @@ -102,6 +102,8 @@ public: void copy_info(Edge const& o) { #if USE_INFO_EDGE set_info(o.info_.str()); // by convention, each person putting info here starts with a separator (e.g. space). it's empty if nobody put any info there. +#else + (void) o; #endif } void copy_pod(Edge const& o) { @@ -142,7 +144,7 @@ public: #else std::string info() const { return std::string(); } void reset_info() { } - void set_info(std::string const& s) { } + void set_info(std::string const& ) { } #endif void show(std::ostream &o,unsigned mask=SPAN|RULE) const { o<<'{'; diff --git a/decoder/hg_io.cc b/decoder/hg_io.cc index 52a8565a..1af8261e 100644 --- a/decoder/hg_io.cc +++ b/decoder/hg_io.cc @@ -622,56 +622,3 @@ void HypergraphIO::WriteAsCFG(const Hypergraph& hg) { } } -namespace B64 { - -static const char cb64[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; -static const char cd64[]="|$$$}rstuvwxyz{$$$$$$$>?@ABCDEFGHIJKLMNOPQRSTUVW$$$$$$XYZ[\\]^_`abcdefghijklmnopq"; - -static void encodeblock(const unsigned char* in, ostream* os, int len) { - char out[4]; - out[0] = cb64[ in[0] >> 2 ]; - out[1] = cb64[ ((in[0] & 0x03) << 4) | ((in[1] & 0xf0) >> 4) ]; - out[2] = (len > 1 ? cb64[ ((in[1] & 0x0f) << 2) | ((in[2] & 0xc0) >> 6) ] : '='); - out[3] = (len > 2 ? cb64[ in[2] & 0x3f ] : '='); - os->write(out, 4); -} - -void b64encode(const char* data, const size_t size, ostream* out) { - size_t cur = 0; - while(cur < size) { - int len = min(static_cast<size_t>(3), size - cur); - encodeblock(reinterpret_cast<const unsigned char*>(&data[cur]), out, len); - cur += len; - } -} - -static void decodeblock(const unsigned char* in, unsigned char* out) { - out[0] = (unsigned char ) (in[0] << 2 | in[1] >> 4); - out[1] = (unsigned char ) (in[1] << 4 | in[2] >> 2); - out[2] = (unsigned char ) (((in[2] << 6) & 0xc0) | in[3]); -} - -bool b64decode(const unsigned char* data, const size_t insize, char* out, const size_t outsize) { - size_t cur = 0; - size_t ocur = 0; - unsigned char in[4]; - while(cur < insize) { - assert(ocur < outsize); - for (int i = 0; i < 4; ++i) { - unsigned char v = data[cur]; - v = (unsigned char) ((v < 43 || v > 122) ? '\0' : cd64[ v - 43 ]); - if (!v) { - cerr << "B64 decode error at offset " << cur << " offending character: " << (int)data[cur] << endl; - return false; - } - v = (unsigned char) ((v == '$') ? '\0' : v - 61); - if (v) in[i] = v - 1; else in[i] = 0; - ++cur; - } - decodeblock(in, reinterpret_cast<unsigned char*>(&out[ocur])); - ocur += 3; - } - return true; -} -} - diff --git a/decoder/hg_io.h b/decoder/hg_io.h index b6a176ab..082489d8 100644 --- a/decoder/hg_io.h +++ b/decoder/hg_io.h @@ -31,9 +31,4 @@ struct HypergraphIO { static std::string Escape(const std::string& s); // PLF helper }; -namespace B64 { - bool b64decode(const unsigned char* data, const size_t insize, char* out, const size_t outsize); - void b64encode(const char* data, const size_t size, std::ostream* out); -} - #endif diff --git a/decoder/oracle_bleu.h b/decoder/oracle_bleu.h index 81a584a7..145c84d1 100755 --- a/decoder/oracle_bleu.h +++ b/decoder/oracle_bleu.h @@ -9,7 +9,7 @@ #include <vector> #include <boost/program_options.hpp> #include <boost/program_options/variables_map.hpp> -#include "../vest/scorer.h" +#include "scorer.h" #include "hg.h" #include "ff_factory.h" #include "ff_bleu.h" diff --git a/decoder/phrasebased_translator.cc b/decoder/phrasebased_translator.cc index 726b3f9a..d65e44d1 100644 --- a/decoder/phrasebased_translator.cc +++ b/decoder/phrasebased_translator.cc @@ -68,7 +68,6 @@ struct PhraseBasedTranslatorImpl { PhraseBasedTranslatorImpl(const boost::program_options::variables_map& conf) : add_pass_through_rules(conf.count("add_pass_through_rules")), max_distortion(conf["pb_max_distortion"].as<int>()), - kSOURCE_RULE(new TRule("[X] ||| [X,1] ||| [X,1]", true)), kCONCAT_RULE(new TRule("[X] ||| [X,1] [X,2] ||| [X,1] [X,2]", true)), kNT_TYPE(TD::Convert("X") * -1) { assert(max_distortion >= 0); @@ -141,6 +140,8 @@ struct PhraseBasedTranslatorImpl { for (int i = 0; i < phrases.size(); ++i) { Hypergraph::Edge* edge = minus_lm_forest->AddEdge(phrases[i], Hypergraph::TailNodeVector()); edge->feature_values_ = edge->rule_->scores_; + edge->i_ = s.i; + edge->j_ = s.j; minus_lm_forest->ConnectEdgeToHeadNode(edge->id_, phrase_head_index); } CoverageNodeMap::iterator cit = c.find(s.coverage); @@ -189,7 +190,6 @@ struct PhraseBasedTranslatorImpl { const bool add_pass_through_rules; const int max_distortion; - TRulePtr kSOURCE_RULE; const TRulePtr kCONCAT_RULE; const WordID kNT_TYPE; boost::shared_ptr<FSTNode> fst; diff --git a/decoder/sentence_metadata.h b/decoder/sentence_metadata.h index 21be9b21..593019c8 100644 --- a/decoder/sentence_metadata.h +++ b/decoder/sentence_metadata.h @@ -3,7 +3,7 @@ #include <cassert> #include "lattice.h" -#include "../vest/scorer.h" +#include "scorer.h" struct SentenceMetadata { SentenceMetadata(int id, const Lattice& ref) : diff --git a/extools/Makefile.am b/extools/Makefile.am index 1e82287d..ee363264 100644 --- a/extools/Makefile.am +++ b/extools/Makefile.am @@ -11,20 +11,20 @@ sg_lexer.cc: sg_lexer.l $(LEX) -s -CF -8 -o$@ $< filter_grammar_SOURCES = filter_grammar.cc extract.cc sentence_pair.cc striped_grammar.cc sg_lexer.cc -filter_grammar_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +filter_grammar_LDADD = $(top_srcdir)/utils/libutils.a -lz #filter_grammar_LDFLAGS = -all-static featurize_grammar_SOURCES = featurize_grammar.cc extract.cc sentence_pair.cc sg_lexer.cc striped_grammar.cc -featurize_grammar_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +featurize_grammar_LDADD = $(top_srcdir)/utils/libutils.a -lz mr_stripe_rule_reduce_SOURCES = mr_stripe_rule_reduce.cc extract.cc sentence_pair.cc striped_grammar.cc sg_lexer.cc -mr_stripe_rule_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +mr_stripe_rule_reduce_LDADD = $(top_srcdir)/utils/libutils.a -lz extractor_SOURCES = sentence_pair.cc extract.cc extractor.cc striped_grammar.cc -extractor_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +extractor_LDADD = $(top_srcdir)/utils/libutils.a -lz extractor_monolingual_SOURCES = extractor_monolingual.cc -extractor_monolingual_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +extractor_monolingual_LDADD = $(top_srcdir)/utils/libutils.a -lz -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils diff --git a/extools/sg_lexer.l b/extools/sg_lexer.l index 168b835a..d60bd0fc 100644 --- a/extools/sg_lexer.l +++ b/extools/sg_lexer.l @@ -1,6 +1,4 @@ %{ -#include "rule_lexer.h" - #include <string> #include <iostream> #include <sstream> @@ -8,7 +6,6 @@ #include <cassert> #include "tdict.h" #include "fdict.h" -#include "trule.h" #include "striped_grammar.h" int lex_line = 0; diff --git a/gi/clda/src/Makefile.am b/gi/clda/src/Makefile.am index 688746bb..2b1393ac 100644 --- a/gi/clda/src/Makefile.am +++ b/gi/clda/src/Makefile.am @@ -2,5 +2,5 @@ bin_PROGRAMS = clda clda_SOURCES = clda.cc -AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/decoder -AM_LDFLAGS = $(top_srcdir)/decoder/libcdec.a -lz +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils +AM_LDFLAGS = $(top_srcdir)/utils/libutils.a -lz diff --git a/gi/pyp-topics/src/Makefile.am b/gi/pyp-topics/src/Makefile.am index c22819db..d3f95d0b 100644 --- a/gi/pyp-topics/src/Makefile.am +++ b/gi/pyp-topics/src/Makefile.am @@ -4,13 +4,13 @@ contexts_lexer.cc: contexts_lexer.l $(LEX) -s -CF -8 -o$@ $< pyp_topics_train_SOURCES = mt19937ar.c corpus.cc gzstream.cc pyp-topics.cc train.cc contexts_lexer.cc contexts_corpus.cc -pyp_topics_train_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +pyp_topics_train_LDADD = $(top_srcdir)/utils/libutils.a -lz pyp_contexts_train_SOURCES = mt19937ar.c corpus.cc gzstream.cc pyp-topics.cc contexts_lexer.cc contexts_corpus.cc train-contexts.cc -pyp_contexts_train_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +pyp_contexts_train_LDADD = $(top_srcdir)/utils/libutils.a -lz #mpi_pyp_contexts_train_SOURCES = mt19937ar.c corpus.cc gzstream.cc mpi-pyp-topics.cc contexts_lexer.cc contexts_corpus.cc mpi-train-contexts.cc -#mpi_pyp_contexts_train_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +#mpi_pyp_contexts_train_LDADD = $(top_srcdir)/utils/libutils.a -lz -AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I../../../utils diff --git a/gi/pyp-topics/src/contexts_corpus.hh b/gi/pyp-topics/src/contexts_corpus.hh index dd721361..b2d235cb 100644 --- a/gi/pyp-topics/src/contexts_corpus.hh +++ b/gi/pyp-topics/src/contexts_corpus.hh @@ -10,7 +10,7 @@ #include "corpus.hh" #include "contexts_lexer.h" -#include "../../../decoder/dict.h" +#include "dict.h" class BackoffGenerator { diff --git a/gi/pyp-topics/src/contexts_lexer.h b/gi/pyp-topics/src/contexts_lexer.h index 1b79c6fd..66004990 100644 --- a/gi/pyp-topics/src/contexts_lexer.h +++ b/gi/pyp-topics/src/contexts_lexer.h @@ -5,7 +5,7 @@ #include <vector> #include <string> -#include "../../../decoder/dict.h" +#include "dict.h" struct ContextsLexer { typedef std::vector<std::string> Context; diff --git a/gi/pyp-topics/src/contexts_lexer.l b/gi/pyp-topics/src/contexts_lexer.l index 7a5d9460..64cd7ca3 100644 --- a/gi/pyp-topics/src/contexts_lexer.l +++ b/gi/pyp-topics/src/contexts_lexer.l @@ -101,7 +101,7 @@ INT [\-+]?[0-9]+|inf|[\-+]inf %% -#include "../../../decoder/filelib.h" +#include "filelib.h" void ContextsLexer::ReadContexts(std::istream* in, ContextsLexer::ContextsCallback func, void* extra) { lex_line = 1; diff --git a/mteval/Makefile.am b/mteval/Makefile.am new file mode 100644 index 00000000..7ae14045 --- /dev/null +++ b/mteval/Makefile.am @@ -0,0 +1,23 @@ +bin_PROGRAMS = \ + fast_score \ + mbr_kbest + +if HAVE_GTEST +noinst_PROGRAMS = \ + scorer_test +endif + +noinst_LIBRARIES = libmteval.a + +libmteval_a_SOURCES = ter.cc comb_scorer.cc aer_scorer.cc scorer.cc + +fast_score_SOURCES = fast_score.cc +fast_score_LDADD = $(top_srcdir)/utils/libutils.a libmteval.a -lz + +mbr_kbest_SOURCES = mbr_kbest.cc +mbr_kbest_LDADD = $(top_srcdir)/utils/libutils.a libmteval.a -lz + +scorer_test_SOURCES = scorer_test.cc +scorer_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/utils/libutils.a libmteval.a -lz + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils diff --git a/vest/aer_scorer.cc b/mteval/aer_scorer.cc index 25b58b5e..edd4390f 100644 --- a/vest/aer_scorer.cc +++ b/mteval/aer_scorer.cc @@ -5,7 +5,7 @@ #include <sstream> #include "tdict.h" -#include "aligner.h" +#include "alignment_pharaoh.h" using namespace std; @@ -85,7 +85,7 @@ AERScorer::AERScorer(const vector<vector<WordID> >& refs, const string& src) : s cerr << "AERScorer can only take a single reference!\n"; abort(); } - ref_ = AlignerTools::ReadPharaohAlignmentGrid(TD::GetString(refs.front())); + ref_ = AlignmentPharaoh::ReadPharaohAlignmentGrid(TD::GetString(refs.front())); } static inline bool Safe(const Array2D<bool>& a, int i, int j) { @@ -101,7 +101,7 @@ ScoreP AERScorer::ScoreCCandidate(const vector<WordID>& shyp) const { ScoreP AERScorer::ScoreCandidate(const vector<WordID>& shyp) const { boost::shared_ptr<Array2D<bool> > hyp = - AlignerTools::ReadPharaohAlignmentGrid(TD::GetString(shyp)); + AlignmentPharaoh::ReadPharaohAlignmentGrid(TD::GetString(shyp)); int m = 0; int r = 0; diff --git a/vest/aer_scorer.h b/mteval/aer_scorer.h index 6d53d359..6d53d359 100644 --- a/vest/aer_scorer.h +++ b/mteval/aer_scorer.h diff --git a/vest/comb_scorer.cc b/mteval/comb_scorer.cc index 9fc37868..9fc37868 100644 --- a/vest/comb_scorer.cc +++ b/mteval/comb_scorer.cc diff --git a/vest/comb_scorer.h b/mteval/comb_scorer.h index 346be576..346be576 100644 --- a/vest/comb_scorer.h +++ b/mteval/comb_scorer.h diff --git a/vest/fast_score.cc b/mteval/fast_score.cc index 5ee264a6..5ee264a6 100644 --- a/vest/fast_score.cc +++ b/mteval/fast_score.cc diff --git a/mteval/mbr_kbest.cc b/mteval/mbr_kbest.cc new file mode 100644 index 00000000..2867b36b --- /dev/null +++ b/mteval/mbr_kbest.cc @@ -0,0 +1,138 @@ +#include <iostream> +#include <vector> + +#include <boost/program_options.hpp> + +#include "prob.h" +#include "tdict.h" +#include "scorer.h" +#include "filelib.h" +#include "stringlib.h" + +using namespace std; + +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("scale,a",po::value<double>()->default_value(1.0), "Posterior scaling factor (alpha)") + ("loss_function,l",po::value<string>()->default_value("bleu"), "Loss function") + ("input,i",po::value<string>()->default_value("-"), "File to read k-best lists from") + ("output_list,L", "Show reranked list as output") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +struct LossComparer { + bool operator()(const pair<vector<WordID>, double>& a, const pair<vector<WordID>, double>& b) const { + return a.second < b.second; + } +}; + +bool ReadKBestList(istream* in, string* sent_id, vector<pair<vector<WordID>, prob_t> >* list) { + static string cache_id; + static pair<vector<WordID>, prob_t> cache_pair; + list->clear(); + string cur_id; + if (cache_pair.first.size() > 0) { + list->push_back(cache_pair); + cur_id = cache_id; + cache_pair.first.clear(); + } + string line; + string tstr; + while(*in) { + getline(*in, line); + if (line.empty()) continue; + size_t p1 = line.find(" ||| "); + if (p1 == string::npos) { cerr << "Bad format: " << line << endl; abort(); } + size_t p2 = line.find(" ||| ", p1 + 4); + if (p2 == string::npos) { cerr << "Bad format: " << line << endl; abort(); } + size_t p3 = line.rfind(" ||| "); + cache_id = line.substr(0, p1); + tstr = line.substr(p1 + 5, p2 - p1 - 5); + double val = strtod(line.substr(p3 + 5).c_str(), NULL); + TD::ConvertSentence(tstr, &cache_pair.first); + cache_pair.second.logeq(val); + if (cur_id.empty()) cur_id = cache_id; + if (cur_id == cache_id) { + list->push_back(cache_pair); + *sent_id = cur_id; + cache_pair.first.clear(); + } else { break; } + } + return !list->empty(); +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const string metric = conf["loss_function"].as<string>(); + const bool output_list = conf.count("output_list") > 0; + const string file = conf["input"].as<string>(); + const double mbr_scale = conf["scale"].as<double>(); + cerr << "Posterior scaling factor (alpha) = " << mbr_scale << endl; + + ScoreType type = ScoreTypeFromString(metric); + vector<pair<vector<WordID>, prob_t> > list; + ReadFile rf(file); + string sent_id; + while(ReadKBestList(rf.stream(), &sent_id, &list)) { + vector<prob_t> joints(list.size()); + const prob_t max_score = pow(list.front().second, mbr_scale); + prob_t marginal = prob_t::Zero(); + for (int i = 0 ; i < list.size(); ++i) { + const prob_t joint = pow(list[i].second, mbr_scale) / max_score; + joints[i] = joint; + // cerr << "list[" << i << "] joint=" << log(joint) << endl; + marginal += joint; + } + int mbr_idx = -1; + vector<double> mbr_scores(output_list ? list.size() : 0); + double mbr_loss = numeric_limits<double>::max(); + for (int i = 0 ; i < list.size(); ++i) { + vector<vector<WordID> > refs(1, list[i].first); + //cerr << i << ": " << list[i].second <<"\t" << TD::GetString(list[i].first) << endl; + ScorerP scorer = SentenceScorer::CreateSentenceScorer(type, refs); + double wl_acc = 0; + for (int j = 0; j < list.size(); ++j) { + if (i != j) { + ScoreP s = scorer->ScoreCandidate(list[j].first); + double loss = 1.0 - s->ComputeScore(); + if (type == TER || type == AER) loss = 1.0 - loss; + double weighted_loss = loss * (joints[j] / marginal); + wl_acc += weighted_loss; + if ((!output_list) && wl_acc > mbr_loss) break; + } + } + if (output_list) mbr_scores[i] = wl_acc; + if (wl_acc < mbr_loss) { + mbr_loss = wl_acc; + mbr_idx = i; + } + } + // cerr << "ML translation: " << TD::GetString(list[0].first) << endl; + cerr << "MBR Best idx: " << mbr_idx << endl; + if (output_list) { + for (int i = 0; i < list.size(); ++i) + list[i].second.logeq(mbr_scores[i]); + sort(list.begin(), list.end(), LossComparer()); + for (int i = 0; i < list.size(); ++i) + cout << sent_id << " ||| " + << TD::GetString(list[i].first) << " ||| " + << log(list[i].second) << endl; + } else { + cout << TD::GetString(list[mbr_idx].first) << endl; + } + } + return 0; +} + diff --git a/vest/scorer.cc b/mteval/scorer.cc index 70fdef34..04eeaa93 100644 --- a/vest/scorer.cc +++ b/mteval/scorer.cc @@ -12,22 +12,15 @@ #include <boost/shared_ptr.hpp> #include "filelib.h" -#include "aligner.h" -#include "viterbi_envelope.h" -#include "error_surface.h" #include "ter.h" #include "aer_scorer.h" #include "comb_scorer.h" #include "tdict.h" #include "stringlib.h" -#include "lattice.h" - using boost::shared_ptr; using namespace std; -const bool minimize_segments = true; // if adjacent segments have equal scores, merge them - void Score::TimesEquals(float scale) { cerr<<"UNIMPLEMENTED except for BLEU (for MIRA): Score::TimesEquals"<<endl;abort(); } @@ -410,77 +403,6 @@ ScoreP SentenceScorer::CreateScoreFromString(const ScoreType type, const string& } } -void SentenceScorer::ComputeErrorSurface(const ViterbiEnvelope& ve, ErrorSurface* env, const ScoreType type, const Hypergraph& hg) const { - vector<WordID> prev_trans; - const vector<shared_ptr<Segment> >& ienv = ve.GetSortedSegs(); - env->resize(ienv.size()); - ScoreP prev_score; - int j = 0; - for (int i = 0; i < ienv.size(); ++i) { - const Segment& seg = *ienv[i]; - vector<WordID> trans; - if (type == AER) { - vector<bool> edges(hg.edges_.size(), false); - seg.CollectEdgesUsed(&edges); // get the set of edges in the viterbi - // alignment - ostringstream os; - const string* psrc = this->GetSource(); - if (psrc == NULL) { - cerr << "AER scoring in VEST requires source, but it is missing!\n"; - abort(); - } - size_t pos = psrc->rfind(" ||| "); - if (pos == string::npos) { - cerr << "Malformed source for AER: expected |||\nINPUT: " << *psrc << endl; - abort(); - } - Lattice src; - Lattice ref; - LatticeTools::ConvertTextOrPLF(psrc->substr(0, pos), &src); - LatticeTools::ConvertTextOrPLF(psrc->substr(pos + 5), &ref); - AlignerTools::WriteAlignment(src, ref, hg, &os, true, &edges); - string tstr = os.str(); - TD::ConvertSentence(tstr.substr(tstr.rfind(" ||| ") + 5), &trans); - } else { - seg.ConstructTranslation(&trans); - } - // cerr << "Scoring: " << TD::GetString(trans) << endl; - if (trans == prev_trans) { - if (!minimize_segments) { - assert(prev_score); // if this fails, it means - // the decoder can generate null translations - ErrorSegment& out = (*env)[j]; - out.delta = prev_score->GetZero(); - out.x = seg.x; - ++j; - } - // cerr << "Identical translation, skipping scoring\n"; - } else { - ScoreP score = ScoreCandidate(trans); - // cerr << "score= " << score->ComputeScore() << "\n"; - ScoreP cur_delta_p = score->GetZero(); - Score* cur_delta = cur_delta_p.get(); - // just record the score diffs - if (!prev_score) - prev_score = score->GetZero(); - - score->Subtract(*prev_score, cur_delta); - prev_trans.swap(trans); - prev_score = score; - if ((!minimize_segments) || (!cur_delta->IsAdditiveIdentity())) { - ErrorSegment& out = (*env)[j]; - out.delta = cur_delta_p; - out.x = seg.x; - ++j; - } - } - } - // cerr << " In segments: " << ienv.size() << endl; - // cerr << "Out segments: " << j << endl; - assert(j > 0); - env->resize(j); -} - void BLEUScore::ScoreDetails(string* details) const { char buf[2000]; vector<float> precs(max(N(),4)); diff --git a/vest/scorer.h b/mteval/scorer.h index 0c8b380f..f18c8c7f 100644 --- a/vest/scorer.h +++ b/mteval/scorer.h @@ -49,7 +49,7 @@ class Score : public boost::intrusive_refcount<Score> { virtual ScoreP Clone() const = 0; protected: Score() { } // we define these explicitly because refcount is noncopyable - Score(Score const& o) { } + Score(Score const&) { } }; //TODO: make sure default copy ctors for score types do what we want. @@ -72,7 +72,6 @@ class SentenceScorer { virtual ~SentenceScorer(); virtual ScoreP GetOne() const; virtual ScoreP GetZero() const; - void ComputeErrorSurface(const ViterbiEnvelope& ve, ErrorSurface* es, const ScoreType type, const Hypergraph& hg) const; virtual ScoreP ScoreCandidate(const Sentence& hyp) const = 0; virtual ScoreP ScoreCCandidate(const Sentence& hyp) const =0; virtual const std::string* GetSource() const; diff --git a/mteval/scorer_test.cc b/mteval/scorer_test.cc new file mode 100644 index 00000000..a07a8c4b --- /dev/null +++ b/mteval/scorer_test.cc @@ -0,0 +1,182 @@ +#include <iostream> +#include <fstream> +#include <valarray> +#include <gtest/gtest.h> + +#include "tdict.h" +#include "scorer.h" +#include "aer_scorer.h" + +using namespace std; + +class ScorerTest : public testing::Test { + protected: + virtual void SetUp() { + refs0.resize(4); + refs1.resize(4); + TD::ConvertSentence("export of high-tech products in guangdong in first two months this year reached 3.76 billion us dollars", &refs0[0]); + TD::ConvertSentence("guangdong's export of new high technology products amounts to us $ 3.76 billion in first two months of this year", &refs0[1]); + TD::ConvertSentence("guangdong exports us $ 3.76 billion worth of high technology products in the first two months of this year", &refs0[2]); + TD::ConvertSentence("in the first 2 months this year , the export volume of new hi-tech products in guangdong province reached 3.76 billion us dollars .", &refs0[3]); + TD::ConvertSentence("xinhua news agency , guangzhou , march 16 ( reporter chen ji ) the latest statistics show that from january through february this year , the export of high-tech products in guangdong province reached 3.76 billion us dollars , up 34.8 \% over the same period last year and accounted for 25.5 \% of the total export in the province .", &refs1[0]); + TD::ConvertSentence("xinhua news agency , guangzhou , march 16 ( reporter : chen ji ) -- latest statistic indicates that guangdong's export of new high technology products amounts to us $ 3.76 billion , up 34.8 \% over corresponding period and accounts for 25.5 \% of the total exports of the province .", &refs1[1]); + TD::ConvertSentence("xinhua news agency report of march 16 from guangzhou ( by staff reporter chen ji ) - latest statistics indicate guangdong province exported us $ 3.76 billion worth of high technology products , up 34.8 percent from the same period last year , which account for 25.5 percent of the total exports of the province .", &refs1[2]); + TD::ConvertSentence("guangdong , march 16 , ( xinhua ) -- ( chen ji reports ) as the newest statistics shows , in january and feberuary this year , the export volume of new hi-tech products in guangdong province reached 3.76 billion us dollars , up 34.8 \% than last year , making up 25.5 \% of the province's total .", &refs1[3]); + TD::ConvertSentence("one guangdong province will next export us $ 3.76 high-tech product two months first this year 3.76 billion us dollars", &hyp1); + TD::ConvertSentence("xinhua news agency , guangzhou , 16th of march ( reporter chen ) -- latest statistics suggest that guangdong exports new advanced technology product totals $ 3.76 million , 34.8 percent last corresponding period and accounts for 25.5 percent of the total export province .", &hyp2); + } + + virtual void TearDown() { } + + vector<vector<WordID> > refs0; + vector<vector<WordID> > refs1; + vector<WordID> hyp1; + vector<WordID> hyp2; +}; + +TEST_F(ScorerTest, TestCreateFromFiles) { + vector<string> files; + files.push_back("test_data/re.txt.0"); + files.push_back("test_data/re.txt.1"); + files.push_back("test_data/re.txt.2"); + files.push_back("test_data/re.txt.3"); + DocScorer ds(IBM_BLEU, files); +} + +TEST_F(ScorerTest, TestBLEUScorer) { + ScorerP s1 = SentenceScorer::CreateSentenceScorer(IBM_BLEU, refs0); + ScorerP s2 = SentenceScorer::CreateSentenceScorer(IBM_BLEU, refs1); + ScoreP b1 = s1->ScoreCandidate(hyp1); + EXPECT_FLOAT_EQ(0.23185077, b1->ComputeScore()); + ScoreP b2 = s2->ScoreCandidate(hyp2); + EXPECT_FLOAT_EQ(0.38101241, b2->ComputeScore()); + b1->PlusEquals(*b2); + EXPECT_FLOAT_EQ(0.348854, b1->ComputeScore()); + EXPECT_FALSE(b1->IsAdditiveIdentity()); + string details; + b1->ScoreDetails(&details); + EXPECT_EQ("BLEU = 34.89, 81.5|50.8|29.5|18.6 (brev=0.898)", details); + cerr << details << endl; + string enc; + b1->Encode(&enc); + ScoreP b3 = SentenceScorer::CreateScoreFromString(IBM_BLEU, enc); + details.clear(); + cerr << "Encoded BLEU score size: " << enc.size() << endl; + b3->ScoreDetails(&details); + cerr << details << endl; + EXPECT_FALSE(b3->IsAdditiveIdentity()); + EXPECT_EQ("BLEU = 34.89, 81.5|50.8|29.5|18.6 (brev=0.898)", details); + ScoreP bz = b3->GetZero(); + EXPECT_TRUE(bz->IsAdditiveIdentity()); +} + +TEST_F(ScorerTest, TestTERScorer) { + ScorerP s1 = SentenceScorer::CreateSentenceScorer(TER, refs0); + ScorerP s2 = SentenceScorer::CreateSentenceScorer(TER, refs1); + string details; + ScoreP t1 = s1->ScoreCandidate(hyp1); + t1->ScoreDetails(&details); + cerr << "DETAILS: " << details << endl; + cerr << t1->ComputeScore() << endl; + ScoreP t2 = s2->ScoreCandidate(hyp2); + t2->ScoreDetails(&details); + cerr << "DETAILS: " << details << endl; + cerr << t2->ComputeScore() << endl; + t1->PlusEquals(*t2); + cerr << t1->ComputeScore() << endl; + t1->ScoreDetails(&details); + cerr << "DETAILS: " << details << endl; + EXPECT_EQ("TER = 44.16, 4| 8| 16| 6 (len=77)", details); + string enc; + t1->Encode(&enc); + ScoreP t3 = SentenceScorer::CreateScoreFromString(TER, enc); + details.clear(); + t3->ScoreDetails(&details); + EXPECT_EQ("TER = 44.16, 4| 8| 16| 6 (len=77)", details); + EXPECT_FALSE(t3->IsAdditiveIdentity()); + ScoreP tz = t3->GetZero(); + EXPECT_TRUE(tz->IsAdditiveIdentity()); +} + +TEST_F(ScorerTest, TestTERScorerSimple) { + vector<vector<WordID> > ref(1); + TD::ConvertSentence("1 2 3 A B", &ref[0]); + vector<WordID> hyp; + TD::ConvertSentence("A B 1 2 3", &hyp); + ScorerP s1 = SentenceScorer::CreateSentenceScorer(TER, ref); + string details; + ScoreP t1 = s1->ScoreCandidate(hyp); + t1->ScoreDetails(&details); + cerr << "DETAILS: " << details << endl; +} + +TEST_F(ScorerTest, TestSERScorerSimple) { + vector<vector<WordID> > ref(1); + TD::ConvertSentence("A B C D", &ref[0]); + vector<WordID> hyp1; + TD::ConvertSentence("A B C", &hyp1); + vector<WordID> hyp2; + TD::ConvertSentence("A B C D", &hyp2); + ScorerP s1 = SentenceScorer::CreateSentenceScorer(SER, ref); + string details; + ScoreP t1 = s1->ScoreCandidate(hyp1); + t1->ScoreDetails(&details); + cerr << "DETAILS: " << details << endl; + ScoreP t2 = s1->ScoreCandidate(hyp2); + t2->ScoreDetails(&details); + cerr << "DETAILS: " << details << endl; + t2->PlusEquals(*t1); + t2->ScoreDetails(&details); + cerr << "DETAILS: " << details << endl; +} + +TEST_F(ScorerTest, TestCombiScorer) { + ScorerP s1 = SentenceScorer::CreateSentenceScorer(BLEU_minus_TER_over_2, refs0); + string details; + ScoreP t1 = s1->ScoreCandidate(hyp1); + t1->ScoreDetails(&details); + cerr << "DETAILS: " << details << endl; + cerr << t1->ComputeScore() << endl; + string enc; + t1->Encode(&enc); + ScoreP t2 = SentenceScorer::CreateScoreFromString(BLEU_minus_TER_over_2, enc); + details.clear(); + t2->ScoreDetails(&details); + cerr << "DETAILS: " << details << endl; + ScoreP cz = t2->GetZero(); + EXPECT_FALSE(t2->IsAdditiveIdentity()); + EXPECT_TRUE(cz->IsAdditiveIdentity()); + cz->PlusEquals(*t2); + EXPECT_FALSE(cz->IsAdditiveIdentity()); + string d2; + cz->ScoreDetails(&d2); + EXPECT_EQ(d2, details); +} + +TEST_F(ScorerTest, AERTest) { + vector<vector<WordID> > refs0(1); + TD::ConvertSentence("0-0 2-1 1-2 3-3", &refs0[0]); + + vector<WordID> hyp; + TD::ConvertSentence("0-0 1-1", &hyp); + AERScorer* as = new AERScorer(refs0); + ScoreP x = as->ScoreCandidate(hyp); + string details; + x->ScoreDetails(&details); + cerr << details << endl; + string enc; + x->Encode(&enc); + delete as; + cerr << "ENC size: " << enc.size() << endl; + ScoreP y = SentenceScorer::CreateScoreFromString(AER, enc); + string d2; + y->ScoreDetails(&d2); + cerr << d2 << endl; + EXPECT_EQ(d2, details); +} + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + diff --git a/vest/ter.cc b/mteval/ter.cc index cacc5b00..cacc5b00 100644 --- a/vest/ter.cc +++ b/mteval/ter.cc diff --git a/vest/ter.h b/mteval/ter.h index 43314791..43314791 100644 --- a/vest/ter.h +++ b/mteval/ter.h diff --git a/mteval/test_data/re.txt.0 b/mteval/test_data/re.txt.0 new file mode 100644 index 00000000..86eff087 --- /dev/null +++ b/mteval/test_data/re.txt.0 @@ -0,0 +1,5 @@ +erdogan states turkey to reject any pressures to urge it to recognize cyprus +ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara will reject any pressure by the european union to urge it to recognize cyprus . this comes two weeks before the summit of european union state and government heads who will decide whether or nor membership negotiations with ankara should be opened . +erdogan told " ntv " television station that " the european union cannot address us by imposing new conditions on us with regard to cyprus . +we will discuss this dossier in the course of membership negotiations . " +he added " let me be clear , i cannot sidestep turkey , this is something we cannot accept . " diff --git a/mteval/test_data/re.txt.1 b/mteval/test_data/re.txt.1 new file mode 100644 index 00000000..2140f198 --- /dev/null +++ b/mteval/test_data/re.txt.1 @@ -0,0 +1,5 @@ +erdogan confirms turkey will resist any pressure to recognize cyprus +ankara 12 - 1 ( afp ) - the turkish head of government , recep tayyip erdogan , announced today ( wednesday ) that ankara would resist any pressure the european union might exercise in order to force it into recognizing cyprus . this comes two weeks before a summit of european union heads of state and government , who will decide whether or not to open membership negotiations with ankara . +erdogan said to the ntv television channel : " the european union cannot engage with us through imposing new conditions on us with regard to cyprus . +we shall discuss this issue in the course of the membership negotiations . " +he added : " let me be clear - i cannot confine turkey . this is something we do not accept . " diff --git a/mteval/test_data/re.txt.2 b/mteval/test_data/re.txt.2 new file mode 100644 index 00000000..94e46286 --- /dev/null +++ b/mteval/test_data/re.txt.2 @@ -0,0 +1,5 @@ +erdogan confirms that turkey will reject any pressures to encourage it to recognize cyprus +ankara , 12 / 1 ( afp ) - the turkish prime minister recep tayyip erdogan declared today , wednesday , that ankara will reject any pressures that the european union may apply on it to encourage to recognize cyprus . this comes two weeks before a summit of the heads of countries and governments of the european union , who will decide on whether or not to start negotiations on joining with ankara . +erdogan told the ntv television station that " it is not possible for the european union to talk to us by imposing new conditions on us regarding cyprus . +we shall discuss this dossier during the negotiations on joining . " +and he added , " let me be clear . turkey's arm should not be twisted ; this is something we cannot accept . " diff --git a/mteval/test_data/re.txt.3 b/mteval/test_data/re.txt.3 new file mode 100644 index 00000000..f87c3308 --- /dev/null +++ b/mteval/test_data/re.txt.3 @@ -0,0 +1,5 @@ +erdogan stresses that turkey will reject all pressures to force it to recognize cyprus +ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara would refuse all pressures applied on it by the european union to force it to recognize cyprus . that came two weeks before the summit of the presidents and prime ministers of the european union , who would decide on whether to open negotiations on joining with ankara or not . +erdogan said to " ntv " tv station that the " european union can not communicate with us by imposing on us new conditions related to cyprus . +we will discuss this file during the negotiations on joining . " +he added , " let me be clear . turkey's arm should not be twisted . this is unacceptable to us . " diff --git a/training/Makefile.am b/training/Makefile.am index 490de774..48b19932 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -14,37 +14,36 @@ noinst_PROGRAMS = \ optimize_test atools_SOURCES = atools.cc -atools_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +atools_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz model1_SOURCES = model1.cc -model1_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +model1_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz grammar_convert_SOURCES = grammar_convert.cc -grammar_convert_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +grammar_convert_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz optimize_test_SOURCES = optimize_test.cc optimize.cc -optimize_test_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +optimize_test_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz collapse_weights_SOURCES = collapse_weights.cc -collapse_weights_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +collapse_weights_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz lbfgs_test_SOURCES = lbfgs_test.cc -lbfgs_test_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +lbfgs_test_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz mr_optimize_reduce_SOURCES = mr_optimize_reduce.cc optimize.cc -mr_optimize_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +mr_optimize_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz mr_em_map_adapter_SOURCES = mr_em_map_adapter.cc -mr_em_map_adapter_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +mr_em_map_adapter_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz mr_reduce_to_weights_SOURCES = mr_reduce_to_weights.cc -mr_reduce_to_weights_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +mr_reduce_to_weights_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz mr_em_adapted_reduce_SOURCES = mr_em_adapted_reduce.cc -mr_em_adapted_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +mr_em_adapted_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz plftools_SOURCES = plftools.cc -plftools_LDADD = $(top_srcdir)/decoder/libcdec.a -lz - -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder +plftools_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -I$(top_srcdir)/utils -I$(top_srcdir)/mteval diff --git a/training/atools.cc b/training/atools.cc index af62804d..805e3c1d 100644 --- a/training/atools.cc +++ b/training/atools.cc @@ -9,6 +9,7 @@ #include "filelib.h" #include "aligner.h" +#include "alignment_pharaoh.h" namespace po = boost::program_options; using namespace std; @@ -349,9 +350,9 @@ int main(int argc, char **argv) { } if (line1.empty() && !*in1) break; shared_ptr<Array2D<bool> > out(new Array2D<bool>); - shared_ptr<Array2D<bool> > a1 = AlignerTools::ReadPharaohAlignmentGrid(line1); + shared_ptr<Array2D<bool> > a1 = AlignmentPharaoh::ReadPharaohAlignmentGrid(line1); if (in2) { - shared_ptr<Array2D<bool> > a2 = AlignerTools::ReadPharaohAlignmentGrid(line2); + shared_ptr<Array2D<bool> > a2 = AlignmentPharaoh::ReadPharaohAlignmentGrid(line2); cmd.Apply(*a1, *a2, out.get()); } else { Array2D<bool> dummy; @@ -359,7 +360,7 @@ int main(int argc, char **argv) { } if (cmd.Result() == 1) { - AlignerTools::SerializePharaohFormat(*out, &cout); + AlignmentPharaoh::SerializePharaohFormat(*out, &cout); } } if (cmd.Result() == 2) diff --git a/utils/Makefile.am b/utils/Makefile.am new file mode 100644 index 00000000..e513febd --- /dev/null +++ b/utils/Makefile.am @@ -0,0 +1,38 @@ +if HAVE_GTEST +noinst_PROGRAMS = \ + dict_test \ + weights_test \ + logval_test \ + small_vector_test +endif + +noinst_LIBRARIES = libutils.a + +libutils_a_SOURCES = \ + alignment_pharaoh.cc \ + b64tools.cc \ + dict.cc \ + tdict.cc \ + fdict.cc \ + gzstream.cc \ + filelib.cc \ + stringlib.cc \ + sparse_vector.cc \ + timing_stats.cc \ + weights.cc + +dict_test_SOURCES = dict_test.cc +dict_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) +weights_test_SOURCES = weights_test.cc +weights_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) +logval_test_SOURCES = logval_test.cc +logval_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) +small_vector_test_SOURCES = small_vector_test.cc +small_vector_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) + +AM_LDFLAGS = libutils.a -lz + +################################################################ +# do NOT NOT NOT add any other -I includes NO NO NO NO NO ###### +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I. +################################################################ diff --git a/utils/alignment_pharaoh.cc b/utils/alignment_pharaoh.cc new file mode 100644 index 00000000..890ff565 --- /dev/null +++ b/utils/alignment_pharaoh.cc @@ -0,0 +1,77 @@ +#include "utils/alignment_pharaoh.h" + +#include <set> + +using namespace std; + +static bool is_digit(char x) { return x >= '0' && x <= '9'; } + +boost::shared_ptr<Array2D<bool> > AlignmentPharaoh::ReadPharaohAlignmentGrid(const string& al) { + int max_x = 0; + int max_y = 0; + int i = 0; + size_t pos = al.rfind(" ||| "); + if (pos != string::npos) { i = pos + 5; } + while (i < al.size()) { + if (al[i] == '\n' || al[i] == '\r') break; + int x = 0; + while(i < al.size() && is_digit(al[i])) { + x *= 10; + x += al[i] - '0'; + ++i; + } + if (x > max_x) max_x = x; + assert(i < al.size()); + if(al[i] != '-') { + cerr << "BAD ALIGNMENT: " << al << endl; + abort(); + } + ++i; + int y = 0; + while(i < al.size() && is_digit(al[i])) { + y *= 10; + y += al[i] - '0'; + ++i; + } + if (y > max_y) max_y = y; + while(i < al.size() && al[i] == ' ') { ++i; } + } + + boost::shared_ptr<Array2D<bool> > grid(new Array2D<bool>(max_x + 1, max_y + 1)); + i = 0; + if (pos != string::npos) { i = pos + 5; } + while (i < al.size()) { + if (al[i] == '\n' || al[i] == '\r') break; + int x = 0; + while(i < al.size() && is_digit(al[i])) { + x *= 10; + x += al[i] - '0'; + ++i; + } + assert(i < al.size()); + assert(al[i] == '-'); + ++i; + int y = 0; + while(i < al.size() && is_digit(al[i])) { + y *= 10; + y += al[i] - '0'; + ++i; + } + (*grid)(x, y) = true; + while(i < al.size() && al[i] == ' ') { ++i; } + } + // cerr << *grid << endl; + return grid; +} + +void AlignmentPharaoh::SerializePharaohFormat(const Array2D<bool>& alignment, ostream* out) { + bool need_space = false; + for (int i = 0; i < alignment.width(); ++i) + for (int j = 0; j < alignment.height(); ++j) + if (alignment(i,j)) { + if (need_space) (*out) << ' '; else need_space = true; + (*out) << i << '-' << j; + } + (*out) << endl; +} + diff --git a/utils/alignment_pharaoh.h b/utils/alignment_pharaoh.h new file mode 100644 index 00000000..d111c8bf --- /dev/null +++ b/utils/alignment_pharaoh.h @@ -0,0 +1,14 @@ +#ifndef _PHARAOH_ALIGNMENT_H_ +#define _PHARAOH_ALIGNMENT_H_ + +#include <string> +#include <iostream> +#include <boost/shared_ptr.hpp> +#include "array2d.h" + +struct AlignmentPharaoh { + static boost::shared_ptr<Array2D<bool> > ReadPharaohAlignmentGrid(const std::string& al); + static void SerializePharaohFormat(const Array2D<bool>& alignment, std::ostream* out); +}; + +#endif diff --git a/decoder/array2d.h b/utils/array2d.h index e63eda0d..e63eda0d 100644 --- a/decoder/array2d.h +++ b/utils/array2d.h diff --git a/utils/b64tools.cc b/utils/b64tools.cc new file mode 100644 index 00000000..5512f975 --- /dev/null +++ b/utils/b64tools.cc @@ -0,0 +1,59 @@ +#include <iostream> +#include <cassert> + +using namespace std; + +namespace B64 { + +static const char cb64[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; +static const char cd64[]="|$$$}rstuvwxyz{$$$$$$$>?@ABCDEFGHIJKLMNOPQRSTUVW$$$$$$XYZ[\\]^_`abcdefghijklmnopq"; + +static void encodeblock(const unsigned char* in, ostream* os, int len) { + char out[4]; + out[0] = cb64[ in[0] >> 2 ]; + out[1] = cb64[ ((in[0] & 0x03) << 4) | ((in[1] & 0xf0) >> 4) ]; + out[2] = (len > 1 ? cb64[ ((in[1] & 0x0f) << 2) | ((in[2] & 0xc0) >> 6) ] : '='); + out[3] = (len > 2 ? cb64[ in[2] & 0x3f ] : '='); + os->write(out, 4); +} + +void b64encode(const char* data, const size_t size, ostream* out) { + size_t cur = 0; + while(cur < size) { + int len = min(static_cast<size_t>(3), size - cur); + encodeblock(reinterpret_cast<const unsigned char*>(&data[cur]), out, len); + cur += len; + } +} + +static void decodeblock(const unsigned char* in, unsigned char* out) { + out[0] = (unsigned char ) (in[0] << 2 | in[1] >> 4); + out[1] = (unsigned char ) (in[1] << 4 | in[2] >> 2); + out[2] = (unsigned char ) (((in[2] << 6) & 0xc0) | in[3]); +} + +bool b64decode(const unsigned char* data, const size_t insize, char* out, const size_t outsize) { + size_t cur = 0; + size_t ocur = 0; + unsigned char in[4]; + while(cur < insize) { + assert(ocur < outsize); + for (int i = 0; i < 4; ++i) { + unsigned char v = data[cur]; + v = (unsigned char) ((v < 43 || v > 122) ? '\0' : cd64[ v - 43 ]); + if (!v) { + cerr << "B64 decode error at offset " << cur << " offending character: " << (int)data[cur] << endl; + return false; + } + v = (unsigned char) ((v == '$') ? '\0' : v - 61); + if (v) in[i] = v - 1; else in[i] = 0; + ++cur; + } + decodeblock(in, reinterpret_cast<unsigned char*>(&out[ocur])); + ocur += 3; + } + return true; +} + +} + diff --git a/utils/b64tools.h b/utils/b64tools.h new file mode 100644 index 00000000..c821fc8f --- /dev/null +++ b/utils/b64tools.h @@ -0,0 +1,9 @@ +#ifndef _B64_TOOLS_H_ +#define _B64_TOOLS_H_ + +namespace B64 { + bool b64decode(const unsigned char* data, const size_t insize, char* out, const size_t outsize); + void b64encode(const char* data, const size_t size, std::ostream* out); +} + +#endif diff --git a/decoder/dict.cc b/utils/dict.cc index 2d6986c8..2d6986c8 100644 --- a/decoder/dict.cc +++ b/utils/dict.cc diff --git a/decoder/dict.h b/utils/dict.h index 348a97e3..348a97e3 100644 --- a/decoder/dict.h +++ b/utils/dict.h diff --git a/decoder/dict_test.cc b/utils/dict_test.cc index 694877fa..2049ec27 100644 --- a/decoder/dict_test.cc +++ b/utils/dict_test.cc @@ -5,9 +5,6 @@ #include <iostream> #include <gtest/gtest.h> #include <cassert> -#include "filelib.h" - -#include "tdict.h" using namespace std; diff --git a/decoder/fdict.cc b/utils/fdict.cc index baa0b552..baa0b552 100644 --- a/decoder/fdict.cc +++ b/utils/fdict.cc diff --git a/decoder/fdict.h b/utils/fdict.h index f9673023..f9673023 100644 --- a/decoder/fdict.h +++ b/utils/fdict.h diff --git a/utils/feature_accum.h b/utils/feature_accum.h new file mode 100755 index 00000000..851b29db --- /dev/null +++ b/utils/feature_accum.h @@ -0,0 +1,129 @@ +#ifndef FEATURE_ACCUM_H +#define FEATURE_ACCUM_H + +#include "ff.h" +#include "sparse_vector.h" +#include "value_array.h" + +struct SparseFeatureAccumulator : public FeatureVector { + typedef FeatureVector State; + SparseFeatureAccumulator() { } + template <class FF> + FeatureVector const& describe(FF const& ) { return *this; } + void Store(FeatureVector *fv) const { + fv->set_from(*this); + } + template <class FF> + void Store(FF const& /* ff */,FeatureVector *fv) const { + fv->set_from(*this); + } + template <class FF> + void Add(FF const& /* ff */,FeatureVector const& fv) { + (*this)+=fv; + } + void Add(FeatureVector const& fv) { + (*this)+=fv; + } + /* + SparseFeatureAccumulator(FeatureVector const& fv) : State(fv) {} + FeatureAccumulator(Features const& fids) {} + FeatureAccumulator(Features const& fids,FeatureVector const& fv) : State(fv) {} + void Add(Features const& fids,FeatureVector const& fv) { + *this += fv; + } + */ + void Add(int i,Featval v) { + (*this)[i]+=v; + } + void Add(Features const& fids,int i,Featval v) { + (*this)[i]+=v; + } +}; + +struct SingleFeatureAccumulator { + typedef Featval State; + typedef SingleFeatureAccumulator Self; + State v; + /* + void operator +=(State const& o) { + v+=o; + } + */ + void operator +=(Self const& s) { + v+=s.v; + } + SingleFeatureAccumulator() : v() {} + template <class FF> + State const& describe(FF const& ) const { return v; } + + template <class FF> + void Store(FF const& ff,FeatureVector *fv) const { + fv->set_value(ff.fid_,v); + } + void Store(Features const& fids,FeatureVector *fv) const { + assert(fids.size()==1); + fv->set_value(fids[0],v); + } + /* + SingleFeatureAccumulator(Features const& fids) { assert(fids.size()==1); } + SingleFeatureAccumulator(Features const& fids,FeatureVector const& fv) + { + assert(fids.size()==1); + v=fv.get_singleton(); + } + */ + + template <class FF> + void Add(FF const& ff,FeatureVector const& fv) { + v+=fv.get(ff.fid_); + } + void Add(FeatureVector const& fv) { + v+=fv.get_singleton(); + } + + void Add(Features const& fids,FeatureVector const& fv) { + v += fv.get(fids[0]); + } + void Add(Featval dv) { + v+=dv; + } + void Add(int,Featval dv) { + v+=dv; + } + void Add(FeatureVector const& fids,int i,Featval dv) { + assert(fids.size()==1 && i==0); + v+=dv; + } +}; + + +#if 0 +// omitting this so we can default construct an accum. might be worth resurrecting in the future +struct ArrayFeatureAccumulator : public ValueArray<Featval> { + typedef ValueArray<Featval> State; + template <class Fsa> + ArrayFeatureAccumulator(Fsa const& fsa) : State(fsa.features_.size()) { } + ArrayFeatureAccumulator(Features const& fids) : State(fids.size()) { } + ArrayFeatureAccumulator(Features const& fids) : State(fids.size()) { } + ArrayFeatureAccumulator(Features const& fids,FeatureVector const& fv) : State(fids.size()) { + for (int i=0,e=i<fids.size();i<e;++i) + (*this)[i]=fv.get(i); + } + State const& describe(Features const& fids) const { return *this; } + void Store(Features const& fids,FeatureVector *fv) const { + assert(fids.size()==size()); + for (int i=0,e=i<fids.size();i<e;++i) + fv->set_value(fids[i],(*this)[i]); + } + void Add(Features const& fids,FeatureVector const& fv) { + for (int i=0,e=i<fids.size();i<e;++i) + (*this)[i]+=fv.get(i); + } + void Add(FeatureVector const& fids,int i,Featval v) { + (*this)[i]+=v; + } +}; +#endif + + +#endif diff --git a/decoder/feature_vector.h b/utils/feature_vector.h index be378a6a..be378a6a 100755 --- a/decoder/feature_vector.h +++ b/utils/feature_vector.h diff --git a/decoder/filelib.cc b/utils/filelib.cc index 79ad2847..79ad2847 100644 --- a/decoder/filelib.cc +++ b/utils/filelib.cc diff --git a/decoder/filelib.h b/utils/filelib.h index b9fef9a7..b9fef9a7 100644 --- a/decoder/filelib.h +++ b/utils/filelib.h diff --git a/decoder/gzstream.cc b/utils/gzstream.cc index 88cd1bd2..88cd1bd2 100644 --- a/decoder/gzstream.cc +++ b/utils/gzstream.cc diff --git a/decoder/gzstream.h b/utils/gzstream.h index a7effd90..a7effd90 100644 --- a/decoder/gzstream.h +++ b/utils/gzstream.h diff --git a/decoder/hash.h b/utils/hash.h index 3a60a429..3a60a429 100755 --- a/decoder/hash.h +++ b/utils/hash.h diff --git a/decoder/have_64_bits.h b/utils/have_64_bits.h index d1e6064f..d1e6064f 100755 --- a/decoder/have_64_bits.h +++ b/utils/have_64_bits.h diff --git a/decoder/int_or_pointer.h b/utils/int_or_pointer.h index 4b6a9e4a..4b6a9e4a 100755 --- a/decoder/int_or_pointer.h +++ b/utils/int_or_pointer.h diff --git a/decoder/intrusive_refcount.hpp b/utils/intrusive_refcount.hpp index 4a4b0187..4a4b0187 100755 --- a/decoder/intrusive_refcount.hpp +++ b/utils/intrusive_refcount.hpp diff --git a/decoder/logval.h b/utils/logval.h index 37f14ae5..37f14ae5 100644 --- a/decoder/logval.h +++ b/utils/logval.h diff --git a/decoder/logval_test.cc b/utils/logval_test.cc index 1a23177d..1a23177d 100644 --- a/decoder/logval_test.cc +++ b/utils/logval_test.cc diff --git a/decoder/murmur_hash.h b/utils/murmur_hash.h index 8dbd7807..8dbd7807 100755 --- a/decoder/murmur_hash.h +++ b/utils/murmur_hash.h diff --git a/decoder/null_deleter.h b/utils/null_deleter.h index 082ab453..082ab453 100755 --- a/decoder/null_deleter.h +++ b/utils/null_deleter.h diff --git a/decoder/prob.h b/utils/prob.h index bc297870..bc297870 100644 --- a/decoder/prob.h +++ b/utils/prob.h diff --git a/decoder/sampler.h b/utils/sampler.h index 5fef45d0..5fef45d0 100644 --- a/decoder/sampler.h +++ b/utils/sampler.h diff --git a/decoder/small_vector.h b/utils/small_vector.h index 25c52359..25c52359 100644 --- a/decoder/small_vector.h +++ b/utils/small_vector.h diff --git a/decoder/small_vector_test.cc b/utils/small_vector_test.cc index d1d8dcab..d1d8dcab 100644 --- a/decoder/small_vector_test.cc +++ b/utils/small_vector_test.cc diff --git a/decoder/sparse_vector.cc b/utils/sparse_vector.cc index 4035b9ef..6e42a216 100644 --- a/decoder/sparse_vector.cc +++ b/utils/sparse_vector.cc @@ -3,7 +3,7 @@ #include <iostream> #include <cstring> -#include "hg_io.h" +#include "b64tools.h" using namespace std; diff --git a/decoder/sparse_vector.h b/utils/sparse_vector.h index 207489c5..207489c5 100644 --- a/decoder/sparse_vector.h +++ b/utils/sparse_vector.h diff --git a/decoder/static_utoa.h b/utils/static_utoa.h index fe5f6d92..fe5f6d92 100755 --- a/decoder/static_utoa.h +++ b/utils/static_utoa.h diff --git a/decoder/stringlib.cc b/utils/stringlib.cc index 3e52ae87..7aaee9f0 100644 --- a/decoder/stringlib.cc +++ b/utils/stringlib.cc @@ -6,8 +6,6 @@ #include <iostream> #include <map> -#include "lattice.h" - using namespace std; void ParseTranslatorInput(const string& line, string* input, string* ref) { @@ -31,15 +29,6 @@ void ParseTranslatorInput(const string& line, string* input, string* ref) { } } -void ParseTranslatorInputLattice(const string& line, string* input, Lattice* ref) { - string sref; - ParseTranslatorInput(line, input, &sref); - if (sref.size() > 0) { - assert(ref); - LatticeTools::ConvertTextOrPLF(sref, ref); - } -} - void ProcessAndStripSGML(string* pline, map<string, string>* out) { map<string, string>& meta = *out; string& line = *pline; diff --git a/decoder/stringlib.h b/utils/stringlib.h index 84e95d44..84e95d44 100644 --- a/decoder/stringlib.h +++ b/utils/stringlib.h diff --git a/decoder/stringlib_test.cc b/utils/stringlib_test.cc index f66cdbeb..f66cdbeb 100755 --- a/decoder/stringlib_test.cc +++ b/utils/stringlib_test.cc diff --git a/decoder/tdict.cc b/utils/tdict.cc index 1f68feae..1f68feae 100644 --- a/decoder/tdict.cc +++ b/utils/tdict.cc diff --git a/decoder/tdict.h b/utils/tdict.h index a7b3ee1c..a7b3ee1c 100644 --- a/decoder/tdict.h +++ b/utils/tdict.h diff --git a/decoder/test_data/weights b/utils/test_data/weights index ea70229c..ea70229c 100644 --- a/decoder/test_data/weights +++ b/utils/test_data/weights diff --git a/decoder/threadlocal.h b/utils/threadlocal.h index d79f5d9d..d79f5d9d 100755 --- a/decoder/threadlocal.h +++ b/utils/threadlocal.h diff --git a/decoder/timing_stats.cc b/utils/timing_stats.cc index fc8e9df1..fc8e9df1 100644 --- a/decoder/timing_stats.cc +++ b/utils/timing_stats.cc diff --git a/decoder/timing_stats.h b/utils/timing_stats.h index 0a9f7656..0a9f7656 100644 --- a/decoder/timing_stats.h +++ b/utils/timing_stats.h diff --git a/decoder/weights.cc b/utils/weights.cc index 84647585..84647585 100644 --- a/decoder/weights.cc +++ b/utils/weights.cc diff --git a/decoder/weights.h b/utils/weights.h index f19aa3ce..f19aa3ce 100644 --- a/decoder/weights.h +++ b/utils/weights.h diff --git a/decoder/weights_test.cc b/utils/weights_test.cc index aa6b3db2..8a4c26ef 100644 --- a/decoder/weights_test.cc +++ b/utils/weights_test.cc @@ -5,7 +5,6 @@ #include <gtest/gtest.h> #include "weights.h" #include "tdict.h" -#include "hg.h" using namespace std; diff --git a/decoder/wordid.h b/utils/wordid.h index fb50bcc1..fb50bcc1 100644 --- a/decoder/wordid.h +++ b/utils/wordid.h diff --git a/vest/Makefile.am b/vest/Makefile.am index abdc8146..b869672b 100644 --- a/vest/Makefile.am +++ b/vest/Makefile.am @@ -1,15 +1,12 @@ bin_PROGRAMS = \ - mbr_kbest \ mr_vest_map \ mr_vest_reduce \ mr_vest_generate_mapper_input \ - fast_score \ sentserver \ sentclient if HAVE_GTEST noinst_PROGRAMS = \ - scorer_test \ lo_test endif @@ -17,25 +14,16 @@ sentserver_SOURCES = sentserver.c sentclient_SOURCES = sentclient.c -mbr_kbest_SOURCES = mbr_kbest.cc ter.cc comb_scorer.cc aer_scorer.cc scorer.cc viterbi_envelope.cc -mbr_kbest_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +mr_vest_generate_mapper_input_SOURCES = mr_vest_generate_mapper_input.cc line_optimizer.cc +mr_vest_generate_mapper_input_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz -fast_score_SOURCES = fast_score.cc ter.cc comb_scorer.cc aer_scorer.cc scorer.cc viterbi_envelope.cc -fast_score_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +mr_vest_map_SOURCES = viterbi_envelope.cc ces.cc error_surface.cc mr_vest_map.cc line_optimizer.cc +mr_vest_map_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz -mr_vest_generate_mapper_input_SOURCES = mr_vest_generate_mapper_input.cc line_optimizer.cc $(top_srcdir)/decoder/timing_stats.cc -mr_vest_generate_mapper_input_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +mr_vest_reduce_SOURCES = error_surface.cc ces.cc mr_vest_reduce.cc line_optimizer.cc viterbi_envelope.cc +mr_vest_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz -mr_vest_map_SOURCES = viterbi_envelope.cc error_surface.cc aer_scorer.cc mr_vest_map.cc scorer.cc ter.cc comb_scorer.cc line_optimizer.cc -mr_vest_map_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +lo_test_SOURCES = lo_test.cc ces.cc viterbi_envelope.cc error_surface.cc line_optimizer.cc +lo_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz -mr_vest_reduce_SOURCES = error_surface.cc aer_scorer.cc mr_vest_reduce.cc scorer.cc ter.cc comb_scorer.cc line_optimizer.cc viterbi_envelope.cc -mr_vest_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a -lz - -scorer_test_SOURCES = aer_scorer.cc scorer_test.cc scorer.cc ter.cc comb_scorer.cc viterbi_envelope.cc -scorer_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/decoder/libcdec.a -lz - -lo_test_SOURCES = lo_test.cc scorer.cc ter.cc aer_scorer.cc comb_scorer.cc viterbi_envelope.cc error_surface.cc line_optimizer.cc -lo_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/decoder/libcdec.a -lz - -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/vest/lo_test.cc b/vest/lo_test.cc index 577113bb..9200eb34 100644 --- a/vest/lo_test.cc +++ b/vest/lo_test.cc @@ -5,6 +5,7 @@ #include <boost/shared_ptr.hpp> #include <gtest/gtest.h> +#include "ces.h" #include "fdict.h" #include "hg.h" #include "kbest.h" @@ -166,8 +167,8 @@ TEST_F(OptTest, TestS1) { envs[1] = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg2, NULL, wf); vector<ErrorSurface> es(2); - scorer1->ComputeErrorSurface(envs[0], &es[0], IBM_BLEU, hg); - scorer2->ComputeErrorSurface(envs[1], &es[1], IBM_BLEU, hg2); + ComputeErrorSurface(*scorer1, envs[0], &es[0], IBM_BLEU, hg); + ComputeErrorSurface(*scorer2, envs[1], &es[1], IBM_BLEU, hg2); cerr << envs[0].size() << " " << envs[1].size() << endl; cerr << es[0].size() << " " << es[1].size() << endl; envs.clear(); diff --git a/vest/mr_vest_map.cc b/vest/mr_vest_map.cc index b3acc5dd..1506a99f 100644 --- a/vest/mr_vest_map.cc +++ b/vest/mr_vest_map.cc @@ -6,6 +6,7 @@ #include <boost/program_options.hpp> #include <boost/program_options/variables_map.hpp> +#include "ces.h" #include "filelib.h" #include "stringlib.h" #include "sparse_vector.h" @@ -13,7 +14,7 @@ #include "viterbi_envelope.h" #include "inside_outside.h" #include "error_surface.h" -#include "hg.h" +#include "b64tools.h" #include "hg_io.h" using namespace std; @@ -90,7 +91,7 @@ int main(int argc, char** argv) { ViterbiEnvelopeWeightFunction wf(origin, axis); ViterbiEnvelope ve = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg, NULL, wf); ErrorSurface es; - ds[sent_id]->ComputeErrorSurface(ve, &es, type, hg); + ComputeErrorSurface(*ds[sent_id], ve, &es, type, hg); //cerr << "Viterbi envelope has " << ve.size() << " segments\n"; // cerr << "Error surface has " << es.size() << " segments\n"; string val; diff --git a/vest/mr_vest_reduce.cc b/vest/mr_vest_reduce.cc index 5efcc19a..3df52020 100644 --- a/vest/mr_vest_reduce.cc +++ b/vest/mr_vest_reduce.cc @@ -9,7 +9,7 @@ #include "sparse_vector.h" #include "error_surface.h" #include "line_optimizer.h" -#include "hg_io.h" +#include "b64tools.h" using namespace std; namespace po = boost::program_options; |