summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile.am2
-rw-r--r--configure.ac2
-rw-r--r--decoder/Makefile.am37
-rw-r--r--decoder/aligner.cc74
-rw-r--r--decoder/aligner.h2
-rw-r--r--decoder/cdec.cc11
-rw-r--r--decoder/ff_bleu.cc2
-rw-r--r--decoder/ff_lm.cc2
-rw-r--r--decoder/ff_wordalign.cc3
-rw-r--r--decoder/hg.h4
-rw-r--r--decoder/hg_io.cc53
-rw-r--r--decoder/hg_io.h5
-rwxr-xr-xdecoder/oracle_bleu.h2
-rw-r--r--decoder/phrasebased_translator.cc4
-rw-r--r--decoder/sentence_metadata.h2
-rw-r--r--extools/Makefile.am12
-rw-r--r--extools/sg_lexer.l3
-rw-r--r--gi/clda/src/Makefile.am4
-rw-r--r--gi/pyp-topics/src/Makefile.am8
-rw-r--r--gi/pyp-topics/src/contexts_corpus.hh2
-rw-r--r--gi/pyp-topics/src/contexts_lexer.h2
-rw-r--r--gi/pyp-topics/src/contexts_lexer.l2
-rw-r--r--mteval/Makefile.am23
-rw-r--r--mteval/aer_scorer.cc (renamed from vest/aer_scorer.cc)6
-rw-r--r--mteval/aer_scorer.h (renamed from vest/aer_scorer.h)0
-rw-r--r--mteval/comb_scorer.cc (renamed from vest/comb_scorer.cc)0
-rw-r--r--mteval/comb_scorer.h (renamed from vest/comb_scorer.h)0
-rw-r--r--mteval/fast_score.cc (renamed from vest/fast_score.cc)0
-rw-r--r--mteval/mbr_kbest.cc138
-rw-r--r--mteval/scorer.cc (renamed from vest/scorer.cc)78
-rw-r--r--mteval/scorer.h (renamed from vest/scorer.h)3
-rw-r--r--mteval/scorer_test.cc182
-rw-r--r--mteval/ter.cc (renamed from vest/ter.cc)0
-rw-r--r--mteval/ter.h (renamed from vest/ter.h)0
-rw-r--r--mteval/test_data/re.txt.05
-rw-r--r--mteval/test_data/re.txt.15
-rw-r--r--mteval/test_data/re.txt.25
-rw-r--r--mteval/test_data/re.txt.35
-rw-r--r--training/Makefile.am25
-rw-r--r--training/atools.cc7
-rw-r--r--utils/Makefile.am38
-rw-r--r--utils/alignment_pharaoh.cc77
-rw-r--r--utils/alignment_pharaoh.h14
-rw-r--r--utils/array2d.h (renamed from decoder/array2d.h)0
-rw-r--r--utils/b64tools.cc59
-rw-r--r--utils/b64tools.h9
-rw-r--r--utils/dict.cc (renamed from decoder/dict.cc)0
-rw-r--r--utils/dict.h (renamed from decoder/dict.h)0
-rw-r--r--utils/dict_test.cc (renamed from decoder/dict_test.cc)3
-rw-r--r--utils/fdict.cc (renamed from decoder/fdict.cc)0
-rw-r--r--utils/fdict.h (renamed from decoder/fdict.h)0
-rwxr-xr-xutils/feature_accum.h129
-rwxr-xr-xutils/feature_vector.h (renamed from decoder/feature_vector.h)0
-rw-r--r--utils/filelib.cc (renamed from decoder/filelib.cc)0
-rw-r--r--utils/filelib.h (renamed from decoder/filelib.h)0
-rw-r--r--utils/gzstream.cc (renamed from decoder/gzstream.cc)0
-rw-r--r--utils/gzstream.h (renamed from decoder/gzstream.h)0
-rwxr-xr-xutils/hash.h (renamed from decoder/hash.h)0
-rwxr-xr-xutils/have_64_bits.h (renamed from decoder/have_64_bits.h)0
-rwxr-xr-xutils/int_or_pointer.h (renamed from decoder/int_or_pointer.h)0
-rwxr-xr-xutils/intrusive_refcount.hpp (renamed from decoder/intrusive_refcount.hpp)0
-rw-r--r--utils/logval.h (renamed from decoder/logval.h)0
-rw-r--r--utils/logval_test.cc (renamed from decoder/logval_test.cc)0
-rwxr-xr-xutils/murmur_hash.h (renamed from decoder/murmur_hash.h)0
-rwxr-xr-xutils/null_deleter.h (renamed from decoder/null_deleter.h)0
-rw-r--r--utils/prob.h (renamed from decoder/prob.h)0
-rw-r--r--utils/sampler.h (renamed from decoder/sampler.h)0
-rw-r--r--utils/small_vector.h (renamed from decoder/small_vector.h)0
-rw-r--r--utils/small_vector_test.cc (renamed from decoder/small_vector_test.cc)0
-rw-r--r--utils/sparse_vector.cc (renamed from decoder/sparse_vector.cc)2
-rw-r--r--utils/sparse_vector.h (renamed from decoder/sparse_vector.h)0
-rwxr-xr-xutils/static_utoa.h (renamed from decoder/static_utoa.h)0
-rw-r--r--utils/stringlib.cc (renamed from decoder/stringlib.cc)11
-rw-r--r--utils/stringlib.h (renamed from decoder/stringlib.h)0
-rwxr-xr-xutils/stringlib_test.cc (renamed from decoder/stringlib_test.cc)0
-rw-r--r--utils/tdict.cc (renamed from decoder/tdict.cc)0
-rw-r--r--utils/tdict.h (renamed from decoder/tdict.h)0
-rw-r--r--utils/test_data/weights (renamed from decoder/test_data/weights)0
-rwxr-xr-xutils/threadlocal.h (renamed from decoder/threadlocal.h)0
-rw-r--r--utils/timing_stats.cc (renamed from decoder/timing_stats.cc)0
-rw-r--r--utils/timing_stats.h (renamed from decoder/timing_stats.h)0
-rw-r--r--utils/weights.cc (renamed from decoder/weights.cc)0
-rw-r--r--utils/weights.h (renamed from decoder/weights.h)0
-rw-r--r--utils/weights_test.cc (renamed from decoder/weights_test.cc)1
-rw-r--r--utils/wordid.h (renamed from decoder/wordid.h)0
-rw-r--r--vest/Makefile.am30
-rw-r--r--vest/lo_test.cc5
-rw-r--r--vest/mr_vest_map.cc5
-rw-r--r--vest/mr_vest_reduce.cc2
89 files changed, 772 insertions, 333 deletions
diff --git a/Makefile.am b/Makefile.am
index e82e2352..98c2561e 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -1,4 +1,4 @@
-SUBDIRS = decoder training vest extools gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava
+SUBDIRS = utils mteval decoder training vest extools gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava
AUTOMAKE_OPTIONS = foreign
ACLOCAL_AMFLAGS = -I m4
diff --git a/configure.ac b/configure.ac
index e627c1cc..302eebed 100644
--- a/configure.ac
+++ b/configure.ac
@@ -76,4 +76,4 @@ then
AM_CONDITIONAL([RAND_LM], true)
fi
-AC_OUTPUT(Makefile extools/Makefile decoder/Makefile training/Makefile vest/Makefile gi/pyp-topics/src/Makefile gi/clda/src/Makefile)
+AC_OUTPUT(Makefile utils/Makefile mteval/Makefile extools/Makefile decoder/Makefile training/Makefile vest/Makefile gi/pyp-topics/src/Makefile gi/clda/src/Makefile)
diff --git a/decoder/Makefile.am b/decoder/Makefile.am
index 68a7d765..f514b340 100644
--- a/decoder/Makefile.am
+++ b/decoder/Makefile.am
@@ -2,24 +2,16 @@ bin_PROGRAMS = cdec
if HAVE_GTEST
noinst_PROGRAMS = \
- dict_test \
- weights_test \
trule_test \
hg_test \
ff_test \
- logval_test \
parser_test \
- grammar_test \
- small_vector_test
+ grammar_test
endif
-cdec_SOURCES = cdec.cc forest_writer.cc maxtrans_blunsom.cc cdec_ff.cc timing_stats.cc
-small_vector_test_SOURCES = small_vector_test.cc
-small_vector_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a
+cdec_SOURCES = cdec.cc forest_writer.cc maxtrans_blunsom.cc cdec_ff.cc
parser_test_SOURCES = parser_test.cc
parser_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a
-dict_test_SOURCES = dict_test.cc
-dict_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a
ff_test_SOURCES = ff_test.cc
ff_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a
grammar_test_SOURCES = grammar_test.cc
@@ -28,15 +20,12 @@ hg_test_SOURCES = hg_test.cc
hg_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a
trule_test_SOURCES = trule_test.cc
trule_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a
-weights_test_SOURCES = weights_test.cc
-weights_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a
-logval_test_SOURCES = logval_test.cc
-logval_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS)
-LDADD = libcdec.a
+LDADD = libcdec.a ../mteval/libmteval.a ../utils/libutils.a
-AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I..
-AM_LDFLAGS = -lz
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I.. -I../mteval -I../utils
+
+AM_LDFLAGS = ../utils/libutils.a -lz
rule_lexer.cc: rule_lexer.l
$(LEX) -s -CF -8 -o$@ $<
@@ -49,7 +38,6 @@ libcdec_a_SOURCES = \
rule_lexer.cc \
fst_translator.cc \
csplit.cc \
- dict.cc \
translator.cc \
scfg_translator.cc \
hg.cc \
@@ -58,17 +46,10 @@ libcdec_a_SOURCES = \
viterbi.cc \
lattice.cc \
aligner.cc \
- gzstream.cc \
apply_models.cc \
earley_composer.cc \
phrasetable_fst.cc \
- sparse_vector.cc \
trule.cc \
- filelib.cc \
- stringlib.cc \
- fdict.cc \
- tdict.cc \
- weights.cc \
ttables.cc \
ff.cc \
ff_lm.cc \
@@ -78,12 +59,6 @@ libcdec_a_SOURCES = \
ff_tagger.cc \
ff_bleu.cc \
ff_factory.cc \
- ../vest/scorer.cc \
- ../vest/ter.cc \
- ../vest/aer_scorer.cc \
- ../vest/comb_scorer.cc \
- ../vest/error_surface.cc \
- ../vest/viterbi_envelope.cc \
freqdict.cc \
lexalign.cc \
lextrans.cc \
diff --git a/decoder/aligner.cc b/decoder/aligner.cc
index b089f52e..92431be4 100644
--- a/decoder/aligner.cc
+++ b/decoder/aligner.cc
@@ -5,81 +5,11 @@
#include "sentence_metadata.h"
#include "inside_outside.h"
#include "viterbi.h"
+#include "alignment_pharaoh.h"
#include <set>
using namespace std;
-static bool is_digit(char x) { return x >= '0' && x <= '9'; }
-
-boost::shared_ptr<Array2D<bool> > AlignerTools::ReadPharaohAlignmentGrid(const string& al) {
- int max_x = 0;
- int max_y = 0;
- int i = 0;
- size_t pos = al.rfind(" ||| ");
- if (pos != string::npos) { i = pos + 5; }
- while (i < al.size()) {
- if (al[i] == '\n' || al[i] == '\r') break;
- int x = 0;
- while(i < al.size() && is_digit(al[i])) {
- x *= 10;
- x += al[i] - '0';
- ++i;
- }
- if (x > max_x) max_x = x;
- assert(i < al.size());
- if(al[i] != '-') {
- cerr << "BAD ALIGNMENT: " << al << endl;
- abort();
- }
- ++i;
- int y = 0;
- while(i < al.size() && is_digit(al[i])) {
- y *= 10;
- y += al[i] - '0';
- ++i;
- }
- if (y > max_y) max_y = y;
- while(i < al.size() && al[i] == ' ') { ++i; }
- }
-
- boost::shared_ptr<Array2D<bool> > grid(new Array2D<bool>(max_x + 1, max_y + 1));
- i = 0;
- if (pos != string::npos) { i = pos + 5; }
- while (i < al.size()) {
- if (al[i] == '\n' || al[i] == '\r') break;
- int x = 0;
- while(i < al.size() && is_digit(al[i])) {
- x *= 10;
- x += al[i] - '0';
- ++i;
- }
- assert(i < al.size());
- assert(al[i] == '-');
- ++i;
- int y = 0;
- while(i < al.size() && is_digit(al[i])) {
- y *= 10;
- y += al[i] - '0';
- ++i;
- }
- (*grid)(x, y) = true;
- while(i < al.size() && al[i] == ' ') { ++i; }
- }
- // cerr << *grid << endl;
- return grid;
-}
-
-void AlignerTools::SerializePharaohFormat(const Array2D<bool>& alignment, ostream* out) {
- bool need_space = false;
- for (int i = 0; i < alignment.width(); ++i)
- for (int j = 0; j < alignment.height(); ++j)
- if (alignment(i,j)) {
- if (need_space) (*out) << ' '; else need_space = true;
- (*out) << i << '-' << j;
- }
- (*out) << endl;
-}
-
// used with lexical models since they may not fully generate the
// source string
void SourceEdgeCoveragesUsingParseIndices(const Hypergraph& g,
@@ -317,6 +247,6 @@ void AlignerTools::WriteAlignment(const Lattice& src_lattice,
cerr << grid << endl;
}
(*out) << TD::GetString(src_sent) << " ||| " << TD::GetString(trg_sent) << " ||| ";
- SerializePharaohFormat(grid, out);
+ AlignmentPharaoh::SerializePharaohFormat(grid, out);
};
diff --git a/decoder/aligner.h b/decoder/aligner.h
index cd159119..a088ba6c 100644
--- a/decoder/aligner.h
+++ b/decoder/aligner.h
@@ -10,8 +10,6 @@ class Hypergraph;
class SentenceMetadata;
struct AlignerTools {
- static boost::shared_ptr<Array2D<bool> > ReadPharaohAlignmentGrid(const std::string& al);
- static void SerializePharaohFormat(const Array2D<bool>& alignment, std::ostream* out);
// assumption: g contains derivations of input/ref and
// ONLY input/ref.
diff --git a/decoder/cdec.cc b/decoder/cdec.cc
index 8c4a25e0..3633febd 100644
--- a/decoder/cdec.cc
+++ b/decoder/cdec.cc
@@ -34,7 +34,7 @@
#include "inside_outside.h"
#include "exp_semiring.h"
#include "sentence_metadata.h"
-#include "../vest/scorer.h"
+#include "scorer.h"
#include "apply_fsa_models.h"
#include "program_options.h"
#include "cfg_options.h"
@@ -59,6 +59,15 @@ void ShowBanner() {
cerr << "cdec v1.0 (c) 2009-2010 by Chris Dyer\n";
}
+void ParseTranslatorInputLattice(const string& line, string* input, Lattice* ref) {
+ string sref;
+ ParseTranslatorInput(line, input, &sref);
+ if (sref.size() > 0) {
+ assert(ref);
+ LatticeTools::ConvertTextOrPLF(sref, ref);
+ }
+}
+
void ConvertSV(const SparseVector<prob_t>& src, SparseVector<double>* trg) {
for (SparseVector<prob_t>::const_iterator it = src.begin(); it != src.end(); ++it)
trg->set_value(it->first, it->second);
diff --git a/decoder/ff_bleu.cc b/decoder/ff_bleu.cc
index 77989331..aa4e6d85 100644
--- a/decoder/ff_bleu.cc
+++ b/decoder/ff_bleu.cc
@@ -18,7 +18,7 @@ char const* bleu_usage_verbose="Uses feature id 0! Make sure there are no other
#include "hg.h"
#include "stringlib.h"
#include "sentence_metadata.h"
-#include "../vest/scorer.h"
+#include "scorer.h"
using namespace std;
diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc
index f3e65cb7..a9929253 100644
--- a/decoder/ff_lm.cc
+++ b/decoder/ff_lm.cc
@@ -728,7 +728,7 @@ LanguageModelRandLM::LanguageModelRandLM(const string& param) :
filename = argv[0];
}
}
- set_order(order);
+// set_order(order);
int cache_MB = 200; // increase cache size
randlm::RandLM* rlm = randlm::RandLM::initRandLM(filename, order, cache_MB);
assert(rlm != NULL);
diff --git a/decoder/ff_wordalign.cc b/decoder/ff_wordalign.cc
index 0ba2bf92..087bff0c 100644
--- a/decoder/ff_wordalign.cc
+++ b/decoder/ff_wordalign.cc
@@ -5,6 +5,7 @@
#include <string>
#include <cmath>
+#include "alignment_pharaoh.h"
#include "stringlib.h"
#include "sentence_metadata.h"
#include "hg.h"
@@ -354,7 +355,7 @@ AlignerResults::AlignerResults(const std::string& param) :
getline(in, line);
if (!in) break;
++lc;
- is_aligned_.push_back(AlignerTools::ReadPharaohAlignmentGrid(line));
+ is_aligned_.push_back(AlignmentPharaoh::ReadPharaohAlignmentGrid(line));
}
cerr << " Loaded " << lc << " refs\n";
}
diff --git a/decoder/hg.h b/decoder/hg.h
index d5c8e197..e9510997 100644
--- a/decoder/hg.h
+++ b/decoder/hg.h
@@ -102,6 +102,8 @@ public:
void copy_info(Edge const& o) {
#if USE_INFO_EDGE
set_info(o.info_.str()); // by convention, each person putting info here starts with a separator (e.g. space). it's empty if nobody put any info there.
+#else
+ (void) o;
#endif
}
void copy_pod(Edge const& o) {
@@ -142,7 +144,7 @@ public:
#else
std::string info() const { return std::string(); }
void reset_info() { }
- void set_info(std::string const& s) { }
+ void set_info(std::string const& ) { }
#endif
void show(std::ostream &o,unsigned mask=SPAN|RULE) const {
o<<'{';
diff --git a/decoder/hg_io.cc b/decoder/hg_io.cc
index 52a8565a..1af8261e 100644
--- a/decoder/hg_io.cc
+++ b/decoder/hg_io.cc
@@ -622,56 +622,3 @@ void HypergraphIO::WriteAsCFG(const Hypergraph& hg) {
}
}
-namespace B64 {
-
-static const char cb64[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
-static const char cd64[]="|$$$}rstuvwxyz{$$$$$$$>?@ABCDEFGHIJKLMNOPQRSTUVW$$$$$$XYZ[\\]^_`abcdefghijklmnopq";
-
-static void encodeblock(const unsigned char* in, ostream* os, int len) {
- char out[4];
- out[0] = cb64[ in[0] >> 2 ];
- out[1] = cb64[ ((in[0] & 0x03) << 4) | ((in[1] & 0xf0) >> 4) ];
- out[2] = (len > 1 ? cb64[ ((in[1] & 0x0f) << 2) | ((in[2] & 0xc0) >> 6) ] : '=');
- out[3] = (len > 2 ? cb64[ in[2] & 0x3f ] : '=');
- os->write(out, 4);
-}
-
-void b64encode(const char* data, const size_t size, ostream* out) {
- size_t cur = 0;
- while(cur < size) {
- int len = min(static_cast<size_t>(3), size - cur);
- encodeblock(reinterpret_cast<const unsigned char*>(&data[cur]), out, len);
- cur += len;
- }
-}
-
-static void decodeblock(const unsigned char* in, unsigned char* out) {
- out[0] = (unsigned char ) (in[0] << 2 | in[1] >> 4);
- out[1] = (unsigned char ) (in[1] << 4 | in[2] >> 2);
- out[2] = (unsigned char ) (((in[2] << 6) & 0xc0) | in[3]);
-}
-
-bool b64decode(const unsigned char* data, const size_t insize, char* out, const size_t outsize) {
- size_t cur = 0;
- size_t ocur = 0;
- unsigned char in[4];
- while(cur < insize) {
- assert(ocur < outsize);
- for (int i = 0; i < 4; ++i) {
- unsigned char v = data[cur];
- v = (unsigned char) ((v < 43 || v > 122) ? '\0' : cd64[ v - 43 ]);
- if (!v) {
- cerr << "B64 decode error at offset " << cur << " offending character: " << (int)data[cur] << endl;
- return false;
- }
- v = (unsigned char) ((v == '$') ? '\0' : v - 61);
- if (v) in[i] = v - 1; else in[i] = 0;
- ++cur;
- }
- decodeblock(in, reinterpret_cast<unsigned char*>(&out[ocur]));
- ocur += 3;
- }
- return true;
-}
-}
-
diff --git a/decoder/hg_io.h b/decoder/hg_io.h
index b6a176ab..082489d8 100644
--- a/decoder/hg_io.h
+++ b/decoder/hg_io.h
@@ -31,9 +31,4 @@ struct HypergraphIO {
static std::string Escape(const std::string& s); // PLF helper
};
-namespace B64 {
- bool b64decode(const unsigned char* data, const size_t insize, char* out, const size_t outsize);
- void b64encode(const char* data, const size_t size, std::ostream* out);
-}
-
#endif
diff --git a/decoder/oracle_bleu.h b/decoder/oracle_bleu.h
index 81a584a7..145c84d1 100755
--- a/decoder/oracle_bleu.h
+++ b/decoder/oracle_bleu.h
@@ -9,7 +9,7 @@
#include <vector>
#include <boost/program_options.hpp>
#include <boost/program_options/variables_map.hpp>
-#include "../vest/scorer.h"
+#include "scorer.h"
#include "hg.h"
#include "ff_factory.h"
#include "ff_bleu.h"
diff --git a/decoder/phrasebased_translator.cc b/decoder/phrasebased_translator.cc
index 726b3f9a..d65e44d1 100644
--- a/decoder/phrasebased_translator.cc
+++ b/decoder/phrasebased_translator.cc
@@ -68,7 +68,6 @@ struct PhraseBasedTranslatorImpl {
PhraseBasedTranslatorImpl(const boost::program_options::variables_map& conf) :
add_pass_through_rules(conf.count("add_pass_through_rules")),
max_distortion(conf["pb_max_distortion"].as<int>()),
- kSOURCE_RULE(new TRule("[X] ||| [X,1] ||| [X,1]", true)),
kCONCAT_RULE(new TRule("[X] ||| [X,1] [X,2] ||| [X,1] [X,2]", true)),
kNT_TYPE(TD::Convert("X") * -1) {
assert(max_distortion >= 0);
@@ -141,6 +140,8 @@ struct PhraseBasedTranslatorImpl {
for (int i = 0; i < phrases.size(); ++i) {
Hypergraph::Edge* edge = minus_lm_forest->AddEdge(phrases[i], Hypergraph::TailNodeVector());
edge->feature_values_ = edge->rule_->scores_;
+ edge->i_ = s.i;
+ edge->j_ = s.j;
minus_lm_forest->ConnectEdgeToHeadNode(edge->id_, phrase_head_index);
}
CoverageNodeMap::iterator cit = c.find(s.coverage);
@@ -189,7 +190,6 @@ struct PhraseBasedTranslatorImpl {
const bool add_pass_through_rules;
const int max_distortion;
- TRulePtr kSOURCE_RULE;
const TRulePtr kCONCAT_RULE;
const WordID kNT_TYPE;
boost::shared_ptr<FSTNode> fst;
diff --git a/decoder/sentence_metadata.h b/decoder/sentence_metadata.h
index 21be9b21..593019c8 100644
--- a/decoder/sentence_metadata.h
+++ b/decoder/sentence_metadata.h
@@ -3,7 +3,7 @@
#include <cassert>
#include "lattice.h"
-#include "../vest/scorer.h"
+#include "scorer.h"
struct SentenceMetadata {
SentenceMetadata(int id, const Lattice& ref) :
diff --git a/extools/Makefile.am b/extools/Makefile.am
index 1e82287d..ee363264 100644
--- a/extools/Makefile.am
+++ b/extools/Makefile.am
@@ -11,20 +11,20 @@ sg_lexer.cc: sg_lexer.l
$(LEX) -s -CF -8 -o$@ $<
filter_grammar_SOURCES = filter_grammar.cc extract.cc sentence_pair.cc striped_grammar.cc sg_lexer.cc
-filter_grammar_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+filter_grammar_LDADD = $(top_srcdir)/utils/libutils.a -lz
#filter_grammar_LDFLAGS = -all-static
featurize_grammar_SOURCES = featurize_grammar.cc extract.cc sentence_pair.cc sg_lexer.cc striped_grammar.cc
-featurize_grammar_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+featurize_grammar_LDADD = $(top_srcdir)/utils/libutils.a -lz
mr_stripe_rule_reduce_SOURCES = mr_stripe_rule_reduce.cc extract.cc sentence_pair.cc striped_grammar.cc sg_lexer.cc
-mr_stripe_rule_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+mr_stripe_rule_reduce_LDADD = $(top_srcdir)/utils/libutils.a -lz
extractor_SOURCES = sentence_pair.cc extract.cc extractor.cc striped_grammar.cc
-extractor_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+extractor_LDADD = $(top_srcdir)/utils/libutils.a -lz
extractor_monolingual_SOURCES = extractor_monolingual.cc
-extractor_monolingual_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+extractor_monolingual_LDADD = $(top_srcdir)/utils/libutils.a -lz
-AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils
diff --git a/extools/sg_lexer.l b/extools/sg_lexer.l
index 168b835a..d60bd0fc 100644
--- a/extools/sg_lexer.l
+++ b/extools/sg_lexer.l
@@ -1,6 +1,4 @@
%{
-#include "rule_lexer.h"
-
#include <string>
#include <iostream>
#include <sstream>
@@ -8,7 +6,6 @@
#include <cassert>
#include "tdict.h"
#include "fdict.h"
-#include "trule.h"
#include "striped_grammar.h"
int lex_line = 0;
diff --git a/gi/clda/src/Makefile.am b/gi/clda/src/Makefile.am
index 688746bb..2b1393ac 100644
--- a/gi/clda/src/Makefile.am
+++ b/gi/clda/src/Makefile.am
@@ -2,5 +2,5 @@ bin_PROGRAMS = clda
clda_SOURCES = clda.cc
-AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/decoder
-AM_LDFLAGS = $(top_srcdir)/decoder/libcdec.a -lz
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils
+AM_LDFLAGS = $(top_srcdir)/utils/libutils.a -lz
diff --git a/gi/pyp-topics/src/Makefile.am b/gi/pyp-topics/src/Makefile.am
index c22819db..d3f95d0b 100644
--- a/gi/pyp-topics/src/Makefile.am
+++ b/gi/pyp-topics/src/Makefile.am
@@ -4,13 +4,13 @@ contexts_lexer.cc: contexts_lexer.l
$(LEX) -s -CF -8 -o$@ $<
pyp_topics_train_SOURCES = mt19937ar.c corpus.cc gzstream.cc pyp-topics.cc train.cc contexts_lexer.cc contexts_corpus.cc
-pyp_topics_train_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+pyp_topics_train_LDADD = $(top_srcdir)/utils/libutils.a -lz
pyp_contexts_train_SOURCES = mt19937ar.c corpus.cc gzstream.cc pyp-topics.cc contexts_lexer.cc contexts_corpus.cc train-contexts.cc
-pyp_contexts_train_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+pyp_contexts_train_LDADD = $(top_srcdir)/utils/libutils.a -lz
#mpi_pyp_contexts_train_SOURCES = mt19937ar.c corpus.cc gzstream.cc mpi-pyp-topics.cc contexts_lexer.cc contexts_corpus.cc mpi-train-contexts.cc
-#mpi_pyp_contexts_train_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+#mpi_pyp_contexts_train_LDADD = $(top_srcdir)/utils/libutils.a -lz
-AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I../../../utils
diff --git a/gi/pyp-topics/src/contexts_corpus.hh b/gi/pyp-topics/src/contexts_corpus.hh
index dd721361..b2d235cb 100644
--- a/gi/pyp-topics/src/contexts_corpus.hh
+++ b/gi/pyp-topics/src/contexts_corpus.hh
@@ -10,7 +10,7 @@
#include "corpus.hh"
#include "contexts_lexer.h"
-#include "../../../decoder/dict.h"
+#include "dict.h"
class BackoffGenerator {
diff --git a/gi/pyp-topics/src/contexts_lexer.h b/gi/pyp-topics/src/contexts_lexer.h
index 1b79c6fd..66004990 100644
--- a/gi/pyp-topics/src/contexts_lexer.h
+++ b/gi/pyp-topics/src/contexts_lexer.h
@@ -5,7 +5,7 @@
#include <vector>
#include <string>
-#include "../../../decoder/dict.h"
+#include "dict.h"
struct ContextsLexer {
typedef std::vector<std::string> Context;
diff --git a/gi/pyp-topics/src/contexts_lexer.l b/gi/pyp-topics/src/contexts_lexer.l
index 7a5d9460..64cd7ca3 100644
--- a/gi/pyp-topics/src/contexts_lexer.l
+++ b/gi/pyp-topics/src/contexts_lexer.l
@@ -101,7 +101,7 @@ INT [\-+]?[0-9]+|inf|[\-+]inf
%%
-#include "../../../decoder/filelib.h"
+#include "filelib.h"
void ContextsLexer::ReadContexts(std::istream* in, ContextsLexer::ContextsCallback func, void* extra) {
lex_line = 1;
diff --git a/mteval/Makefile.am b/mteval/Makefile.am
new file mode 100644
index 00000000..7ae14045
--- /dev/null
+++ b/mteval/Makefile.am
@@ -0,0 +1,23 @@
+bin_PROGRAMS = \
+ fast_score \
+ mbr_kbest
+
+if HAVE_GTEST
+noinst_PROGRAMS = \
+ scorer_test
+endif
+
+noinst_LIBRARIES = libmteval.a
+
+libmteval_a_SOURCES = ter.cc comb_scorer.cc aer_scorer.cc scorer.cc
+
+fast_score_SOURCES = fast_score.cc
+fast_score_LDADD = $(top_srcdir)/utils/libutils.a libmteval.a -lz
+
+mbr_kbest_SOURCES = mbr_kbest.cc
+mbr_kbest_LDADD = $(top_srcdir)/utils/libutils.a libmteval.a -lz
+
+scorer_test_SOURCES = scorer_test.cc
+scorer_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/utils/libutils.a libmteval.a -lz
+
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils
diff --git a/vest/aer_scorer.cc b/mteval/aer_scorer.cc
index 25b58b5e..edd4390f 100644
--- a/vest/aer_scorer.cc
+++ b/mteval/aer_scorer.cc
@@ -5,7 +5,7 @@
#include <sstream>
#include "tdict.h"
-#include "aligner.h"
+#include "alignment_pharaoh.h"
using namespace std;
@@ -85,7 +85,7 @@ AERScorer::AERScorer(const vector<vector<WordID> >& refs, const string& src) : s
cerr << "AERScorer can only take a single reference!\n";
abort();
}
- ref_ = AlignerTools::ReadPharaohAlignmentGrid(TD::GetString(refs.front()));
+ ref_ = AlignmentPharaoh::ReadPharaohAlignmentGrid(TD::GetString(refs.front()));
}
static inline bool Safe(const Array2D<bool>& a, int i, int j) {
@@ -101,7 +101,7 @@ ScoreP AERScorer::ScoreCCandidate(const vector<WordID>& shyp) const {
ScoreP AERScorer::ScoreCandidate(const vector<WordID>& shyp) const {
boost::shared_ptr<Array2D<bool> > hyp =
- AlignerTools::ReadPharaohAlignmentGrid(TD::GetString(shyp));
+ AlignmentPharaoh::ReadPharaohAlignmentGrid(TD::GetString(shyp));
int m = 0;
int r = 0;
diff --git a/vest/aer_scorer.h b/mteval/aer_scorer.h
index 6d53d359..6d53d359 100644
--- a/vest/aer_scorer.h
+++ b/mteval/aer_scorer.h
diff --git a/vest/comb_scorer.cc b/mteval/comb_scorer.cc
index 9fc37868..9fc37868 100644
--- a/vest/comb_scorer.cc
+++ b/mteval/comb_scorer.cc
diff --git a/vest/comb_scorer.h b/mteval/comb_scorer.h
index 346be576..346be576 100644
--- a/vest/comb_scorer.h
+++ b/mteval/comb_scorer.h
diff --git a/vest/fast_score.cc b/mteval/fast_score.cc
index 5ee264a6..5ee264a6 100644
--- a/vest/fast_score.cc
+++ b/mteval/fast_score.cc
diff --git a/mteval/mbr_kbest.cc b/mteval/mbr_kbest.cc
new file mode 100644
index 00000000..2867b36b
--- /dev/null
+++ b/mteval/mbr_kbest.cc
@@ -0,0 +1,138 @@
+#include <iostream>
+#include <vector>
+
+#include <boost/program_options.hpp>
+
+#include "prob.h"
+#include "tdict.h"
+#include "scorer.h"
+#include "filelib.h"
+#include "stringlib.h"
+
+using namespace std;
+
+namespace po = boost::program_options;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("scale,a",po::value<double>()->default_value(1.0), "Posterior scaling factor (alpha)")
+ ("loss_function,l",po::value<string>()->default_value("bleu"), "Loss function")
+ ("input,i",po::value<string>()->default_value("-"), "File to read k-best lists from")
+ ("output_list,L", "Show reranked list as output")
+ ("help,h", "Help");
+ po::options_description dcmdline_options;
+ dcmdline_options.add(opts);
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ bool flag = false;
+ if (flag || conf->count("help")) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+}
+
+struct LossComparer {
+ bool operator()(const pair<vector<WordID>, double>& a, const pair<vector<WordID>, double>& b) const {
+ return a.second < b.second;
+ }
+};
+
+bool ReadKBestList(istream* in, string* sent_id, vector<pair<vector<WordID>, prob_t> >* list) {
+ static string cache_id;
+ static pair<vector<WordID>, prob_t> cache_pair;
+ list->clear();
+ string cur_id;
+ if (cache_pair.first.size() > 0) {
+ list->push_back(cache_pair);
+ cur_id = cache_id;
+ cache_pair.first.clear();
+ }
+ string line;
+ string tstr;
+ while(*in) {
+ getline(*in, line);
+ if (line.empty()) continue;
+ size_t p1 = line.find(" ||| ");
+ if (p1 == string::npos) { cerr << "Bad format: " << line << endl; abort(); }
+ size_t p2 = line.find(" ||| ", p1 + 4);
+ if (p2 == string::npos) { cerr << "Bad format: " << line << endl; abort(); }
+ size_t p3 = line.rfind(" ||| ");
+ cache_id = line.substr(0, p1);
+ tstr = line.substr(p1 + 5, p2 - p1 - 5);
+ double val = strtod(line.substr(p3 + 5).c_str(), NULL);
+ TD::ConvertSentence(tstr, &cache_pair.first);
+ cache_pair.second.logeq(val);
+ if (cur_id.empty()) cur_id = cache_id;
+ if (cur_id == cache_id) {
+ list->push_back(cache_pair);
+ *sent_id = cur_id;
+ cache_pair.first.clear();
+ } else { break; }
+ }
+ return !list->empty();
+}
+
+int main(int argc, char** argv) {
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ const string metric = conf["loss_function"].as<string>();
+ const bool output_list = conf.count("output_list") > 0;
+ const string file = conf["input"].as<string>();
+ const double mbr_scale = conf["scale"].as<double>();
+ cerr << "Posterior scaling factor (alpha) = " << mbr_scale << endl;
+
+ ScoreType type = ScoreTypeFromString(metric);
+ vector<pair<vector<WordID>, prob_t> > list;
+ ReadFile rf(file);
+ string sent_id;
+ while(ReadKBestList(rf.stream(), &sent_id, &list)) {
+ vector<prob_t> joints(list.size());
+ const prob_t max_score = pow(list.front().second, mbr_scale);
+ prob_t marginal = prob_t::Zero();
+ for (int i = 0 ; i < list.size(); ++i) {
+ const prob_t joint = pow(list[i].second, mbr_scale) / max_score;
+ joints[i] = joint;
+ // cerr << "list[" << i << "] joint=" << log(joint) << endl;
+ marginal += joint;
+ }
+ int mbr_idx = -1;
+ vector<double> mbr_scores(output_list ? list.size() : 0);
+ double mbr_loss = numeric_limits<double>::max();
+ for (int i = 0 ; i < list.size(); ++i) {
+ vector<vector<WordID> > refs(1, list[i].first);
+ //cerr << i << ": " << list[i].second <<"\t" << TD::GetString(list[i].first) << endl;
+ ScorerP scorer = SentenceScorer::CreateSentenceScorer(type, refs);
+ double wl_acc = 0;
+ for (int j = 0; j < list.size(); ++j) {
+ if (i != j) {
+ ScoreP s = scorer->ScoreCandidate(list[j].first);
+ double loss = 1.0 - s->ComputeScore();
+ if (type == TER || type == AER) loss = 1.0 - loss;
+ double weighted_loss = loss * (joints[j] / marginal);
+ wl_acc += weighted_loss;
+ if ((!output_list) && wl_acc > mbr_loss) break;
+ }
+ }
+ if (output_list) mbr_scores[i] = wl_acc;
+ if (wl_acc < mbr_loss) {
+ mbr_loss = wl_acc;
+ mbr_idx = i;
+ }
+ }
+ // cerr << "ML translation: " << TD::GetString(list[0].first) << endl;
+ cerr << "MBR Best idx: " << mbr_idx << endl;
+ if (output_list) {
+ for (int i = 0; i < list.size(); ++i)
+ list[i].second.logeq(mbr_scores[i]);
+ sort(list.begin(), list.end(), LossComparer());
+ for (int i = 0; i < list.size(); ++i)
+ cout << sent_id << " ||| "
+ << TD::GetString(list[i].first) << " ||| "
+ << log(list[i].second) << endl;
+ } else {
+ cout << TD::GetString(list[mbr_idx].first) << endl;
+ }
+ }
+ return 0;
+}
+
diff --git a/vest/scorer.cc b/mteval/scorer.cc
index 70fdef34..04eeaa93 100644
--- a/vest/scorer.cc
+++ b/mteval/scorer.cc
@@ -12,22 +12,15 @@
#include <boost/shared_ptr.hpp>
#include "filelib.h"
-#include "aligner.h"
-#include "viterbi_envelope.h"
-#include "error_surface.h"
#include "ter.h"
#include "aer_scorer.h"
#include "comb_scorer.h"
#include "tdict.h"
#include "stringlib.h"
-#include "lattice.h"
-
using boost::shared_ptr;
using namespace std;
-const bool minimize_segments = true; // if adjacent segments have equal scores, merge them
-
void Score::TimesEquals(float scale) {
cerr<<"UNIMPLEMENTED except for BLEU (for MIRA): Score::TimesEquals"<<endl;abort();
}
@@ -410,77 +403,6 @@ ScoreP SentenceScorer::CreateScoreFromString(const ScoreType type, const string&
}
}
-void SentenceScorer::ComputeErrorSurface(const ViterbiEnvelope& ve, ErrorSurface* env, const ScoreType type, const Hypergraph& hg) const {
- vector<WordID> prev_trans;
- const vector<shared_ptr<Segment> >& ienv = ve.GetSortedSegs();
- env->resize(ienv.size());
- ScoreP prev_score;
- int j = 0;
- for (int i = 0; i < ienv.size(); ++i) {
- const Segment& seg = *ienv[i];
- vector<WordID> trans;
- if (type == AER) {
- vector<bool> edges(hg.edges_.size(), false);
- seg.CollectEdgesUsed(&edges); // get the set of edges in the viterbi
- // alignment
- ostringstream os;
- const string* psrc = this->GetSource();
- if (psrc == NULL) {
- cerr << "AER scoring in VEST requires source, but it is missing!\n";
- abort();
- }
- size_t pos = psrc->rfind(" ||| ");
- if (pos == string::npos) {
- cerr << "Malformed source for AER: expected |||\nINPUT: " << *psrc << endl;
- abort();
- }
- Lattice src;
- Lattice ref;
- LatticeTools::ConvertTextOrPLF(psrc->substr(0, pos), &src);
- LatticeTools::ConvertTextOrPLF(psrc->substr(pos + 5), &ref);
- AlignerTools::WriteAlignment(src, ref, hg, &os, true, &edges);
- string tstr = os.str();
- TD::ConvertSentence(tstr.substr(tstr.rfind(" ||| ") + 5), &trans);
- } else {
- seg.ConstructTranslation(&trans);
- }
- // cerr << "Scoring: " << TD::GetString(trans) << endl;
- if (trans == prev_trans) {
- if (!minimize_segments) {
- assert(prev_score); // if this fails, it means
- // the decoder can generate null translations
- ErrorSegment& out = (*env)[j];
- out.delta = prev_score->GetZero();
- out.x = seg.x;
- ++j;
- }
- // cerr << "Identical translation, skipping scoring\n";
- } else {
- ScoreP score = ScoreCandidate(trans);
- // cerr << "score= " << score->ComputeScore() << "\n";
- ScoreP cur_delta_p = score->GetZero();
- Score* cur_delta = cur_delta_p.get();
- // just record the score diffs
- if (!prev_score)
- prev_score = score->GetZero();
-
- score->Subtract(*prev_score, cur_delta);
- prev_trans.swap(trans);
- prev_score = score;
- if ((!minimize_segments) || (!cur_delta->IsAdditiveIdentity())) {
- ErrorSegment& out = (*env)[j];
- out.delta = cur_delta_p;
- out.x = seg.x;
- ++j;
- }
- }
- }
- // cerr << " In segments: " << ienv.size() << endl;
- // cerr << "Out segments: " << j << endl;
- assert(j > 0);
- env->resize(j);
-}
-
void BLEUScore::ScoreDetails(string* details) const {
char buf[2000];
vector<float> precs(max(N(),4));
diff --git a/vest/scorer.h b/mteval/scorer.h
index 0c8b380f..f18c8c7f 100644
--- a/vest/scorer.h
+++ b/mteval/scorer.h
@@ -49,7 +49,7 @@ class Score : public boost::intrusive_refcount<Score> {
virtual ScoreP Clone() const = 0;
protected:
Score() { } // we define these explicitly because refcount is noncopyable
- Score(Score const& o) { }
+ Score(Score const&) { }
};
//TODO: make sure default copy ctors for score types do what we want.
@@ -72,7 +72,6 @@ class SentenceScorer {
virtual ~SentenceScorer();
virtual ScoreP GetOne() const;
virtual ScoreP GetZero() const;
- void ComputeErrorSurface(const ViterbiEnvelope& ve, ErrorSurface* es, const ScoreType type, const Hypergraph& hg) const;
virtual ScoreP ScoreCandidate(const Sentence& hyp) const = 0;
virtual ScoreP ScoreCCandidate(const Sentence& hyp) const =0;
virtual const std::string* GetSource() const;
diff --git a/mteval/scorer_test.cc b/mteval/scorer_test.cc
new file mode 100644
index 00000000..a07a8c4b
--- /dev/null
+++ b/mteval/scorer_test.cc
@@ -0,0 +1,182 @@
+#include <iostream>
+#include <fstream>
+#include <valarray>
+#include <gtest/gtest.h>
+
+#include "tdict.h"
+#include "scorer.h"
+#include "aer_scorer.h"
+
+using namespace std;
+
+class ScorerTest : public testing::Test {
+ protected:
+ virtual void SetUp() {
+ refs0.resize(4);
+ refs1.resize(4);
+ TD::ConvertSentence("export of high-tech products in guangdong in first two months this year reached 3.76 billion us dollars", &refs0[0]);
+ TD::ConvertSentence("guangdong's export of new high technology products amounts to us $ 3.76 billion in first two months of this year", &refs0[1]);
+ TD::ConvertSentence("guangdong exports us $ 3.76 billion worth of high technology products in the first two months of this year", &refs0[2]);
+ TD::ConvertSentence("in the first 2 months this year , the export volume of new hi-tech products in guangdong province reached 3.76 billion us dollars .", &refs0[3]);
+ TD::ConvertSentence("xinhua news agency , guangzhou , march 16 ( reporter chen ji ) the latest statistics show that from january through february this year , the export of high-tech products in guangdong province reached 3.76 billion us dollars , up 34.8 \% over the same period last year and accounted for 25.5 \% of the total export in the province .", &refs1[0]);
+ TD::ConvertSentence("xinhua news agency , guangzhou , march 16 ( reporter : chen ji ) -- latest statistic indicates that guangdong's export of new high technology products amounts to us $ 3.76 billion , up 34.8 \% over corresponding period and accounts for 25.5 \% of the total exports of the province .", &refs1[1]);
+ TD::ConvertSentence("xinhua news agency report of march 16 from guangzhou ( by staff reporter chen ji ) - latest statistics indicate guangdong province exported us $ 3.76 billion worth of high technology products , up 34.8 percent from the same period last year , which account for 25.5 percent of the total exports of the province .", &refs1[2]);
+ TD::ConvertSentence("guangdong , march 16 , ( xinhua ) -- ( chen ji reports ) as the newest statistics shows , in january and feberuary this year , the export volume of new hi-tech products in guangdong province reached 3.76 billion us dollars , up 34.8 \% than last year , making up 25.5 \% of the province's total .", &refs1[3]);
+ TD::ConvertSentence("one guangdong province will next export us $ 3.76 high-tech product two months first this year 3.76 billion us dollars", &hyp1);
+ TD::ConvertSentence("xinhua news agency , guangzhou , 16th of march ( reporter chen ) -- latest statistics suggest that guangdong exports new advanced technology product totals $ 3.76 million , 34.8 percent last corresponding period and accounts for 25.5 percent of the total export province .", &hyp2);
+ }
+
+ virtual void TearDown() { }
+
+ vector<vector<WordID> > refs0;
+ vector<vector<WordID> > refs1;
+ vector<WordID> hyp1;
+ vector<WordID> hyp2;
+};
+
+TEST_F(ScorerTest, TestCreateFromFiles) {
+ vector<string> files;
+ files.push_back("test_data/re.txt.0");
+ files.push_back("test_data/re.txt.1");
+ files.push_back("test_data/re.txt.2");
+ files.push_back("test_data/re.txt.3");
+ DocScorer ds(IBM_BLEU, files);
+}
+
+TEST_F(ScorerTest, TestBLEUScorer) {
+ ScorerP s1 = SentenceScorer::CreateSentenceScorer(IBM_BLEU, refs0);
+ ScorerP s2 = SentenceScorer::CreateSentenceScorer(IBM_BLEU, refs1);
+ ScoreP b1 = s1->ScoreCandidate(hyp1);
+ EXPECT_FLOAT_EQ(0.23185077, b1->ComputeScore());
+ ScoreP b2 = s2->ScoreCandidate(hyp2);
+ EXPECT_FLOAT_EQ(0.38101241, b2->ComputeScore());
+ b1->PlusEquals(*b2);
+ EXPECT_FLOAT_EQ(0.348854, b1->ComputeScore());
+ EXPECT_FALSE(b1->IsAdditiveIdentity());
+ string details;
+ b1->ScoreDetails(&details);
+ EXPECT_EQ("BLEU = 34.89, 81.5|50.8|29.5|18.6 (brev=0.898)", details);
+ cerr << details << endl;
+ string enc;
+ b1->Encode(&enc);
+ ScoreP b3 = SentenceScorer::CreateScoreFromString(IBM_BLEU, enc);
+ details.clear();
+ cerr << "Encoded BLEU score size: " << enc.size() << endl;
+ b3->ScoreDetails(&details);
+ cerr << details << endl;
+ EXPECT_FALSE(b3->IsAdditiveIdentity());
+ EXPECT_EQ("BLEU = 34.89, 81.5|50.8|29.5|18.6 (brev=0.898)", details);
+ ScoreP bz = b3->GetZero();
+ EXPECT_TRUE(bz->IsAdditiveIdentity());
+}
+
+TEST_F(ScorerTest, TestTERScorer) {
+ ScorerP s1 = SentenceScorer::CreateSentenceScorer(TER, refs0);
+ ScorerP s2 = SentenceScorer::CreateSentenceScorer(TER, refs1);
+ string details;
+ ScoreP t1 = s1->ScoreCandidate(hyp1);
+ t1->ScoreDetails(&details);
+ cerr << "DETAILS: " << details << endl;
+ cerr << t1->ComputeScore() << endl;
+ ScoreP t2 = s2->ScoreCandidate(hyp2);
+ t2->ScoreDetails(&details);
+ cerr << "DETAILS: " << details << endl;
+ cerr << t2->ComputeScore() << endl;
+ t1->PlusEquals(*t2);
+ cerr << t1->ComputeScore() << endl;
+ t1->ScoreDetails(&details);
+ cerr << "DETAILS: " << details << endl;
+ EXPECT_EQ("TER = 44.16, 4| 8| 16| 6 (len=77)", details);
+ string enc;
+ t1->Encode(&enc);
+ ScoreP t3 = SentenceScorer::CreateScoreFromString(TER, enc);
+ details.clear();
+ t3->ScoreDetails(&details);
+ EXPECT_EQ("TER = 44.16, 4| 8| 16| 6 (len=77)", details);
+ EXPECT_FALSE(t3->IsAdditiveIdentity());
+ ScoreP tz = t3->GetZero();
+ EXPECT_TRUE(tz->IsAdditiveIdentity());
+}
+
+TEST_F(ScorerTest, TestTERScorerSimple) {
+ vector<vector<WordID> > ref(1);
+ TD::ConvertSentence("1 2 3 A B", &ref[0]);
+ vector<WordID> hyp;
+ TD::ConvertSentence("A B 1 2 3", &hyp);
+ ScorerP s1 = SentenceScorer::CreateSentenceScorer(TER, ref);
+ string details;
+ ScoreP t1 = s1->ScoreCandidate(hyp);
+ t1->ScoreDetails(&details);
+ cerr << "DETAILS: " << details << endl;
+}
+
+TEST_F(ScorerTest, TestSERScorerSimple) {
+ vector<vector<WordID> > ref(1);
+ TD::ConvertSentence("A B C D", &ref[0]);
+ vector<WordID> hyp1;
+ TD::ConvertSentence("A B C", &hyp1);
+ vector<WordID> hyp2;
+ TD::ConvertSentence("A B C D", &hyp2);
+ ScorerP s1 = SentenceScorer::CreateSentenceScorer(SER, ref);
+ string details;
+ ScoreP t1 = s1->ScoreCandidate(hyp1);
+ t1->ScoreDetails(&details);
+ cerr << "DETAILS: " << details << endl;
+ ScoreP t2 = s1->ScoreCandidate(hyp2);
+ t2->ScoreDetails(&details);
+ cerr << "DETAILS: " << details << endl;
+ t2->PlusEquals(*t1);
+ t2->ScoreDetails(&details);
+ cerr << "DETAILS: " << details << endl;
+}
+
+TEST_F(ScorerTest, TestCombiScorer) {
+ ScorerP s1 = SentenceScorer::CreateSentenceScorer(BLEU_minus_TER_over_2, refs0);
+ string details;
+ ScoreP t1 = s1->ScoreCandidate(hyp1);
+ t1->ScoreDetails(&details);
+ cerr << "DETAILS: " << details << endl;
+ cerr << t1->ComputeScore() << endl;
+ string enc;
+ t1->Encode(&enc);
+ ScoreP t2 = SentenceScorer::CreateScoreFromString(BLEU_minus_TER_over_2, enc);
+ details.clear();
+ t2->ScoreDetails(&details);
+ cerr << "DETAILS: " << details << endl;
+ ScoreP cz = t2->GetZero();
+ EXPECT_FALSE(t2->IsAdditiveIdentity());
+ EXPECT_TRUE(cz->IsAdditiveIdentity());
+ cz->PlusEquals(*t2);
+ EXPECT_FALSE(cz->IsAdditiveIdentity());
+ string d2;
+ cz->ScoreDetails(&d2);
+ EXPECT_EQ(d2, details);
+}
+
+TEST_F(ScorerTest, AERTest) {
+ vector<vector<WordID> > refs0(1);
+ TD::ConvertSentence("0-0 2-1 1-2 3-3", &refs0[0]);
+
+ vector<WordID> hyp;
+ TD::ConvertSentence("0-0 1-1", &hyp);
+ AERScorer* as = new AERScorer(refs0);
+ ScoreP x = as->ScoreCandidate(hyp);
+ string details;
+ x->ScoreDetails(&details);
+ cerr << details << endl;
+ string enc;
+ x->Encode(&enc);
+ delete as;
+ cerr << "ENC size: " << enc.size() << endl;
+ ScoreP y = SentenceScorer::CreateScoreFromString(AER, enc);
+ string d2;
+ y->ScoreDetails(&d2);
+ cerr << d2 << endl;
+ EXPECT_EQ(d2, details);
+}
+
+int main(int argc, char **argv) {
+ testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
diff --git a/vest/ter.cc b/mteval/ter.cc
index cacc5b00..cacc5b00 100644
--- a/vest/ter.cc
+++ b/mteval/ter.cc
diff --git a/vest/ter.h b/mteval/ter.h
index 43314791..43314791 100644
--- a/vest/ter.h
+++ b/mteval/ter.h
diff --git a/mteval/test_data/re.txt.0 b/mteval/test_data/re.txt.0
new file mode 100644
index 00000000..86eff087
--- /dev/null
+++ b/mteval/test_data/re.txt.0
@@ -0,0 +1,5 @@
+erdogan states turkey to reject any pressures to urge it to recognize cyprus
+ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara will reject any pressure by the european union to urge it to recognize cyprus . this comes two weeks before the summit of european union state and government heads who will decide whether or nor membership negotiations with ankara should be opened .
+erdogan told " ntv " television station that " the european union cannot address us by imposing new conditions on us with regard to cyprus .
+we will discuss this dossier in the course of membership negotiations . "
+he added " let me be clear , i cannot sidestep turkey , this is something we cannot accept . "
diff --git a/mteval/test_data/re.txt.1 b/mteval/test_data/re.txt.1
new file mode 100644
index 00000000..2140f198
--- /dev/null
+++ b/mteval/test_data/re.txt.1
@@ -0,0 +1,5 @@
+erdogan confirms turkey will resist any pressure to recognize cyprus
+ankara 12 - 1 ( afp ) - the turkish head of government , recep tayyip erdogan , announced today ( wednesday ) that ankara would resist any pressure the european union might exercise in order to force it into recognizing cyprus . this comes two weeks before a summit of european union heads of state and government , who will decide whether or not to open membership negotiations with ankara .
+erdogan said to the ntv television channel : " the european union cannot engage with us through imposing new conditions on us with regard to cyprus .
+we shall discuss this issue in the course of the membership negotiations . "
+he added : " let me be clear - i cannot confine turkey . this is something we do not accept . "
diff --git a/mteval/test_data/re.txt.2 b/mteval/test_data/re.txt.2
new file mode 100644
index 00000000..94e46286
--- /dev/null
+++ b/mteval/test_data/re.txt.2
@@ -0,0 +1,5 @@
+erdogan confirms that turkey will reject any pressures to encourage it to recognize cyprus
+ankara , 12 / 1 ( afp ) - the turkish prime minister recep tayyip erdogan declared today , wednesday , that ankara will reject any pressures that the european union may apply on it to encourage to recognize cyprus . this comes two weeks before a summit of the heads of countries and governments of the european union , who will decide on whether or not to start negotiations on joining with ankara .
+erdogan told the ntv television station that " it is not possible for the european union to talk to us by imposing new conditions on us regarding cyprus .
+we shall discuss this dossier during the negotiations on joining . "
+and he added , " let me be clear . turkey's arm should not be twisted ; this is something we cannot accept . "
diff --git a/mteval/test_data/re.txt.3 b/mteval/test_data/re.txt.3
new file mode 100644
index 00000000..f87c3308
--- /dev/null
+++ b/mteval/test_data/re.txt.3
@@ -0,0 +1,5 @@
+erdogan stresses that turkey will reject all pressures to force it to recognize cyprus
+ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara would refuse all pressures applied on it by the european union to force it to recognize cyprus . that came two weeks before the summit of the presidents and prime ministers of the european union , who would decide on whether to open negotiations on joining with ankara or not .
+erdogan said to " ntv " tv station that the " european union can not communicate with us by imposing on us new conditions related to cyprus .
+we will discuss this file during the negotiations on joining . "
+he added , " let me be clear . turkey's arm should not be twisted . this is unacceptable to us . "
diff --git a/training/Makefile.am b/training/Makefile.am
index 490de774..48b19932 100644
--- a/training/Makefile.am
+++ b/training/Makefile.am
@@ -14,37 +14,36 @@ noinst_PROGRAMS = \
optimize_test
atools_SOURCES = atools.cc
-atools_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+atools_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz
model1_SOURCES = model1.cc
-model1_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+model1_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz
grammar_convert_SOURCES = grammar_convert.cc
-grammar_convert_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+grammar_convert_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz
optimize_test_SOURCES = optimize_test.cc optimize.cc
-optimize_test_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+optimize_test_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz
collapse_weights_SOURCES = collapse_weights.cc
-collapse_weights_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+collapse_weights_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz
lbfgs_test_SOURCES = lbfgs_test.cc
-lbfgs_test_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+lbfgs_test_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz
mr_optimize_reduce_SOURCES = mr_optimize_reduce.cc optimize.cc
-mr_optimize_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+mr_optimize_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz
mr_em_map_adapter_SOURCES = mr_em_map_adapter.cc
-mr_em_map_adapter_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+mr_em_map_adapter_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz
mr_reduce_to_weights_SOURCES = mr_reduce_to_weights.cc
-mr_reduce_to_weights_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+mr_reduce_to_weights_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz
mr_em_adapted_reduce_SOURCES = mr_em_adapted_reduce.cc
-mr_em_adapted_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+mr_em_adapted_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz
plftools_SOURCES = plftools.cc
-plftools_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
-
-AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder
+plftools_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -I$(top_srcdir)/utils -I$(top_srcdir)/mteval
diff --git a/training/atools.cc b/training/atools.cc
index af62804d..805e3c1d 100644
--- a/training/atools.cc
+++ b/training/atools.cc
@@ -9,6 +9,7 @@
#include "filelib.h"
#include "aligner.h"
+#include "alignment_pharaoh.h"
namespace po = boost::program_options;
using namespace std;
@@ -349,9 +350,9 @@ int main(int argc, char **argv) {
}
if (line1.empty() && !*in1) break;
shared_ptr<Array2D<bool> > out(new Array2D<bool>);
- shared_ptr<Array2D<bool> > a1 = AlignerTools::ReadPharaohAlignmentGrid(line1);
+ shared_ptr<Array2D<bool> > a1 = AlignmentPharaoh::ReadPharaohAlignmentGrid(line1);
if (in2) {
- shared_ptr<Array2D<bool> > a2 = AlignerTools::ReadPharaohAlignmentGrid(line2);
+ shared_ptr<Array2D<bool> > a2 = AlignmentPharaoh::ReadPharaohAlignmentGrid(line2);
cmd.Apply(*a1, *a2, out.get());
} else {
Array2D<bool> dummy;
@@ -359,7 +360,7 @@ int main(int argc, char **argv) {
}
if (cmd.Result() == 1) {
- AlignerTools::SerializePharaohFormat(*out, &cout);
+ AlignmentPharaoh::SerializePharaohFormat(*out, &cout);
}
}
if (cmd.Result() == 2)
diff --git a/utils/Makefile.am b/utils/Makefile.am
new file mode 100644
index 00000000..e513febd
--- /dev/null
+++ b/utils/Makefile.am
@@ -0,0 +1,38 @@
+if HAVE_GTEST
+noinst_PROGRAMS = \
+ dict_test \
+ weights_test \
+ logval_test \
+ small_vector_test
+endif
+
+noinst_LIBRARIES = libutils.a
+
+libutils_a_SOURCES = \
+ alignment_pharaoh.cc \
+ b64tools.cc \
+ dict.cc \
+ tdict.cc \
+ fdict.cc \
+ gzstream.cc \
+ filelib.cc \
+ stringlib.cc \
+ sparse_vector.cc \
+ timing_stats.cc \
+ weights.cc
+
+dict_test_SOURCES = dict_test.cc
+dict_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS)
+weights_test_SOURCES = weights_test.cc
+weights_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS)
+logval_test_SOURCES = logval_test.cc
+logval_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS)
+small_vector_test_SOURCES = small_vector_test.cc
+small_vector_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS)
+
+AM_LDFLAGS = libutils.a -lz
+
+################################################################
+# do NOT NOT NOT add any other -I includes NO NO NO NO NO ######
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I.
+################################################################
diff --git a/utils/alignment_pharaoh.cc b/utils/alignment_pharaoh.cc
new file mode 100644
index 00000000..890ff565
--- /dev/null
+++ b/utils/alignment_pharaoh.cc
@@ -0,0 +1,77 @@
+#include "utils/alignment_pharaoh.h"
+
+#include <set>
+
+using namespace std;
+
+static bool is_digit(char x) { return x >= '0' && x <= '9'; }
+
+boost::shared_ptr<Array2D<bool> > AlignmentPharaoh::ReadPharaohAlignmentGrid(const string& al) {
+ int max_x = 0;
+ int max_y = 0;
+ int i = 0;
+ size_t pos = al.rfind(" ||| ");
+ if (pos != string::npos) { i = pos + 5; }
+ while (i < al.size()) {
+ if (al[i] == '\n' || al[i] == '\r') break;
+ int x = 0;
+ while(i < al.size() && is_digit(al[i])) {
+ x *= 10;
+ x += al[i] - '0';
+ ++i;
+ }
+ if (x > max_x) max_x = x;
+ assert(i < al.size());
+ if(al[i] != '-') {
+ cerr << "BAD ALIGNMENT: " << al << endl;
+ abort();
+ }
+ ++i;
+ int y = 0;
+ while(i < al.size() && is_digit(al[i])) {
+ y *= 10;
+ y += al[i] - '0';
+ ++i;
+ }
+ if (y > max_y) max_y = y;
+ while(i < al.size() && al[i] == ' ') { ++i; }
+ }
+
+ boost::shared_ptr<Array2D<bool> > grid(new Array2D<bool>(max_x + 1, max_y + 1));
+ i = 0;
+ if (pos != string::npos) { i = pos + 5; }
+ while (i < al.size()) {
+ if (al[i] == '\n' || al[i] == '\r') break;
+ int x = 0;
+ while(i < al.size() && is_digit(al[i])) {
+ x *= 10;
+ x += al[i] - '0';
+ ++i;
+ }
+ assert(i < al.size());
+ assert(al[i] == '-');
+ ++i;
+ int y = 0;
+ while(i < al.size() && is_digit(al[i])) {
+ y *= 10;
+ y += al[i] - '0';
+ ++i;
+ }
+ (*grid)(x, y) = true;
+ while(i < al.size() && al[i] == ' ') { ++i; }
+ }
+ // cerr << *grid << endl;
+ return grid;
+}
+
+void AlignmentPharaoh::SerializePharaohFormat(const Array2D<bool>& alignment, ostream* out) {
+ bool need_space = false;
+ for (int i = 0; i < alignment.width(); ++i)
+ for (int j = 0; j < alignment.height(); ++j)
+ if (alignment(i,j)) {
+ if (need_space) (*out) << ' '; else need_space = true;
+ (*out) << i << '-' << j;
+ }
+ (*out) << endl;
+}
+
diff --git a/utils/alignment_pharaoh.h b/utils/alignment_pharaoh.h
new file mode 100644
index 00000000..d111c8bf
--- /dev/null
+++ b/utils/alignment_pharaoh.h
@@ -0,0 +1,14 @@
+#ifndef _PHARAOH_ALIGNMENT_H_
+#define _PHARAOH_ALIGNMENT_H_
+
+#include <string>
+#include <iostream>
+#include <boost/shared_ptr.hpp>
+#include "array2d.h"
+
+struct AlignmentPharaoh {
+ static boost::shared_ptr<Array2D<bool> > ReadPharaohAlignmentGrid(const std::string& al);
+ static void SerializePharaohFormat(const Array2D<bool>& alignment, std::ostream* out);
+};
+
+#endif
diff --git a/decoder/array2d.h b/utils/array2d.h
index e63eda0d..e63eda0d 100644
--- a/decoder/array2d.h
+++ b/utils/array2d.h
diff --git a/utils/b64tools.cc b/utils/b64tools.cc
new file mode 100644
index 00000000..5512f975
--- /dev/null
+++ b/utils/b64tools.cc
@@ -0,0 +1,59 @@
+#include <iostream>
+#include <cassert>
+
+using namespace std;
+
+namespace B64 {
+
+static const char cb64[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+static const char cd64[]="|$$$}rstuvwxyz{$$$$$$$>?@ABCDEFGHIJKLMNOPQRSTUVW$$$$$$XYZ[\\]^_`abcdefghijklmnopq";
+
+static void encodeblock(const unsigned char* in, ostream* os, int len) {
+ char out[4];
+ out[0] = cb64[ in[0] >> 2 ];
+ out[1] = cb64[ ((in[0] & 0x03) << 4) | ((in[1] & 0xf0) >> 4) ];
+ out[2] = (len > 1 ? cb64[ ((in[1] & 0x0f) << 2) | ((in[2] & 0xc0) >> 6) ] : '=');
+ out[3] = (len > 2 ? cb64[ in[2] & 0x3f ] : '=');
+ os->write(out, 4);
+}
+
+void b64encode(const char* data, const size_t size, ostream* out) {
+ size_t cur = 0;
+ while(cur < size) {
+ int len = min(static_cast<size_t>(3), size - cur);
+ encodeblock(reinterpret_cast<const unsigned char*>(&data[cur]), out, len);
+ cur += len;
+ }
+}
+
+static void decodeblock(const unsigned char* in, unsigned char* out) {
+ out[0] = (unsigned char ) (in[0] << 2 | in[1] >> 4);
+ out[1] = (unsigned char ) (in[1] << 4 | in[2] >> 2);
+ out[2] = (unsigned char ) (((in[2] << 6) & 0xc0) | in[3]);
+}
+
+bool b64decode(const unsigned char* data, const size_t insize, char* out, const size_t outsize) {
+ size_t cur = 0;
+ size_t ocur = 0;
+ unsigned char in[4];
+ while(cur < insize) {
+ assert(ocur < outsize);
+ for (int i = 0; i < 4; ++i) {
+ unsigned char v = data[cur];
+ v = (unsigned char) ((v < 43 || v > 122) ? '\0' : cd64[ v - 43 ]);
+ if (!v) {
+ cerr << "B64 decode error at offset " << cur << " offending character: " << (int)data[cur] << endl;
+ return false;
+ }
+ v = (unsigned char) ((v == '$') ? '\0' : v - 61);
+ if (v) in[i] = v - 1; else in[i] = 0;
+ ++cur;
+ }
+ decodeblock(in, reinterpret_cast<unsigned char*>(&out[ocur]));
+ ocur += 3;
+ }
+ return true;
+}
+
+}
+
diff --git a/utils/b64tools.h b/utils/b64tools.h
new file mode 100644
index 00000000..c821fc8f
--- /dev/null
+++ b/utils/b64tools.h
@@ -0,0 +1,9 @@
+#ifndef _B64_TOOLS_H_
+#define _B64_TOOLS_H_
+
+namespace B64 {
+ bool b64decode(const unsigned char* data, const size_t insize, char* out, const size_t outsize);
+ void b64encode(const char* data, const size_t size, std::ostream* out);
+}
+
+#endif
diff --git a/decoder/dict.cc b/utils/dict.cc
index 2d6986c8..2d6986c8 100644
--- a/decoder/dict.cc
+++ b/utils/dict.cc
diff --git a/decoder/dict.h b/utils/dict.h
index 348a97e3..348a97e3 100644
--- a/decoder/dict.h
+++ b/utils/dict.h
diff --git a/decoder/dict_test.cc b/utils/dict_test.cc
index 694877fa..2049ec27 100644
--- a/decoder/dict_test.cc
+++ b/utils/dict_test.cc
@@ -5,9 +5,6 @@
#include <iostream>
#include <gtest/gtest.h>
#include <cassert>
-#include "filelib.h"
-
-#include "tdict.h"
using namespace std;
diff --git a/decoder/fdict.cc b/utils/fdict.cc
index baa0b552..baa0b552 100644
--- a/decoder/fdict.cc
+++ b/utils/fdict.cc
diff --git a/decoder/fdict.h b/utils/fdict.h
index f9673023..f9673023 100644
--- a/decoder/fdict.h
+++ b/utils/fdict.h
diff --git a/utils/feature_accum.h b/utils/feature_accum.h
new file mode 100755
index 00000000..851b29db
--- /dev/null
+++ b/utils/feature_accum.h
@@ -0,0 +1,129 @@
+#ifndef FEATURE_ACCUM_H
+#define FEATURE_ACCUM_H
+
+#include "ff.h"
+#include "sparse_vector.h"
+#include "value_array.h"
+
+struct SparseFeatureAccumulator : public FeatureVector {
+ typedef FeatureVector State;
+ SparseFeatureAccumulator() { }
+ template <class FF>
+ FeatureVector const& describe(FF const& ) { return *this; }
+ void Store(FeatureVector *fv) const {
+ fv->set_from(*this);
+ }
+ template <class FF>
+ void Store(FF const& /* ff */,FeatureVector *fv) const {
+ fv->set_from(*this);
+ }
+ template <class FF>
+ void Add(FF const& /* ff */,FeatureVector const& fv) {
+ (*this)+=fv;
+ }
+ void Add(FeatureVector const& fv) {
+ (*this)+=fv;
+ }
+ /*
+ SparseFeatureAccumulator(FeatureVector const& fv) : State(fv) {}
+ FeatureAccumulator(Features const& fids) {}
+ FeatureAccumulator(Features const& fids,FeatureVector const& fv) : State(fv) {}
+ void Add(Features const& fids,FeatureVector const& fv) {
+ *this += fv;
+ }
+ */
+ void Add(int i,Featval v) {
+ (*this)[i]+=v;
+ }
+ void Add(Features const& fids,int i,Featval v) {
+ (*this)[i]+=v;
+ }
+};
+
+struct SingleFeatureAccumulator {
+ typedef Featval State;
+ typedef SingleFeatureAccumulator Self;
+ State v;
+ /*
+ void operator +=(State const& o) {
+ v+=o;
+ }
+ */
+ void operator +=(Self const& s) {
+ v+=s.v;
+ }
+ SingleFeatureAccumulator() : v() {}
+ template <class FF>
+ State const& describe(FF const& ) const { return v; }
+
+ template <class FF>
+ void Store(FF const& ff,FeatureVector *fv) const {
+ fv->set_value(ff.fid_,v);
+ }
+ void Store(Features const& fids,FeatureVector *fv) const {
+ assert(fids.size()==1);
+ fv->set_value(fids[0],v);
+ }
+ /*
+ SingleFeatureAccumulator(Features const& fids) { assert(fids.size()==1); }
+ SingleFeatureAccumulator(Features const& fids,FeatureVector const& fv)
+ {
+ assert(fids.size()==1);
+ v=fv.get_singleton();
+ }
+ */
+
+ template <class FF>
+ void Add(FF const& ff,FeatureVector const& fv) {
+ v+=fv.get(ff.fid_);
+ }
+ void Add(FeatureVector const& fv) {
+ v+=fv.get_singleton();
+ }
+
+ void Add(Features const& fids,FeatureVector const& fv) {
+ v += fv.get(fids[0]);
+ }
+ void Add(Featval dv) {
+ v+=dv;
+ }
+ void Add(int,Featval dv) {
+ v+=dv;
+ }
+ void Add(FeatureVector const& fids,int i,Featval dv) {
+ assert(fids.size()==1 && i==0);
+ v+=dv;
+ }
+};
+
+
+#if 0
+// omitting this so we can default construct an accum. might be worth resurrecting in the future
+struct ArrayFeatureAccumulator : public ValueArray<Featval> {
+ typedef ValueArray<Featval> State;
+ template <class Fsa>
+ ArrayFeatureAccumulator(Fsa const& fsa) : State(fsa.features_.size()) { }
+ ArrayFeatureAccumulator(Features const& fids) : State(fids.size()) { }
+ ArrayFeatureAccumulator(Features const& fids) : State(fids.size()) { }
+ ArrayFeatureAccumulator(Features const& fids,FeatureVector const& fv) : State(fids.size()) {
+ for (int i=0,e=i<fids.size();i<e;++i)
+ (*this)[i]=fv.get(i);
+ }
+ State const& describe(Features const& fids) const { return *this; }
+ void Store(Features const& fids,FeatureVector *fv) const {
+ assert(fids.size()==size());
+ for (int i=0,e=i<fids.size();i<e;++i)
+ fv->set_value(fids[i],(*this)[i]);
+ }
+ void Add(Features const& fids,FeatureVector const& fv) {
+ for (int i=0,e=i<fids.size();i<e;++i)
+ (*this)[i]+=fv.get(i);
+ }
+ void Add(FeatureVector const& fids,int i,Featval v) {
+ (*this)[i]+=v;
+ }
+};
+#endif
+
+
+#endif
diff --git a/decoder/feature_vector.h b/utils/feature_vector.h
index be378a6a..be378a6a 100755
--- a/decoder/feature_vector.h
+++ b/utils/feature_vector.h
diff --git a/decoder/filelib.cc b/utils/filelib.cc
index 79ad2847..79ad2847 100644
--- a/decoder/filelib.cc
+++ b/utils/filelib.cc
diff --git a/decoder/filelib.h b/utils/filelib.h
index b9fef9a7..b9fef9a7 100644
--- a/decoder/filelib.h
+++ b/utils/filelib.h
diff --git a/decoder/gzstream.cc b/utils/gzstream.cc
index 88cd1bd2..88cd1bd2 100644
--- a/decoder/gzstream.cc
+++ b/utils/gzstream.cc
diff --git a/decoder/gzstream.h b/utils/gzstream.h
index a7effd90..a7effd90 100644
--- a/decoder/gzstream.h
+++ b/utils/gzstream.h
diff --git a/decoder/hash.h b/utils/hash.h
index 3a60a429..3a60a429 100755
--- a/decoder/hash.h
+++ b/utils/hash.h
diff --git a/decoder/have_64_bits.h b/utils/have_64_bits.h
index d1e6064f..d1e6064f 100755
--- a/decoder/have_64_bits.h
+++ b/utils/have_64_bits.h
diff --git a/decoder/int_or_pointer.h b/utils/int_or_pointer.h
index 4b6a9e4a..4b6a9e4a 100755
--- a/decoder/int_or_pointer.h
+++ b/utils/int_or_pointer.h
diff --git a/decoder/intrusive_refcount.hpp b/utils/intrusive_refcount.hpp
index 4a4b0187..4a4b0187 100755
--- a/decoder/intrusive_refcount.hpp
+++ b/utils/intrusive_refcount.hpp
diff --git a/decoder/logval.h b/utils/logval.h
index 37f14ae5..37f14ae5 100644
--- a/decoder/logval.h
+++ b/utils/logval.h
diff --git a/decoder/logval_test.cc b/utils/logval_test.cc
index 1a23177d..1a23177d 100644
--- a/decoder/logval_test.cc
+++ b/utils/logval_test.cc
diff --git a/decoder/murmur_hash.h b/utils/murmur_hash.h
index 8dbd7807..8dbd7807 100755
--- a/decoder/murmur_hash.h
+++ b/utils/murmur_hash.h
diff --git a/decoder/null_deleter.h b/utils/null_deleter.h
index 082ab453..082ab453 100755
--- a/decoder/null_deleter.h
+++ b/utils/null_deleter.h
diff --git a/decoder/prob.h b/utils/prob.h
index bc297870..bc297870 100644
--- a/decoder/prob.h
+++ b/utils/prob.h
diff --git a/decoder/sampler.h b/utils/sampler.h
index 5fef45d0..5fef45d0 100644
--- a/decoder/sampler.h
+++ b/utils/sampler.h
diff --git a/decoder/small_vector.h b/utils/small_vector.h
index 25c52359..25c52359 100644
--- a/decoder/small_vector.h
+++ b/utils/small_vector.h
diff --git a/decoder/small_vector_test.cc b/utils/small_vector_test.cc
index d1d8dcab..d1d8dcab 100644
--- a/decoder/small_vector_test.cc
+++ b/utils/small_vector_test.cc
diff --git a/decoder/sparse_vector.cc b/utils/sparse_vector.cc
index 4035b9ef..6e42a216 100644
--- a/decoder/sparse_vector.cc
+++ b/utils/sparse_vector.cc
@@ -3,7 +3,7 @@
#include <iostream>
#include <cstring>
-#include "hg_io.h"
+#include "b64tools.h"
using namespace std;
diff --git a/decoder/sparse_vector.h b/utils/sparse_vector.h
index 207489c5..207489c5 100644
--- a/decoder/sparse_vector.h
+++ b/utils/sparse_vector.h
diff --git a/decoder/static_utoa.h b/utils/static_utoa.h
index fe5f6d92..fe5f6d92 100755
--- a/decoder/static_utoa.h
+++ b/utils/static_utoa.h
diff --git a/decoder/stringlib.cc b/utils/stringlib.cc
index 3e52ae87..7aaee9f0 100644
--- a/decoder/stringlib.cc
+++ b/utils/stringlib.cc
@@ -6,8 +6,6 @@
#include <iostream>
#include <map>
-#include "lattice.h"
-
using namespace std;
void ParseTranslatorInput(const string& line, string* input, string* ref) {
@@ -31,15 +29,6 @@ void ParseTranslatorInput(const string& line, string* input, string* ref) {
}
}
-void ParseTranslatorInputLattice(const string& line, string* input, Lattice* ref) {
- string sref;
- ParseTranslatorInput(line, input, &sref);
- if (sref.size() > 0) {
- assert(ref);
- LatticeTools::ConvertTextOrPLF(sref, ref);
- }
-}
-
void ProcessAndStripSGML(string* pline, map<string, string>* out) {
map<string, string>& meta = *out;
string& line = *pline;
diff --git a/decoder/stringlib.h b/utils/stringlib.h
index 84e95d44..84e95d44 100644
--- a/decoder/stringlib.h
+++ b/utils/stringlib.h
diff --git a/decoder/stringlib_test.cc b/utils/stringlib_test.cc
index f66cdbeb..f66cdbeb 100755
--- a/decoder/stringlib_test.cc
+++ b/utils/stringlib_test.cc
diff --git a/decoder/tdict.cc b/utils/tdict.cc
index 1f68feae..1f68feae 100644
--- a/decoder/tdict.cc
+++ b/utils/tdict.cc
diff --git a/decoder/tdict.h b/utils/tdict.h
index a7b3ee1c..a7b3ee1c 100644
--- a/decoder/tdict.h
+++ b/utils/tdict.h
diff --git a/decoder/test_data/weights b/utils/test_data/weights
index ea70229c..ea70229c 100644
--- a/decoder/test_data/weights
+++ b/utils/test_data/weights
diff --git a/decoder/threadlocal.h b/utils/threadlocal.h
index d79f5d9d..d79f5d9d 100755
--- a/decoder/threadlocal.h
+++ b/utils/threadlocal.h
diff --git a/decoder/timing_stats.cc b/utils/timing_stats.cc
index fc8e9df1..fc8e9df1 100644
--- a/decoder/timing_stats.cc
+++ b/utils/timing_stats.cc
diff --git a/decoder/timing_stats.h b/utils/timing_stats.h
index 0a9f7656..0a9f7656 100644
--- a/decoder/timing_stats.h
+++ b/utils/timing_stats.h
diff --git a/decoder/weights.cc b/utils/weights.cc
index 84647585..84647585 100644
--- a/decoder/weights.cc
+++ b/utils/weights.cc
diff --git a/decoder/weights.h b/utils/weights.h
index f19aa3ce..f19aa3ce 100644
--- a/decoder/weights.h
+++ b/utils/weights.h
diff --git a/decoder/weights_test.cc b/utils/weights_test.cc
index aa6b3db2..8a4c26ef 100644
--- a/decoder/weights_test.cc
+++ b/utils/weights_test.cc
@@ -5,7 +5,6 @@
#include <gtest/gtest.h>
#include "weights.h"
#include "tdict.h"
-#include "hg.h"
using namespace std;
diff --git a/decoder/wordid.h b/utils/wordid.h
index fb50bcc1..fb50bcc1 100644
--- a/decoder/wordid.h
+++ b/utils/wordid.h
diff --git a/vest/Makefile.am b/vest/Makefile.am
index abdc8146..b869672b 100644
--- a/vest/Makefile.am
+++ b/vest/Makefile.am
@@ -1,15 +1,12 @@
bin_PROGRAMS = \
- mbr_kbest \
mr_vest_map \
mr_vest_reduce \
mr_vest_generate_mapper_input \
- fast_score \
sentserver \
sentclient
if HAVE_GTEST
noinst_PROGRAMS = \
- scorer_test \
lo_test
endif
@@ -17,25 +14,16 @@ sentserver_SOURCES = sentserver.c
sentclient_SOURCES = sentclient.c
-mbr_kbest_SOURCES = mbr_kbest.cc ter.cc comb_scorer.cc aer_scorer.cc scorer.cc viterbi_envelope.cc
-mbr_kbest_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+mr_vest_generate_mapper_input_SOURCES = mr_vest_generate_mapper_input.cc line_optimizer.cc
+mr_vest_generate_mapper_input_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
-fast_score_SOURCES = fast_score.cc ter.cc comb_scorer.cc aer_scorer.cc scorer.cc viterbi_envelope.cc
-fast_score_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+mr_vest_map_SOURCES = viterbi_envelope.cc ces.cc error_surface.cc mr_vest_map.cc line_optimizer.cc
+mr_vest_map_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
-mr_vest_generate_mapper_input_SOURCES = mr_vest_generate_mapper_input.cc line_optimizer.cc $(top_srcdir)/decoder/timing_stats.cc
-mr_vest_generate_mapper_input_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+mr_vest_reduce_SOURCES = error_surface.cc ces.cc mr_vest_reduce.cc line_optimizer.cc viterbi_envelope.cc
+mr_vest_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
-mr_vest_map_SOURCES = viterbi_envelope.cc error_surface.cc aer_scorer.cc mr_vest_map.cc scorer.cc ter.cc comb_scorer.cc line_optimizer.cc
-mr_vest_map_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+lo_test_SOURCES = lo_test.cc ces.cc viterbi_envelope.cc error_surface.cc line_optimizer.cc
+lo_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz
-mr_vest_reduce_SOURCES = error_surface.cc aer_scorer.cc mr_vest_reduce.cc scorer.cc ter.cc comb_scorer.cc line_optimizer.cc viterbi_envelope.cc
-mr_vest_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
-
-scorer_test_SOURCES = aer_scorer.cc scorer_test.cc scorer.cc ter.cc comb_scorer.cc viterbi_envelope.cc
-scorer_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/decoder/libcdec.a -lz
-
-lo_test_SOURCES = lo_test.cc scorer.cc ter.cc aer_scorer.cc comb_scorer.cc viterbi_envelope.cc error_surface.cc line_optimizer.cc
-lo_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/decoder/libcdec.a -lz
-
-AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
diff --git a/vest/lo_test.cc b/vest/lo_test.cc
index 577113bb..9200eb34 100644
--- a/vest/lo_test.cc
+++ b/vest/lo_test.cc
@@ -5,6 +5,7 @@
#include <boost/shared_ptr.hpp>
#include <gtest/gtest.h>
+#include "ces.h"
#include "fdict.h"
#include "hg.h"
#include "kbest.h"
@@ -166,8 +167,8 @@ TEST_F(OptTest, TestS1) {
envs[1] = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg2, NULL, wf);
vector<ErrorSurface> es(2);
- scorer1->ComputeErrorSurface(envs[0], &es[0], IBM_BLEU, hg);
- scorer2->ComputeErrorSurface(envs[1], &es[1], IBM_BLEU, hg2);
+ ComputeErrorSurface(*scorer1, envs[0], &es[0], IBM_BLEU, hg);
+ ComputeErrorSurface(*scorer2, envs[1], &es[1], IBM_BLEU, hg2);
cerr << envs[0].size() << " " << envs[1].size() << endl;
cerr << es[0].size() << " " << es[1].size() << endl;
envs.clear();
diff --git a/vest/mr_vest_map.cc b/vest/mr_vest_map.cc
index b3acc5dd..1506a99f 100644
--- a/vest/mr_vest_map.cc
+++ b/vest/mr_vest_map.cc
@@ -6,6 +6,7 @@
#include <boost/program_options.hpp>
#include <boost/program_options/variables_map.hpp>
+#include "ces.h"
#include "filelib.h"
#include "stringlib.h"
#include "sparse_vector.h"
@@ -13,7 +14,7 @@
#include "viterbi_envelope.h"
#include "inside_outside.h"
#include "error_surface.h"
-#include "hg.h"
+#include "b64tools.h"
#include "hg_io.h"
using namespace std;
@@ -90,7 +91,7 @@ int main(int argc, char** argv) {
ViterbiEnvelopeWeightFunction wf(origin, axis);
ViterbiEnvelope ve = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg, NULL, wf);
ErrorSurface es;
- ds[sent_id]->ComputeErrorSurface(ve, &es, type, hg);
+ ComputeErrorSurface(*ds[sent_id], ve, &es, type, hg);
//cerr << "Viterbi envelope has " << ve.size() << " segments\n";
// cerr << "Error surface has " << es.size() << " segments\n";
string val;
diff --git a/vest/mr_vest_reduce.cc b/vest/mr_vest_reduce.cc
index 5efcc19a..3df52020 100644
--- a/vest/mr_vest_reduce.cc
+++ b/vest/mr_vest_reduce.cc
@@ -9,7 +9,7 @@
#include "sparse_vector.h"
#include "error_surface.h"
#include "line_optimizer.h"
-#include "hg_io.h"
+#include "b64tools.h"
using namespace std;
namespace po = boost::program_options;