From a53461650fbdcd3cfe7543d28af9647ac3e5e47e Mon Sep 17 00:00:00 2001 From: redpony Date: Wed, 11 Aug 2010 02:37:10 +0000 Subject: major refactor, break bad circular deps git-svn-id: https://ws10smt.googlecode.com/svn/trunk@509 ec762483-ff6d-05da-a07a-a48fb63a330f --- Makefile.am | 2 +- configure.ac | 2 +- decoder/Makefile.am | 37 +- decoder/aligner.cc | 74 +--- decoder/aligner.h | 2 - decoder/array2d.h | 172 --------- decoder/cdec.cc | 11 +- decoder/dict.cc | 27 -- decoder/dict.h | 66 ---- decoder/dict_test.cc | 50 --- decoder/fdict.cc | 143 ------- decoder/fdict.h | 34 -- decoder/feature_vector.h | 18 - decoder/ff_bleu.cc | 2 +- decoder/ff_lm.cc | 2 +- decoder/ff_wordalign.cc | 3 +- decoder/filelib.cc | 22 -- decoder/filelib.h | 106 ------ decoder/gzstream.cc | 182 --------- decoder/gzstream.h | 127 ------- decoder/hash.h | 54 --- decoder/have_64_bits.h | 17 - decoder/hg.h | 4 +- decoder/hg_io.cc | 53 --- decoder/hg_io.h | 5 - decoder/int_or_pointer.h | 70 ---- decoder/intrusive_refcount.hpp | 84 ----- decoder/logval.h | 174 --------- decoder/logval_test.cc | 73 ---- decoder/murmur_hash.h | 186 --------- decoder/null_deleter.h | 9 - decoder/oracle_bleu.h | 2 +- decoder/phrasebased_translator.cc | 4 +- decoder/prob.h | 8 - decoder/sampler.h | 147 -------- decoder/sentence_metadata.h | 2 +- decoder/small_vector.h | 265 ------------- decoder/small_vector_test.cc | 129 ------- decoder/sparse_vector.cc | 98 ----- decoder/sparse_vector.h | 512 ------------------------- decoder/static_utoa.h | 115 ------ decoder/stringlib.cc | 98 ----- decoder/stringlib.h | 267 ------------- decoder/stringlib_test.cc | 17 - decoder/tdict.cc | 154 -------- decoder/tdict.h | 50 --- decoder/test_data/weights | 8 - decoder/threadlocal.h | 71 ---- decoder/timing_stats.cc | 24 -- decoder/timing_stats.h | 25 -- decoder/weights.cc | 77 ---- decoder/weights.h | 21 -- decoder/weights_test.cc | 28 -- decoder/wordid.h | 6 - extools/Makefile.am | 12 +- extools/sg_lexer.l | 3 - gi/clda/src/Makefile.am | 4 +- gi/pyp-topics/src/Makefile.am | 8 +- gi/pyp-topics/src/contexts_corpus.hh | 2 +- gi/pyp-topics/src/contexts_lexer.h | 2 +- gi/pyp-topics/src/contexts_lexer.l | 2 +- mteval/Makefile.am | 23 ++ mteval/aer_scorer.cc | 135 +++++++ mteval/aer_scorer.h | 23 ++ mteval/comb_scorer.cc | 97 +++++ mteval/comb_scorer.h | 17 + mteval/fast_score.cc | 72 ++++ mteval/mbr_kbest.cc | 138 +++++++ mteval/scorer.cc | 630 +++++++++++++++++++++++++++++++ mteval/scorer.h | 110 ++++++ mteval/scorer_test.cc | 182 +++++++++ mteval/ter.cc | 535 ++++++++++++++++++++++++++ mteval/ter.h | 19 + mteval/test_data/re.txt.0 | 5 + mteval/test_data/re.txt.1 | 5 + mteval/test_data/re.txt.2 | 5 + mteval/test_data/re.txt.3 | 5 + training/Makefile.am | 25 +- training/atools.cc | 7 +- utils/Makefile.am | 38 ++ utils/alignment_pharaoh.cc | 77 ++++ utils/alignment_pharaoh.h | 14 + utils/array2d.h | 172 +++++++++ utils/b64tools.cc | 59 +++ utils/b64tools.h | 9 + utils/dict.cc | 27 ++ utils/dict.h | 66 ++++ utils/dict_test.cc | 47 +++ utils/fdict.cc | 143 +++++++ utils/fdict.h | 34 ++ utils/feature_accum.h | 129 +++++++ utils/feature_vector.h | 18 + utils/filelib.cc | 22 ++ utils/filelib.h | 106 ++++++ utils/gzstream.cc | 182 +++++++++ utils/gzstream.h | 127 +++++++ utils/hash.h | 54 +++ utils/have_64_bits.h | 17 + utils/int_or_pointer.h | 70 ++++ utils/intrusive_refcount.hpp | 84 +++++ utils/logval.h | 174 +++++++++ utils/logval_test.cc | 73 ++++ utils/murmur_hash.h | 186 +++++++++ utils/null_deleter.h | 9 + utils/prob.h | 8 + utils/sampler.h | 147 ++++++++ utils/small_vector.h | 265 +++++++++++++ utils/small_vector_test.cc | 129 +++++++ utils/sparse_vector.cc | 98 +++++ utils/sparse_vector.h | 512 +++++++++++++++++++++++++ utils/static_utoa.h | 115 ++++++ utils/stringlib.cc | 87 +++++ utils/stringlib.h | 267 +++++++++++++ utils/stringlib_test.cc | 17 + utils/tdict.cc | 154 ++++++++ utils/tdict.h | 50 +++ utils/test_data/weights | 8 + utils/threadlocal.h | 71 ++++ utils/timing_stats.cc | 24 ++ utils/timing_stats.h | 25 ++ utils/weights.cc | 77 ++++ utils/weights.h | 21 ++ utils/weights_test.cc | 27 ++ utils/wordid.h | 6 + vest/Makefile.am | 30 +- vest/aer_scorer.cc | 135 ------- vest/aer_scorer.h | 23 -- vest/comb_scorer.cc | 97 ----- vest/comb_scorer.h | 17 - vest/fast_score.cc | 72 ---- vest/lo_test.cc | 5 +- vest/mr_vest_map.cc | 5 +- vest/mr_vest_reduce.cc | 2 +- vest/scorer.cc | 708 ----------------------------------- vest/scorer.h | 111 ------ vest/ter.cc | 535 -------------------------- vest/ter.h | 19 - 137 files changed, 6124 insertions(+), 5685 deletions(-) delete mode 100644 decoder/array2d.h delete mode 100644 decoder/dict.cc delete mode 100644 decoder/dict.h delete mode 100644 decoder/dict_test.cc delete mode 100644 decoder/fdict.cc delete mode 100644 decoder/fdict.h delete mode 100755 decoder/feature_vector.h delete mode 100644 decoder/filelib.cc delete mode 100644 decoder/filelib.h delete mode 100644 decoder/gzstream.cc delete mode 100644 decoder/gzstream.h delete mode 100755 decoder/hash.h delete mode 100755 decoder/have_64_bits.h delete mode 100755 decoder/int_or_pointer.h delete mode 100755 decoder/intrusive_refcount.hpp delete mode 100644 decoder/logval.h delete mode 100644 decoder/logval_test.cc delete mode 100755 decoder/murmur_hash.h delete mode 100755 decoder/null_deleter.h delete mode 100644 decoder/prob.h delete mode 100644 decoder/sampler.h delete mode 100644 decoder/small_vector.h delete mode 100644 decoder/small_vector_test.cc delete mode 100644 decoder/sparse_vector.cc delete mode 100644 decoder/sparse_vector.h delete mode 100755 decoder/static_utoa.h delete mode 100644 decoder/stringlib.cc delete mode 100644 decoder/stringlib.h delete mode 100755 decoder/stringlib_test.cc delete mode 100644 decoder/tdict.cc delete mode 100644 decoder/tdict.h delete mode 100644 decoder/test_data/weights delete mode 100755 decoder/threadlocal.h delete mode 100644 decoder/timing_stats.cc delete mode 100644 decoder/timing_stats.h delete mode 100644 decoder/weights.cc delete mode 100644 decoder/weights.h delete mode 100644 decoder/weights_test.cc delete mode 100644 decoder/wordid.h create mode 100644 mteval/Makefile.am create mode 100644 mteval/aer_scorer.cc create mode 100644 mteval/aer_scorer.h create mode 100644 mteval/comb_scorer.cc create mode 100644 mteval/comb_scorer.h create mode 100644 mteval/fast_score.cc create mode 100644 mteval/mbr_kbest.cc create mode 100644 mteval/scorer.cc create mode 100644 mteval/scorer.h create mode 100644 mteval/scorer_test.cc create mode 100644 mteval/ter.cc create mode 100644 mteval/ter.h create mode 100644 mteval/test_data/re.txt.0 create mode 100644 mteval/test_data/re.txt.1 create mode 100644 mteval/test_data/re.txt.2 create mode 100644 mteval/test_data/re.txt.3 create mode 100644 utils/Makefile.am create mode 100644 utils/alignment_pharaoh.cc create mode 100644 utils/alignment_pharaoh.h create mode 100644 utils/array2d.h create mode 100644 utils/b64tools.cc create mode 100644 utils/b64tools.h create mode 100644 utils/dict.cc create mode 100644 utils/dict.h create mode 100644 utils/dict_test.cc create mode 100644 utils/fdict.cc create mode 100644 utils/fdict.h create mode 100755 utils/feature_accum.h create mode 100755 utils/feature_vector.h create mode 100644 utils/filelib.cc create mode 100644 utils/filelib.h create mode 100644 utils/gzstream.cc create mode 100644 utils/gzstream.h create mode 100755 utils/hash.h create mode 100755 utils/have_64_bits.h create mode 100755 utils/int_or_pointer.h create mode 100755 utils/intrusive_refcount.hpp create mode 100644 utils/logval.h create mode 100644 utils/logval_test.cc create mode 100755 utils/murmur_hash.h create mode 100755 utils/null_deleter.h create mode 100644 utils/prob.h create mode 100644 utils/sampler.h create mode 100644 utils/small_vector.h create mode 100644 utils/small_vector_test.cc create mode 100644 utils/sparse_vector.cc create mode 100644 utils/sparse_vector.h create mode 100755 utils/static_utoa.h create mode 100644 utils/stringlib.cc create mode 100644 utils/stringlib.h create mode 100755 utils/stringlib_test.cc create mode 100644 utils/tdict.cc create mode 100644 utils/tdict.h create mode 100644 utils/test_data/weights create mode 100755 utils/threadlocal.h create mode 100644 utils/timing_stats.cc create mode 100644 utils/timing_stats.h create mode 100644 utils/weights.cc create mode 100644 utils/weights.h create mode 100644 utils/weights_test.cc create mode 100644 utils/wordid.h delete mode 100644 vest/aer_scorer.cc delete mode 100644 vest/aer_scorer.h delete mode 100644 vest/comb_scorer.cc delete mode 100644 vest/comb_scorer.h delete mode 100644 vest/fast_score.cc delete mode 100644 vest/scorer.cc delete mode 100644 vest/scorer.h delete mode 100644 vest/ter.cc delete mode 100644 vest/ter.h diff --git a/Makefile.am b/Makefile.am index e82e2352..98c2561e 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,4 +1,4 @@ -SUBDIRS = decoder training vest extools gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava +SUBDIRS = utils mteval decoder training vest extools gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava AUTOMAKE_OPTIONS = foreign ACLOCAL_AMFLAGS = -I m4 diff --git a/configure.ac b/configure.ac index e627c1cc..302eebed 100644 --- a/configure.ac +++ b/configure.ac @@ -76,4 +76,4 @@ then AM_CONDITIONAL([RAND_LM], true) fi -AC_OUTPUT(Makefile extools/Makefile decoder/Makefile training/Makefile vest/Makefile gi/pyp-topics/src/Makefile gi/clda/src/Makefile) +AC_OUTPUT(Makefile utils/Makefile mteval/Makefile extools/Makefile decoder/Makefile training/Makefile vest/Makefile gi/pyp-topics/src/Makefile gi/clda/src/Makefile) diff --git a/decoder/Makefile.am b/decoder/Makefile.am index 68a7d765..f514b340 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -2,24 +2,16 @@ bin_PROGRAMS = cdec if HAVE_GTEST noinst_PROGRAMS = \ - dict_test \ - weights_test \ trule_test \ hg_test \ ff_test \ - logval_test \ parser_test \ - grammar_test \ - small_vector_test + grammar_test endif -cdec_SOURCES = cdec.cc forest_writer.cc maxtrans_blunsom.cc cdec_ff.cc timing_stats.cc -small_vector_test_SOURCES = small_vector_test.cc -small_vector_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a +cdec_SOURCES = cdec.cc forest_writer.cc maxtrans_blunsom.cc cdec_ff.cc parser_test_SOURCES = parser_test.cc parser_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a -dict_test_SOURCES = dict_test.cc -dict_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a ff_test_SOURCES = ff_test.cc ff_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a grammar_test_SOURCES = grammar_test.cc @@ -28,15 +20,12 @@ hg_test_SOURCES = hg_test.cc hg_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a trule_test_SOURCES = trule_test.cc trule_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a -weights_test_SOURCES = weights_test.cc -weights_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a -logval_test_SOURCES = logval_test.cc -logval_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) -LDADD = libcdec.a +LDADD = libcdec.a ../mteval/libmteval.a ../utils/libutils.a -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I.. -AM_LDFLAGS = -lz +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I.. -I../mteval -I../utils + +AM_LDFLAGS = ../utils/libutils.a -lz rule_lexer.cc: rule_lexer.l $(LEX) -s -CF -8 -o$@ $< @@ -49,7 +38,6 @@ libcdec_a_SOURCES = \ rule_lexer.cc \ fst_translator.cc \ csplit.cc \ - dict.cc \ translator.cc \ scfg_translator.cc \ hg.cc \ @@ -58,17 +46,10 @@ libcdec_a_SOURCES = \ viterbi.cc \ lattice.cc \ aligner.cc \ - gzstream.cc \ apply_models.cc \ earley_composer.cc \ phrasetable_fst.cc \ - sparse_vector.cc \ trule.cc \ - filelib.cc \ - stringlib.cc \ - fdict.cc \ - tdict.cc \ - weights.cc \ ttables.cc \ ff.cc \ ff_lm.cc \ @@ -78,12 +59,6 @@ libcdec_a_SOURCES = \ ff_tagger.cc \ ff_bleu.cc \ ff_factory.cc \ - ../vest/scorer.cc \ - ../vest/ter.cc \ - ../vest/aer_scorer.cc \ - ../vest/comb_scorer.cc \ - ../vest/error_surface.cc \ - ../vest/viterbi_envelope.cc \ freqdict.cc \ lexalign.cc \ lextrans.cc \ diff --git a/decoder/aligner.cc b/decoder/aligner.cc index b089f52e..92431be4 100644 --- a/decoder/aligner.cc +++ b/decoder/aligner.cc @@ -5,81 +5,11 @@ #include "sentence_metadata.h" #include "inside_outside.h" #include "viterbi.h" +#include "alignment_pharaoh.h" #include using namespace std; -static bool is_digit(char x) { return x >= '0' && x <= '9'; } - -boost::shared_ptr > AlignerTools::ReadPharaohAlignmentGrid(const string& al) { - int max_x = 0; - int max_y = 0; - int i = 0; - size_t pos = al.rfind(" ||| "); - if (pos != string::npos) { i = pos + 5; } - while (i < al.size()) { - if (al[i] == '\n' || al[i] == '\r') break; - int x = 0; - while(i < al.size() && is_digit(al[i])) { - x *= 10; - x += al[i] - '0'; - ++i; - } - if (x > max_x) max_x = x; - assert(i < al.size()); - if(al[i] != '-') { - cerr << "BAD ALIGNMENT: " << al << endl; - abort(); - } - ++i; - int y = 0; - while(i < al.size() && is_digit(al[i])) { - y *= 10; - y += al[i] - '0'; - ++i; - } - if (y > max_y) max_y = y; - while(i < al.size() && al[i] == ' ') { ++i; } - } - - boost::shared_ptr > grid(new Array2D(max_x + 1, max_y + 1)); - i = 0; - if (pos != string::npos) { i = pos + 5; } - while (i < al.size()) { - if (al[i] == '\n' || al[i] == '\r') break; - int x = 0; - while(i < al.size() && is_digit(al[i])) { - x *= 10; - x += al[i] - '0'; - ++i; - } - assert(i < al.size()); - assert(al[i] == '-'); - ++i; - int y = 0; - while(i < al.size() && is_digit(al[i])) { - y *= 10; - y += al[i] - '0'; - ++i; - } - (*grid)(x, y) = true; - while(i < al.size() && al[i] == ' ') { ++i; } - } - // cerr << *grid << endl; - return grid; -} - -void AlignerTools::SerializePharaohFormat(const Array2D& alignment, ostream* out) { - bool need_space = false; - for (int i = 0; i < alignment.width(); ++i) - for (int j = 0; j < alignment.height(); ++j) - if (alignment(i,j)) { - if (need_space) (*out) << ' '; else need_space = true; - (*out) << i << '-' << j; - } - (*out) << endl; -} - // used with lexical models since they may not fully generate the // source string void SourceEdgeCoveragesUsingParseIndices(const Hypergraph& g, @@ -317,6 +247,6 @@ void AlignerTools::WriteAlignment(const Lattice& src_lattice, cerr << grid << endl; } (*out) << TD::GetString(src_sent) << " ||| " << TD::GetString(trg_sent) << " ||| "; - SerializePharaohFormat(grid, out); + AlignmentPharaoh::SerializePharaohFormat(grid, out); }; diff --git a/decoder/aligner.h b/decoder/aligner.h index cd159119..a088ba6c 100644 --- a/decoder/aligner.h +++ b/decoder/aligner.h @@ -10,8 +10,6 @@ class Hypergraph; class SentenceMetadata; struct AlignerTools { - static boost::shared_ptr > ReadPharaohAlignmentGrid(const std::string& al); - static void SerializePharaohFormat(const Array2D& alignment, std::ostream* out); // assumption: g contains derivations of input/ref and // ONLY input/ref. diff --git a/decoder/array2d.h b/decoder/array2d.h deleted file mode 100644 index e63eda0d..00000000 --- a/decoder/array2d.h +++ /dev/null @@ -1,172 +0,0 @@ -#ifndef ARRAY2D_H_ -#define ARRAY2D_H_ - -#include -#include -#include -#include -#include - -template -class Array2D { - public: - typedef typename std::vector::reference reference; - typedef typename std::vector::const_reference const_reference; - typedef typename std::vector::iterator iterator; - typedef typename std::vector::const_iterator const_iterator; - Array2D() : width_(0), height_(0) {} - Array2D(int w, int h, const T& d = T()) : - width_(w), height_(h), data_(w*h, d) {} - Array2D(const Array2D& rhs) : - width_(rhs.width_), height_(rhs.height_), data_(rhs.data_) {} - bool empty() const { return data_.empty(); } - void resize(int w, int h, const T& d = T()) { - data_.resize(w * h, d); - width_ = w; - height_ = h; - } - const Array2D& operator=(const Array2D& rhs) { - data_ = rhs.data_; - width_ = rhs.width_; - height_ = rhs.height_; - return *this; - } - void fill(const T& v) { data_.assign(data_.size(), v); } - int width() const { return width_; } - int height() const { return height_; } - reference operator()(int i, int j) { - return data_[offset(i, j)]; - } - void clear() { data_.clear(); width_=0; height_=0; } - const_reference operator()(int i, int j) const { - return data_[offset(i, j)]; - } - iterator begin_col(int j) { - return data_.begin() + offset(0,j); - } - const_iterator begin_col(int j) const { - return data_.begin() + offset(0,j); - } - iterator end_col(int j) { - return data_.begin() + offset(0,j) + width_; - } - const_iterator end_col(int j) const { - return data_.begin() + offset(0,j) + width_; - } - iterator end() { return data_.end(); } - const_iterator end() const { return data_.end(); } - const Array2D& operator*=(const T& x) { - std::transform(data_.begin(), data_.end(), data_.begin(), - std::bind2nd(std::multiplies(), x)); - } - const Array2D& operator/=(const T& x) { - std::transform(data_.begin(), data_.end(), data_.begin(), - std::bind2nd(std::divides(), x)); - } - const Array2D& operator+=(const Array2D& m) { - std::transform(m.data_.begin(), m.data_.end(), data_.begin(), data_.begin(), std::plus()); - } - const Array2D& operator-=(const Array2D& m) { - std::transform(m.data_.begin(), m.data_.end(), data_.begin(), data_.begin(), std::minus()); - } - - private: - inline int offset(int i, int j) const { - assert(i data_; -}; - -template -Array2D operator*(const Array2D& l, const T& scalar) { - Array2D res(l); - res *= scalar; - return res; -} - -template -Array2D operator*(const T& scalar, const Array2D& l) { - Array2D res(l); - res *= scalar; - return res; -} - -template -Array2D operator/(const Array2D& l, const T& scalar) { - Array2D res(l); - res /= scalar; - return res; -} - -template -Array2D operator+(const Array2D& l, const Array2D& r) { - Array2D res(l); - res += r; - return res; -} - -template -Array2D operator-(const Array2D& l, const Array2D& r) { - Array2D res(l); - res -= r; - return res; -} - -template -inline std::ostream& operator<<(std::ostream& os, const Array2D& m) { - for (int i=0; i& m) { - os << ' '; - for (int j=0; j >& m) { - os << ' '; - for (int j=0; j& ar = m(i,j); - for (int k=0; k 0) { + assert(ref); + LatticeTools::ConvertTextOrPLF(sref, ref); + } +} + void ConvertSV(const SparseVector& src, SparseVector* trg) { for (SparseVector::const_iterator it = src.begin(); it != src.end(); ++it) trg->set_value(it->first, it->second); diff --git a/decoder/dict.cc b/decoder/dict.cc deleted file mode 100644 index 2d6986c8..00000000 --- a/decoder/dict.cc +++ /dev/null @@ -1,27 +0,0 @@ -#include "dict.h" - -#include -#include - -void TokenizeStringSeparator( - const std::string& str, - const std::string& separator, - std::vector* tokens) { - - size_t pos = 0; - std::string::size_type nextPos = str.find(separator, pos); - - while (nextPos != std::string::npos) { - tokens->push_back(str.substr(pos, nextPos - pos)); - pos = nextPos + separator.size(); - nextPos = str.find(separator, pos); - } - tokens->push_back(str.substr(pos, nextPos - pos)); -} - - -void Dict::AsVector(const WordID& id, std::vector* results) const { - results->clear(); - TokenizeStringSeparator(Convert(id), " ||| ", results); -} - diff --git a/decoder/dict.h b/decoder/dict.h deleted file mode 100644 index 348a97e3..00000000 --- a/decoder/dict.h +++ /dev/null @@ -1,66 +0,0 @@ -#ifndef DICT_H_ -#define DICT_H_ - - -#include -#include - -#include -#include -#include "hash.h" -#include "wordid.h" - -class Dict { - typedef - HASH_MAP > Map; - public: - Dict() : b0_("") { - HASH_MAP_EMPTY(d_,""); - words_.reserve(1000); - } - - inline int max() const { return words_.size(); } - - inline WordID Convert(const std::string& word, bool frozen = false) { - Map::iterator i = d_.find(word); - if (i == d_.end()) { - if (frozen) - return 0; - words_.push_back(word); - d_[word] = words_.size(); - return words_.size(); - } else { - return i->second; - } - } - - inline WordID Convert(const std::vector& words, bool frozen = false) - { return Convert(toString(words), frozen); } - - static inline std::string toString(const std::vector& words) { - std::string word= ""; - for (std::vector::const_iterator it=words.begin(); - it != words.end(); ++it) { - if (it != words.begin()) word += " ||| "; - word += *it; - } - return word; - } - - inline const std::string& Convert(const WordID& id) const { - if (id == 0) return b0_; - assert(id <= (int)words_.size()); - return words_[id-1]; - } - - void AsVector(const WordID& id, std::vector* results) const; - - void clear() { words_.clear(); d_.clear(); } - - private: - const std::string b0_; - std::vector words_; - Map d_; -}; - -#endif diff --git a/decoder/dict_test.cc b/decoder/dict_test.cc deleted file mode 100644 index 694877fa..00000000 --- a/decoder/dict_test.cc +++ /dev/null @@ -1,50 +0,0 @@ -#include "dict.h" - -#include "fdict.h" - -#include -#include -#include -#include "filelib.h" - -#include "tdict.h" - -using namespace std; - -class DTest : public testing::Test { - public: - DTest() {} - protected: - virtual void SetUp() { } - virtual void TearDown() { } -}; - -TEST_F(DTest, Convert) { - Dict d; - WordID a = d.Convert("foo"); - WordID b = d.Convert("bar"); - std::string x = "foo"; - WordID c = d.Convert(x); - EXPECT_NE(a, b); - EXPECT_EQ(a, c); - EXPECT_EQ(d.Convert(a), "foo"); - EXPECT_EQ(d.Convert(b), "bar"); -} - -TEST_F(DTest, FDictTest) { - int fid = FD::Convert("First"); - EXPECT_GT(fid, 0); - EXPECT_EQ(FD::Convert(fid), "First"); - string x = FD::Escape("="); - cerr << x << endl; - EXPECT_NE(x, "="); - x = FD::Escape(";"); - cerr << x << endl; - EXPECT_NE(x, ";"); -} - -int main(int argc, char** argv) { - testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} - diff --git a/decoder/fdict.cc b/decoder/fdict.cc deleted file mode 100644 index baa0b552..00000000 --- a/decoder/fdict.cc +++ /dev/null @@ -1,143 +0,0 @@ -#include "fdict.h" -#include "stdlib.h" -//for malloc (need on cygwin); todo and std::malloc -#include -#include - -using namespace std; - -Dict FD::dict_; -bool FD::frozen_ = false; - -std::string FD::Convert(std::vector const& v) { - return Convert(&*v.begin(),&*v.end()); -} - -std::string FD::Convert(WordID const *b,WordID const* e) { - ostringstream o; - for (WordID const* i=b;ib) o << ' '; - o << FD::Convert(*i); - } - return o.str(); -} - -static int HexPairValue(const char * code) { - int value = 0; - const char * pch = code; - for (;;) { - int digit = *pch++; - if (digit >= '0' && digit <= '9') { - value += digit - '0'; - } - else if (digit >= 'A' && digit <= 'F') { - value += digit - 'A' + 10; - } - else if (digit >= 'a' && digit <= 'f') { - value += digit - 'a' + 10; - } - else { - return -1; - } - if (pch == code + 2) - return value; - value <<= 4; - } -} - -int UrlDecode(const char *source, char *dest) -{ - char * start = dest; - - while (*source) { - switch (*source) { - case '+': - *(dest++) = ' '; - break; - case '%': - if (source[1] && source[2]) { - int value = HexPairValue(source + 1); - if (value >= 0) { - *(dest++) = value; - source += 2; - } - else { - *dest++ = '?'; - } - } - else { - *dest++ = '?'; - } - break; - default: - *dest++ = *source; - } - source++; - } - - *dest = 0; - return dest - start; -} - -int UrlEncode(const char *source, char *dest, unsigned max) { - static const char *digits = "0123456789ABCDEF"; - unsigned char ch; - unsigned len = 0; - char *start = dest; - - while (len < max - 4 && *source) - { - ch = (unsigned char)*source; - if (*source == ' ') { - *dest++ = '+'; - } - else if (strchr("=:;,_| %", ch)) { - *dest++ = '%'; - *dest++ = digits[(ch >> 4) & 0x0F]; - *dest++ = digits[ ch & 0x0F]; - } - else { - *dest++ = *source; - } - source++; - } - *dest = 0; - return start - dest; -} - -std::string UrlDecodeString(const std::string & encoded) { - const char * sz_encoded = encoded.c_str(); - size_t needed_length = encoded.length(); - for (const char * pch = sz_encoded; *pch; pch++) { - if (*pch == '%') - needed_length += 2; - } - needed_length += 10; - char stackalloc[64]; - char * buf = needed_length > sizeof(stackalloc)/sizeof(*stackalloc) ? - (char *)malloc(needed_length) : stackalloc; - UrlDecode(encoded.c_str(), buf); - std::string result(buf); - if (buf != stackalloc) { - free(buf); - } - return result; -} - -std::string UrlEncodeString(const std::string & decoded) { - size_t needed_length = decoded.length() * 3 + 3; - char stackalloc[64]; - char * buf = needed_length > sizeof(stackalloc)/sizeof(*stackalloc) ? - (char *)malloc(needed_length) : stackalloc; - UrlEncode(decoded.c_str(), buf, needed_length); - std::string result(buf); - if (buf != stackalloc) { - free(buf); - } - return result; -} - -string FD::Escape(const string& s) { - return UrlEncodeString(s); -} - diff --git a/decoder/fdict.h b/decoder/fdict.h deleted file mode 100644 index f9673023..00000000 --- a/decoder/fdict.h +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef _FDICT_H_ -#define _FDICT_H_ - -#include -#include -#include "dict.h" - -struct FD { - // once the FD is frozen, new features not already in the - // dictionary will return 0 - static void Freeze() { - frozen_ = true; - } - static inline int NumFeats() { - return dict_.max() + 1; - } - static inline WordID Convert(const std::string& s) { - return dict_.Convert(s, frozen_); - } - static inline const std::string& Convert(const WordID& w) { - return dict_.Convert(w); - } - static std::string Convert(WordID const *i,WordID const* e); - static std::string Convert(std::vector const& v); - - // Escape any string to a form that can be used as the name - // of a weight in a weights file - static std::string Escape(const std::string& s); - static Dict dict_; - private: - static bool frozen_; -}; - -#endif diff --git a/decoder/feature_vector.h b/decoder/feature_vector.h deleted file mode 100755 index be378a6a..00000000 --- a/decoder/feature_vector.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef _FEATURE_VECTOR_H_ -#define _FEATURE_VECTOR_H_ - -#include -#include "sparse_vector.h" -#include "fdict.h" - -typedef double Featval; -typedef SparseVectorList FeatureVectorList; -typedef SparseVector FeatureVector; -typedef SparseVector WeightVector; -typedef std::vector DenseWeightVector; - -inline void sparse_to_dense(WeightVector const& wv,DenseWeightVector *dv) { - wv.init_vector(dv); -} - -#endif diff --git a/decoder/ff_bleu.cc b/decoder/ff_bleu.cc index 77989331..aa4e6d85 100644 --- a/decoder/ff_bleu.cc +++ b/decoder/ff_bleu.cc @@ -18,7 +18,7 @@ char const* bleu_usage_verbose="Uses feature id 0! Make sure there are no other #include "hg.h" #include "stringlib.h" #include "sentence_metadata.h" -#include "../vest/scorer.h" +#include "scorer.h" using namespace std; diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc index f3e65cb7..a9929253 100644 --- a/decoder/ff_lm.cc +++ b/decoder/ff_lm.cc @@ -728,7 +728,7 @@ LanguageModelRandLM::LanguageModelRandLM(const string& param) : filename = argv[0]; } } - set_order(order); +// set_order(order); int cache_MB = 200; // increase cache size randlm::RandLM* rlm = randlm::RandLM::initRandLM(filename, order, cache_MB); assert(rlm != NULL); diff --git a/decoder/ff_wordalign.cc b/decoder/ff_wordalign.cc index 0ba2bf92..087bff0c 100644 --- a/decoder/ff_wordalign.cc +++ b/decoder/ff_wordalign.cc @@ -5,6 +5,7 @@ #include #include +#include "alignment_pharaoh.h" #include "stringlib.h" #include "sentence_metadata.h" #include "hg.h" @@ -354,7 +355,7 @@ AlignerResults::AlignerResults(const std::string& param) : getline(in, line); if (!in) break; ++lc; - is_aligned_.push_back(AlignerTools::ReadPharaohAlignmentGrid(line)); + is_aligned_.push_back(AlignmentPharaoh::ReadPharaohAlignmentGrid(line)); } cerr << " Loaded " << lc << " refs\n"; } diff --git a/decoder/filelib.cc b/decoder/filelib.cc deleted file mode 100644 index 79ad2847..00000000 --- a/decoder/filelib.cc +++ /dev/null @@ -1,22 +0,0 @@ -#include "filelib.h" - -#include -#include - -using namespace std; - -bool FileExists(const std::string& fn) { - struct stat info; - int s = stat(fn.c_str(), &info); - return (s==0); -} - -bool DirectoryExists(const string& dir) { - if (access(dir.c_str(),0) == 0) { - struct stat status; - stat(dir.c_str(), &status); - if (status.st_mode & S_IFDIR) return true; - } - return false; -} - diff --git a/decoder/filelib.h b/decoder/filelib.h deleted file mode 100644 index b9fef9a7..00000000 --- a/decoder/filelib.h +++ /dev/null @@ -1,106 +0,0 @@ -#ifndef _FILELIB_H_ -#define _FILELIB_H_ - -#include -#include -#include -#include -#include -#include -#include "gzstream.h" -#include "null_deleter.h" - -bool FileExists(const std::string& file_name); -bool DirectoryExists(const std::string& dir_name); - -// reads from standard in if filename is - -// uncompresses if file ends with .gz -// otherwise, reads from a normal file - -template -struct BaseFile { - typedef Stream S; - typedef boost::shared_ptr PS; - void Reset() { - ps_.reset(); - } - bool is_null() const { return !ps_; } - operator bool() const { - return ps_; - } - S* stream() { return ps_.get(); } - S* operator->() { return ps_.get(); } // compat with old ReadFile * -> new Readfile. remove? - S &operator *() const { return get(); } - S &get() const { return *ps_; } - bool is_std() { - return filename_=="-"; - } - std::string filename_; -protected: - void error(std::string const& reason,std::string const& filename) { - throw std::runtime_error("File "+filename+" - "+reason); - } - - PS ps_; - static bool EndsWith(const std::string& f, const std::string& suf) { - return (f.size() > suf.size()) && (f.rfind(suf) == f.size() - suf.size()); - } -}; - -class ReadFile : public BaseFile { - public: - ReadFile() { } - explicit ReadFile(const std::string& filename) { - Init(filename); - } - void Init(const std::string& filename) { - filename_=filename; - if (is_std()) { - ps_=PS(&std::cin,null_deleter()); - } else { - if (!FileExists(filename)) { - std::cerr << "File does not exist: " << filename << std::endl; - error(filename," couldn't read nonexistant file."); - abort(); - } - char const* file=filename_.c_str(); // just in case the gzstream keeps using the filename for longer than the constructor, e.g. inflateReset2. warning in valgrind that I'm hoping will disappear - it makes no sense. - ps_=PS(EndsWith(filename, ".gz") ? - static_cast(new igzstream(file)) : - static_cast(new std::ifstream(file))); - if (!*ps_) { - std::cerr << "Failed to open " << filename << std::endl; - error(filename," open for reading failed."); - abort(); - } - } - } - -}; - -class WriteFile : public BaseFile { - public: - WriteFile() {} - explicit WriteFile(std::string const& filename) { Init(filename); } - void Init(const std::string& filename) { - filename_=filename; - if (is_std()) { - ps_=PS(&std::cout,null_deleter()); - } else { - char const* file=filename_.c_str(); // just in case the gzstream keeps using the filename for longer than the constructor, e.g. inflateReset2. warning in valgrind that I'm hoping will disappear - it makes no sense. - ps_=PS(EndsWith(filename, ".gz") ? - static_cast(new ogzstream(file)) : - static_cast(new std::ofstream(file))); - if (!*ps_) { - std::cerr << "Failed to open " << filename << std::endl; - error(filename," open for writing failed."); - abort(); - } - } - } - ~WriteFile() { - if (ps_) - get() << std::flush; - } -}; - -#endif diff --git a/decoder/gzstream.cc b/decoder/gzstream.cc deleted file mode 100644 index 88cd1bd2..00000000 --- a/decoder/gzstream.cc +++ /dev/null @@ -1,182 +0,0 @@ -// ============================================================================ -// gzstream, C++ iostream classes wrapping the zlib compression library. -// Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -// Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this library; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// ============================================================================ -// -// File : gzstream.C -// Revision : $Revision: 1.7 $ -// Revision_date : $Date: 2003/01/08 14:41:27 $ -// Author(s) : Deepak Bandyopadhyay, Lutz Kettner -// -// Standard streambuf implementation following Nicolai Josuttis, "The -// Standard C++ Library". -// ============================================================================ - -#include -#include -#include // for memcpy -#include - -#ifdef GZSTREAM_NAMESPACE -namespace GZSTREAM_NAMESPACE { -#endif - -// ---------------------------------------------------------------------------- -// Internal classes to implement gzstream. See header file for user classes. -// ---------------------------------------------------------------------------- - -// -------------------------------------- -// class gzstreambuf: -// -------------------------------------- - -gzstreambuf* gzstreambuf::open( const char* name, int open_mode) { - if ( is_open()) - return (gzstreambuf*)0; - mode = open_mode; - // no append nor read/write mode - if ((mode & std::ios::ate) || (mode & std::ios::app) - || ((mode & std::ios::in) && (mode & std::ios::out))) - return (gzstreambuf*)0; - const int Nmode=10; - char fmode[Nmode]; - char* fmodeptr = fmode; - if ( mode & std::ios::in) - *fmodeptr++ = 'r'; - else if ( mode & std::ios::out) - *fmodeptr++ = 'w'; - *fmodeptr++ = 'b'; - while (fmodeptr( gptr()); - - if ( ! (mode & std::ios::in) || ! opened) - return EOF; - // Josuttis' implementation of inbuf - int n_putback = gptr() - eback(); - if ( n_putback > 4) - n_putback = 4; - std::memcpy( buffer + (4 - n_putback), gptr() - n_putback, n_putback); - - int num = gzread( file, buffer+4, bufferSize-4); - if (num <= 0) // ERROR or EOF - { - if (gzeof(file)) - return EOF; - handle_gzerror(); - } - - // reset buffer pointers - setg( buffer + (4 - n_putback), // beginning of putback area - buffer + 4, // read position - buffer + 4 + num); // end of buffer - - // return next character - return * reinterpret_cast( gptr()); -} - -int gzstreambuf::flush_buffer() { - // Separate the writing of the buffer from overflow() and - // sync() operation. - int w = pptr() - pbase(); - if ( gzwrite( file, pbase(), w) != w) - handle_gzerror(); - pbump( -w); - return w; -} - -int gzstreambuf::overflow( int c) { // used for output buffer only - if ( ! ( mode & std::ios::out) || ! opened) - return EOF; - if (c != EOF) { - *pptr() = c; - pbump(1); - } - if ( flush_buffer() == EOF) - return EOF; - return c; -} - -int gzstreambuf::sync() { - // Changed to use flush_buffer() instead of overflow( EOF) - // which caused improper behavior with std::endl and flush(), - // bug reported by Vincent Ricard. - if ( pptr() && pptr() > pbase()) { - if ( flush_buffer() == EOF) - return -1; - } - return 0; -} - -// -------------------------------------- -// class gzstreambase: -// -------------------------------------- - -gzstreambase::gzstreambase( const char* name, int mode) { - init( &buf); - open( name, mode); -} - -gzstreambase::~gzstreambase() { - buf.close(); -} - -void gzstreambase::open( const char* name, int open_mode) { - if ( ! buf.open( name, open_mode)) - clear( rdstate() | std::ios::badbit); -} - -void gzstreambase::close() { - if ( buf.is_open()) - if ( ! buf.close()) - clear( rdstate() | std::ios::badbit); -} - -#ifdef GZSTREAM_NAMESPACE -} // namespace GZSTREAM_NAMESPACE -#endif - -// ============================================================================ -// EOF // diff --git a/decoder/gzstream.h b/decoder/gzstream.h deleted file mode 100644 index a7effd90..00000000 --- a/decoder/gzstream.h +++ /dev/null @@ -1,127 +0,0 @@ -// ============================================================================ -// gzstream, C++ iostream classes wrapping the zlib compression library. -// Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner -// -// This library is free software; you can redistribute it and/or -// modify it under the terms of the GNU Lesser General Public -// License as published by the Free Software Foundation; either -// version 2.1 of the License, or (at your option) any later version. -// -// This library is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -// Lesser General Public License for more details. -// -// You should have received a copy of the GNU Lesser General Public -// License along with this library; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -// ============================================================================ -// -// File : gzstream.h -// Revision : $Revision: 1.5 $ -// Revision_date : $Date: 2002/04/26 23:30:15 $ -// Author(s) : Deepak Bandyopadhyay, Lutz Kettner -// -// Standard streambuf implementation following Nicolai Josuttis, "The -// Standard C++ Library". -// ============================================================================ - -#ifndef GZSTREAM_H -#define GZSTREAM_H 1 - -// standard C++ with new header file names and std:: namespace -#include -#include -#include - -#ifdef GZSTREAM_NAMESPACE -namespace GZSTREAM_NAMESPACE { -#endif - -// ---------------------------------------------------------------------------- -// Internal classes to implement gzstream. See below for user classes. -// ---------------------------------------------------------------------------- - -class gzstreambuf : public std::streambuf { -private: - static const int bufferSize = 47+(1024*256); // size of data buff - // totals 512 bytes under g++ for igzstream at the end. - - gzFile file; // file handle for compressed file - char buffer[bufferSize]; // data buffer - char opened; // open/close state of stream - int mode; // I/O mode - - int flush_buffer(); - void handle_gzerror(); // throws exception -public: -#if defined(_WIN32) && !defined(CYGWIN) && !defined(EOF) - enum { - EOF = -1 - }; -#endif - gzstreambuf() : opened(0) { - setp( buffer, buffer + (bufferSize-1)); - setg( buffer + 4, // beginning of putback area - buffer + 4, // read position - buffer + 4); // end position - // ASSERT: both input & output capabilities will not be used together - } - int is_open() { return opened; } - gzstreambuf* open( const char* name, int open_mode); - gzstreambuf* close(); - ~gzstreambuf() { close(); } - - virtual int overflow( int c = EOF); - virtual int underflow(); - virtual int sync(); -}; - -class gzstreambase : virtual public std::ios { -protected: - gzstreambuf buf; -public: - gzstreambase() { init(&buf); } - gzstreambase( const char* name, int open_mode); - ~gzstreambase(); - void open( const char* name, int open_mode); - void close(); - gzstreambuf* rdbuf() { return &buf; } -}; - -// ---------------------------------------------------------------------------- -// User classes. Use igzstream and ogzstream analogously to ifstream and -// ofstream respectively. They read and write files based on the gz* -// function interface of the zlib. Files are compatible with gzip compression. -// ---------------------------------------------------------------------------- - -class igzstream : public gzstreambase, public std::istream { -public: - igzstream() : std::istream( &buf) {} - igzstream( const char* name, int open_mode = std::ios::in) - : gzstreambase( name, std::ios::in | open_mode), std::istream( &buf) {} - gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); } - void open( const char* name, int open_mode = std::ios::in) { - gzstreambase::open( name, open_mode); - } -}; - -class ogzstream : public gzstreambase, public std::ostream { -public: - ogzstream() : std::ostream( &buf) {} - ogzstream( const char* name, int mode = std::ios::out) - : gzstreambase( name, mode), std::ostream( &buf) {} - gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); } - void open( const char* name, int open_mode = std::ios::out) { - gzstreambase::open( name, open_mode); - } -}; - -#ifdef GZSTREAM_NAMESPACE -} // namespace GZSTREAM_NAMESPACE -#endif - -#endif // GZSTREAM_H -// ============================================================================ -// EOF // - diff --git a/decoder/hash.h b/decoder/hash.h deleted file mode 100755 index 3a60a429..00000000 --- a/decoder/hash.h +++ /dev/null @@ -1,54 +0,0 @@ -#ifndef CDEC_HASH_H -#define CDEC_HASH_H - -#include "murmur_hash.h" - -#include "config.h" -#ifdef HAVE_SPARSEHASH -# include -# define HASH_MAP google::dense_hash_map -# define HASH_MAP_RESERVED(h,empty,deleted) do { h.set_empty_key(empty); h.set_deleted_key(deleted); } while(0) -# define HASH_MAP_EMPTY(h,empty) do { h.set_empty_key(empty); } while(0) -#else -# include -# define HASH_MAP std::tr1::unordered_map -# define HASH_MAP_RESERVED(h,empty,deleted) -# define HASH_MAP_EMPTY(h,empty) -#endif - -#include - -// assumes C is POD -template -struct murmur_hash -{ - typedef MurmurInt return_type; - typedef C /*const&*/ argument_type; - return_type operator()(argument_type const& c) const { - return MurmurHash((void*)&c,sizeof(c)); - } -}; - -// murmur_hash_array isn't std guaranteed safe (you need to use string::data()) -template <> -struct murmur_hash -{ - typedef MurmurInt return_type; - typedef std::string /*const&*/ argument_type; - return_type operator()(argument_type const& c) const { - return MurmurHash(c.data(),c.size()); - } -}; - -// uses begin(),size() assuming contiguous layout and POD -template -struct murmur_hash_array -{ - typedef MurmurInt return_type; - typedef C /*const&*/ argument_type; - return_type operator()(argument_type const& c) const { - return MurmurHash(&*c.begin(),c.size()*sizeof(*c.begin())); - } -}; - -#endif diff --git a/decoder/have_64_bits.h b/decoder/have_64_bits.h deleted file mode 100755 index d1e6064f..00000000 --- a/decoder/have_64_bits.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef HAVE_64_BITS_H -#define HAVE_64_BITS_H - -#include - -#undef HAVE_64_BITS - -#if INTPTR_MAX == INT32_MAX -# define HAVE_64_BITS 0 -#elif INTPTR_MAX >= INT64_MAX -# define HAVE_64_BITS 1 -#else -# error "couldn't tell if HAVE_64_BITS from INTPTR_MAX INT32_MAX INT64_MAX" -#endif - - -#endif diff --git a/decoder/hg.h b/decoder/hg.h index d5c8e197..e9510997 100644 --- a/decoder/hg.h +++ b/decoder/hg.h @@ -102,6 +102,8 @@ public: void copy_info(Edge const& o) { #if USE_INFO_EDGE set_info(o.info_.str()); // by convention, each person putting info here starts with a separator (e.g. space). it's empty if nobody put any info there. +#else + (void) o; #endif } void copy_pod(Edge const& o) { @@ -142,7 +144,7 @@ public: #else std::string info() const { return std::string(); } void reset_info() { } - void set_info(std::string const& s) { } + void set_info(std::string const& ) { } #endif void show(std::ostream &o,unsigned mask=SPAN|RULE) const { o<<'{'; diff --git a/decoder/hg_io.cc b/decoder/hg_io.cc index 52a8565a..1af8261e 100644 --- a/decoder/hg_io.cc +++ b/decoder/hg_io.cc @@ -622,56 +622,3 @@ void HypergraphIO::WriteAsCFG(const Hypergraph& hg) { } } -namespace B64 { - -static const char cb64[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; -static const char cd64[]="|$$$}rstuvwxyz{$$$$$$$>?@ABCDEFGHIJKLMNOPQRSTUVW$$$$$$XYZ[\\]^_`abcdefghijklmnopq"; - -static void encodeblock(const unsigned char* in, ostream* os, int len) { - char out[4]; - out[0] = cb64[ in[0] >> 2 ]; - out[1] = cb64[ ((in[0] & 0x03) << 4) | ((in[1] & 0xf0) >> 4) ]; - out[2] = (len > 1 ? cb64[ ((in[1] & 0x0f) << 2) | ((in[2] & 0xc0) >> 6) ] : '='); - out[3] = (len > 2 ? cb64[ in[2] & 0x3f ] : '='); - os->write(out, 4); -} - -void b64encode(const char* data, const size_t size, ostream* out) { - size_t cur = 0; - while(cur < size) { - int len = min(static_cast(3), size - cur); - encodeblock(reinterpret_cast(&data[cur]), out, len); - cur += len; - } -} - -static void decodeblock(const unsigned char* in, unsigned char* out) { - out[0] = (unsigned char ) (in[0] << 2 | in[1] >> 4); - out[1] = (unsigned char ) (in[1] << 4 | in[2] >> 2); - out[2] = (unsigned char ) (((in[2] << 6) & 0xc0) | in[3]); -} - -bool b64decode(const unsigned char* data, const size_t insize, char* out, const size_t outsize) { - size_t cur = 0; - size_t ocur = 0; - unsigned char in[4]; - while(cur < insize) { - assert(ocur < outsize); - for (int i = 0; i < 4; ++i) { - unsigned char v = data[cur]; - v = (unsigned char) ((v < 43 || v > 122) ? '\0' : cd64[ v - 43 ]); - if (!v) { - cerr << "B64 decode error at offset " << cur << " offending character: " << (int)data[cur] << endl; - return false; - } - v = (unsigned char) ((v == '$') ? '\0' : v - 61); - if (v) in[i] = v - 1; else in[i] = 0; - ++cur; - } - decodeblock(in, reinterpret_cast(&out[ocur])); - ocur += 3; - } - return true; -} -} - diff --git a/decoder/hg_io.h b/decoder/hg_io.h index b6a176ab..082489d8 100644 --- a/decoder/hg_io.h +++ b/decoder/hg_io.h @@ -31,9 +31,4 @@ struct HypergraphIO { static std::string Escape(const std::string& s); // PLF helper }; -namespace B64 { - bool b64decode(const unsigned char* data, const size_t insize, char* out, const size_t outsize); - void b64encode(const char* data, const size_t size, std::ostream* out); -} - #endif diff --git a/decoder/int_or_pointer.h b/decoder/int_or_pointer.h deleted file mode 100755 index 4b6a9e4a..00000000 --- a/decoder/int_or_pointer.h +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef INT_OR_POINTER_H -#define INT_OR_POINTER_H - -// if you ever wanted to store a discriminated union of pointer/integer without an extra boolean flag, this will do it, assuming your pointers are never odd. - -// check lsb for expected tag? -#ifndef IOP_CHECK_LSB -# define IOP_CHECK_LSB 1 -#endif -#if IOP_CHECK_LSB -# define iop_assert(x) assert(x) -#else -# define iop_assert(x) -#endif - -#include -#include - -template -struct IntOrPointer { - typedef Pointed pointed_type; - typedef Int integer_type; - typedef Pointed *value_type; - typedef IntOrPointer self_type; - IntOrPointer(int j) { *this=j; } - IntOrPointer(size_t j) { *this=j; } - IntOrPointer(value_type v) { *this=v; } - bool is_integer() const { return i&1; } - bool is_pointer() const { return !(i&1); } - value_type & pointer() { return p; } - const value_type & pointer() const { iop_assert(is_pointer()); return p; } - integer_type integer() const { iop_assert(is_integer()); return i >> 1; } - void set_integer(Int j) { i=2*j+1; } - void set_pointer(value_type p_) { p=p_;iop_assert(is_pointer()); } - void operator=(unsigned j) { i = 2*(integer_type)j+1; } - void operator=(int j) { i = 2*(integer_type)j+1; } - template - void operator=(C j) { i = 2*(integer_type)j+1; } - void operator=(value_type v) { p=v; } - IntOrPointer() {} - IntOrPointer(const self_type &s) : p(s.p) {} - void operator=(const self_type &s) { p=s.p; } - template - bool operator ==(C* v) const { return p==v; } - template - bool operator ==(const C* v) const { return p==v; } - template - bool operator ==(C j) const { return integer() == j; } - bool operator ==(self_type s) const { return p==s.p; } - bool operator !=(self_type s) const { return p!=s.p; } - template void print(O&o) const - { - if (is_integer()) - o << integer(); - else { - o << "0x" << std::hex << (size_t)pointer() << std::dec; - } - } - friend inline std::ostream& operator<<(std::ostream &o,self_type const& s) { - s.print(o); return o; - } -protected: - union { - value_type p; // must be even (guaranteed unless you're pointing at packed chars) - integer_type i; // stored as 2*data+1, so only has half the range (one less bit) of a normal integer_type - }; -}; - - -#endif diff --git a/decoder/intrusive_refcount.hpp b/decoder/intrusive_refcount.hpp deleted file mode 100755 index 4a4b0187..00000000 --- a/decoder/intrusive_refcount.hpp +++ /dev/null @@ -1,84 +0,0 @@ -#ifndef GRAEHL__SHARED__INTRUSIVE_REFCOUNT_HPP -#define GRAEHL__SHARED__INTRUSIVE_REFCOUNT_HPP - -#include -#include -#include -#include - -/** usage: - struct mine : public boost::instrusive_refcount {}; - - boost::intrusive_ptr p(new mine()); -*/ - -namespace boost { -// note: the free functions need to be in boost namespace, OR namespace of involved type. this is the only way to do it. - -template -class intrusive_refcount; - -template -class atomic_intrusive_refcount; - -template -void intrusive_ptr_add_ref(intrusive_refcount* ptr) -{ - ++(ptr->refs); -} - -template -void intrusive_ptr_release(intrusive_refcount* ptr) -{ - if (!--(ptr->refs)) delete static_cast(ptr); -} - - -//WARNING: only 2^32 (unsigned) refs allowed. hope that's ok :) -template -class intrusive_refcount : boost::noncopyable -{ - protected: -// typedef intrusive_refcount pointed_type; - friend void intrusive_ptr_add_ref(intrusive_refcount* ptr); - friend void intrusive_ptr_release(intrusive_refcount* ptr); -// friend class intrusive_ptr; - - intrusive_refcount(): refs(0) {} - ~intrusive_refcount() { assert(refs==0); } - -private: - unsigned refs; -}; - - -template -void intrusive_ptr_add_ref(atomic_intrusive_refcount* ptr) -{ - ++(ptr->refs); -} - -template -void intrusive_ptr_release(atomic_intrusive_refcount* ptr) -{ - if(!--(ptr->refs)) delete static_cast(ptr); -} - -template -class atomic_intrusive_refcount : boost::noncopyable -{ - protected: - friend void intrusive_ptr_add_ref(atomic_intrusive_refcount* ptr); - friend void intrusive_ptr_release(atomic_intrusive_refcount* ptr); - - atomic_intrusive_refcount(): refs(0) {} - ~atomic_intrusive_refcount() { assert(refs==0); } - -private: - boost::detail::atomic_count refs; -}; - -} - - -#endif diff --git a/decoder/logval.h b/decoder/logval.h deleted file mode 100644 index 37f14ae5..00000000 --- a/decoder/logval.h +++ /dev/null @@ -1,174 +0,0 @@ -#ifndef LOGVAL_H_ -#define LOGVAL_H_ - -#define LOGVAL_CHECK_NEG false - -#include -#include -#include -#include - -template -class LogVal { - public: - LogVal() : s_(), v_(-std::numeric_limits::infinity()) {} - explicit LogVal(double x) : s_(std::signbit(x)), v_(s_ ? std::log(-x) : std::log(x)) {} - LogVal(int x) : s_(x<0), v_(s_ ? std::log(-x) : std::log(x)) {} - LogVal(unsigned x) : s_(0), v_(std::log(x)) { } - LogVal(double lnx,bool sign) : s_(sign),v_(lnx) {} - static LogVal exp(T lnx) { return LogVal(lnx,false); } - - static LogVal One() { return LogVal(1); } - static LogVal Zero() { return LogVal(); } - static LogVal e() { return LogVal(1,false); } - void logeq(const T& v) { s_ = false; v_ = v; } - - LogVal& operator+=(const LogVal& a) { - if (a.v_ == -std::numeric_limits::infinity()) return *this; - if (a.s_ == s_) { - if (a.v_ < v_) { - v_ = v_ + log1p(std::exp(a.v_ - v_)); - } else { - v_ = a.v_ + log1p(std::exp(v_ - a.v_)); - } - } else { - if (a.v_ < v_) { - v_ = v_ + log1p(-std::exp(a.v_ - v_)); - } else { - v_ = a.v_ + log1p(-std::exp(v_ - a.v_)); - s_ = !s_; - } - } - return *this; - } - - LogVal& operator*=(const LogVal& a) { - s_ = (s_ != a.s_); - v_ += a.v_; - return *this; - } - - LogVal& operator/=(const LogVal& a) { - s_ = (s_ != a.s_); - v_ -= a.v_; - return *this; - } - - LogVal& operator-=(const LogVal& a) { - LogVal b = a; - b.invert(); - return *this += b; - } - - // LogVal(fabs(log(x)),x.s_) - friend LogVal abslog(LogVal x) { - if (x.v_<0) x.v_=-x.v_; - return x; - } - - LogVal& poweq(const T& power) { -#if LOGVAL_CHECK_NEG - if (s_) { - std::cerr << "poweq(T) not implemented when s_ is true\n"; - std::abort(); - } else -#endif - v_ *= power; - return *this; - } - - void invert() { s_ = !s_; } - - LogVal pow(const T& power) const { - LogVal res = *this; - res.poweq(power); - return res; - } - - LogVal root(const T& root) const { - return pow(1/root); - } - - operator T() const { - if (s_) return -std::exp(v_); else return std::exp(v_); - } - - bool s_; - T v_; -}; - -// copy elision - as opposed to explicit copy of LogVal const& o1, we should be able to construct Logval r=a+(b+c) as a single result in place in r. todo: return std::move(o1) - C++0x -template -LogVal operator+(LogVal o1, const LogVal& o2) { - o1 += o2; - return o1; -} - -template -LogVal operator*(LogVal o1, const LogVal& o2) { - o1 *= o2; - return o1; -} - -template -LogVal operator/(LogVal o1, const LogVal& o2) { - o1 /= o2; - return o1; -} - -template -LogVal operator-(LogVal o1, const LogVal& o2) { - o1 -= o2; - return o1; -} - -template -T log(const LogVal& o) { -#ifdef LOGVAL_CHECK_NEG - if (o.s_) return log(-1.0); -#endif - return o.v_; -} - -template -LogVal pow(const LogVal& b, const T& e) { - return b.pow(e); -} - -template -bool operator<(const LogVal& lhs, const LogVal& rhs) { - if (lhs.s_ == rhs.s_) { - return (lhs.v_ < rhs.v_); - } else { - return lhs.s_ > rhs.s_; - } -} - -#if 0 -template -bool operator<=(const LogVal& lhs, const LogVal& rhs) { - return (lhs.v_ <= rhs.v_); -} - -template -bool operator>(const LogVal& lhs, const LogVal& rhs) { - return (lhs.v_ > rhs.v_); -} - -template -bool operator>=(const LogVal& lhs, const LogVal& rhs) { - return (lhs.v_ >= rhs.v_); -} -#endif - -template -bool operator==(const LogVal& lhs, const LogVal& rhs) { - return (lhs.v_ == rhs.v_) && (lhs.s_ == rhs.s_); -} - -template -bool operator!=(const LogVal& lhs, const LogVal& rhs) { - return !(lhs == rhs); -} - -#endif diff --git a/decoder/logval_test.cc b/decoder/logval_test.cc deleted file mode 100644 index 1a23177d..00000000 --- a/decoder/logval_test.cc +++ /dev/null @@ -1,73 +0,0 @@ -#include "logval.h" - -#include -#include - -class LogValTest : public testing::Test { - protected: - virtual void SetUp() { } - virtual void TearDown() { } -}; - -using namespace std; - -TEST_F(LogValTest,Order) { - LogVal a(-0.3); - LogVal b(0.3); - LogVal c(2.4); - EXPECT_LT(a,b); - EXPECT_LT(b,c); - EXPECT_LT(a,c); - EXPECT_FALSE(b < a); - EXPECT_FALSE(c < a); - EXPECT_FALSE(c < b); - EXPECT_FALSE(c < c); - EXPECT_FALSE(b < b); - EXPECT_FALSE(a < a); -} - -TEST_F(LogValTest,Invert) { - LogVal x(-2.4); - LogVal y(2.4); - y.invert(); - EXPECT_FLOAT_EQ(x,y); -} - -TEST_F(LogValTest,Minus) { - LogVal x(12); - LogVal y(2); - LogVal z1 = x - y; - LogVal z2 = x; - z2 -= y; - EXPECT_FLOAT_EQ(z1, z2); - EXPECT_FLOAT_EQ(z1, 10.0); - EXPECT_FLOAT_EQ(y - x, -10.0); -} - -TEST_F(LogValTest,TestOps) { - LogVal x(-12.12); - LogVal y(x); - cerr << x << endl; - cerr << (x*y) << endl; - cerr << (x*y + x) << endl; - cerr << (x + x*y) << endl; - cerr << log1p(-0.5) << endl; - LogVal aa(0.2); - LogVal bb(-0.3); - cerr << (aa + bb) << endl; - cerr << (bb + aa) << endl; - EXPECT_FLOAT_EQ((aa + bb), (bb + aa)); - EXPECT_FLOAT_EQ((aa + bb), -0.1); -} - -TEST_F(LogValTest,TestSizes) { - cerr << sizeof(LogVal) << endl; - cerr << sizeof(LogVal) << endl; - cerr << sizeof(void*) << endl; -} - -int main(int argc, char** argv) { - testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} - diff --git a/decoder/murmur_hash.h b/decoder/murmur_hash.h deleted file mode 100755 index 8dbd7807..00000000 --- a/decoder/murmur_hash.h +++ /dev/null @@ -1,186 +0,0 @@ -#ifndef _MURMUR_HASH_H_ -#define _MURMUR_HASH_H_ - -//NOTE: quite fast, nice collision properties, but endian dependent hash values - -#include "have_64_bits.h" -typedef uintptr_t MurmurInt; - -// MurmurHash2, by Austin Appleby - -static const uint32_t DEFAULT_SEED=2654435769U; - -#if HAVE_64_BITS -//MurmurInt MurmurHash(void const *key, int len, uint32_t seed=DEFAULT_SEED); - -inline uint64_t MurmurHash64( const void * key, int len, unsigned int seed=DEFAULT_SEED ) -{ - const uint64_t m = 0xc6a4a7935bd1e995; - const int r = 47; - - uint64_t h = seed ^ (len * m); - - const uint64_t * data = (const uint64_t *)key; - const uint64_t * end = data + (len/8); - - while(data != end) - { - uint64_t k = *data++; - - k *= m; - k ^= k >> r; - k *= m; - - h ^= k; - h *= m; - } - - const unsigned char * data2 = (const unsigned char*)data; - - switch(len & 7) - { - case 7: h ^= uint64_t(data2[6]) << 48; - case 6: h ^= uint64_t(data2[5]) << 40; - case 5: h ^= uint64_t(data2[4]) << 32; - case 4: h ^= uint64_t(data2[3]) << 24; - case 3: h ^= uint64_t(data2[2]) << 16; - case 2: h ^= uint64_t(data2[1]) << 8; - case 1: h ^= uint64_t(data2[0]); - h *= m; - }; - - h ^= h >> r; - h *= m; - h ^= h >> r; - - return h; -} - -inline uint32_t MurmurHash32(void const *key, int len, uint32_t seed=DEFAULT_SEED) -{ - return (uint32_t) MurmurHash64(key,len,seed); -} - -inline MurmurInt MurmurHash(void const *key, int len, uint32_t seed=DEFAULT_SEED) -{ - return MurmurHash64(key,len,seed); -} - -#else -// 32-bit - -// Note - This code makes a few assumptions about how your machine behaves - -// 1. We can read a 4-byte value from any address without crashing -// 2. sizeof(int) == 4 -inline uint32_t MurmurHash32 ( const void * key, int len, uint32_t seed=DEFAULT_SEED) -{ - // 'm' and 'r' are mixing constants generated offline. - // They're not really 'magic', they just happen to work well. - - const uint32_t m = 0x5bd1e995; - const int r = 24; - - // Initialize the hash to a 'random' value - - uint32_t h = seed ^ len; - - // Mix 4 bytes at a time into the hash - - const unsigned char * data = (const unsigned char *)key; - - while(len >= 4) - { - uint32_t k = *(uint32_t *)data; - - k *= m; - k ^= k >> r; - k *= m; - - h *= m; - h ^= k; - - data += 4; - len -= 4; - } - - // Handle the last few bytes of the input array - - switch(len) - { - case 3: h ^= data[2] << 16; - case 2: h ^= data[1] << 8; - case 1: h ^= data[0]; - h *= m; - }; - - // Do a few final mixes of the hash to ensure the last few - // bytes are well-incorporated. - - h ^= h >> 13; - h *= m; - h ^= h >> 15; - - return h; -} - -inline MurmurInt MurmurHash ( const void * key, int len, uint32_t seed=DEFAULT_SEED) { - return MurmurHash32(key,len,seed); -} - -// 64-bit hash for 32-bit platforms - -inline uint64_t MurmurHash64 ( const void * key, int len, uint32_t seed=DEFAULT_SEED) -{ - const uint32_t m = 0x5bd1e995; - const int r = 24; - - uint32_t h1 = seed ^ len; - uint32_t h2 = 0; - - const uint32_t * data = (const uint32_t *)key; - - while(len >= 8) - { - uint32_t k1 = *data++; - k1 *= m; k1 ^= k1 >> r; k1 *= m; - h1 *= m; h1 ^= k1; - len -= 4; - - uint32_t k2 = *data++; - k2 *= m; k2 ^= k2 >> r; k2 *= m; - h2 *= m; h2 ^= k2; - len -= 4; - } - - if(len >= 4) - { - uint32_t k1 = *data++; - k1 *= m; k1 ^= k1 >> r; k1 *= m; - h1 *= m; h1 ^= k1; - len -= 4; - } - - switch(len) - { - case 3: h2 ^= ((unsigned char*)data)[2] << 16; - case 2: h2 ^= ((unsigned char*)data)[1] << 8; - case 1: h2 ^= ((unsigned char*)data)[0]; - h2 *= m; - }; - - h1 ^= h2 >> 18; h1 *= m; - h2 ^= h1 >> 22; h2 *= m; - h1 ^= h2 >> 17; h1 *= m; - h2 ^= h1 >> 19; h2 *= m; - - uint64_t h = h1; - - h = (h << 32) | h2; - - return h; -} - -#endif -//32bit - -#endif diff --git a/decoder/null_deleter.h b/decoder/null_deleter.h deleted file mode 100755 index 082ab453..00000000 --- a/decoder/null_deleter.h +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef NULL_DELETER_H -#define NULL_DELETER_H - -struct null_deleter { - void operator()(void*) const {} - void operator()(void const*) const {} -}; - -#endif diff --git a/decoder/oracle_bleu.h b/decoder/oracle_bleu.h index 81a584a7..145c84d1 100755 --- a/decoder/oracle_bleu.h +++ b/decoder/oracle_bleu.h @@ -9,7 +9,7 @@ #include #include #include -#include "../vest/scorer.h" +#include "scorer.h" #include "hg.h" #include "ff_factory.h" #include "ff_bleu.h" diff --git a/decoder/phrasebased_translator.cc b/decoder/phrasebased_translator.cc index 726b3f9a..d65e44d1 100644 --- a/decoder/phrasebased_translator.cc +++ b/decoder/phrasebased_translator.cc @@ -68,7 +68,6 @@ struct PhraseBasedTranslatorImpl { PhraseBasedTranslatorImpl(const boost::program_options::variables_map& conf) : add_pass_through_rules(conf.count("add_pass_through_rules")), max_distortion(conf["pb_max_distortion"].as()), - kSOURCE_RULE(new TRule("[X] ||| [X,1] ||| [X,1]", true)), kCONCAT_RULE(new TRule("[X] ||| [X,1] [X,2] ||| [X,1] [X,2]", true)), kNT_TYPE(TD::Convert("X") * -1) { assert(max_distortion >= 0); @@ -141,6 +140,8 @@ struct PhraseBasedTranslatorImpl { for (int i = 0; i < phrases.size(); ++i) { Hypergraph::Edge* edge = minus_lm_forest->AddEdge(phrases[i], Hypergraph::TailNodeVector()); edge->feature_values_ = edge->rule_->scores_; + edge->i_ = s.i; + edge->j_ = s.j; minus_lm_forest->ConnectEdgeToHeadNode(edge->id_, phrase_head_index); } CoverageNodeMap::iterator cit = c.find(s.coverage); @@ -189,7 +190,6 @@ struct PhraseBasedTranslatorImpl { const bool add_pass_through_rules; const int max_distortion; - TRulePtr kSOURCE_RULE; const TRulePtr kCONCAT_RULE; const WordID kNT_TYPE; boost::shared_ptr fst; diff --git a/decoder/prob.h b/decoder/prob.h deleted file mode 100644 index bc297870..00000000 --- a/decoder/prob.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef _PROB_H_ -#define _PROB_H_ - -#include "logval.h" - -typedef LogVal prob_t; - -#endif diff --git a/decoder/sampler.h b/decoder/sampler.h deleted file mode 100644 index 5fef45d0..00000000 --- a/decoder/sampler.h +++ /dev/null @@ -1,147 +0,0 @@ -#ifndef SAMPLER_H_ -#define SAMPLER_H_ - -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "prob.h" - -struct SampleSet; - -template -struct RandomNumberGenerator { - static uint32_t GetTrulyRandomSeed() { - uint32_t seed; - std::ifstream r("/dev/urandom"); - if (r) { - r.read((char*)&seed,sizeof(uint32_t)); - } - if (r.fail() || !r) { - std::cerr << "Warning: could not read from /dev/urandom. Seeding from clock" << std::endl; - seed = std::time(NULL); - } - std::cerr << "Seeding random number sequence to " << seed << std::endl; - return seed; - } - - RandomNumberGenerator() : m_dist(0,1), m_generator(), m_random(m_generator,m_dist) { - uint32_t seed = GetTrulyRandomSeed(); - m_generator.seed(seed); - } - explicit RandomNumberGenerator(uint32_t seed) : m_dist(0,1), m_generator(), m_random(m_generator,m_dist) { - if (!seed) seed = GetTrulyRandomSeed(); - m_generator.seed(seed); - } - - size_t SelectSample(const prob_t& a, const prob_t& b, double T = 1.0) { - if (T == 1.0) { - if (this->next() > (a / (a + b))) return 1; else return 0; - } else { - assert(!"not implemented"); - } - } - - // T is the annealing temperature, if desired - size_t SelectSample(const SampleSet& ss, double T = 1.0); - - // draw a value from U(0,1) - double next() {return m_random();} - - // draw a value from N(mean,var) - double NextNormal(double mean, double var) { - return boost::normal_distribution(mean, var)(m_random); - } - - // draw a value from a Poisson distribution - // lambda must be greater than 0 - int NextPoisson(int lambda) { - return boost::poisson_distribution(lambda)(m_random); - } - - bool AcceptMetropolisHastings(const prob_t& p_cur, - const prob_t& p_prev, - const prob_t& q_cur, - const prob_t& q_prev) { - const prob_t a = (p_cur / p_prev) * (q_prev / q_cur); - if (log(a) >= 0.0) return true; - return (prob_t(this->next()) < a); - } - - RNG &gen() { return m_generator; } - typedef boost::variate_generator > IntRNG; - IntRNG inclusive(int low,int high_incl) { - assert(high_incl>=low); - return IntRNG(m_generator,boost::uniform_int<>(low,high_incl)); - } - - private: - boost::uniform_real<> m_dist; - RNG m_generator; - boost::variate_generator > m_random; -}; - -typedef RandomNumberGenerator MT19937; - -class SampleSet { - public: - const prob_t& operator[](int i) const { return m_scores[i]; } - prob_t& operator[](int i) { return m_scores[i]; } - bool empty() const { return m_scores.empty(); } - void add(const prob_t& s) { m_scores.push_back(s); } - void clear() { m_scores.clear(); } - size_t size() const { return m_scores.size(); } - void resize(int size) { m_scores.resize(size); } - std::vector m_scores; -}; - -template -size_t RandomNumberGenerator::SelectSample(const SampleSet& ss, double T) { - assert(T > 0.0); - assert(ss.m_scores.size() > 0); - if (ss.m_scores.size() == 1) return 0; - const prob_t annealing_factor(1.0 / T); - const bool anneal = (annealing_factor != prob_t::One()); - prob_t sum = prob_t::Zero(); - if (anneal) { - for (int i = 0; i < ss.m_scores.size(); ++i) - sum += ss.m_scores[i].pow(annealing_factor); // p^(1/T) - } else { - sum = std::accumulate(ss.m_scores.begin(), ss.m_scores.end(), prob_t::Zero()); - } - //for (size_t i = 0; i < ss.m_scores.size(); ++i) std::cerr << ss.m_scores[i] << ","; - //std::cerr << std::endl; - - prob_t random(this->next()); // random number between 0 and 1 - random *= sum; // scale with normalization factor - //std::cerr << "Random number " << random << std::endl; - - //now figure out which sample - size_t position = 1; - sum = ss.m_scores[0]; - if (anneal) { - sum.poweq(annealing_factor); - for (; position < ss.m_scores.size() && sum < random; ++position) - sum += ss.m_scores[position].pow(annealing_factor); - } else { - for (; position < ss.m_scores.size() && sum < random; ++position) - sum += ss.m_scores[position]; - } - //std::cout << "random: " << random << " sample: " << position << std::endl; - //std::cerr << "Sample: " << position-1 << std::endl; - //exit(1); - return position-1; -} - -#endif diff --git a/decoder/sentence_metadata.h b/decoder/sentence_metadata.h index 21be9b21..593019c8 100644 --- a/decoder/sentence_metadata.h +++ b/decoder/sentence_metadata.h @@ -3,7 +3,7 @@ #include #include "lattice.h" -#include "../vest/scorer.h" +#include "scorer.h" struct SentenceMetadata { SentenceMetadata(int id, const Lattice& ref) : diff --git a/decoder/small_vector.h b/decoder/small_vector.h deleted file mode 100644 index 25c52359..00000000 --- a/decoder/small_vector.h +++ /dev/null @@ -1,265 +0,0 @@ -#ifndef _SMALL_VECTOR_H_ -#define _SMALL_VECTOR_H_ - -/* REQUIRES that T is POD (can be memcpy). won't work (yet) due to union with SMALL_VECTOR_POD==0 - may be possible to handle movable types that have ctor/dtor, by using explicit allocation, ctor/dtor calls. but for now JUST USE THIS FOR no-meaningful ctor/dtor POD types. - - stores small element (<=SV_MAX items) vectors inline. recommend SV_MAX=sizeof(T)/sizeof(T*)>1?sizeof(T)/sizeof(T*):1. may not work if SV_MAX==0. - */ - -#define SMALL_VECTOR_POD 1 - -#include // std::max - where to get this? -#include -#include -#include -#include -#include -//sizeof(T)/sizeof(T*)>1?sizeof(T)/sizeof(T*):1 - -template -class SmallVector { -// typedef unsigned short uint16_t; - public: - typedef SmallVector Self; - SmallVector() : size_(0) {} - - typedef T const* const_iterator; - typedef T* iterator; - typedef T value_type; - typedef T &reference; - typedef T const& const_reference; - - T *begin() { return size_>SV_MAX?data_.ptr:data_.vals; } - T const* begin() const { return const_cast(this)->begin(); } - T *end() { return begin()+size_; } - T const* end() const { return begin()+size_; } - - explicit SmallVector(size_t s) : size_(s) { - assert(s < 0xA000); - if (s <= SV_MAX) { - for (int i = 0; i < s; ++i) new(&data_.vals[i]) T(); - } else { - capacity_ = s; - size_ = s; - data_.ptr = new T[s]; // TODO: replace this with allocator or ::operator new(sizeof(T)*s) everywhere - for (int i = 0; i < size_; ++i) new(&data_.ptr[i]) T(); - } - } - - SmallVector(size_t s, T const& v) : size_(s) { - assert(s < 0xA000); - if (s <= SV_MAX) { - for (int i = 0; i < s; ++i) data_.vals[i] = v; - } else { - capacity_ = s; - size_ = s; - data_.ptr = new T[s]; - for (int i = 0; i < size_; ++i) data_.ptr[i] = v; - } - } - - SmallVector(const Self& o) : size_(o.size_) { - if (size_ <= SV_MAX) { - std::memcpy(data_.vals,o.data_.vals,size_*sizeof(T)); -// for (int i = 0; i < size_; ++i) data_.vals[i] = o.data_.vals[i]; - } else { - capacity_ = size_ = o.size_; - data_.ptr = new T[capacity_]; - std::memcpy(data_.ptr, o.data_.ptr, size_ * sizeof(T)); - } - } - - const Self& operator=(const Self& o) { - if (size_ <= SV_MAX) { - if (o.size_ <= SV_MAX) { - size_ = o.size_; - for (int i = 0; i < SV_MAX; ++i) data_.vals[i] = o.data_.vals[i]; - } else { - capacity_ = size_ = o.size_; - data_.ptr = new T[capacity_]; - std::memcpy(data_.ptr, o.data_.ptr, size_ * sizeof(T)); - } - } else { - if (o.size_ <= SV_MAX) { - delete[] data_.ptr; - size_ = o.size_; - for (int i = 0; i < size_; ++i) data_.vals[i] = o.data_.vals[i]; - } else { - if (capacity_ < o.size_) { - delete[] data_.ptr; - capacity_ = o.size_; - data_.ptr = new T[capacity_]; - } - size_ = o.size_; - for (int i = 0; i < size_; ++i) - data_.ptr[i] = o.data_.ptr[i]; - } - } - return *this; - } - - ~SmallVector() { - if (size_ <= SV_MAX) { - // skip if pod? yes, we required pod anyway. no need to destruct -#if !SMALL_VECTOR_POD - for (int i=0;i SV_MAX) { - delete[] data_.ptr; - } - size_ = 0; - } - - bool empty() const { return size_ == 0; } - size_t size() const { return size_; } - - inline void ensure_capacity(uint16_t min_size) { - assert(min_size > SV_MAX); - if (min_size < capacity_) return; - uint16_t new_cap = std::max(static_cast(capacity_ << 1), min_size); - T* tmp = new T[new_cap]; - std::memcpy(tmp, data_.ptr, capacity_ * sizeof(T)); - delete[] data_.ptr; - data_.ptr = tmp; - capacity_ = new_cap; - } - -private: - inline void copy_vals_to_ptr() { - capacity_ = SV_MAX * 2; - T* tmp = new T[capacity_]; - for (int i = 0; i < SV_MAX; ++i) tmp[i] = data_.vals[i]; - data_.ptr = tmp; - } - inline void ptr_to_small() { - assert(size_<=SV_MAX); - int *tmp=data_.ptr; - for (int i=0;ioperator[](size_ - 1); } - const T& back() const { return this->operator[](size_ - 1); } - T& front() { return this->operator[](0); } - const T& front() const { return this->operator[](0); } - - void pop_back() { - assert(size_>0); - --size_; - if (size_==SV_MAX) - ptr_to_small(); - } - - void compact() { - compact(size_); - } - - // size must be <= size_ - TODO: test - void compact(uint16_t size) { - assert(size<=size_); - if (size_>SV_MAX) { - size_=size; - if (size<=SV_MAX) - ptr_to_small(); - } else - size_=size; - } - - void resize(size_t s, int v = 0) { - if (s <= SV_MAX) { - if (size_ > SV_MAX) { - T *tmp=data_.ptr; - for (int i = 0; i < s; ++i) data_.vals[i] = tmp[i]; - delete[] tmp; - size_ = s; - return; - } - if (s <= size_) { - size_ = s; - return; - } else { - for (int i = size_; i < s; ++i) - data_.vals[i] = v; - size_ = s; - return; - } - } else { - if (size_ <= SV_MAX) - copy_vals_to_ptr(); - if (s > capacity_) - ensure_capacity(s); - if (s > size_) { - for (int i = size_; i < s; ++i) - data_.ptr[i] = v; - } - size_ = s; - } - } - - T& operator[](size_t i) { - if (size_ <= SV_MAX) return data_.vals[i]; - return data_.ptr[i]; - } - - const T& operator[](size_t i) const { - if (size_ <= SV_MAX) return data_.vals[i]; - return data_.ptr[i]; - } - - bool operator==(const Self& o) const { - if (size_ != o.size_) return false; - if (size_ <= SV_MAX) { - for (size_t i = 0; i < size_; ++i) - if (data_.vals[i] != o.data_.vals[i]) return false; - return true; - } else { - for (size_t i = 0; i < size_; ++i) - if (data_.ptr[i] != o.data_.ptr[i]) return false; - return true; - } - } - - friend bool operator!=(const Self& a, const Self& b) { - return !(a==b); - } - - private: - union StorageType { - T vals[SV_MAX]; - T* ptr; - }; - StorageType data_; - uint16_t size_; - uint16_t capacity_; // only defined when size_ > __SV_MAX_STATIC -}; - -typedef SmallVector SmallVectorInt; - -template -void memcpy(void *out,SmallVector const& v) { - std::memcpy(out,v.begin(),v.size()*sizeof(T)); -} - -#endif diff --git a/decoder/small_vector_test.cc b/decoder/small_vector_test.cc deleted file mode 100644 index d1d8dcab..00000000 --- a/decoder/small_vector_test.cc +++ /dev/null @@ -1,129 +0,0 @@ -#include "small_vector.h" - -#include -#include -#include -#include - -using namespace std; - -class SVTest : public testing::Test { - protected: - virtual void SetUp() { } - virtual void TearDown() { } -}; - -TEST_F(SVTest, LargerThan2) { - SmallVectorInt v; - SmallVectorInt v2; - v.push_back(0); - v.push_back(1); - v.push_back(2); - assert(v.size() == 3); - assert(v[2] == 2); - assert(v[1] == 1); - assert(v[0] == 0); - v2 = v; - SmallVectorInt copy(v); - assert(copy.size() == 3); - assert(copy[0] == 0); - assert(copy[1] == 1); - assert(copy[2] == 2); - assert(copy == v2); - copy[1] = 99; - assert(copy != v2); - assert(v2.size() == 3); - assert(v2[2] == 2); - assert(v2[1] == 1); - assert(v2[0] == 0); - v2[0] = -2; - v2[1] = -1; - v2[2] = 0; - assert(v2[2] == 0); - assert(v2[1] == -1); - assert(v2[0] == -2); - SmallVectorInt v3(1,1); - assert(v3[0] == 1); - v2 = v3; - assert(v2.size() == 1); - assert(v2[0] == 1); - SmallVectorInt v4(10, 1); - assert(v4.size() == 10); - assert(v4[5] == 1); - assert(v4[9] == 1); - v4 = v; - assert(v4.size() == 3); - assert(v4[2] == 2); - assert(v4[1] == 1); - assert(v4[0] == 0); - SmallVectorInt v5(10, 2); - assert(v5.size() == 10); - assert(v5[7] == 2); - assert(v5[0] == 2); - assert(v.size() == 3); - v = v5; - assert(v.size() == 10); - assert(v[2] == 2); - assert(v[9] == 2); - SmallVectorInt cc; - for (int i = 0; i < 33; ++i) - cc.push_back(i); - for (int i = 0; i < 33; ++i) - assert(cc[i] == i); - cc.resize(20); - assert(cc.size() == 20); - for (int i = 0; i < 20; ++i) - assert(cc[i] == i); - cc[0]=-1; - cc.resize(1, 999); - assert(cc.size() == 1); - assert(cc[0] == -1); - cc.resize(99, 99); - for (int i = 1; i < 99; ++i) { - cerr << i << " " << cc[i] << endl; - assert(cc[i] == 99); - } - cc.clear(); - assert(cc.size() == 0); -} - -TEST_F(SVTest, Small) { - SmallVectorInt v; - SmallVectorInt v1(1,0); - SmallVectorInt v2(2,10); - SmallVectorInt v1a(2,0); - EXPECT_TRUE(v1 != v1a); - EXPECT_TRUE(v1 == v1); - EXPECT_EQ(v1[0], 0); - EXPECT_EQ(v2[1], 10); - EXPECT_EQ(v2[0], 10); - ++v2[1]; - --v2[0]; - EXPECT_EQ(v2[0], 9); - EXPECT_EQ(v2[1], 11); - SmallVectorInt v3(v2); - assert(v3[0] == 9); - assert(v3[1] == 11); - assert(!v3.empty()); - assert(v3.size() == 2); - v3.clear(); - assert(v3.empty()); - assert(v3.size() == 0); - assert(v3 != v2); - assert(v2 != v3); - v3 = v2; - assert(v3 == v2); - assert(v2 == v3); - assert(v3[0] == 9); - assert(v3[1] == 11); - assert(!v3.empty()); - assert(v3.size() == 2); - cerr << sizeof(SmallVectorInt) << endl; - cerr << sizeof(vector) << endl; -} - -int main(int argc, char** argv) { - testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} - diff --git a/decoder/sparse_vector.cc b/decoder/sparse_vector.cc deleted file mode 100644 index 4035b9ef..00000000 --- a/decoder/sparse_vector.cc +++ /dev/null @@ -1,98 +0,0 @@ -#include "sparse_vector.h" - -#include -#include - -#include "hg_io.h" - -using namespace std; - -namespace B64 { - -void Encode(double objective, const SparseVector& v, ostream* out) { - const int num_feats = v.num_active(); - size_t tot_size = 0; - const size_t off_objective = tot_size; - tot_size += sizeof(double); // objective - const size_t off_num_feats = tot_size; - tot_size += sizeof(int); // num_feats - const size_t off_data = tot_size; - tot_size += sizeof(unsigned char) * num_feats; // lengths of feature names; - typedef SparseVector::const_iterator const_iterator; - for (const_iterator it = v.begin(); it != v.end(); ++it) - tot_size += FD::Convert(it->first).size(); // feature names; - tot_size += sizeof(double) * num_feats; // gradient - const size_t off_magic = tot_size; - tot_size += 4; // magic - - // size_t b64_size = tot_size * 4 / 3; - // cerr << "Sparse vector binary size: " << tot_size << " (b64 size=" << b64_size << ")\n"; - char* data = new char[tot_size]; - *reinterpret_cast(&data[off_objective]) = objective; - *reinterpret_cast(&data[off_num_feats]) = num_feats; - char* cur = &data[off_data]; - assert(cur - data == off_data); - for (const_iterator it = v.begin(); it != v.end(); ++it) { - const string& fname = FD::Convert(it->first); - *cur++ = static_cast(fname.size()); // name len - memcpy(cur, &fname[0], fname.size()); - cur += fname.size(); - *reinterpret_cast(cur) = it->second; - cur += sizeof(double); - } - assert(cur - data == off_magic); - *reinterpret_cast(cur) = 0xBAABABBAu; - cur += sizeof(unsigned int); - assert(cur - data == tot_size); - b64encode(data, tot_size, out); - delete[] data; -} - -bool Decode(double* objective, SparseVector* v, const char* in, size_t size) { - v->clear(); - if (size % 4 != 0) { - cerr << "B64 error - line % 4 != 0\n"; - return false; - } - const size_t decoded_size = size * 3 / 4 - sizeof(unsigned int); - const size_t buf_size = decoded_size + sizeof(unsigned int); - if (decoded_size < 6) { cerr << "SparseVector decoding error: too short!\n"; return false; } - char* data = new char[buf_size]; - if (!b64decode(reinterpret_cast(in), size, data, buf_size)) { - delete[] data; - return false; - } - size_t cur = 0; - *objective = *reinterpret_cast(data); - cur += sizeof(double); - const int num_feats = *reinterpret_cast(&data[cur]); - cur += sizeof(int); - int fc = 0; - while(fc < num_feats && cur < decoded_size) { - ++fc; - const int fname_len = data[cur++]; - assert(fname_len > 0); - assert(fname_len < 256); - string fname(fname_len, '\0'); - memcpy(&fname[0], &data[cur], fname_len); - cur += fname_len; - const double val = *reinterpret_cast(&data[cur]); - cur += sizeof(double); - int fid = FD::Convert(fname); - v->set_value(fid, val); - } - if(num_feats != fc) { - cerr << "Expected " << num_feats << " but only decoded " << fc << "!\n"; - delete[] data; - return false; - } - if (*reinterpret_cast(&data[cur]) != 0xBAABABBAu) { - cerr << "SparseVector decodeding error : magic does not match!\n"; - delete[] data; - return false; - } - delete[] data; - return true; -} - -} diff --git a/decoder/sparse_vector.h b/decoder/sparse_vector.h deleted file mode 100644 index 207489c5..00000000 --- a/decoder/sparse_vector.h +++ /dev/null @@ -1,512 +0,0 @@ -#ifndef _SPARSE_VECTOR_H_ -#define _SPARSE_VECTOR_H_ - -//#define SPARSE_VECTOR_HASH - -#ifdef SPARSE_VECTOR_HASH -#include "hash.h" -# define SPARSE_VECTOR_MAP HASH_MAP -# define SPARSE_VECTOR_MAP_RESERVED(h,empty,deleted) HASH_MAP_RESERVED(h,empty,deleted) -#else -# define SPARSE_VECTOR_MAP std::map -# define SPARSE_VECTOR_MAP_RESERVED(h,empty,deleted) -#endif -/* - use SparseVectorList (pair smallvector) for feat funcs / hypergraphs (you rarely need random access; just append a feature to the list) -*/ -/* hack: index 0 never gets printed because cdyer is creative and efficient. features which have no weight got feature dict id 0, see, and the models all clobered that value. nobody wants to see it. except that vlad is also creative and efficient and stored the oracle bleu there. */ -/* NOTE: zero vals may or may not be dropped from map (sparse, but not guaranteed to be so). - - I rely on !v the same as !((bool)v) the same as v==0 and v() same as v(0). - - one exception: - - a local: - T sum = 0; - is used instead of - T sum; - - because T may be a primitive type, and - - T sum(); - - is parsed as a function decl :( - - the alternative T sum=T() is also be reasonable. i've switched to that. -*/ - -// this is a modified version of code originally written -// by Phil Blunsom - -#include -#include -#include -#include -#include - -#include "fdict.h" -#include "small_vector.h" - -template -inline T & extend_vector(std::vector &v,int i) { - if (i>=v.size()) - v.resize(i+1); - return v[i]; -} - -template -class SparseVector { - void init_reserved() { - SPARSE_VECTOR_MAP_RESERVED(values_,-1,-2); - } -public: - T const& get_singleton() const { - assert(values_.size()==1); - return values_.begin()->second; - } - - typedef SparseVector Self; - typedef SPARSE_VECTOR_MAP MapType; - typedef typename MapType::const_iterator const_iterator; - SparseVector() { - init_reserved(); - } - explicit SparseVector(std::vector const& v) { - init_reserved(); - typename MapType::iterator p=values_.begin(); - const T z=0; - for (unsigned i=0;i *vp) const { - init_vector(*vp); - } - - void init_vector(std::vector &v) const { - v.clear(); - for (const_iterator i=values_.begin(),e=values_.end();i!=e;++i) - extend_vector(v,i->first)=i->second; - } - - void set_new_value(int index, T const& val) { - assert(values_.find(index)==values_.end()); - values_[index]=val; - } - - - // warning: exploits the fact that 0 values are always removed from map. change this if you change that. - bool nonzero(int index) const { - typename MapType::const_iterator found = values_.find(index); - return found==values_.end() || !found->second; - } - - - T get(int index) const { - typename MapType::const_iterator found = values_.find(index); - return found==values_.end()?T():found->second; - } - - T value(int i) const { return get(i); } - - // same as above but may add a 0 entry. TODO: check that people relying on no entry use get - T & operator[](int index){ - return values_[index]; - } - - inline void set_value(int index, const T &value) { - values_[index] = value; - } - - inline void maybe_add(int index, const T& value) { - if (value) add_value(index,value); - } - - T& add_value(int index, const T &value) { -#if 1 - return values_[index]+=value; -#else - // this is not really going to be any faster, and we already rely on default init = 0 init - std::pair art=values_.insert(std::make_pair(index,value)); - T &val=art.first->second; - if (!art.second) val += value; // already existed - return val; -#endif - } - - - void store(std::valarray* target) const { - (*target) *= 0; - for (typename MapType::const_iterator - it = values_.begin(); it != values_.end(); ++it) { - if (it->first >= target->size()) break; - (*target)[it->first] = it->second; - } - } - - int max_index() const { - if (empty()) return 0; - typename MapType::const_iterator found =values_.end(); - --found; - return found->first; - } - - // dot product with a unit vector of the same length - // as the sparse vector - T dot() const { - T sum = T(); - for (typename MapType::const_iterator - it = values_.begin(); it != values_.end(); ++it) - sum += it->second; - return sum; - } - - template - S cosine_sim(const SparseVector &vec) const { - return dot(vec)/(l2norm()*vec.l2norm()); - } - - // if values are binary, gives |A intersect B|/|A union B| - template - S tanimoto_coef(const SparseVector &vec) const { - S dp=dot(vec); - return dp/(l2norm_sq()+vec.l2norm_sq()-dp); - } - - template - S dot(const SparseVector &vec) const { - S sum = S(); - for (typename MapType::const_iterator - it = values_.begin(); it != values_.end(); ++it) - { - typename MapType::const_iterator - found = vec.values_.find(it->first); - if (found != vec.values_.end()) - sum += it->second * found->second; - } - return sum; - } - - template - S dot(const std::vector &vec) const { - S sum = S(); - for (typename MapType::const_iterator - it = values_.begin(); it != values_.end(); ++it) - { - if (it->first < static_cast(vec.size())) - sum += it->second * vec[it->first]; - } - return sum; - } - - template - S dot(const S *vec) const { - // this is not range checked! - S sum = S(); - for (typename MapType::const_iterator - it = values_.begin(); it != values_.end(); ++it) - sum += it->second * vec[it->first]; - std::cout << "dot(*vec) " << sum << std::endl; - return sum; - } - - T l1norm() const { - T sum = T(); - for (typename MapType::const_iterator - it = values_.begin(); it != values_.end(); ++it) - sum += fabs(it->second); - return sum; - } - - T l2norm_sq() const { - T sum = T(); - for (typename MapType::const_iterator - it = values_.begin(); it != values_.end(); ++it) - sum += it->second * it->second; - return sum; - } - - T l2norm() const { - return sqrt(l2norm_sq()); - } - - void erase(int key) { - values_.erase(key); -/* typename MapType::iterator found = values_.find(key); - if (found!=values_end()) - values_.erase(found);*/ - } - - template - void set_from(SparseVector const& other) { - for (typename MapType::const_iterator - it = other.values_.begin(); it != other.values_.end(); ++it) - { - values_[it->first]=it->second; - } - } - - SparseVector &operator+=(const SparseVector &other) { - for (typename MapType::const_iterator - it = other.values_.begin(); it != other.values_.end(); ++it) - { -// T v = - (values_[it->first] += it->second); -// if (!v) values_.erase(it->first); - } - return *this; - } - - SparseVector &operator-=(const SparseVector &other) { - for (typename MapType::const_iterator - it = other.values_.begin(); it != other.values_.end(); ++it) - { -// T v = - (values_[it->first] -= it->second); -// if (!v) values_.erase(it->first); - } - return *this; - } - - friend SparseVector operator -(SparseVector x,SparseVector const& y) { - x-=y; - return x; - } - friend SparseVector operator +(SparseVector x,SparseVector const& y) { - x+=y; - return x; - } - -private: - // DEPRECATED: becuase 0 values are dropped from the map, this doesn't even make sense if you have a fully populated (not really sparse re: what you'll ever use) vector - SparseVector &operator-=(T const& x) { - for (typename MapType::iterator - it = values_.begin(); it != values_.end(); ++it) - it->second -= x; - return *this; - } - - SparseVector &operator+=(T const& x) { - for (typename MapType::iterator - it = values_.begin(); it != values_.end(); ++it) - it->second += x; - return *this; - } -public: - SparseVector &operator/=(const T &x) { - for (typename MapType::iterator - it = values_.begin(); it != values_.end(); ++it) - it->second /= x; - return *this; - } - - SparseVector &operator*=(const T& x) { - for (typename MapType::iterator - it = values_.begin(); it != values_.end(); ++it) - it->second *= x; - return *this; - } - - SparseVector operator+(T const& x) const { - SparseVector result = *this; - return result += x; - } - - SparseVector operator-(T const& x) const { - SparseVector result = *this; - return result -= x; - } - - SparseVector operator/(T const& x) const { - SparseVector result = *this; - return result /= x; - } - - std::ostream &operator<<(std::ostream& out) const { - Write(true, &out); - return out; - } - - void Write(const bool with_semi, std::ostream* os) const { - bool first = true; - for (typename MapType::const_iterator - it = values_.begin(); it != values_.end(); ++it) { - // by definition feature id 0 is a dummy value - if (!it->first) continue; - if (with_semi) { - (*os) << (first ? "" : ";") - << FD::Convert(it->first) << '=' << it->second; - } else { - (*os) << (first ? "" : " ") - << FD::Convert(it->first) << '=' << it->second; - } - first = false; - } - } - - bool operator==(Self const & other) const { - return size()==other.size() && contains_keys_of(other) && other.contains_i(*this); - } - - bool contains(Self const &o) const { - return size()>o.size() && contains(o); - } - - bool at_equals(int i,T const& val) const { - const_iterator it=values_.find(i); - if (it==values_.end()) return !val; - return it->second==val; - } - - bool contains_i(Self const& o) const { - for (typename MapType::const_iterator i=o.begin(),e=o.end();i!=e;++i) - if (!at_equals(i->first,i->second)) - return false; - return true; - } - - bool contains_keys_of(Self const& o) const { - for (typename MapType::const_iterator i=o.begin(),e=o.end();i!=e;++i) - if (values_.find(i)==values_.end()) - return false; - return true; - } - -#ifndef SPARSE_VECTOR_HASH - bool operator<(const SparseVector &other) const { - typename MapType::const_iterator it = values_.begin(); - typename MapType::const_iterator other_it = other.values_.begin(); - - for (; it != values_.end() && other_it != other.values_.end(); ++it, ++other_it) - { - if (it->first < other_it->first) return true; - if (it->first > other_it->first) return false; - if (it->second < other_it->second) return true; - if (it->second > other_it->second) return false; - } - return values_.size() < other.values_.size(); - } -#endif - - int size() const { return values_.size(); } - - int num_active() const { return values_.size(); } - bool empty() const { return values_.empty(); } - - const_iterator begin() const { return values_.begin(); } - const_iterator end() const { return values_.end(); } - - void clear() { - values_.clear(); - } - - void swap(SparseVector& other) { - values_.swap(other.values_); - } - -private: - MapType values_; -}; - -//like a pair but can live in a union, because it lacks default+copy ctors, dtor. -template -struct feature_val { - int fid; - T val; -}; - -template -inline feature_val featval(int fid,T const &val) { - feature_val f; - f.fid=fid; - f.val=val; - return f; -} - - -// doesn't support fast indexing directly -template -class SparseVectorList { - typedef feature_val Pair; - typedef SmallVector List; - typedef typename List::const_iterator const_iterator; - SparseVectorList() { } - template - SparseVectorList(I i,I const& end) { - int c=0; - for (;i const& v) { - for (unsigned i=0;i *to) const { - for (int i=0;iset_value(p[i].fid,p[i].val); - } - void copy_to(SparseVector *to) const { - to->clear(); - overlay(to); - } - SparseVector sparse() const { - SparseVector r; - copy_to(r); - return r; - } -private: - List p; -}; - -template -SparseVector operator+(const SparseVector& a, const SparseVector& b) { - SparseVector result = a; - return result += b; -} - -template -SparseVector operator*(const SparseVector& a, const double& b) { - SparseVector result = a; - return result *= b; -} - -template -SparseVector operator*(const SparseVector& a, const T& b) { - SparseVector result = a; - return result *= b; -} - -template -SparseVector operator*(const double& a, const SparseVector& b) { - SparseVector result = b; - return result *= a; -} - -template -std::ostream &operator<<(std::ostream &out, const SparseVector &vec) -{ - return vec.operator<<(out); -} - -namespace B64 { - void Encode(double objective, const SparseVector& v, std::ostream* out); - // returns false if failed to decode - bool Decode(double* objective, SparseVector* v, const char* data, size_t size); -} - -#endif diff --git a/decoder/static_utoa.h b/decoder/static_utoa.h deleted file mode 100755 index fe5f6d92..00000000 --- a/decoder/static_utoa.h +++ /dev/null @@ -1,115 +0,0 @@ -#ifndef STATIC_UTOA_H -#define STATIC_UTOA_H - -#include "threadlocal.h" - - -#include -#include - -#define DIGIT_LOOKUP_TABLE 0 - -namespace { -THREADLOCAL char utoa_buf[] = "01234567890123456789"; // to put end of string character at buf[20] -const unsigned utoa_bufsize=sizeof(utoa_buf); -const unsigned utoa_bufsizem1=utoa_bufsize-1; -#ifdef DIGIT_LOOKUP_TABLE -char digits[] = "0123456789"; -#endif -} - -inline char digit_to_char(int d) { - return -#ifdef DIGIT_LOOKUP_TABLE - digits[d]; -#else - '0'+d; -#endif -} - -// returns n in string [return,num); *num=0 yourself before calling if you want a c_str -inline char *utoa(char *num,unsigned n) { - if ( !n ) { - *--num='0'; - } else { - unsigned rem; - // 3digit lookup table, divide by 1000 faster? - while ( n ) { -#if 1 - rem = n; - n /= 10; - rem -= 10*n; // maybe this is faster than mod because we are already dividing -#else - rem = n%10; // would optimizer combine these together? - n = n/10; -#endif - *--num = digit_to_char(rem); - } - } - return num; -} - -inline char *static_utoa(unsigned n) { - return utoa(utoa_buf+utoa_bufsizem1,n); -} - -//returns position of '\0' terminating number written starting at to -inline char* append_utoa(char *to,unsigned n) { - char *s=static_utoa(n); - int ns=(utoa_buf+utoa_bufsize)-s; - std::memcpy(to,s,ns); - return to+ns; -} - -// so named to avoid gcc segfault when named itoa -inline char *itoa(char *p,int n) { - if (n<0) { - p=utoa(p,-n); // TODO: check that (unsigned)(-INT_MIN) == 0x1000000 in 2s complement and not == 0 - *--p='-'; - return p; - } else - return utoa(p,n); -} - -inline char *static_itoa(int n) { - return itoa(utoa_buf+utoa_bufsizem1,n); -} - - -inline std::string utos(unsigned n) { - const int bufsz=20; - char buf[bufsz]; - char *end=buf+bufsz; - char *p=utoa(end,n); - return std::string(p,end); -} - -inline std::string itos(int n) { - const int bufsz=20; - char buf[bufsz]; - char *end=buf+bufsz; - char *p=itoa(end,n); - return std::string(p,end); -} - -#ifdef ITOA_SAMPLE -# include -# include -# include -using namespace std; - -int main(int argc,char *argv[]) { - printf("d U d U d U\n"); - for (int i=1;i -#include -#include -#include -#include - -#include "lattice.h" - -using namespace std; - -void ParseTranslatorInput(const string& line, string* input, string* ref) { - size_t hint = 0; - if (line.find("{\"rules\":") == 0) { - hint = line.find("}}"); - if (hint == string::npos) { - cerr << "Syntax error: " << line << endl; - abort(); - } - hint += 2; - } - size_t pos = line.find("|||", hint); - if (pos == string::npos) { *input = line; return; } - ref->clear(); - *input = line.substr(0, pos - 1); - string rline = line.substr(pos + 4); - if (rline.size() > 0) { - assert(ref); - *ref = rline; - } -} - -void ParseTranslatorInputLattice(const string& line, string* input, Lattice* ref) { - string sref; - ParseTranslatorInput(line, input, &sref); - if (sref.size() > 0) { - assert(ref); - LatticeTools::ConvertTextOrPLF(sref, ref); - } -} - -void ProcessAndStripSGML(string* pline, map* out) { - map& meta = *out; - string& line = *pline; - string lline = LowercaseString(line); - if (lline.find(""); - if (close == string::npos) return; // error - size_t end = lline.find(""); - string seg = Trim(lline.substr(4, close-4)); - string text = line.substr(close+1, end - close - 1); - for (size_t i = 1; i < seg.size(); i++) { - if (seg[i] == '=' && seg[i-1] == ' ') { - string less = seg.substr(0, i-1) + seg.substr(i); - seg = less; i = 0; continue; - } - if (seg[i] == '=' && seg[i+1] == ' ') { - string less = seg.substr(0, i+1); - if (i+2 < seg.size()) less += seg.substr(i+2); - seg = less; i = 0; continue; - } - } - line = Trim(text); - if (seg == "") return; - for (size_t i = 1; i < seg.size(); i++) { - if (seg[i] == '=') { - string label = seg.substr(0, i); - string val = seg.substr(i+1); - if (val[0] == '"') { - val = val.substr(1); - size_t close = val.find('"'); - if (close == string::npos) { - cerr << "SGML parse error: missing \"\n"; - seg = ""; - i = 0; - } else { - seg = val.substr(close+1); - val = val.substr(0, close); - i = 0; - } - } else { - size_t close = val.find(' '); - if (close == string::npos) { - seg = ""; - i = 0; - } else { - seg = val.substr(close+1); - val = val.substr(0, close); - } - } - label = Trim(label); - seg = Trim(seg); - meta[label] = val; - } - } -} - diff --git a/decoder/stringlib.h b/decoder/stringlib.h deleted file mode 100644 index 84e95d44..00000000 --- a/decoder/stringlib.h +++ /dev/null @@ -1,267 +0,0 @@ -#ifndef CDEC_STRINGLIB_H_ -#define CDEC_STRINGLIB_H_ - -//usage: string s=MAKESTRE(1<<" "<(ostringstream()< -#define SLIBDBG(x) do { std::cerr<<"DBG(stringlib): "< -#include -#include -#include -#include -#include -#include - -inline std::size_t skip_ws(std::string const& s,std::size_t starting=0,char const* ws=" \t\n\r") { - return s.find_first_not_of(ws,starting); -} - -// returns position of end of all non-ws chars before ending, i.e. string(s.begin()+skip_ws(s),s.begin()+trailing_ws(s)) strips both ends -inline std::size_t trailing_ws(std::string const& s,std::size_t ending=std::string::npos,char const* ws=" \t\n\r") { - std::size_t n=s.find_last_not_of(ws,ending); - if (n==std::string::npos) return n; - else return n+1; -} - -//TEST: if string is all whitespace, make sure that string(a+npos,a+npos) can't segfault (i.e. won't access any memory because begin==end) -inline std::string strip_ws(std::string const& s) { - return std::string(s.begin()+skip_ws(s),s.begin()+trailing_ws(s)); -} - - -inline bool is_single_line(std::string const& line) { - return std::count(line.begin(),line.end(),'\n')==0; // but we want to allow terminal newlines/blanks -} - -// is_single_line(strip_ws(line)) -inline bool is_single_line_stripped(std::string const& line) { - std::size_t b=skip_ws(line),e=trailing_ws(line); - std::size_t n=line.find('\n',b); - return n==std::string::npos || n>=e; -} - -struct toupperc { - inline char operator()(char c) const { - return std::toupper(c); - } -}; - -inline std::string toupper(std::string s) { - std::transform(s.begin(),s.end(),s.begin(),toupperc()); - return s; -} - -template inline -bool match_begin(Istr bstr,Istr estr,Isubstr bsub,Isubstr esub) -{ - while (bsub != esub) { - if (bstr == estr) - return false; - if (*bsub++ != *bstr++) - return false; - } - return true; -} - -template inline -bool match_begin(Istr bstr,Istr estr,Prefix prefix) -{ - return match_begin(bstr,estr,prefix.begin(),prefix.end()); -} - -template inline -bool match_begin(Str const& str,Prefix const& prefix) -{ - return match_begin(str.begin(),str.end(),prefix.begin(),prefix.end()); -} - - -// read line in the form of either: -// source -// source ||| target -// source will be returned as a string, target must be a sentence or -// a lattice (in PLF format) and will be returned as a Lattice object -void ParseTranslatorInput(const std::string& line, std::string* input, std::string* ref); -struct Lattice; -void ParseTranslatorInputLattice(const std::string& line, std::string* input, Lattice* ref); - -inline std::string Trim(const std::string& str, const std::string& dropChars = " \t") { - std::string res = str; - res.erase(str.find_last_not_of(dropChars)+1); - return res.erase(0, res.find_first_not_of(dropChars)); -} - -inline void Tokenize(const std::string& str, char delimiter, std::vector* res) { - std::string s = str; - int last = 0; - res->clear(); - for (int i=0; i < s.size(); ++i) - if (s[i] == delimiter) { - s[i]=0; - if (last != i) { - res->push_back(&s[last]); - } - last = i + 1; - } - if (last != s.size()) - res->push_back(&s[last]); -} - -inline unsigned NTokens(const std::string& str, char delimiter) -{ - std::vector r; - Tokenize(str,delimiter,&r); - return r.size(); -} - -inline std::string LowercaseString(const std::string& in) { - std::string res(in.size(),' '); - for (int i = 0; i < in.size(); ++i) - res[i] = tolower(in[i]); - return res; -} - -inline int CountSubstrings(const std::string& str, const std::string& sub) { - size_t p = 0; - int res = 0; - while (p < str.size()) { - p = str.find(sub, p); - if (p == std::string::npos) break; - ++res; - p += sub.size(); - } - return res; -} - -inline int SplitOnWhitespace(const std::string& in, std::vector* out) { - out->clear(); - int i = 0; - int start = 0; - std::string cur; - while(i < in.size()) { - if (in[i] == ' ' || in[i] == '\t') { - if (i - start > 0) - out->push_back(in.substr(start, i - start)); - start = i + 1; - } - ++i; - } - if (i > start) - out->push_back(in.substr(start, i - start)); - return out->size(); -} - -inline std::vector SplitOnWhitespace(std::string const& in) -{ - std::vector r; - SplitOnWhitespace(in,&r); - return r; -} - - -struct mutable_c_str { - // because making a copy of a string might not copy its storage, so modifying a c_str() could screw up original (nobody uses cow nowadays because it needs locking under threading) - char *p; - mutable_c_str(std::string const& s) : p((char *)::operator new(s.size()+1)) { - std::memcpy(p,s.data(),s.size()); - p[s.size()]=0; - } - ~mutable_c_str() { ::operator delete(p); } -private: - mutable_c_str(mutable_c_str const&); -}; - -// ' ' '\t' tokens hardcoded -//NOTE: you should have stripped endline chars out first. -inline bool IsWordSep(char c) { - return c==' '||c=='\t'; -} - - -template -// *end must be 0 (i.e. [p,end] is valid storage, which will be written to with 0 to separate c string tokens -void VisitTokens(char *p,char *const end,F f) { - SLIBDBG("VisitTokens. p="<* out); - -// given the first character of a UTF8 block, find out how wide it is -// see http://en.wikipedia.org/wiki/UTF-8 for more info -inline unsigned int UTF8Len(unsigned char x) { - if (x < 0x80) return 1; - else if ((x >> 5) == 0x06) return 2; - else if ((x >> 4) == 0x0e) return 3; - else if ((x >> 3) == 0x1e) return 4; - else return 0; -} - -#endif diff --git a/decoder/stringlib_test.cc b/decoder/stringlib_test.cc deleted file mode 100755 index f66cdbeb..00000000 --- a/decoder/stringlib_test.cc +++ /dev/null @@ -1,17 +0,0 @@ -#define STRINGLIB_DEBUG -#include "stringlib.h" - -using namespace std; -struct print { - template - void operator()(S const& s) const { - cout<= end() will give a numeric token name (single per-thread shared buffer), which of course won't be Convert-able back to the id, because it's not added to the dict. This is a convenience for logging fake token indices. Any tokens actually added to the dict may cause end() to overlap the range of fake ids you were using - that's up to you to prevent. - -#include -#include -#include -#include "Ngram.h" -#include "dict.h" -#include "tdict.h" -#include "Vocab.h" -#include "stringlib.h" -#include "threadlocal.h" - -using namespace std; - -Vocab TD::dict_(0,TD::max_wordid); -WordID TD::ss=dict_.ssIndex(); -WordID TD::se=dict_.seIndex(); -WordID TD::unk=dict_.unkIndex(); -char const*const TD::ss_str=Vocab_SentStart; -char const*const TD::se_str=Vocab_SentEnd; -char const*const TD::unk_str=Vocab_Unknown; - -// pre+(i-base)+">" for i in [base,e) -inline void pad(std::string const& pre,int base,int e) { - assert(base<=e); - ostringstream o; - for (int i=base;i'; - WordID id=TD::Convert(o.str()); - assert(id==i); // this fails. why? - } -} - - -namespace { -struct TD_init { - TD_init() { - /* - // disabled for now since it's breaking trunk - assert(TD::Convert(TD::ss_str)==TD::ss); - assert(TD::Convert(TD::se_str)==TD::se); - assert(TD::Convert(TD::unk_str)==TD::unk); - assert(TD::none==Vocab_None); - pad("=dict_.highIndex()) return undef_token(w); -#endif - return dict_.getWord((VocabIndex)w); -} - - -void TD::GetWordIDs(const std::vector& strings, std::vector* ids) { - ids->clear(); - for (vector::const_iterator i = strings.begin(); i != strings.end(); ++i) - ids->push_back(TD::Convert(*i)); -} - -std::string TD::GetString(const std::vector& str) { - ostringstream o; - for (int i=0;i Ws; - Ws *ids; - explicit add_wordids(Ws *i) : ids(i) { } - add_wordids(const add_wordids& o) : ids(o.ids) { } - void operator()(char const* s) { - ids->push_back(TD::Convert(s)); - } - void operator()(std::string const& s) { - ids->push_back(TD::Convert(s)); - } -}; - -} - -void TD::ConvertSentence(std::string const& s, std::vector* ids) { - ids->clear(); - VisitTokens(s,add_wordids(ids)); -} diff --git a/decoder/tdict.h b/decoder/tdict.h deleted file mode 100644 index a7b3ee1c..00000000 --- a/decoder/tdict.h +++ /dev/null @@ -1,50 +0,0 @@ -#ifndef _TDICT_H_ -#define _TDICT_H_ - -#include -#include -#include "wordid.h" -#include - -class Vocab; - -struct TD { - /* // disabled for now - static const int reserved_begin=10; // allow room for SRI special tokens e.g. unk ss se pause. tokens until this get "" - static const int n_reserved=10; // 0...n_reserved-1 get token '' - static inline WordID reserved(int i) { - assert(i>=0 && i"; - static char const* const se_str; //=""; - static char const* const unk_str; //=""; - static WordID ss,se,unk; // x=Convert(x_str) - static WordID end(); // next id to be assigned; [begin,end) give the non-reserved tokens seen so far - static Vocab dict_; - static void ConvertSentence(std::string const& sent, std::vector* ids); - static void GetWordIDs(const std::vector& strings, std::vector* ids); - static std::string GetString(const std::vector& str); - static std::string GetString(WordID const* i,WordID const* e); - static int AppendString(const WordID& w, int pos, int bufsize, char* buffer); - static unsigned int NumWords(); - static WordID Convert(const std::string& s); - static WordID Convert(char const* s); - static const char* Convert(WordID w); -}; - -struct ToTD { - typedef WordID result_type; - result_type operator()(std::string const& t) const { - return TD::Convert(t); - } -}; - - -#endif diff --git a/decoder/test_data/weights b/decoder/test_data/weights deleted file mode 100644 index ea70229c..00000000 --- a/decoder/test_data/weights +++ /dev/null @@ -1,8 +0,0 @@ -# hiero -WordPenalty -0.387029 -LanguageModel 0.253195 -PhraseModel_0 0.142926 -PhraseModel_1 0.465119 -PhraseModel_2 0.079503 -CNPosteriorProbability 0.09259 -Inf -inf diff --git a/decoder/threadlocal.h b/decoder/threadlocal.h deleted file mode 100755 index d79f5d9d..00000000 --- a/decoder/threadlocal.h +++ /dev/null @@ -1,71 +0,0 @@ -#ifndef THREADLOCAL_H -#define THREADLOCAL_H - -#ifndef SETLOCAL_SWAP -# define SETLOCAL_SWAP 0 -#endif - -#ifdef BOOST_NO_MT - -# define THREADLOCAL - -#else - -#ifdef _MSC_VER - -//FIXME: doesn't work with DLLs ... use TLS apis instead (http://www.boost.org/libs/thread/doc/tss.html) -# define THREADLOCAL __declspec(thread) - -#else - -# define THREADLOCAL __thread - -#endif - -#endif - -#include //swap - -// naturally, the below are only thread-safe if value is THREADLOCAL -template -struct SaveLocal { - D &value; - D old_value; - SaveLocal(D& val) : value(val), old_value(val) {} - ~SaveLocal() { -#if SETLOCAL_SWAP - swap(value,old_value); -#else - value=old_value; -#endif - } -}; - -template -struct SetLocal { - D &value; - D old_value; - SetLocal(D& val,const D &new_value) : value(val), old_value( -#if SETLOCAL_SWAP - new_value -#else - val -#endif - ) { -#if SETLOCAL_SWAP - swap(value,old_value); -#else - value=new_value; -#endif - } - ~SetLocal() { -#if SETLOCAL_SWAP - swap(value,old_value); -#else - value=old_value; -#endif - } -}; - - -#endif diff --git a/decoder/timing_stats.cc b/decoder/timing_stats.cc deleted file mode 100644 index fc8e9df1..00000000 --- a/decoder/timing_stats.cc +++ /dev/null @@ -1,24 +0,0 @@ -#include "timing_stats.h" - -#include -#include "time.h" //cygwin needs -using namespace std; - -map Timer::stats; - -Timer::Timer(const string& timername) : start_t(clock()), cur(stats[timername]) {} - -Timer::~Timer() { - ++cur.calls; - const clock_t end_t = clock(); - const double elapsed = (end_t - start_t) / 1000000.0; - cur.total_time += elapsed; -} - -void Timer::Summarize() { - for (map::iterator it = stats.begin(); it != stats.end(); ++it) { - cerr << it->first << ": " << it->second.total_time << " secs (" << it->second.calls << " calls)\n"; - } - stats.clear(); -} - diff --git a/decoder/timing_stats.h b/decoder/timing_stats.h deleted file mode 100644 index 0a9f7656..00000000 --- a/decoder/timing_stats.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef _TIMING_STATS_H_ -#define _TIMING_STATS_H_ - -#include -#include - -struct TimerInfo { - int calls; - double total_time; - TimerInfo() : calls(), total_time() {} -}; - -struct Timer { - Timer(const std::string& info); - ~Timer(); - static void Summarize(); - private: - static std::map stats; - clock_t start_t; - TimerInfo& cur; - Timer(const Timer& other); - const Timer& operator=(const Timer& other); -}; - -#endif diff --git a/decoder/weights.cc b/decoder/weights.cc deleted file mode 100644 index 84647585..00000000 --- a/decoder/weights.cc +++ /dev/null @@ -1,77 +0,0 @@ -#include "weights.h" - -#include - -#include "fdict.h" -#include "filelib.h" - -using namespace std; - -void Weights::InitFromFile(const std::string& filename, vector* feature_list) { - cerr << "Reading weights from " << filename << endl; - ReadFile in_file(filename); - istream& in = *in_file.stream(); - assert(in); - int weight_count = 0; - bool fl = false; - while (in) { - double val = 0; - string buf; - getline(in, buf); - if (buf.size() == 0) continue; - if (buf[0] == '#') continue; - for (int i = 0; i < buf.size(); ++i) - if (buf[i] == '=') buf[i] = ' '; - int start = 0; - while(start < buf.size() && buf[start] == ' ') ++start; - int end = 0; - while(end < buf.size() && buf[end] != ' ') ++end; - int fid = FD::Convert(buf.substr(start, end - start)); - while(end < buf.size() && buf[end] == ' ') ++end; - val = strtod(&buf.c_str()[end], NULL); - if (isnan(val)) { - cerr << FD::Convert(fid) << " has weight NaN!\n"; - abort(); - } - if (wv_.size() <= fid) - wv_.resize(fid + 1); - wv_[fid] = val; - if (feature_list) { feature_list->push_back(FD::Convert(fid)); } - ++weight_count; - if (weight_count % 50000 == 0) { cerr << '.' << flush; fl = true; } - if (weight_count % 2000000 == 0) { cerr << " [" << weight_count << "]\n"; fl = false; } - } - if (fl) { cerr << endl; } - cerr << "Loaded " << weight_count << " feature weights\n"; -} - -void Weights::WriteToFile(const std::string& fname, bool hide_zero_value_features) const { - WriteFile out(fname); - ostream& o = *out.stream(); - assert(o); - o.precision(17); - const int num_feats = FD::NumFeats(); - for (int i = 1; i < num_feats; ++i) { - const double val = (i < wv_.size() ? wv_[i] : 0.0); - if (hide_zero_value_features && val == 0.0) continue; - o << FD::Convert(i) << ' ' << val << endl; - } -} - -void Weights::InitVector(std::vector* w) const { - *w = wv_; -} - -void Weights::InitSparseVector(SparseVector* w) const { - for (int i = 1; i < wv_.size(); ++i) { - const double& weight = wv_[i]; - if (weight) w->set_value(i, weight); - } -} - -void Weights::InitFromVector(const std::vector& w) { - wv_ = w; - if (wv_.size() > FD::NumFeats()) - cerr << "WARNING: initializing weight vector has more features than the global feature dictionary!\n"; - wv_.resize(FD::NumFeats(), 0); -} diff --git a/decoder/weights.h b/decoder/weights.h deleted file mode 100644 index f19aa3ce..00000000 --- a/decoder/weights.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef _WEIGHTS_H_ -#define _WEIGHTS_H_ - -#include -#include -#include -#include "sparse_vector.h" - -class Weights { - public: - Weights() {} - void InitFromFile(const std::string& fname, std::vector* feature_list = NULL); - void WriteToFile(const std::string& fname, bool hide_zero_value_features = true) const; - void InitVector(std::vector* w) const; - void InitSparseVector(SparseVector* w) const; - void InitFromVector(const std::vector& w); - private: - std::vector wv_; -}; - -#endif diff --git a/decoder/weights_test.cc b/decoder/weights_test.cc deleted file mode 100644 index aa6b3db2..00000000 --- a/decoder/weights_test.cc +++ /dev/null @@ -1,28 +0,0 @@ -#include -#include -#include -#include -#include -#include "weights.h" -#include "tdict.h" -#include "hg.h" - -using namespace std; - -class WeightsTest : public testing::Test { - protected: - virtual void SetUp() { } - virtual void TearDown() { } -}; - - -TEST_F(WeightsTest,Load) { - Weights w; - w.InitFromFile("test_data/weights"); - w.WriteToFile("-"); -} - -int main(int argc, char **argv) { - testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/decoder/wordid.h b/decoder/wordid.h deleted file mode 100644 index fb50bcc1..00000000 --- a/decoder/wordid.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _WORD_ID_H_ -#define _WORD_ID_H_ - -typedef int WordID; - -#endif diff --git a/extools/Makefile.am b/extools/Makefile.am index 1e82287d..ee363264 100644 --- a/extools/Makefile.am +++ b/extools/Makefile.am @@ -11,20 +11,20 @@ sg_lexer.cc: sg_lexer.l $(LEX) -s -CF -8 -o$@ $< filter_grammar_SOURCES = filter_grammar.cc extract.cc sentence_pair.cc striped_grammar.cc sg_lexer.cc -filter_grammar_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +filter_grammar_LDADD = $(top_srcdir)/utils/libutils.a -lz #filter_grammar_LDFLAGS = -all-static featurize_grammar_SOURCES = featurize_grammar.cc extract.cc sentence_pair.cc sg_lexer.cc striped_grammar.cc -featurize_grammar_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +featurize_grammar_LDADD = $(top_srcdir)/utils/libutils.a -lz mr_stripe_rule_reduce_SOURCES = mr_stripe_rule_reduce.cc extract.cc sentence_pair.cc striped_grammar.cc sg_lexer.cc -mr_stripe_rule_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +mr_stripe_rule_reduce_LDADD = $(top_srcdir)/utils/libutils.a -lz extractor_SOURCES = sentence_pair.cc extract.cc extractor.cc striped_grammar.cc -extractor_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +extractor_LDADD = $(top_srcdir)/utils/libutils.a -lz extractor_monolingual_SOURCES = extractor_monolingual.cc -extractor_monolingual_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +extractor_monolingual_LDADD = $(top_srcdir)/utils/libutils.a -lz -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils diff --git a/extools/sg_lexer.l b/extools/sg_lexer.l index 168b835a..d60bd0fc 100644 --- a/extools/sg_lexer.l +++ b/extools/sg_lexer.l @@ -1,6 +1,4 @@ %{ -#include "rule_lexer.h" - #include #include #include @@ -8,7 +6,6 @@ #include #include "tdict.h" #include "fdict.h" -#include "trule.h" #include "striped_grammar.h" int lex_line = 0; diff --git a/gi/clda/src/Makefile.am b/gi/clda/src/Makefile.am index 688746bb..2b1393ac 100644 --- a/gi/clda/src/Makefile.am +++ b/gi/clda/src/Makefile.am @@ -2,5 +2,5 @@ bin_PROGRAMS = clda clda_SOURCES = clda.cc -AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/decoder -AM_LDFLAGS = $(top_srcdir)/decoder/libcdec.a -lz +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils +AM_LDFLAGS = $(top_srcdir)/utils/libutils.a -lz diff --git a/gi/pyp-topics/src/Makefile.am b/gi/pyp-topics/src/Makefile.am index c22819db..d3f95d0b 100644 --- a/gi/pyp-topics/src/Makefile.am +++ b/gi/pyp-topics/src/Makefile.am @@ -4,13 +4,13 @@ contexts_lexer.cc: contexts_lexer.l $(LEX) -s -CF -8 -o$@ $< pyp_topics_train_SOURCES = mt19937ar.c corpus.cc gzstream.cc pyp-topics.cc train.cc contexts_lexer.cc contexts_corpus.cc -pyp_topics_train_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +pyp_topics_train_LDADD = $(top_srcdir)/utils/libutils.a -lz pyp_contexts_train_SOURCES = mt19937ar.c corpus.cc gzstream.cc pyp-topics.cc contexts_lexer.cc contexts_corpus.cc train-contexts.cc -pyp_contexts_train_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +pyp_contexts_train_LDADD = $(top_srcdir)/utils/libutils.a -lz #mpi_pyp_contexts_train_SOURCES = mt19937ar.c corpus.cc gzstream.cc mpi-pyp-topics.cc contexts_lexer.cc contexts_corpus.cc mpi-train-contexts.cc -#mpi_pyp_contexts_train_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +#mpi_pyp_contexts_train_LDADD = $(top_srcdir)/utils/libutils.a -lz -AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I../../../utils diff --git a/gi/pyp-topics/src/contexts_corpus.hh b/gi/pyp-topics/src/contexts_corpus.hh index dd721361..b2d235cb 100644 --- a/gi/pyp-topics/src/contexts_corpus.hh +++ b/gi/pyp-topics/src/contexts_corpus.hh @@ -10,7 +10,7 @@ #include "corpus.hh" #include "contexts_lexer.h" -#include "../../../decoder/dict.h" +#include "dict.h" class BackoffGenerator { diff --git a/gi/pyp-topics/src/contexts_lexer.h b/gi/pyp-topics/src/contexts_lexer.h index 1b79c6fd..66004990 100644 --- a/gi/pyp-topics/src/contexts_lexer.h +++ b/gi/pyp-topics/src/contexts_lexer.h @@ -5,7 +5,7 @@ #include #include -#include "../../../decoder/dict.h" +#include "dict.h" struct ContextsLexer { typedef std::vector Context; diff --git a/gi/pyp-topics/src/contexts_lexer.l b/gi/pyp-topics/src/contexts_lexer.l index 7a5d9460..64cd7ca3 100644 --- a/gi/pyp-topics/src/contexts_lexer.l +++ b/gi/pyp-topics/src/contexts_lexer.l @@ -101,7 +101,7 @@ INT [\-+]?[0-9]+|inf|[\-+]inf %% -#include "../../../decoder/filelib.h" +#include "filelib.h" void ContextsLexer::ReadContexts(std::istream* in, ContextsLexer::ContextsCallback func, void* extra) { lex_line = 1; diff --git a/mteval/Makefile.am b/mteval/Makefile.am new file mode 100644 index 00000000..7ae14045 --- /dev/null +++ b/mteval/Makefile.am @@ -0,0 +1,23 @@ +bin_PROGRAMS = \ + fast_score \ + mbr_kbest + +if HAVE_GTEST +noinst_PROGRAMS = \ + scorer_test +endif + +noinst_LIBRARIES = libmteval.a + +libmteval_a_SOURCES = ter.cc comb_scorer.cc aer_scorer.cc scorer.cc + +fast_score_SOURCES = fast_score.cc +fast_score_LDADD = $(top_srcdir)/utils/libutils.a libmteval.a -lz + +mbr_kbest_SOURCES = mbr_kbest.cc +mbr_kbest_LDADD = $(top_srcdir)/utils/libutils.a libmteval.a -lz + +scorer_test_SOURCES = scorer_test.cc +scorer_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/utils/libutils.a libmteval.a -lz + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils diff --git a/mteval/aer_scorer.cc b/mteval/aer_scorer.cc new file mode 100644 index 00000000..edd4390f --- /dev/null +++ b/mteval/aer_scorer.cc @@ -0,0 +1,135 @@ +#include "aer_scorer.h" + +#include +#include +#include + +#include "tdict.h" +#include "alignment_pharaoh.h" + +using namespace std; + +class AERScore : public ScoreBase { + friend class AERScorer; + public: + AERScore() : num_matches(), num_predicted(), num_in_ref() {} + AERScore(int m, int p, int r) : + num_matches(m), num_predicted(p), num_in_ref(r) {} + virtual void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len){} + virtual void PlusEquals(const Score& delta, const float scale) { + const AERScore& other = static_cast(delta); + num_matches += scale*other.num_matches; + num_predicted += scale*other.num_predicted; + num_in_ref += scale*other.num_in_ref; + } + virtual void PlusEquals(const Score& delta) { + const AERScore& other = static_cast(delta); + num_matches += other.num_matches; + num_predicted += other.num_predicted; + num_in_ref += other.num_in_ref; + } + + + virtual ScoreP GetZero() const { + return ScoreP(new AERScore); + } + virtual ScoreP GetOne() const { + return ScoreP(new AERScore); + } + virtual void Subtract(const Score& rhs, Score* out) const { + AERScore* res = static_cast(out); + const AERScore& other = static_cast(rhs); + res->num_matches = num_matches - other.num_matches; + res->num_predicted = num_predicted - other.num_predicted; + res->num_in_ref = num_in_ref - other.num_in_ref; + } + float Precision() const { + return static_cast(num_matches) / num_predicted; + } + float Recall() const { + return static_cast(num_matches) / num_in_ref; + } + float ComputePartialScore() const { return 0.0;} + virtual float ComputeScore() const { + const float prec = Precision(); + const float rec = Recall(); + const float f = (2.0 * prec * rec) / (rec + prec); + if (isnan(f)) return 1.0f; + return 1.0f - f; + } + virtual bool IsAdditiveIdentity() const { + return (num_matches == 0) && (num_predicted == 0) && (num_in_ref == 0); + } + virtual void ScoreDetails(std::string* out) const { + ostringstream os; + os << "AER=" << (ComputeScore() * 100.0) + << " F=" << (100 - ComputeScore() * 100.0) + << " P=" << (Precision() * 100.0) << " R=" << (Recall() * 100.0) + << " [" << num_matches << " " << num_predicted << " " << num_in_ref << "]"; + *out = os.str(); + } + virtual void Encode(std::string*out) const { + out->resize(sizeof(int) * 3); + *(int *)&(*out)[sizeof(int) * 0] = num_matches; + *(int *)&(*out)[sizeof(int) * 1] = num_predicted; + *(int *)&(*out)[sizeof(int) * 2] = num_in_ref; + } + private: + int num_matches; + int num_predicted; + int num_in_ref; +}; + +AERScorer::AERScorer(const vector >& refs, const string& src) : src_(src) { + if (refs.size() != 1) { + cerr << "AERScorer can only take a single reference!\n"; + abort(); + } + ref_ = AlignmentPharaoh::ReadPharaohAlignmentGrid(TD::GetString(refs.front())); +} + +static inline bool Safe(const Array2D& a, int i, int j) { + if (i >= 0 && j >= 0 && i < a.width() && j < a.height()) + return a(i,j); + else + return false; +} + +ScoreP AERScorer::ScoreCCandidate(const vector& shyp) const { + return ScoreP(); +} + +ScoreP AERScorer::ScoreCandidate(const vector& shyp) const { + boost::shared_ptr > hyp = + AlignmentPharaoh::ReadPharaohAlignmentGrid(TD::GetString(shyp)); + + int m = 0; + int r = 0; + int p = 0; + int i_len = ref_->width(); + int j_len = ref_->height(); + for (int i = 0; i < i_len; ++i) { + for (int j = 0; j < j_len; ++j) { + if ((*ref_)(i,j)) { + ++r; + if (Safe(*hyp, i, j)) ++m; + } + } + } + for (int i = 0; i < hyp->width(); ++i) + for (int j = 0; j < hyp->height(); ++j) + if ((*hyp)(i,j)) ++p; + + return ScoreP(new AERScore(m,p,r)); +} + +ScoreP AERScorer::ScoreFromString(const string& in) { + AERScore* res = new AERScore; + res->num_matches = *(const int *)&in[sizeof(int) * 0]; + res->num_predicted = *(const int *)&in[sizeof(int) * 1]; + res->num_in_ref = *(const int *)&in[sizeof(int) * 2]; + return ScoreP(res); +} + +const std::string* AERScorer::GetSource() const { return &src_; } + diff --git a/mteval/aer_scorer.h b/mteval/aer_scorer.h new file mode 100644 index 00000000..6d53d359 --- /dev/null +++ b/mteval/aer_scorer.h @@ -0,0 +1,23 @@ +#ifndef _AER_SCORER_ +#define _AER_SCORER_ + +#include + +#include "scorer.h" +#include "array2d.h" + +class AERScorer : public SentenceScorer { + public: + // when constructing alignment strings from a hypergraph, the source + // is necessary. + AERScorer(const std::vector >& refs, const std::string& src = ""); + ScoreP ScoreCandidate(const std::vector& hyp) const; + ScoreP ScoreCCandidate(const std::vector& hyp) const; + static ScoreP ScoreFromString(const std::string& in); + const std::string* GetSource() const; + private: + std::string src_; + boost::shared_ptr > ref_; +}; + +#endif diff --git a/mteval/comb_scorer.cc b/mteval/comb_scorer.cc new file mode 100644 index 00000000..9fc37868 --- /dev/null +++ b/mteval/comb_scorer.cc @@ -0,0 +1,97 @@ +#include "comb_scorer.h" + +#include + +using namespace std; + +class BLEUTERCombinationScore : public ScoreBase { + friend class BLEUTERCombinationScorer; + public: + ~BLEUTERCombinationScore(); + float ComputePartialScore() const { return 0.0;} + float ComputeScore() const { + return (bleu->ComputeScore() - ter->ComputeScore()) / 2.0f; + } + void ScoreDetails(string* details) const { + char buf[160]; + sprintf(buf, "Combi = %.2f, BLEU = %.2f, TER = %.2f", + ComputeScore()*100.0f, bleu->ComputeScore()*100.0f, ter->ComputeScore()*100.0f); + *details = buf; + } + void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len){} + + void PlusEquals(const Score& delta, const float scale) { + bleu->PlusEquals(*static_cast(delta).bleu, scale); + ter->PlusEquals(*static_cast(delta).ter, scale); + } + void PlusEquals(const Score& delta) { + bleu->PlusEquals(*static_cast(delta).bleu); + ter->PlusEquals(*static_cast(delta).ter); + } + + + + ScoreP GetOne() const { + BLEUTERCombinationScore* res = new BLEUTERCombinationScore; + res->bleu = bleu->GetOne(); + res->ter = ter->GetOne(); + return ScoreP(res); + } + ScoreP GetZero() const { + BLEUTERCombinationScore* res = new BLEUTERCombinationScore; + res->bleu = bleu->GetZero(); + res->ter = ter->GetZero(); + return ScoreP(res); + } + void Subtract(const Score& rhs, Score* res) const { + bleu->Subtract(*static_cast(rhs).bleu, + static_cast(res)->bleu.get()); + ter->Subtract(*static_cast(rhs).ter, + static_cast(res)->ter.get()); + } + void Encode(std::string* out) const { + string bs, ts; + bleu->Encode(&bs); + ter->Encode(&ts); + out->clear(); + (*out) += static_cast(bs.size()); + (*out) += bs; + (*out) += ts; + } + bool IsAdditiveIdentity() const { + return bleu->IsAdditiveIdentity() && ter->IsAdditiveIdentity(); + } + private: + ScoreP bleu; + ScoreP ter; +}; + +BLEUTERCombinationScore::~BLEUTERCombinationScore() { +} + +BLEUTERCombinationScorer::BLEUTERCombinationScorer(const vector >& refs) { + bleu_ = SentenceScorer::CreateSentenceScorer(IBM_BLEU, refs); + ter_ = SentenceScorer::CreateSentenceScorer(TER, refs); +} + +BLEUTERCombinationScorer::~BLEUTERCombinationScorer() { +} + +ScoreP BLEUTERCombinationScorer::ScoreCCandidate(const vector& hyp) const { + return ScoreP(); +} + +ScoreP BLEUTERCombinationScorer::ScoreCandidate(const std::vector& hyp) const { + BLEUTERCombinationScore* res = new BLEUTERCombinationScore; + res->bleu = bleu_->ScoreCandidate(hyp); + res->ter = ter_->ScoreCandidate(hyp); + return ScoreP(res); +} + +ScoreP BLEUTERCombinationScorer::ScoreFromString(const std::string& in) { + int bss = in[0]; + BLEUTERCombinationScore* r = new BLEUTERCombinationScore; + r->bleu = SentenceScorer::CreateScoreFromString(IBM_BLEU, in.substr(1, bss)); + r->ter = SentenceScorer::CreateScoreFromString(TER, in.substr(1 + bss)); + return ScoreP(r); +} diff --git a/mteval/comb_scorer.h b/mteval/comb_scorer.h new file mode 100644 index 00000000..346be576 --- /dev/null +++ b/mteval/comb_scorer.h @@ -0,0 +1,17 @@ +#ifndef _COMB_SCORER_ +#define _COMB_SCORER_ + +#include "scorer.h" + +class BLEUTERCombinationScorer : public SentenceScorer { + public: + BLEUTERCombinationScorer(const std::vector >& refs); + ~BLEUTERCombinationScorer(); + ScoreP ScoreCandidate(const std::vector& hyp) const; + ScoreP ScoreCCandidate(const std::vector& hyp) const; + static ScoreP ScoreFromString(const std::string& in); + private: + ScorerP bleu_,ter_; +}; + +#endif diff --git a/mteval/fast_score.cc b/mteval/fast_score.cc new file mode 100644 index 00000000..5ee264a6 --- /dev/null +++ b/mteval/fast_score.cc @@ -0,0 +1,72 @@ +#include +#include + +#include +#include + +#include "filelib.h" +#include "tdict.h" +#include "scorer.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("reference,r",po::value >(), "[REQD] Reference translation(s) (tokenized text file)") + ("loss_function,l",po::value()->default_value("ibm_bleu"), "Scoring metric (ibm_bleu, nist_bleu, koehn_bleu, ter, combi)") + ("in_file,i", po::value()->default_value("-"), "Input file") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (!conf->count("reference")) { + cerr << "Please specify one or more references using -r -r ...\n"; + flag = true; + } + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const string loss_function = conf["loss_function"].as(); + ScoreType type = ScoreTypeFromString(loss_function); + DocScorer ds(type, conf["reference"].as >(), ""); + cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl; + + ReadFile rf(conf["in_file"].as()); + ScoreP acc; + istream& in = *rf.stream(); + int lc = 0; + while(in) { + string line; + getline(in, line); + if (line.empty() && !in) break; + vector sent; + TD::ConvertSentence(line, &sent); + ScoreP sentscore = ds[lc]->ScoreCandidate(sent); + if (!acc) { acc = sentscore->GetZero(); } + acc->PlusEquals(*sentscore); + ++lc; + } + assert(lc > 0); + if (lc > ds.size()) { + cerr << "Too many (" << lc << ") translations in input, expected " << ds.size() << endl; + return 1; + } + if (lc != ds.size()) + cerr << "Fewer sentences in hyp (" << lc << ") than refs (" + << ds.size() << "): scoring partial set!\n"; + float score = acc->ComputeScore(); + string details; + acc->ScoreDetails(&details); + cerr << details << endl; + cout << score << endl; + return 0; +} diff --git a/mteval/mbr_kbest.cc b/mteval/mbr_kbest.cc new file mode 100644 index 00000000..2867b36b --- /dev/null +++ b/mteval/mbr_kbest.cc @@ -0,0 +1,138 @@ +#include +#include + +#include + +#include "prob.h" +#include "tdict.h" +#include "scorer.h" +#include "filelib.h" +#include "stringlib.h" + +using namespace std; + +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("scale,a",po::value()->default_value(1.0), "Posterior scaling factor (alpha)") + ("loss_function,l",po::value()->default_value("bleu"), "Loss function") + ("input,i",po::value()->default_value("-"), "File to read k-best lists from") + ("output_list,L", "Show reranked list as output") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +struct LossComparer { + bool operator()(const pair, double>& a, const pair, double>& b) const { + return a.second < b.second; + } +}; + +bool ReadKBestList(istream* in, string* sent_id, vector, prob_t> >* list) { + static string cache_id; + static pair, prob_t> cache_pair; + list->clear(); + string cur_id; + if (cache_pair.first.size() > 0) { + list->push_back(cache_pair); + cur_id = cache_id; + cache_pair.first.clear(); + } + string line; + string tstr; + while(*in) { + getline(*in, line); + if (line.empty()) continue; + size_t p1 = line.find(" ||| "); + if (p1 == string::npos) { cerr << "Bad format: " << line << endl; abort(); } + size_t p2 = line.find(" ||| ", p1 + 4); + if (p2 == string::npos) { cerr << "Bad format: " << line << endl; abort(); } + size_t p3 = line.rfind(" ||| "); + cache_id = line.substr(0, p1); + tstr = line.substr(p1 + 5, p2 - p1 - 5); + double val = strtod(line.substr(p3 + 5).c_str(), NULL); + TD::ConvertSentence(tstr, &cache_pair.first); + cache_pair.second.logeq(val); + if (cur_id.empty()) cur_id = cache_id; + if (cur_id == cache_id) { + list->push_back(cache_pair); + *sent_id = cur_id; + cache_pair.first.clear(); + } else { break; } + } + return !list->empty(); +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const string metric = conf["loss_function"].as(); + const bool output_list = conf.count("output_list") > 0; + const string file = conf["input"].as(); + const double mbr_scale = conf["scale"].as(); + cerr << "Posterior scaling factor (alpha) = " << mbr_scale << endl; + + ScoreType type = ScoreTypeFromString(metric); + vector, prob_t> > list; + ReadFile rf(file); + string sent_id; + while(ReadKBestList(rf.stream(), &sent_id, &list)) { + vector joints(list.size()); + const prob_t max_score = pow(list.front().second, mbr_scale); + prob_t marginal = prob_t::Zero(); + for (int i = 0 ; i < list.size(); ++i) { + const prob_t joint = pow(list[i].second, mbr_scale) / max_score; + joints[i] = joint; + // cerr << "list[" << i << "] joint=" << log(joint) << endl; + marginal += joint; + } + int mbr_idx = -1; + vector mbr_scores(output_list ? list.size() : 0); + double mbr_loss = numeric_limits::max(); + for (int i = 0 ; i < list.size(); ++i) { + vector > refs(1, list[i].first); + //cerr << i << ": " << list[i].second <<"\t" << TD::GetString(list[i].first) << endl; + ScorerP scorer = SentenceScorer::CreateSentenceScorer(type, refs); + double wl_acc = 0; + for (int j = 0; j < list.size(); ++j) { + if (i != j) { + ScoreP s = scorer->ScoreCandidate(list[j].first); + double loss = 1.0 - s->ComputeScore(); + if (type == TER || type == AER) loss = 1.0 - loss; + double weighted_loss = loss * (joints[j] / marginal); + wl_acc += weighted_loss; + if ((!output_list) && wl_acc > mbr_loss) break; + } + } + if (output_list) mbr_scores[i] = wl_acc; + if (wl_acc < mbr_loss) { + mbr_loss = wl_acc; + mbr_idx = i; + } + } + // cerr << "ML translation: " << TD::GetString(list[0].first) << endl; + cerr << "MBR Best idx: " << mbr_idx << endl; + if (output_list) { + for (int i = 0; i < list.size(); ++i) + list[i].second.logeq(mbr_scores[i]); + sort(list.begin(), list.end(), LossComparer()); + for (int i = 0; i < list.size(); ++i) + cout << sent_id << " ||| " + << TD::GetString(list[i].first) << " ||| " + << log(list[i].second) << endl; + } else { + cout << TD::GetString(list[mbr_idx].first) << endl; + } + } + return 0; +} + diff --git a/mteval/scorer.cc b/mteval/scorer.cc new file mode 100644 index 00000000..04eeaa93 --- /dev/null +++ b/mteval/scorer.cc @@ -0,0 +1,630 @@ +#include "scorer.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "filelib.h" +#include "ter.h" +#include "aer_scorer.h" +#include "comb_scorer.h" +#include "tdict.h" +#include "stringlib.h" + +using boost::shared_ptr; +using namespace std; + +void Score::TimesEquals(float scale) { + cerr<<"UNIMPLEMENTED except for BLEU (for MIRA): Score::TimesEquals"<=0 && st + float operator()(float sum,S const& ref) const { + return sum+ref.size(); + } +}; + +template +float avg_reflength(vector refs) { + unsigned n=refs.size(); + return n?accumulate(refs.begin(),refs.end(),0.,length_accum())/n:0.; +} + + +float SentenceScorer::ComputeRefLength(const Sentence &hyp) const { + return hyp.size(); // reasonable default? :) +} + +const std::string* SentenceScorer::GetSource() const { return NULL; } + +class SERScore : public ScoreBase { + friend class SERScorer; + public: + SERScore() : correct(0), total(0) {} + float ComputePartialScore() const { return 0.0;} + float ComputeScore() const { + return static_cast(correct) / static_cast(total); + } + void ScoreDetails(string* details) const { + ostringstream os; + os << "SER= " << ComputeScore() << " (" << correct << '/' << total << ')'; + *details = os.str(); + } + void PlusPartialEquals(const Score& /* delta */, int /* oracle_e_cover */, int /* oracle_f_cover */, int /* src_len */){} + + void PlusEquals(const Score& delta, const float scale) { + correct += scale*static_cast(delta).correct; + total += scale*static_cast(delta).total; + } + void PlusEquals(const Score& delta) { + correct += static_cast(delta).correct; + total += static_cast(delta).total; + } + ScoreP GetZero() const { return ScoreP(new SERScore); } + ScoreP GetOne() const { return ScoreP(new SERScore); } + void Subtract(const Score& rhs, Score* res) const { + SERScore* r = static_cast(res); + r->correct = correct - static_cast(rhs).correct; + r->total = total - static_cast(rhs).total; + } + void Encode(string* out) const { + assert(!"not implemented"); + } + bool IsAdditiveIdentity() const { + return (total == 0 && correct == 0); // correct is always 0 <= n <= total + } + private: + int correct, total; +}; + +std::string SentenceScorer::verbose_desc() const { + return desc+",ref0={ "+TD::GetString(refs[0])+" }"; +} + +class SERScorer : public SentenceScorer { + public: + SERScorer(const vector >& references) : SentenceScorer("SERScorer",references),refs_(references) {} + ScoreP ScoreCCandidate(const vector& /* hyp */) const { + return ScoreP(); + } + ScoreP ScoreCandidate(const vector& hyp) const { + SERScore* res = new SERScore; + res->total = 1; + for (int i = 0; i < refs_.size(); ++i) + if (refs_[i] == hyp) res->correct = 1; + return ScoreP(res); + } + static ScoreP ScoreFromString(const string& data) { + assert(!"Not implemented"); + } + private: + vector > refs_; +}; + +class BLEUScore : public ScoreBase { + friend class BLEUScorerBase; + public: + BLEUScore(int n) : correct_ngram_hit_counts(float(0),n), hyp_ngram_counts(float(0),n) { + ref_len = 0; + hyp_len = 0; } + BLEUScore(int n, int k) : correct_ngram_hit_counts(float(k),n), hyp_ngram_counts(float(k),n) { + ref_len = k; + hyp_len = k; } + float ComputeScore() const; + float ComputePartialScore() const; + void ScoreDetails(string* details) const; + void TimesEquals(float scale); + void PlusEquals(const Score& delta); + void PlusEquals(const Score& delta, const float scale); + void PlusPartialEquals(const Score& delta, int oracle_e_cover, int oracle_f_cover, int src_len); + ScoreP GetZero() const; + ScoreP GetOne() const; + void Subtract(const Score& rhs, Score* res) const; + void Encode(string* out) const; + bool IsAdditiveIdentity() const { + if (fabs(ref_len) > 0.1f || hyp_len != 0) return false; + for (int i = 0; i < correct_ngram_hit_counts.size(); ++i) + if (hyp_ngram_counts[i] != 0 || + correct_ngram_hit_counts[i] != 0) return false; + return true; + } + private: + int N() const { + return hyp_ngram_counts.size(); + } + float ComputeScore(vector* precs, float* bp) const; + float ComputePartialScore(vector* prec, float* bp) const; + valarray correct_ngram_hit_counts; + valarray hyp_ngram_counts; + float ref_len; + float hyp_len; +}; + +class BLEUScorerBase : public SentenceScorer { + public: + BLEUScorerBase(const vector >& references, + int n + ); + ScoreP ScoreCandidate(const vector& hyp) const; + ScoreP ScoreCCandidate(const vector& hyp) const; + static ScoreP ScoreFromString(const string& in); + + virtual float ComputeRefLength(const vector& hyp) const = 0; + private: + struct NGramCompare { + int operator() (const vector& a, const vector& b) { + size_t as = a.size(); + size_t bs = b.size(); + const size_t s = (as < bs ? as : bs); + for (size_t i = 0; i < s; ++i) { + int d = a[i] - b[i]; + if (d < 0) return true; + if (d > 0) return false; + } + return as < bs; + } + }; + typedef map, pair, NGramCompare> NGramCountMap; + void CountRef(const vector& ref) { + NGramCountMap tc; + vector ngram(n_); + int s = ref.size(); + for (int j=0; j& p = ngrams_[i->first]; + if (p.first < i->second.first) + p = i->second; + } + } + + void ComputeNgramStats(const vector& sent, + valarray* correct, + valarray* hyp, + bool clip_counts) + const { + assert(correct->size() == n_); + assert(hyp->size() == n_); + vector ngram(n_); + (*correct) *= 0; + (*hyp) *= 0; + int s = sent.size(); + for (int j=0; j& p = ngrams_[ngram]; + if(clip_counts){ + if (p.second < p.first) { + ++p.second; + (*correct)[i-1]++; + }} + else { + ++p.second; + (*correct)[i-1]++; + } + // if the 1 gram isn't found, don't try to match don't need to match any 2- 3- .. grams: + if (!p.first) { + for (; i<=k; ++i) + (*hyp)[i-1]++; + } else { + (*hyp)[i-1]++; + } + } + } + } + + mutable NGramCountMap ngrams_; + int n_; + vector lengths_; +}; + +ScoreP BLEUScorerBase::ScoreFromString(const string& in) { + istringstream is(in); + int n; + is >> n; + BLEUScore* r = new BLEUScore(n); + is >> r->ref_len >> r->hyp_len; + + for (int i = 0; i < n; ++i) { + is >> r->correct_ngram_hit_counts[i]; + is >> r->hyp_ngram_counts[i]; + } + return ScoreP(r); +} + +class IBM_BLEUScorer : public BLEUScorerBase { + public: + IBM_BLEUScorer(const vector >& references, + int n=4) : BLEUScorerBase(references, n), lengths_(references.size()) { + for (int i=0; i < references.size(); ++i) + lengths_[i] = references[i].size(); + } + float ComputeRefLength(const vector& hyp) const { + if (lengths_.size() == 1) return lengths_[0]; + int bestd = 2000000; + int hl = hyp.size(); + int bl = -1; + for (vector::const_iterator ci = lengths_.begin(); ci != lengths_.end(); ++ci) { + int cl = *ci; + if (abs(cl - hl) < bestd) { + bestd = abs(cl - hl); + bl = cl; + } + } + return bl; + } + private: + vector lengths_; +}; + +class NIST_BLEUScorer : public BLEUScorerBase { + public: + NIST_BLEUScorer(const vector >& references, + int n=4) : BLEUScorerBase(references, n), + shortest_(references[0].size()) { + for (int i=1; i < references.size(); ++i) + if (references[i].size() < shortest_) + shortest_ = references[i].size(); + } + float ComputeRefLength(const vector& /* hyp */) const { + return shortest_; + } + private: + float shortest_; +}; + +class Koehn_BLEUScorer : public BLEUScorerBase { + public: + Koehn_BLEUScorer(const vector >& references, + int n=4) : BLEUScorerBase(references, n), + avg_(0) { + for (int i=0; i < references.size(); ++i) + avg_ += references[i].size(); + avg_ /= references.size(); + } + float ComputeRefLength(const vector& /* hyp */) const { + return avg_; + } + private: + float avg_; +}; + +ScorerP SentenceScorer::CreateSentenceScorer(const ScoreType type, + const vector >& refs, + const string& src) +{ + SentenceScorer *r=0; + switch (type) { + case IBM_BLEU: r = new IBM_BLEUScorer(refs, 4);break; + case IBM_BLEU_3 : r = new IBM_BLEUScorer(refs,3);break; + case NIST_BLEU: r = new NIST_BLEUScorer(refs, 4);break; + case Koehn_BLEU: r = new Koehn_BLEUScorer(refs, 4);break; + case AER: r = new AERScorer(refs, src);break; + case TER: r = new TERScorer(refs);break; + case SER: r = new SERScorer(refs);break; + case BLEU_minus_TER_over_2: r = new BLEUTERCombinationScorer(refs);break; + default: + assert(!"Not implemented!"); + } + return ScorerP(r); +} + +ScoreP SentenceScorer::GetOne() const { + Sentence s; + return ScoreCCandidate(s)->GetOne(); +} + +ScoreP SentenceScorer::GetZero() const { + Sentence s; + return ScoreCCandidate(s)->GetZero(); +} + +ScoreP Score::GetOne(ScoreType type) { + std::vector refs; + return SentenceScorer::CreateSentenceScorer(type,refs)->GetOne(); +} + +ScoreP Score::GetZero(ScoreType type) { + std::vector refs; + return SentenceScorer::CreateSentenceScorer(type,refs)->GetZero(); +} + + +ScoreP SentenceScorer::CreateScoreFromString(const ScoreType type, const string& in) { + switch (type) { + case IBM_BLEU: + case IBM_BLEU_3: + case NIST_BLEU: + case Koehn_BLEU: + return BLEUScorerBase::ScoreFromString(in); + case TER: + return TERScorer::ScoreFromString(in); + case AER: + return AERScorer::ScoreFromString(in); + case SER: + return SERScorer::ScoreFromString(in); + case BLEU_minus_TER_over_2: + return BLEUTERCombinationScorer::ScoreFromString(in); + default: + assert(!"Not implemented!"); + } +} + +void BLEUScore::ScoreDetails(string* details) const { + char buf[2000]; + vector precs(max(N(),4)); + float bp; + float bleu = ComputeScore(&precs, &bp); + for (int i=N();i<4;++i) + precs[i]=0.; + char *bufn; + bufn=buf+sprintf(buf, "BLEU = %.2f, %.1f|%.1f|%.1f|%.1f (brev=%.3f)", + bleu*100.0, + precs[0]*100.0, + precs[1]*100.0, + precs[2]*100.0, + precs[3]*100.0, + bp); + *details = buf; +} + +float BLEUScore::ComputeScore(vector* precs, float* bp) const { + float log_bleu = 0; + if (precs) precs->clear(); + int count = 0; + for (int i = 0; i < N(); ++i) { + if (hyp_ngram_counts[i] > 0) { + float lprec = log(correct_ngram_hit_counts[i]) - log(hyp_ngram_counts[i]); + if (precs) precs->push_back(exp(lprec)); + log_bleu += lprec; + ++count; + } + } + log_bleu /= static_cast(count); + float lbp = 0.0; + if (hyp_len < ref_len) + lbp = (hyp_len - ref_len) / hyp_len; + log_bleu += lbp; + if (bp) *bp = exp(lbp); + return exp(log_bleu); +} + + +//comptue scaled score for oracle retrieval +float BLEUScore::ComputePartialScore(vector* precs, float* bp) const { + // cerr << "Then here " << endl; + float log_bleu = 0; + if (precs) precs->clear(); + int count = 0; + for (int i = 0; i < N(); ++i) { + // cerr << "In CPS " << hyp_ngram_counts[i] << " " << correct_ngram_hit_counts[i] << endl; + if (hyp_ngram_counts[i] > 0) { + float lprec = log(correct_ngram_hit_counts[i]) - log(hyp_ngram_counts[i]); + if (precs) precs->push_back(exp(lprec)); + log_bleu += lprec; + ++count; + } + } + log_bleu /= static_cast(count); + float lbp = 0.0; + if (hyp_len < ref_len) + lbp = (hyp_len - ref_len) / hyp_len; + log_bleu += lbp; + if (bp) *bp = exp(lbp); + return exp(log_bleu); +} + +float BLEUScore::ComputePartialScore() const { + // cerr << "In here first " << endl; + return ComputePartialScore(NULL, NULL); +} + +float BLEUScore::ComputeScore() const { + return ComputeScore(NULL, NULL); +} + +void BLEUScore::Subtract(const Score& rhs, Score* res) const { + const BLEUScore& d = static_cast(rhs); + BLEUScore* o = static_cast(res); + o->ref_len = ref_len - d.ref_len; + o->hyp_len = hyp_len - d.hyp_len; + o->correct_ngram_hit_counts = correct_ngram_hit_counts - d.correct_ngram_hit_counts; + o->hyp_ngram_counts = hyp_ngram_counts - d.hyp_ngram_counts; +} + +void BLEUScore::PlusEquals(const Score& delta) { + const BLEUScore& d = static_cast(delta); + correct_ngram_hit_counts += d.correct_ngram_hit_counts; + hyp_ngram_counts += d.hyp_ngram_counts; + ref_len += d.ref_len; + hyp_len += d.hyp_len; +} + +void BLEUScore::TimesEquals(float scale) { + correct_ngram_hit_counts *= scale; + hyp_ngram_counts *= scale; + ref_len *= scale; + hyp_len *= scale; +} + +void BLEUScore::PlusEquals(const Score& delta, const float scale) { + const BLEUScore& d = static_cast(delta); + correct_ngram_hit_counts = correct_ngram_hit_counts + (d.correct_ngram_hit_counts * scale); + hyp_ngram_counts = hyp_ngram_counts + (d.hyp_ngram_counts * scale); + ref_len = ref_len + (d.ref_len * scale); + hyp_len = hyp_len + (d.hyp_len * scale); +} + +void BLEUScore::PlusPartialEquals(const Score& delta, int oracle_e_cover, int oracle_f_cover, int src_len){ + const BLEUScore& d = static_cast(delta); + correct_ngram_hit_counts += d.correct_ngram_hit_counts; + hyp_ngram_counts += d.hyp_ngram_counts; + //scale the reference length according to the size of the input sentence covered by this rule + + ref_len *= (float)oracle_f_cover / src_len; + ref_len += d.ref_len; + + hyp_len = oracle_e_cover; + hyp_len += d.hyp_len; +} + + +ScoreP BLEUScore::GetZero() const { + return ScoreP(new BLEUScore(N())); +} + +ScoreP BLEUScore::GetOne() const { + return ScoreP(new BLEUScore(N(),1)); +} + + +void BLEUScore::Encode(string* out) const { + ostringstream os; + const int n = correct_ngram_hit_counts.size(); + os << n << ' ' << ref_len << ' ' << hyp_len; + for (int i = 0; i < n; ++i) + os << ' ' << correct_ngram_hit_counts[i] << ' ' << hyp_ngram_counts[i]; + *out = os.str(); +} + +BLEUScorerBase::BLEUScorerBase(const vector >& references, + int n) : SentenceScorer("BLEU"+boost::lexical_cast(n),references),n_(n) { + for (vector >::const_iterator ci = references.begin(); + ci != references.end(); ++ci) { + lengths_.push_back(ci->size()); + CountRef(*ci); + } +} + +ScoreP BLEUScorerBase::ScoreCandidate(const vector& hyp) const { + BLEUScore* bs = new BLEUScore(n_); + for (NGramCountMap::iterator i=ngrams_.begin(); i != ngrams_.end(); ++i) + i->second.second = 0; + ComputeNgramStats(hyp, &bs->correct_ngram_hit_counts, &bs->hyp_ngram_counts, true); + bs->ref_len = ComputeRefLength(hyp); + bs->hyp_len = hyp.size(); + return ScoreP(bs); +} + +ScoreP BLEUScorerBase::ScoreCCandidate(const vector& hyp) const { + BLEUScore* bs = new BLEUScore(n_); + for (NGramCountMap::iterator i=ngrams_.begin(); i != ngrams_.end(); ++i) + i->second.second = 0; + bool clip = false; + ComputeNgramStats(hyp, &bs->correct_ngram_hit_counts, &bs->hyp_ngram_counts,clip); + bs->ref_len = ComputeRefLength(hyp); + bs->hyp_len = hyp.size(); + return ScoreP(bs); +} + + +DocScorer::~DocScorer() { +} + +void DocScorer::Init( + const ScoreType type, + const vector& ref_files, + const string& src_file, bool verbose) { + scorers_.clear(); + // TODO stop using valarray, start using ReadFile + cerr << "Loading references (" << ref_files.size() << " files)\n"; + ReadFile srcrf; + if (type == AER && src_file.size() > 0) { + cerr << " (source=" << src_file << ")\n"; + srcrf.Init(src_file); + } + std::vector ifs(ref_files.begin(),ref_files.end()); + for (int i=0; i < ref_files.size(); ++i) ifs[i].Init(ref_files[i]); + char buf[64000]; + bool expect_eof = false; + int line=0; + while (ifs[0].get()) { + vector > refs(ref_files.size()); + for (int i=0; i < ref_files.size(); ++i) { + istream &in=ifs[i].get(); + if (in.eof()) break; + in.getline(buf, 64000); + refs[i].clear(); + if (strlen(buf) == 0) { + if (in.eof()) { + if (!expect_eof) { + assert(i == 0); + expect_eof = true; + } + break; + } + } else { + TD::ConvertSentence(buf, &refs[i]); + assert(!refs[i].empty()); + } + assert(!expect_eof); + } + if (!expect_eof) { + string src_line; + if (srcrf) { + getline(srcrf.get(), src_line); + map dummy; + ProcessAndStripSGML(&src_line, &dummy); + } + scorers_.push_back(ScorerP(SentenceScorer::CreateSentenceScorer(type, refs, src_line))); + if (verbose) + cerr<<"doc_scorer["<verbose_desc()< +#include +#include +//TODO: use intrusive shared_ptr in Score (because there are many of them on ErrorSurfaces) +#include "wordid.h" +#include "intrusive_refcount.hpp" + +class Score; +class SentenceScorer; +typedef boost::intrusive_ptr ScoreP; +typedef boost::shared_ptr ScorerP; + +class ViterbiEnvelope; +class ErrorSurface; +class Hypergraph; // needed for alignment + +//TODO: BLEU N (N separate arg, not part of enum)? +enum ScoreType { IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, BLEU_minus_TER_over_2, SER, AER, IBM_BLEU_3 }; +ScoreType ScoreTypeFromString(const std::string& st); +std::string StringFromScoreType(ScoreType st); + +class Score : public boost::intrusive_refcount { + public: + virtual ~Score(); + virtual float ComputeScore() const = 0; + virtual float ComputePartialScore() const =0; + virtual void ScoreDetails(std::string* details) const = 0; + std::string ScoreDetails() { + std::string d; + ScoreDetails(&d); + return d; + } + virtual void TimesEquals(float scale); // only for bleu; for mira oracle + /// same as rhs.TimesEquals(scale);PlusEquals(rhs) except doesn't modify rhs. + virtual void PlusEquals(const Score& rhs, const float scale) = 0; + virtual void PlusEquals(const Score& rhs) = 0; + virtual void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len) = 0; + virtual void Subtract(const Score& rhs, Score *res) const = 0; + virtual ScoreP GetZero() const = 0; + virtual ScoreP GetOne() const = 0; + virtual bool IsAdditiveIdentity() const = 0; // returns true if adding this delta + // to another score results in no score change + // under any circumstances + virtual void Encode(std::string* out) const = 0; + static ScoreP GetZero(ScoreType type); + static ScoreP GetOne(ScoreType type); + virtual ScoreP Clone() const = 0; +protected: + Score() { } // we define these explicitly because refcount is noncopyable + Score(Score const&) { } +}; + +//TODO: make sure default copy ctors for score types do what we want. +template +struct ScoreBase : public Score { + ScoreP Clone() const { + return ScoreP(new Derived(dynamic_cast(*this))); + } +}; + +class SentenceScorer { + public: + typedef std::vector Sentence; + typedef std::vector Sentences; + std::string desc; + Sentences refs; + SentenceScorer(std::string desc="SentenceScorer_unknown", Sentences const& refs=Sentences()) : desc(desc),refs(refs) { } + std::string verbose_desc() const; + virtual float ComputeRefLength(const Sentence& hyp) const; // default: avg of refs.length + virtual ~SentenceScorer(); + virtual ScoreP GetOne() const; + virtual ScoreP GetZero() const; + virtual ScoreP ScoreCandidate(const Sentence& hyp) const = 0; + virtual ScoreP ScoreCCandidate(const Sentence& hyp) const =0; + virtual const std::string* GetSource() const; + static ScoreP CreateScoreFromString(const ScoreType type, const std::string& in); + static ScorerP CreateSentenceScorer(const ScoreType type, + const std::vector& refs, + const std::string& src = ""); +}; + +//TODO: should be able to GetOne GetZero without supplying sentence (just type) +class DocScorer { + public: + ~DocScorer(); + DocScorer() { } + void Init(const ScoreType type, + const std::vector& ref_files, + const std::string& src_file = "", + bool verbose=false + ); + DocScorer(const ScoreType type, + const std::vector& ref_files, + const std::string& src_file = "", + bool verbose=false + ) + { + Init(type,ref_files,src_file,verbose); + } + + int size() const { return scorers_.size(); } + ScorerP operator[](size_t i) const { return scorers_[i]; } + private: + std::vector scorers_; +}; + + +#endif diff --git a/mteval/scorer_test.cc b/mteval/scorer_test.cc new file mode 100644 index 00000000..a07a8c4b --- /dev/null +++ b/mteval/scorer_test.cc @@ -0,0 +1,182 @@ +#include +#include +#include +#include + +#include "tdict.h" +#include "scorer.h" +#include "aer_scorer.h" + +using namespace std; + +class ScorerTest : public testing::Test { + protected: + virtual void SetUp() { + refs0.resize(4); + refs1.resize(4); + TD::ConvertSentence("export of high-tech products in guangdong in first two months this year reached 3.76 billion us dollars", &refs0[0]); + TD::ConvertSentence("guangdong's export of new high technology products amounts to us $ 3.76 billion in first two months of this year", &refs0[1]); + TD::ConvertSentence("guangdong exports us $ 3.76 billion worth of high technology products in the first two months of this year", &refs0[2]); + TD::ConvertSentence("in the first 2 months this year , the export volume of new hi-tech products in guangdong province reached 3.76 billion us dollars .", &refs0[3]); + TD::ConvertSentence("xinhua news agency , guangzhou , march 16 ( reporter chen ji ) the latest statistics show that from january through february this year , the export of high-tech products in guangdong province reached 3.76 billion us dollars , up 34.8 \% over the same period last year and accounted for 25.5 \% of the total export in the province .", &refs1[0]); + TD::ConvertSentence("xinhua news agency , guangzhou , march 16 ( reporter : chen ji ) -- latest statistic indicates that guangdong's export of new high technology products amounts to us $ 3.76 billion , up 34.8 \% over corresponding period and accounts for 25.5 \% of the total exports of the province .", &refs1[1]); + TD::ConvertSentence("xinhua news agency report of march 16 from guangzhou ( by staff reporter chen ji ) - latest statistics indicate guangdong province exported us $ 3.76 billion worth of high technology products , up 34.8 percent from the same period last year , which account for 25.5 percent of the total exports of the province .", &refs1[2]); + TD::ConvertSentence("guangdong , march 16 , ( xinhua ) -- ( chen ji reports ) as the newest statistics shows , in january and feberuary this year , the export volume of new hi-tech products in guangdong province reached 3.76 billion us dollars , up 34.8 \% than last year , making up 25.5 \% of the province's total .", &refs1[3]); + TD::ConvertSentence("one guangdong province will next export us $ 3.76 high-tech product two months first this year 3.76 billion us dollars", &hyp1); + TD::ConvertSentence("xinhua news agency , guangzhou , 16th of march ( reporter chen ) -- latest statistics suggest that guangdong exports new advanced technology product totals $ 3.76 million , 34.8 percent last corresponding period and accounts for 25.5 percent of the total export province .", &hyp2); + } + + virtual void TearDown() { } + + vector > refs0; + vector > refs1; + vector hyp1; + vector hyp2; +}; + +TEST_F(ScorerTest, TestCreateFromFiles) { + vector files; + files.push_back("test_data/re.txt.0"); + files.push_back("test_data/re.txt.1"); + files.push_back("test_data/re.txt.2"); + files.push_back("test_data/re.txt.3"); + DocScorer ds(IBM_BLEU, files); +} + +TEST_F(ScorerTest, TestBLEUScorer) { + ScorerP s1 = SentenceScorer::CreateSentenceScorer(IBM_BLEU, refs0); + ScorerP s2 = SentenceScorer::CreateSentenceScorer(IBM_BLEU, refs1); + ScoreP b1 = s1->ScoreCandidate(hyp1); + EXPECT_FLOAT_EQ(0.23185077, b1->ComputeScore()); + ScoreP b2 = s2->ScoreCandidate(hyp2); + EXPECT_FLOAT_EQ(0.38101241, b2->ComputeScore()); + b1->PlusEquals(*b2); + EXPECT_FLOAT_EQ(0.348854, b1->ComputeScore()); + EXPECT_FALSE(b1->IsAdditiveIdentity()); + string details; + b1->ScoreDetails(&details); + EXPECT_EQ("BLEU = 34.89, 81.5|50.8|29.5|18.6 (brev=0.898)", details); + cerr << details << endl; + string enc; + b1->Encode(&enc); + ScoreP b3 = SentenceScorer::CreateScoreFromString(IBM_BLEU, enc); + details.clear(); + cerr << "Encoded BLEU score size: " << enc.size() << endl; + b3->ScoreDetails(&details); + cerr << details << endl; + EXPECT_FALSE(b3->IsAdditiveIdentity()); + EXPECT_EQ("BLEU = 34.89, 81.5|50.8|29.5|18.6 (brev=0.898)", details); + ScoreP bz = b3->GetZero(); + EXPECT_TRUE(bz->IsAdditiveIdentity()); +} + +TEST_F(ScorerTest, TestTERScorer) { + ScorerP s1 = SentenceScorer::CreateSentenceScorer(TER, refs0); + ScorerP s2 = SentenceScorer::CreateSentenceScorer(TER, refs1); + string details; + ScoreP t1 = s1->ScoreCandidate(hyp1); + t1->ScoreDetails(&details); + cerr << "DETAILS: " << details << endl; + cerr << t1->ComputeScore() << endl; + ScoreP t2 = s2->ScoreCandidate(hyp2); + t2->ScoreDetails(&details); + cerr << "DETAILS: " << details << endl; + cerr << t2->ComputeScore() << endl; + t1->PlusEquals(*t2); + cerr << t1->ComputeScore() << endl; + t1->ScoreDetails(&details); + cerr << "DETAILS: " << details << endl; + EXPECT_EQ("TER = 44.16, 4| 8| 16| 6 (len=77)", details); + string enc; + t1->Encode(&enc); + ScoreP t3 = SentenceScorer::CreateScoreFromString(TER, enc); + details.clear(); + t3->ScoreDetails(&details); + EXPECT_EQ("TER = 44.16, 4| 8| 16| 6 (len=77)", details); + EXPECT_FALSE(t3->IsAdditiveIdentity()); + ScoreP tz = t3->GetZero(); + EXPECT_TRUE(tz->IsAdditiveIdentity()); +} + +TEST_F(ScorerTest, TestTERScorerSimple) { + vector > ref(1); + TD::ConvertSentence("1 2 3 A B", &ref[0]); + vector hyp; + TD::ConvertSentence("A B 1 2 3", &hyp); + ScorerP s1 = SentenceScorer::CreateSentenceScorer(TER, ref); + string details; + ScoreP t1 = s1->ScoreCandidate(hyp); + t1->ScoreDetails(&details); + cerr << "DETAILS: " << details << endl; +} + +TEST_F(ScorerTest, TestSERScorerSimple) { + vector > ref(1); + TD::ConvertSentence("A B C D", &ref[0]); + vector hyp1; + TD::ConvertSentence("A B C", &hyp1); + vector hyp2; + TD::ConvertSentence("A B C D", &hyp2); + ScorerP s1 = SentenceScorer::CreateSentenceScorer(SER, ref); + string details; + ScoreP t1 = s1->ScoreCandidate(hyp1); + t1->ScoreDetails(&details); + cerr << "DETAILS: " << details << endl; + ScoreP t2 = s1->ScoreCandidate(hyp2); + t2->ScoreDetails(&details); + cerr << "DETAILS: " << details << endl; + t2->PlusEquals(*t1); + t2->ScoreDetails(&details); + cerr << "DETAILS: " << details << endl; +} + +TEST_F(ScorerTest, TestCombiScorer) { + ScorerP s1 = SentenceScorer::CreateSentenceScorer(BLEU_minus_TER_over_2, refs0); + string details; + ScoreP t1 = s1->ScoreCandidate(hyp1); + t1->ScoreDetails(&details); + cerr << "DETAILS: " << details << endl; + cerr << t1->ComputeScore() << endl; + string enc; + t1->Encode(&enc); + ScoreP t2 = SentenceScorer::CreateScoreFromString(BLEU_minus_TER_over_2, enc); + details.clear(); + t2->ScoreDetails(&details); + cerr << "DETAILS: " << details << endl; + ScoreP cz = t2->GetZero(); + EXPECT_FALSE(t2->IsAdditiveIdentity()); + EXPECT_TRUE(cz->IsAdditiveIdentity()); + cz->PlusEquals(*t2); + EXPECT_FALSE(cz->IsAdditiveIdentity()); + string d2; + cz->ScoreDetails(&d2); + EXPECT_EQ(d2, details); +} + +TEST_F(ScorerTest, AERTest) { + vector > refs0(1); + TD::ConvertSentence("0-0 2-1 1-2 3-3", &refs0[0]); + + vector hyp; + TD::ConvertSentence("0-0 1-1", &hyp); + AERScorer* as = new AERScorer(refs0); + ScoreP x = as->ScoreCandidate(hyp); + string details; + x->ScoreDetails(&details); + cerr << details << endl; + string enc; + x->Encode(&enc); + delete as; + cerr << "ENC size: " << enc.size() << endl; + ScoreP y = SentenceScorer::CreateScoreFromString(AER, enc); + string d2; + y->ScoreDetails(&d2); + cerr << d2 << endl; + EXPECT_EQ(d2, details); +} + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + diff --git a/mteval/ter.cc b/mteval/ter.cc new file mode 100644 index 00000000..cacc5b00 --- /dev/null +++ b/mteval/ter.cc @@ -0,0 +1,535 @@ +#include "ter.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "tdict.h" + +const bool ter_use_average_ref_len = true; +const int ter_short_circuit_long_sentences = -1; + +using namespace std; +using namespace std::tr1; + +struct COSTS { + static const float substitution; + static const float deletion; + static const float insertion; + static const float shift; +}; +const float COSTS::substitution = 1.0f; +const float COSTS::deletion = 1.0f; +const float COSTS::insertion = 1.0f; +const float COSTS::shift = 1.0f; + +static const int MAX_SHIFT_SIZE = 10; +static const int MAX_SHIFT_DIST = 50; + +struct Shift { + unsigned int d_; + Shift() : d_() {} + Shift(int b, int e, int m) : d_() { + begin(b); + end(e); + moveto(m); + } + inline int begin() const { + return d_ & 0x3ff; + } + inline int end() const { + return (d_ >> 10) & 0x3ff; + } + inline int moveto() const { + int m = (d_ >> 20) & 0x7ff; + if (m > 1024) { m -= 1024; m *= -1; } + return m; + } + inline void begin(int b) { + d_ &= 0xfffffc00u; + d_ |= (b & 0x3ff); + } + inline void end(int e) { + d_ &= 0xfff003ffu; + d_ |= (e & 0x3ff) << 10; + } + inline void moveto(int m) { + bool neg = (m < 0); + if (neg) { m *= -1; m += 1024; } + d_ &= 0xfffff; + d_ |= (m & 0x7ff) << 20; + } +}; + +class TERScorerImpl { + + public: + enum TransType { MATCH, SUBSTITUTION, INSERTION, DELETION }; + + explicit TERScorerImpl(const vector& ref) : ref_(ref) { + for (int i = 0; i < ref.size(); ++i) + rwexists_.insert(ref[i]); + } + + float Calculate(const vector& hyp, int* subs, int* ins, int* dels, int* shifts) const { + return CalculateAllShifts(hyp, subs, ins, dels, shifts); + } + + inline int GetRefLength() const { + return ref_.size(); + } + + private: + vector ref_; + set rwexists_; + + typedef unordered_map, set, boost::hash > > NgramToIntsMap; + mutable NgramToIntsMap nmap_; + + static float MinimumEditDistance( + const vector& hyp, + const vector& ref, + vector* path) { + vector > bmat(hyp.size() + 1, vector(ref.size() + 1, MATCH)); + vector > cmat(hyp.size() + 1, vector(ref.size() + 1, 0)); + for (int i = 0; i <= hyp.size(); ++i) + cmat[i][0] = i; + for (int j = 0; j <= ref.size(); ++j) + cmat[0][j] = j; + for (int i = 1; i <= hyp.size(); ++i) { + const WordID& hw = hyp[i-1]; + for (int j = 1; j <= ref.size(); ++j) { + const WordID& rw = ref[j-1]; + float& cur_c = cmat[i][j]; + TransType& cur_b = bmat[i][j]; + + if (rw == hw) { + cur_c = cmat[i-1][j-1]; + cur_b = MATCH; + } else { + cur_c = cmat[i-1][j-1] + COSTS::substitution; + cur_b = SUBSTITUTION; + } + float cwoi = cmat[i-1][j]; + if (cur_c > cwoi + COSTS::insertion) { + cur_c = cwoi + COSTS::insertion; + cur_b = INSERTION; + } + float cwod = cmat[i][j-1]; + if (cur_c > cwod + COSTS::deletion) { + cur_c = cwod + COSTS::deletion; + cur_b = DELETION; + } + } + } + + // trace back along the best path and record the transition types + path->clear(); + int i = hyp.size(); + int j = ref.size(); + while (i > 0 || j > 0) { + if (j == 0) { + --i; + path->push_back(INSERTION); + } else if (i == 0) { + --j; + path->push_back(DELETION); + } else { + TransType t = bmat[i][j]; + path->push_back(t); + switch (t) { + case SUBSTITUTION: + case MATCH: + --i; --j; break; + case INSERTION: + --i; break; + case DELETION: + --j; break; + } + } + } + reverse(path->begin(), path->end()); + return cmat[hyp.size()][ref.size()]; + } + + void BuildWordMatches(const vector& hyp, NgramToIntsMap* nmap) const { + nmap->clear(); + set exists_both; + for (int i = 0; i < hyp.size(); ++i) + if (rwexists_.find(hyp[i]) != rwexists_.end()) + exists_both.insert(hyp[i]); + for (int start=0; start cp; + int mlen = min(MAX_SHIFT_SIZE, static_cast(ref_.size() - start)); + for (int len=0; len& in, + int start, int end, int moveto, vector* out) { + // cerr << "ps: " << start << " " << end << " " << moveto << endl; + out->clear(); + if (moveto == -1) { + for (int i = start; i <= end; ++i) + out->push_back(in[i]); + for (int i = 0; i < start; ++i) + out->push_back(in[i]); + for (int i = end+1; i < in.size(); ++i) + out->push_back(in[i]); + } else if (moveto < start) { + for (int i = 0; i <= moveto; ++i) + out->push_back(in[i]); + for (int i = start; i <= end; ++i) + out->push_back(in[i]); + for (int i = moveto+1; i < start; ++i) + out->push_back(in[i]); + for (int i = end+1; i < in.size(); ++i) + out->push_back(in[i]); + } else if (moveto > end) { + for (int i = 0; i < start; ++i) + out->push_back(in[i]); + for (int i = end+1; i <= moveto; ++i) + out->push_back(in[i]); + for (int i = start; i <= end; ++i) + out->push_back(in[i]); + for (int i = moveto+1; i < in.size(); ++i) + out->push_back(in[i]); + } else { + for (int i = 0; i < start; ++i) + out->push_back(in[i]); + for (int i = end+1; (i < in.size()) && (i <= end + (moveto - start)); ++i) + out->push_back(in[i]); + for (int i = start; i <= end; ++i) + out->push_back(in[i]); + for (int i = (end + (moveto - start))+1; i < in.size(); ++i) + out->push_back(in[i]); + } + if (out->size() != in.size()) { + cerr << "ps: " << start << " " << end << " " << moveto << endl; + cerr << "in=" << TD::GetString(in) << endl; + cerr << "out=" << TD::GetString(*out) << endl; + } + assert(out->size() == in.size()); + // cerr << "ps: " << TD::GetString(*out) << endl; + } + + void GetAllPossibleShifts(const vector& hyp, + const vector& ralign, + const vector& herr, + const vector& rerr, + const int min_size, + vector >* shifts) const { + for (int start = 0; start < hyp.size(); ++start) { + vector cp(1, hyp[start]); + NgramToIntsMap::iterator niter = nmap_.find(cp); + if (niter == nmap_.end()) continue; + bool ok = false; + int moveto; + for (set::iterator i = niter->second.begin(); i != niter->second.end(); ++i) { + moveto = *i; + int rm = ralign[moveto]; + ok = (start != rm && + (rm - start) < MAX_SHIFT_DIST && + (start - rm - 1) < MAX_SHIFT_DIST); + if (ok) break; + } + if (!ok) continue; + cp.clear(); + for (int end = start + min_size - 1; + ok && end < hyp.size() && end < (start + MAX_SHIFT_SIZE); ++end) { + cp.push_back(hyp[end]); + vector& sshifts = (*shifts)[end - start]; + ok = false; + NgramToIntsMap::iterator niter = nmap_.find(cp); + if (niter == nmap_.end()) break; + bool any_herr = false; + for (int i = start; i <= end && !any_herr; ++i) + any_herr = herr[i]; + if (!any_herr) { + ok = true; + continue; + } + for (set::iterator mi = niter->second.begin(); + mi != niter->second.end(); ++mi) { + int moveto = *mi; + int rm = ralign[moveto]; + if (! ((rm != start) && + ((rm < start) || (rm > end)) && + (rm - start <= MAX_SHIFT_DIST) && + ((start - rm - 1) <= MAX_SHIFT_DIST))) continue; + ok = true; + bool any_rerr = false; + for (int i = 0; (i <= end - start) && (!any_rerr); ++i) + any_rerr = rerr[moveto+i]; + if (!any_rerr) continue; + for (int roff = 0; roff <= (end - start); ++roff) { + int rmr = ralign[moveto+roff]; + if ((start != rmr) && ((roff == 0) || (rmr != ralign[moveto]))) + sshifts.push_back(Shift(start, end, moveto + roff)); + } + } + } + } + } + + bool CalculateBestShift(const vector& cur, + const vector& hyp, + float curerr, + const vector& path, + vector* new_hyp, + float* newerr, + vector* new_path) const { + vector herr, rerr; + vector ralign; + int hpos = -1; + for (int i = 0; i < path.size(); ++i) { + switch (path[i]) { + case MATCH: + ++hpos; + herr.push_back(false); + rerr.push_back(false); + ralign.push_back(hpos); + break; + case SUBSTITUTION: + ++hpos; + herr.push_back(true); + rerr.push_back(true); + ralign.push_back(hpos); + break; + case INSERTION: + ++hpos; + herr.push_back(true); + break; + case DELETION: + rerr.push_back(true); + ralign.push_back(hpos); + break; + } + } +#if 0 + cerr << "RALIGN: "; + for (int i = 0; i < rerr.size(); ++i) + cerr << ralign[i] << " "; + cerr << endl; + cerr << "RERR: "; + for (int i = 0; i < rerr.size(); ++i) + cerr << (bool)rerr[i] << " "; + cerr << endl; + cerr << "HERR: "; + for (int i = 0; i < herr.size(); ++i) + cerr << (bool)herr[i] << " "; + cerr << endl; +#endif + + vector > shifts(MAX_SHIFT_SIZE + 1); + GetAllPossibleShifts(cur, ralign, herr, rerr, 1, &shifts); + float cur_best_shift_cost = 0; + *newerr = curerr; + vector cur_best_path; + vector cur_best_hyp; + + bool res = false; + for (int i = shifts.size() - 1; i >=0; --i) { + float curfix = curerr - (cur_best_shift_cost + *newerr); + float maxfix = 2.0f * (1 + i) - COSTS::shift; + if ((curfix > maxfix) || ((cur_best_shift_cost == 0) && (curfix == maxfix))) break; + for (int j = 0; j < shifts[i].size(); ++j) { + const Shift& s = shifts[i][j]; + curfix = curerr - (cur_best_shift_cost + *newerr); + maxfix = 2.0f * (1 + i) - COSTS::shift; // TODO remove? + if ((curfix > maxfix) || ((cur_best_shift_cost == 0) && (curfix == maxfix))) continue; + vector shifted(cur.size()); + PerformShift(cur, s.begin(), s.end(), ralign[s.moveto()], &shifted); + vector try_path; + float try_cost = MinimumEditDistance(shifted, ref_, &try_path); + float gain = (*newerr + cur_best_shift_cost) - (try_cost + COSTS::shift); + if (gain > 0.0f || ((cur_best_shift_cost == 0.0f) && (gain == 0.0f))) { + *newerr = try_cost; + cur_best_shift_cost = COSTS::shift; + new_path->swap(try_path); + new_hyp->swap(shifted); + res = true; + // cerr << "Found better shift " << s.begin() << "..." << s.end() << " moveto " << s.moveto() << endl; + } + } + } + + return res; + } + + static void GetPathStats(const vector& path, int* subs, int* ins, int* dels) { + *subs = *ins = *dels = 0; + for (int i = 0; i < path.size(); ++i) { + switch (path[i]) { + case SUBSTITUTION: + ++(*subs); + case MATCH: + break; + case INSERTION: + ++(*ins); break; + case DELETION: + ++(*dels); break; + } + } + } + + float CalculateAllShifts(const vector& hyp, + int* subs, int* ins, int* dels, int* shifts) const { + BuildWordMatches(hyp, &nmap_); + vector path; + float med_cost = MinimumEditDistance(hyp, ref_, &path); + float edits = 0; + vector cur = hyp; + *shifts = 0; + if (ter_short_circuit_long_sentences < 0 || + ref_.size() < ter_short_circuit_long_sentences) { + while (true) { + vector new_hyp; + vector new_path; + float new_med_cost; + if (!CalculateBestShift(cur, hyp, med_cost, path, &new_hyp, &new_med_cost, &new_path)) + break; + edits += COSTS::shift; + ++(*shifts); + med_cost = new_med_cost; + path.swap(new_path); + cur.swap(new_hyp); + } + } + GetPathStats(path, subs, ins, dels); + return med_cost + edits; + } +}; + +class TERScore : public ScoreBase { + friend class TERScorer; + + public: + static const unsigned kINSERTIONS = 0; + static const unsigned kDELETIONS = 1; + static const unsigned kSUBSTITUTIONS = 2; + static const unsigned kSHIFTS = 3; + static const unsigned kREF_WORDCOUNT = 4; + static const unsigned kDUMMY_LAST_ENTRY = 5; + + TERScore() : stats(0,kDUMMY_LAST_ENTRY) {} + float ComputePartialScore() const { return 0.0;} + float ComputeScore() const { + float edits = static_cast(stats[kINSERTIONS] + stats[kDELETIONS] + stats[kSUBSTITUTIONS] + stats[kSHIFTS]); + return edits / static_cast(stats[kREF_WORDCOUNT]); + } + void ScoreDetails(string* details) const; + void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len){} + void PlusEquals(const Score& delta, const float scale) { + if (scale==1) + stats += static_cast(delta).stats; + if (scale==-1) + stats -= static_cast(delta).stats; + throw std::runtime_error("TERScore::PlusEquals with scale != +-1"); + } + void PlusEquals(const Score& delta) { + stats += static_cast(delta).stats; + } + + ScoreP GetZero() const { + return ScoreP(new TERScore); + } + ScoreP GetOne() const { + return ScoreP(new TERScore); + } + void Subtract(const Score& rhs, Score* res) const { + static_cast(res)->stats = stats - static_cast(rhs).stats; + } + void Encode(std::string* out) const { + ostringstream os; + os << stats[kINSERTIONS] << ' ' + << stats[kDELETIONS] << ' ' + << stats[kSUBSTITUTIONS] << ' ' + << stats[kSHIFTS] << ' ' + << stats[kREF_WORDCOUNT]; + *out = os.str(); + } + bool IsAdditiveIdentity() const { + for (int i = 0; i < kDUMMY_LAST_ENTRY; ++i) + if (stats[i] != 0) return false; + return true; + } + private: + valarray stats; +}; + +ScoreP TERScorer::ScoreFromString(const std::string& data) { + istringstream is(data); + TERScore* r = new TERScore; + is >> r->stats[TERScore::kINSERTIONS] + >> r->stats[TERScore::kDELETIONS] + >> r->stats[TERScore::kSUBSTITUTIONS] + >> r->stats[TERScore::kSHIFTS] + >> r->stats[TERScore::kREF_WORDCOUNT]; + return ScoreP(r); +} + +void TERScore::ScoreDetails(std::string* details) const { + char buf[200]; + sprintf(buf, "TER = %.2f, %3d|%3d|%3d|%3d (len=%d)", + ComputeScore() * 100.0f, + stats[kINSERTIONS], + stats[kDELETIONS], + stats[kSUBSTITUTIONS], + stats[kSHIFTS], + stats[kREF_WORDCOUNT]); + *details = buf; +} + +TERScorer::~TERScorer() { + for (vector::iterator i = impl_.begin(); i != impl_.end(); ++i) + delete *i; +} + +TERScorer::TERScorer(const vector >& refs) : impl_(refs.size()) { + for (int i = 0; i < refs.size(); ++i) + impl_[i] = new TERScorerImpl(refs[i]); +} + +ScoreP TERScorer::ScoreCCandidate(const vector& hyp) const { + return ScoreP(); +} + +ScoreP TERScorer::ScoreCandidate(const std::vector& hyp) const { + float best_score = numeric_limits::max(); + TERScore* res = new TERScore; + int avg_len = 0; + for (int i = 0; i < impl_.size(); ++i) + avg_len += impl_[i]->GetRefLength(); + avg_len /= impl_.size(); + for (int i = 0; i < impl_.size(); ++i) { + int subs, ins, dels, shifts; + float score = impl_[i]->Calculate(hyp, &subs, &ins, &dels, &shifts); + // cerr << "Component TER cost: " << score << endl; + if (score < best_score) { + res->stats[TERScore::kINSERTIONS] = ins; + res->stats[TERScore::kDELETIONS] = dels; + res->stats[TERScore::kSUBSTITUTIONS] = subs; + res->stats[TERScore::kSHIFTS] = shifts; + if (ter_use_average_ref_len) { + res->stats[TERScore::kREF_WORDCOUNT] = avg_len; + } else { + res->stats[TERScore::kREF_WORDCOUNT] = impl_[i]->GetRefLength(); + } + + best_score = score; + } + } + return ScoreP(res); +} diff --git a/mteval/ter.h b/mteval/ter.h new file mode 100644 index 00000000..43314791 --- /dev/null +++ b/mteval/ter.h @@ -0,0 +1,19 @@ +#ifndef _TER_H_ +#define _TER_H_ + +#include "scorer.h" + +class TERScorerImpl; + +class TERScorer : public SentenceScorer { + public: + TERScorer(const std::vector >& references); + ~TERScorer(); + ScoreP ScoreCandidate(const std::vector& hyp) const; + ScoreP ScoreCCandidate(const std::vector& hyp) const; + static ScoreP ScoreFromString(const std::string& data); + private: + std::vector impl_; +}; + +#endif diff --git a/mteval/test_data/re.txt.0 b/mteval/test_data/re.txt.0 new file mode 100644 index 00000000..86eff087 --- /dev/null +++ b/mteval/test_data/re.txt.0 @@ -0,0 +1,5 @@ +erdogan states turkey to reject any pressures to urge it to recognize cyprus +ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara will reject any pressure by the european union to urge it to recognize cyprus . this comes two weeks before the summit of european union state and government heads who will decide whether or nor membership negotiations with ankara should be opened . +erdogan told " ntv " television station that " the european union cannot address us by imposing new conditions on us with regard to cyprus . +we will discuss this dossier in the course of membership negotiations . " +he added " let me be clear , i cannot sidestep turkey , this is something we cannot accept . " diff --git a/mteval/test_data/re.txt.1 b/mteval/test_data/re.txt.1 new file mode 100644 index 00000000..2140f198 --- /dev/null +++ b/mteval/test_data/re.txt.1 @@ -0,0 +1,5 @@ +erdogan confirms turkey will resist any pressure to recognize cyprus +ankara 12 - 1 ( afp ) - the turkish head of government , recep tayyip erdogan , announced today ( wednesday ) that ankara would resist any pressure the european union might exercise in order to force it into recognizing cyprus . this comes two weeks before a summit of european union heads of state and government , who will decide whether or not to open membership negotiations with ankara . +erdogan said to the ntv television channel : " the european union cannot engage with us through imposing new conditions on us with regard to cyprus . +we shall discuss this issue in the course of the membership negotiations . " +he added : " let me be clear - i cannot confine turkey . this is something we do not accept . " diff --git a/mteval/test_data/re.txt.2 b/mteval/test_data/re.txt.2 new file mode 100644 index 00000000..94e46286 --- /dev/null +++ b/mteval/test_data/re.txt.2 @@ -0,0 +1,5 @@ +erdogan confirms that turkey will reject any pressures to encourage it to recognize cyprus +ankara , 12 / 1 ( afp ) - the turkish prime minister recep tayyip erdogan declared today , wednesday , that ankara will reject any pressures that the european union may apply on it to encourage to recognize cyprus . this comes two weeks before a summit of the heads of countries and governments of the european union , who will decide on whether or not to start negotiations on joining with ankara . +erdogan told the ntv television station that " it is not possible for the european union to talk to us by imposing new conditions on us regarding cyprus . +we shall discuss this dossier during the negotiations on joining . " +and he added , " let me be clear . turkey's arm should not be twisted ; this is something we cannot accept . " diff --git a/mteval/test_data/re.txt.3 b/mteval/test_data/re.txt.3 new file mode 100644 index 00000000..f87c3308 --- /dev/null +++ b/mteval/test_data/re.txt.3 @@ -0,0 +1,5 @@ +erdogan stresses that turkey will reject all pressures to force it to recognize cyprus +ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara would refuse all pressures applied on it by the european union to force it to recognize cyprus . that came two weeks before the summit of the presidents and prime ministers of the european union , who would decide on whether to open negotiations on joining with ankara or not . +erdogan said to " ntv " tv station that the " european union can not communicate with us by imposing on us new conditions related to cyprus . +we will discuss this file during the negotiations on joining . " +he added , " let me be clear . turkey's arm should not be twisted . this is unacceptable to us . " diff --git a/training/Makefile.am b/training/Makefile.am index 490de774..48b19932 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -14,37 +14,36 @@ noinst_PROGRAMS = \ optimize_test atools_SOURCES = atools.cc -atools_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +atools_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz model1_SOURCES = model1.cc -model1_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +model1_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz grammar_convert_SOURCES = grammar_convert.cc -grammar_convert_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +grammar_convert_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz optimize_test_SOURCES = optimize_test.cc optimize.cc -optimize_test_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +optimize_test_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz collapse_weights_SOURCES = collapse_weights.cc -collapse_weights_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +collapse_weights_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz lbfgs_test_SOURCES = lbfgs_test.cc -lbfgs_test_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +lbfgs_test_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz mr_optimize_reduce_SOURCES = mr_optimize_reduce.cc optimize.cc -mr_optimize_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +mr_optimize_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz mr_em_map_adapter_SOURCES = mr_em_map_adapter.cc -mr_em_map_adapter_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +mr_em_map_adapter_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz mr_reduce_to_weights_SOURCES = mr_reduce_to_weights.cc -mr_reduce_to_weights_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +mr_reduce_to_weights_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz mr_em_adapted_reduce_SOURCES = mr_em_adapted_reduce.cc -mr_em_adapted_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +mr_em_adapted_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz plftools_SOURCES = plftools.cc -plftools_LDADD = $(top_srcdir)/decoder/libcdec.a -lz - -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder +plftools_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -I$(top_srcdir)/utils -I$(top_srcdir)/mteval diff --git a/training/atools.cc b/training/atools.cc index af62804d..805e3c1d 100644 --- a/training/atools.cc +++ b/training/atools.cc @@ -9,6 +9,7 @@ #include "filelib.h" #include "aligner.h" +#include "alignment_pharaoh.h" namespace po = boost::program_options; using namespace std; @@ -349,9 +350,9 @@ int main(int argc, char **argv) { } if (line1.empty() && !*in1) break; shared_ptr > out(new Array2D); - shared_ptr > a1 = AlignerTools::ReadPharaohAlignmentGrid(line1); + shared_ptr > a1 = AlignmentPharaoh::ReadPharaohAlignmentGrid(line1); if (in2) { - shared_ptr > a2 = AlignerTools::ReadPharaohAlignmentGrid(line2); + shared_ptr > a2 = AlignmentPharaoh::ReadPharaohAlignmentGrid(line2); cmd.Apply(*a1, *a2, out.get()); } else { Array2D dummy; @@ -359,7 +360,7 @@ int main(int argc, char **argv) { } if (cmd.Result() == 1) { - AlignerTools::SerializePharaohFormat(*out, &cout); + AlignmentPharaoh::SerializePharaohFormat(*out, &cout); } } if (cmd.Result() == 2) diff --git a/utils/Makefile.am b/utils/Makefile.am new file mode 100644 index 00000000..e513febd --- /dev/null +++ b/utils/Makefile.am @@ -0,0 +1,38 @@ +if HAVE_GTEST +noinst_PROGRAMS = \ + dict_test \ + weights_test \ + logval_test \ + small_vector_test +endif + +noinst_LIBRARIES = libutils.a + +libutils_a_SOURCES = \ + alignment_pharaoh.cc \ + b64tools.cc \ + dict.cc \ + tdict.cc \ + fdict.cc \ + gzstream.cc \ + filelib.cc \ + stringlib.cc \ + sparse_vector.cc \ + timing_stats.cc \ + weights.cc + +dict_test_SOURCES = dict_test.cc +dict_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) +weights_test_SOURCES = weights_test.cc +weights_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) +logval_test_SOURCES = logval_test.cc +logval_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) +small_vector_test_SOURCES = small_vector_test.cc +small_vector_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) + +AM_LDFLAGS = libutils.a -lz + +################################################################ +# do NOT NOT NOT add any other -I includes NO NO NO NO NO ###### +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I. +################################################################ diff --git a/utils/alignment_pharaoh.cc b/utils/alignment_pharaoh.cc new file mode 100644 index 00000000..890ff565 --- /dev/null +++ b/utils/alignment_pharaoh.cc @@ -0,0 +1,77 @@ +#include "utils/alignment_pharaoh.h" + +#include + +using namespace std; + +static bool is_digit(char x) { return x >= '0' && x <= '9'; } + +boost::shared_ptr > AlignmentPharaoh::ReadPharaohAlignmentGrid(const string& al) { + int max_x = 0; + int max_y = 0; + int i = 0; + size_t pos = al.rfind(" ||| "); + if (pos != string::npos) { i = pos + 5; } + while (i < al.size()) { + if (al[i] == '\n' || al[i] == '\r') break; + int x = 0; + while(i < al.size() && is_digit(al[i])) { + x *= 10; + x += al[i] - '0'; + ++i; + } + if (x > max_x) max_x = x; + assert(i < al.size()); + if(al[i] != '-') { + cerr << "BAD ALIGNMENT: " << al << endl; + abort(); + } + ++i; + int y = 0; + while(i < al.size() && is_digit(al[i])) { + y *= 10; + y += al[i] - '0'; + ++i; + } + if (y > max_y) max_y = y; + while(i < al.size() && al[i] == ' ') { ++i; } + } + + boost::shared_ptr > grid(new Array2D(max_x + 1, max_y + 1)); + i = 0; + if (pos != string::npos) { i = pos + 5; } + while (i < al.size()) { + if (al[i] == '\n' || al[i] == '\r') break; + int x = 0; + while(i < al.size() && is_digit(al[i])) { + x *= 10; + x += al[i] - '0'; + ++i; + } + assert(i < al.size()); + assert(al[i] == '-'); + ++i; + int y = 0; + while(i < al.size() && is_digit(al[i])) { + y *= 10; + y += al[i] - '0'; + ++i; + } + (*grid)(x, y) = true; + while(i < al.size() && al[i] == ' ') { ++i; } + } + // cerr << *grid << endl; + return grid; +} + +void AlignmentPharaoh::SerializePharaohFormat(const Array2D& alignment, ostream* out) { + bool need_space = false; + for (int i = 0; i < alignment.width(); ++i) + for (int j = 0; j < alignment.height(); ++j) + if (alignment(i,j)) { + if (need_space) (*out) << ' '; else need_space = true; + (*out) << i << '-' << j; + } + (*out) << endl; +} + diff --git a/utils/alignment_pharaoh.h b/utils/alignment_pharaoh.h new file mode 100644 index 00000000..d111c8bf --- /dev/null +++ b/utils/alignment_pharaoh.h @@ -0,0 +1,14 @@ +#ifndef _PHARAOH_ALIGNMENT_H_ +#define _PHARAOH_ALIGNMENT_H_ + +#include +#include +#include +#include "array2d.h" + +struct AlignmentPharaoh { + static boost::shared_ptr > ReadPharaohAlignmentGrid(const std::string& al); + static void SerializePharaohFormat(const Array2D& alignment, std::ostream* out); +}; + +#endif diff --git a/utils/array2d.h b/utils/array2d.h new file mode 100644 index 00000000..e63eda0d --- /dev/null +++ b/utils/array2d.h @@ -0,0 +1,172 @@ +#ifndef ARRAY2D_H_ +#define ARRAY2D_H_ + +#include +#include +#include +#include +#include + +template +class Array2D { + public: + typedef typename std::vector::reference reference; + typedef typename std::vector::const_reference const_reference; + typedef typename std::vector::iterator iterator; + typedef typename std::vector::const_iterator const_iterator; + Array2D() : width_(0), height_(0) {} + Array2D(int w, int h, const T& d = T()) : + width_(w), height_(h), data_(w*h, d) {} + Array2D(const Array2D& rhs) : + width_(rhs.width_), height_(rhs.height_), data_(rhs.data_) {} + bool empty() const { return data_.empty(); } + void resize(int w, int h, const T& d = T()) { + data_.resize(w * h, d); + width_ = w; + height_ = h; + } + const Array2D& operator=(const Array2D& rhs) { + data_ = rhs.data_; + width_ = rhs.width_; + height_ = rhs.height_; + return *this; + } + void fill(const T& v) { data_.assign(data_.size(), v); } + int width() const { return width_; } + int height() const { return height_; } + reference operator()(int i, int j) { + return data_[offset(i, j)]; + } + void clear() { data_.clear(); width_=0; height_=0; } + const_reference operator()(int i, int j) const { + return data_[offset(i, j)]; + } + iterator begin_col(int j) { + return data_.begin() + offset(0,j); + } + const_iterator begin_col(int j) const { + return data_.begin() + offset(0,j); + } + iterator end_col(int j) { + return data_.begin() + offset(0,j) + width_; + } + const_iterator end_col(int j) const { + return data_.begin() + offset(0,j) + width_; + } + iterator end() { return data_.end(); } + const_iterator end() const { return data_.end(); } + const Array2D& operator*=(const T& x) { + std::transform(data_.begin(), data_.end(), data_.begin(), + std::bind2nd(std::multiplies(), x)); + } + const Array2D& operator/=(const T& x) { + std::transform(data_.begin(), data_.end(), data_.begin(), + std::bind2nd(std::divides(), x)); + } + const Array2D& operator+=(const Array2D& m) { + std::transform(m.data_.begin(), m.data_.end(), data_.begin(), data_.begin(), std::plus()); + } + const Array2D& operator-=(const Array2D& m) { + std::transform(m.data_.begin(), m.data_.end(), data_.begin(), data_.begin(), std::minus()); + } + + private: + inline int offset(int i, int j) const { + assert(i data_; +}; + +template +Array2D operator*(const Array2D& l, const T& scalar) { + Array2D res(l); + res *= scalar; + return res; +} + +template +Array2D operator*(const T& scalar, const Array2D& l) { + Array2D res(l); + res *= scalar; + return res; +} + +template +Array2D operator/(const Array2D& l, const T& scalar) { + Array2D res(l); + res /= scalar; + return res; +} + +template +Array2D operator+(const Array2D& l, const Array2D& r) { + Array2D res(l); + res += r; + return res; +} + +template +Array2D operator-(const Array2D& l, const Array2D& r) { + Array2D res(l); + res -= r; + return res; +} + +template +inline std::ostream& operator<<(std::ostream& os, const Array2D& m) { + for (int i=0; i& m) { + os << ' '; + for (int j=0; j >& m) { + os << ' '; + for (int j=0; j& ar = m(i,j); + for (int k=0; k +#include + +using namespace std; + +namespace B64 { + +static const char cb64[]="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; +static const char cd64[]="|$$$}rstuvwxyz{$$$$$$$>?@ABCDEFGHIJKLMNOPQRSTUVW$$$$$$XYZ[\\]^_`abcdefghijklmnopq"; + +static void encodeblock(const unsigned char* in, ostream* os, int len) { + char out[4]; + out[0] = cb64[ in[0] >> 2 ]; + out[1] = cb64[ ((in[0] & 0x03) << 4) | ((in[1] & 0xf0) >> 4) ]; + out[2] = (len > 1 ? cb64[ ((in[1] & 0x0f) << 2) | ((in[2] & 0xc0) >> 6) ] : '='); + out[3] = (len > 2 ? cb64[ in[2] & 0x3f ] : '='); + os->write(out, 4); +} + +void b64encode(const char* data, const size_t size, ostream* out) { + size_t cur = 0; + while(cur < size) { + int len = min(static_cast(3), size - cur); + encodeblock(reinterpret_cast(&data[cur]), out, len); + cur += len; + } +} + +static void decodeblock(const unsigned char* in, unsigned char* out) { + out[0] = (unsigned char ) (in[0] << 2 | in[1] >> 4); + out[1] = (unsigned char ) (in[1] << 4 | in[2] >> 2); + out[2] = (unsigned char ) (((in[2] << 6) & 0xc0) | in[3]); +} + +bool b64decode(const unsigned char* data, const size_t insize, char* out, const size_t outsize) { + size_t cur = 0; + size_t ocur = 0; + unsigned char in[4]; + while(cur < insize) { + assert(ocur < outsize); + for (int i = 0; i < 4; ++i) { + unsigned char v = data[cur]; + v = (unsigned char) ((v < 43 || v > 122) ? '\0' : cd64[ v - 43 ]); + if (!v) { + cerr << "B64 decode error at offset " << cur << " offending character: " << (int)data[cur] << endl; + return false; + } + v = (unsigned char) ((v == '$') ? '\0' : v - 61); + if (v) in[i] = v - 1; else in[i] = 0; + ++cur; + } + decodeblock(in, reinterpret_cast(&out[ocur])); + ocur += 3; + } + return true; +} + +} + diff --git a/utils/b64tools.h b/utils/b64tools.h new file mode 100644 index 00000000..c821fc8f --- /dev/null +++ b/utils/b64tools.h @@ -0,0 +1,9 @@ +#ifndef _B64_TOOLS_H_ +#define _B64_TOOLS_H_ + +namespace B64 { + bool b64decode(const unsigned char* data, const size_t insize, char* out, const size_t outsize); + void b64encode(const char* data, const size_t size, std::ostream* out); +} + +#endif diff --git a/utils/dict.cc b/utils/dict.cc new file mode 100644 index 00000000..2d6986c8 --- /dev/null +++ b/utils/dict.cc @@ -0,0 +1,27 @@ +#include "dict.h" + +#include +#include + +void TokenizeStringSeparator( + const std::string& str, + const std::string& separator, + std::vector* tokens) { + + size_t pos = 0; + std::string::size_type nextPos = str.find(separator, pos); + + while (nextPos != std::string::npos) { + tokens->push_back(str.substr(pos, nextPos - pos)); + pos = nextPos + separator.size(); + nextPos = str.find(separator, pos); + } + tokens->push_back(str.substr(pos, nextPos - pos)); +} + + +void Dict::AsVector(const WordID& id, std::vector* results) const { + results->clear(); + TokenizeStringSeparator(Convert(id), " ||| ", results); +} + diff --git a/utils/dict.h b/utils/dict.h new file mode 100644 index 00000000..348a97e3 --- /dev/null +++ b/utils/dict.h @@ -0,0 +1,66 @@ +#ifndef DICT_H_ +#define DICT_H_ + + +#include +#include + +#include +#include +#include "hash.h" +#include "wordid.h" + +class Dict { + typedef + HASH_MAP > Map; + public: + Dict() : b0_("") { + HASH_MAP_EMPTY(d_,""); + words_.reserve(1000); + } + + inline int max() const { return words_.size(); } + + inline WordID Convert(const std::string& word, bool frozen = false) { + Map::iterator i = d_.find(word); + if (i == d_.end()) { + if (frozen) + return 0; + words_.push_back(word); + d_[word] = words_.size(); + return words_.size(); + } else { + return i->second; + } + } + + inline WordID Convert(const std::vector& words, bool frozen = false) + { return Convert(toString(words), frozen); } + + static inline std::string toString(const std::vector& words) { + std::string word= ""; + for (std::vector::const_iterator it=words.begin(); + it != words.end(); ++it) { + if (it != words.begin()) word += " ||| "; + word += *it; + } + return word; + } + + inline const std::string& Convert(const WordID& id) const { + if (id == 0) return b0_; + assert(id <= (int)words_.size()); + return words_[id-1]; + } + + void AsVector(const WordID& id, std::vector* results) const; + + void clear() { words_.clear(); d_.clear(); } + + private: + const std::string b0_; + std::vector words_; + Map d_; +}; + +#endif diff --git a/utils/dict_test.cc b/utils/dict_test.cc new file mode 100644 index 00000000..2049ec27 --- /dev/null +++ b/utils/dict_test.cc @@ -0,0 +1,47 @@ +#include "dict.h" + +#include "fdict.h" + +#include +#include +#include + +using namespace std; + +class DTest : public testing::Test { + public: + DTest() {} + protected: + virtual void SetUp() { } + virtual void TearDown() { } +}; + +TEST_F(DTest, Convert) { + Dict d; + WordID a = d.Convert("foo"); + WordID b = d.Convert("bar"); + std::string x = "foo"; + WordID c = d.Convert(x); + EXPECT_NE(a, b); + EXPECT_EQ(a, c); + EXPECT_EQ(d.Convert(a), "foo"); + EXPECT_EQ(d.Convert(b), "bar"); +} + +TEST_F(DTest, FDictTest) { + int fid = FD::Convert("First"); + EXPECT_GT(fid, 0); + EXPECT_EQ(FD::Convert(fid), "First"); + string x = FD::Escape("="); + cerr << x << endl; + EXPECT_NE(x, "="); + x = FD::Escape(";"); + cerr << x << endl; + EXPECT_NE(x, ";"); +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + diff --git a/utils/fdict.cc b/utils/fdict.cc new file mode 100644 index 00000000..baa0b552 --- /dev/null +++ b/utils/fdict.cc @@ -0,0 +1,143 @@ +#include "fdict.h" +#include "stdlib.h" +//for malloc (need on cygwin); todo and std::malloc +#include +#include + +using namespace std; + +Dict FD::dict_; +bool FD::frozen_ = false; + +std::string FD::Convert(std::vector const& v) { + return Convert(&*v.begin(),&*v.end()); +} + +std::string FD::Convert(WordID const *b,WordID const* e) { + ostringstream o; + for (WordID const* i=b;ib) o << ' '; + o << FD::Convert(*i); + } + return o.str(); +} + +static int HexPairValue(const char * code) { + int value = 0; + const char * pch = code; + for (;;) { + int digit = *pch++; + if (digit >= '0' && digit <= '9') { + value += digit - '0'; + } + else if (digit >= 'A' && digit <= 'F') { + value += digit - 'A' + 10; + } + else if (digit >= 'a' && digit <= 'f') { + value += digit - 'a' + 10; + } + else { + return -1; + } + if (pch == code + 2) + return value; + value <<= 4; + } +} + +int UrlDecode(const char *source, char *dest) +{ + char * start = dest; + + while (*source) { + switch (*source) { + case '+': + *(dest++) = ' '; + break; + case '%': + if (source[1] && source[2]) { + int value = HexPairValue(source + 1); + if (value >= 0) { + *(dest++) = value; + source += 2; + } + else { + *dest++ = '?'; + } + } + else { + *dest++ = '?'; + } + break; + default: + *dest++ = *source; + } + source++; + } + + *dest = 0; + return dest - start; +} + +int UrlEncode(const char *source, char *dest, unsigned max) { + static const char *digits = "0123456789ABCDEF"; + unsigned char ch; + unsigned len = 0; + char *start = dest; + + while (len < max - 4 && *source) + { + ch = (unsigned char)*source; + if (*source == ' ') { + *dest++ = '+'; + } + else if (strchr("=:;,_| %", ch)) { + *dest++ = '%'; + *dest++ = digits[(ch >> 4) & 0x0F]; + *dest++ = digits[ ch & 0x0F]; + } + else { + *dest++ = *source; + } + source++; + } + *dest = 0; + return start - dest; +} + +std::string UrlDecodeString(const std::string & encoded) { + const char * sz_encoded = encoded.c_str(); + size_t needed_length = encoded.length(); + for (const char * pch = sz_encoded; *pch; pch++) { + if (*pch == '%') + needed_length += 2; + } + needed_length += 10; + char stackalloc[64]; + char * buf = needed_length > sizeof(stackalloc)/sizeof(*stackalloc) ? + (char *)malloc(needed_length) : stackalloc; + UrlDecode(encoded.c_str(), buf); + std::string result(buf); + if (buf != stackalloc) { + free(buf); + } + return result; +} + +std::string UrlEncodeString(const std::string & decoded) { + size_t needed_length = decoded.length() * 3 + 3; + char stackalloc[64]; + char * buf = needed_length > sizeof(stackalloc)/sizeof(*stackalloc) ? + (char *)malloc(needed_length) : stackalloc; + UrlEncode(decoded.c_str(), buf, needed_length); + std::string result(buf); + if (buf != stackalloc) { + free(buf); + } + return result; +} + +string FD::Escape(const string& s) { + return UrlEncodeString(s); +} + diff --git a/utils/fdict.h b/utils/fdict.h new file mode 100644 index 00000000..f9673023 --- /dev/null +++ b/utils/fdict.h @@ -0,0 +1,34 @@ +#ifndef _FDICT_H_ +#define _FDICT_H_ + +#include +#include +#include "dict.h" + +struct FD { + // once the FD is frozen, new features not already in the + // dictionary will return 0 + static void Freeze() { + frozen_ = true; + } + static inline int NumFeats() { + return dict_.max() + 1; + } + static inline WordID Convert(const std::string& s) { + return dict_.Convert(s, frozen_); + } + static inline const std::string& Convert(const WordID& w) { + return dict_.Convert(w); + } + static std::string Convert(WordID const *i,WordID const* e); + static std::string Convert(std::vector const& v); + + // Escape any string to a form that can be used as the name + // of a weight in a weights file + static std::string Escape(const std::string& s); + static Dict dict_; + private: + static bool frozen_; +}; + +#endif diff --git a/utils/feature_accum.h b/utils/feature_accum.h new file mode 100755 index 00000000..851b29db --- /dev/null +++ b/utils/feature_accum.h @@ -0,0 +1,129 @@ +#ifndef FEATURE_ACCUM_H +#define FEATURE_ACCUM_H + +#include "ff.h" +#include "sparse_vector.h" +#include "value_array.h" + +struct SparseFeatureAccumulator : public FeatureVector { + typedef FeatureVector State; + SparseFeatureAccumulator() { } + template + FeatureVector const& describe(FF const& ) { return *this; } + void Store(FeatureVector *fv) const { + fv->set_from(*this); + } + template + void Store(FF const& /* ff */,FeatureVector *fv) const { + fv->set_from(*this); + } + template + void Add(FF const& /* ff */,FeatureVector const& fv) { + (*this)+=fv; + } + void Add(FeatureVector const& fv) { + (*this)+=fv; + } + /* + SparseFeatureAccumulator(FeatureVector const& fv) : State(fv) {} + FeatureAccumulator(Features const& fids) {} + FeatureAccumulator(Features const& fids,FeatureVector const& fv) : State(fv) {} + void Add(Features const& fids,FeatureVector const& fv) { + *this += fv; + } + */ + void Add(int i,Featval v) { + (*this)[i]+=v; + } + void Add(Features const& fids,int i,Featval v) { + (*this)[i]+=v; + } +}; + +struct SingleFeatureAccumulator { + typedef Featval State; + typedef SingleFeatureAccumulator Self; + State v; + /* + void operator +=(State const& o) { + v+=o; + } + */ + void operator +=(Self const& s) { + v+=s.v; + } + SingleFeatureAccumulator() : v() {} + template + State const& describe(FF const& ) const { return v; } + + template + void Store(FF const& ff,FeatureVector *fv) const { + fv->set_value(ff.fid_,v); + } + void Store(Features const& fids,FeatureVector *fv) const { + assert(fids.size()==1); + fv->set_value(fids[0],v); + } + /* + SingleFeatureAccumulator(Features const& fids) { assert(fids.size()==1); } + SingleFeatureAccumulator(Features const& fids,FeatureVector const& fv) + { + assert(fids.size()==1); + v=fv.get_singleton(); + } + */ + + template + void Add(FF const& ff,FeatureVector const& fv) { + v+=fv.get(ff.fid_); + } + void Add(FeatureVector const& fv) { + v+=fv.get_singleton(); + } + + void Add(Features const& fids,FeatureVector const& fv) { + v += fv.get(fids[0]); + } + void Add(Featval dv) { + v+=dv; + } + void Add(int,Featval dv) { + v+=dv; + } + void Add(FeatureVector const& fids,int i,Featval dv) { + assert(fids.size()==1 && i==0); + v+=dv; + } +}; + + +#if 0 +// omitting this so we can default construct an accum. might be worth resurrecting in the future +struct ArrayFeatureAccumulator : public ValueArray { + typedef ValueArray State; + template + ArrayFeatureAccumulator(Fsa const& fsa) : State(fsa.features_.size()) { } + ArrayFeatureAccumulator(Features const& fids) : State(fids.size()) { } + ArrayFeatureAccumulator(Features const& fids) : State(fids.size()) { } + ArrayFeatureAccumulator(Features const& fids,FeatureVector const& fv) : State(fids.size()) { + for (int i=0,e=iset_value(fids[i],(*this)[i]); + } + void Add(Features const& fids,FeatureVector const& fv) { + for (int i=0,e=i +#include "sparse_vector.h" +#include "fdict.h" + +typedef double Featval; +typedef SparseVectorList FeatureVectorList; +typedef SparseVector FeatureVector; +typedef SparseVector WeightVector; +typedef std::vector DenseWeightVector; + +inline void sparse_to_dense(WeightVector const& wv,DenseWeightVector *dv) { + wv.init_vector(dv); +} + +#endif diff --git a/utils/filelib.cc b/utils/filelib.cc new file mode 100644 index 00000000..79ad2847 --- /dev/null +++ b/utils/filelib.cc @@ -0,0 +1,22 @@ +#include "filelib.h" + +#include +#include + +using namespace std; + +bool FileExists(const std::string& fn) { + struct stat info; + int s = stat(fn.c_str(), &info); + return (s==0); +} + +bool DirectoryExists(const string& dir) { + if (access(dir.c_str(),0) == 0) { + struct stat status; + stat(dir.c_str(), &status); + if (status.st_mode & S_IFDIR) return true; + } + return false; +} + diff --git a/utils/filelib.h b/utils/filelib.h new file mode 100644 index 00000000..b9fef9a7 --- /dev/null +++ b/utils/filelib.h @@ -0,0 +1,106 @@ +#ifndef _FILELIB_H_ +#define _FILELIB_H_ + +#include +#include +#include +#include +#include +#include +#include "gzstream.h" +#include "null_deleter.h" + +bool FileExists(const std::string& file_name); +bool DirectoryExists(const std::string& dir_name); + +// reads from standard in if filename is - +// uncompresses if file ends with .gz +// otherwise, reads from a normal file + +template +struct BaseFile { + typedef Stream S; + typedef boost::shared_ptr PS; + void Reset() { + ps_.reset(); + } + bool is_null() const { return !ps_; } + operator bool() const { + return ps_; + } + S* stream() { return ps_.get(); } + S* operator->() { return ps_.get(); } // compat with old ReadFile * -> new Readfile. remove? + S &operator *() const { return get(); } + S &get() const { return *ps_; } + bool is_std() { + return filename_=="-"; + } + std::string filename_; +protected: + void error(std::string const& reason,std::string const& filename) { + throw std::runtime_error("File "+filename+" - "+reason); + } + + PS ps_; + static bool EndsWith(const std::string& f, const std::string& suf) { + return (f.size() > suf.size()) && (f.rfind(suf) == f.size() - suf.size()); + } +}; + +class ReadFile : public BaseFile { + public: + ReadFile() { } + explicit ReadFile(const std::string& filename) { + Init(filename); + } + void Init(const std::string& filename) { + filename_=filename; + if (is_std()) { + ps_=PS(&std::cin,null_deleter()); + } else { + if (!FileExists(filename)) { + std::cerr << "File does not exist: " << filename << std::endl; + error(filename," couldn't read nonexistant file."); + abort(); + } + char const* file=filename_.c_str(); // just in case the gzstream keeps using the filename for longer than the constructor, e.g. inflateReset2. warning in valgrind that I'm hoping will disappear - it makes no sense. + ps_=PS(EndsWith(filename, ".gz") ? + static_cast(new igzstream(file)) : + static_cast(new std::ifstream(file))); + if (!*ps_) { + std::cerr << "Failed to open " << filename << std::endl; + error(filename," open for reading failed."); + abort(); + } + } + } + +}; + +class WriteFile : public BaseFile { + public: + WriteFile() {} + explicit WriteFile(std::string const& filename) { Init(filename); } + void Init(const std::string& filename) { + filename_=filename; + if (is_std()) { + ps_=PS(&std::cout,null_deleter()); + } else { + char const* file=filename_.c_str(); // just in case the gzstream keeps using the filename for longer than the constructor, e.g. inflateReset2. warning in valgrind that I'm hoping will disappear - it makes no sense. + ps_=PS(EndsWith(filename, ".gz") ? + static_cast(new ogzstream(file)) : + static_cast(new std::ofstream(file))); + if (!*ps_) { + std::cerr << "Failed to open " << filename << std::endl; + error(filename," open for writing failed."); + abort(); + } + } + } + ~WriteFile() { + if (ps_) + get() << std::flush; + } +}; + +#endif diff --git a/utils/gzstream.cc b/utils/gzstream.cc new file mode 100644 index 00000000..88cd1bd2 --- /dev/null +++ b/utils/gzstream.cc @@ -0,0 +1,182 @@ +// ============================================================================ +// gzstream, C++ iostream classes wrapping the zlib compression library. +// Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// ============================================================================ +// +// File : gzstream.C +// Revision : $Revision: 1.7 $ +// Revision_date : $Date: 2003/01/08 14:41:27 $ +// Author(s) : Deepak Bandyopadhyay, Lutz Kettner +// +// Standard streambuf implementation following Nicolai Josuttis, "The +// Standard C++ Library". +// ============================================================================ + +#include +#include +#include // for memcpy +#include + +#ifdef GZSTREAM_NAMESPACE +namespace GZSTREAM_NAMESPACE { +#endif + +// ---------------------------------------------------------------------------- +// Internal classes to implement gzstream. See header file for user classes. +// ---------------------------------------------------------------------------- + +// -------------------------------------- +// class gzstreambuf: +// -------------------------------------- + +gzstreambuf* gzstreambuf::open( const char* name, int open_mode) { + if ( is_open()) + return (gzstreambuf*)0; + mode = open_mode; + // no append nor read/write mode + if ((mode & std::ios::ate) || (mode & std::ios::app) + || ((mode & std::ios::in) && (mode & std::ios::out))) + return (gzstreambuf*)0; + const int Nmode=10; + char fmode[Nmode]; + char* fmodeptr = fmode; + if ( mode & std::ios::in) + *fmodeptr++ = 'r'; + else if ( mode & std::ios::out) + *fmodeptr++ = 'w'; + *fmodeptr++ = 'b'; + while (fmodeptr( gptr()); + + if ( ! (mode & std::ios::in) || ! opened) + return EOF; + // Josuttis' implementation of inbuf + int n_putback = gptr() - eback(); + if ( n_putback > 4) + n_putback = 4; + std::memcpy( buffer + (4 - n_putback), gptr() - n_putback, n_putback); + + int num = gzread( file, buffer+4, bufferSize-4); + if (num <= 0) // ERROR or EOF + { + if (gzeof(file)) + return EOF; + handle_gzerror(); + } + + // reset buffer pointers + setg( buffer + (4 - n_putback), // beginning of putback area + buffer + 4, // read position + buffer + 4 + num); // end of buffer + + // return next character + return * reinterpret_cast( gptr()); +} + +int gzstreambuf::flush_buffer() { + // Separate the writing of the buffer from overflow() and + // sync() operation. + int w = pptr() - pbase(); + if ( gzwrite( file, pbase(), w) != w) + handle_gzerror(); + pbump( -w); + return w; +} + +int gzstreambuf::overflow( int c) { // used for output buffer only + if ( ! ( mode & std::ios::out) || ! opened) + return EOF; + if (c != EOF) { + *pptr() = c; + pbump(1); + } + if ( flush_buffer() == EOF) + return EOF; + return c; +} + +int gzstreambuf::sync() { + // Changed to use flush_buffer() instead of overflow( EOF) + // which caused improper behavior with std::endl and flush(), + // bug reported by Vincent Ricard. + if ( pptr() && pptr() > pbase()) { + if ( flush_buffer() == EOF) + return -1; + } + return 0; +} + +// -------------------------------------- +// class gzstreambase: +// -------------------------------------- + +gzstreambase::gzstreambase( const char* name, int mode) { + init( &buf); + open( name, mode); +} + +gzstreambase::~gzstreambase() { + buf.close(); +} + +void gzstreambase::open( const char* name, int open_mode) { + if ( ! buf.open( name, open_mode)) + clear( rdstate() | std::ios::badbit); +} + +void gzstreambase::close() { + if ( buf.is_open()) + if ( ! buf.close()) + clear( rdstate() | std::ios::badbit); +} + +#ifdef GZSTREAM_NAMESPACE +} // namespace GZSTREAM_NAMESPACE +#endif + +// ============================================================================ +// EOF // diff --git a/utils/gzstream.h b/utils/gzstream.h new file mode 100644 index 00000000..a7effd90 --- /dev/null +++ b/utils/gzstream.h @@ -0,0 +1,127 @@ +// ============================================================================ +// gzstream, C++ iostream classes wrapping the zlib compression library. +// Copyright (C) 2001 Deepak Bandyopadhyay, Lutz Kettner +// +// This library is free software; you can redistribute it and/or +// modify it under the terms of the GNU Lesser General Public +// License as published by the Free Software Foundation; either +// version 2.1 of the License, or (at your option) any later version. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +// Lesser General Public License for more details. +// +// You should have received a copy of the GNU Lesser General Public +// License along with this library; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +// ============================================================================ +// +// File : gzstream.h +// Revision : $Revision: 1.5 $ +// Revision_date : $Date: 2002/04/26 23:30:15 $ +// Author(s) : Deepak Bandyopadhyay, Lutz Kettner +// +// Standard streambuf implementation following Nicolai Josuttis, "The +// Standard C++ Library". +// ============================================================================ + +#ifndef GZSTREAM_H +#define GZSTREAM_H 1 + +// standard C++ with new header file names and std:: namespace +#include +#include +#include + +#ifdef GZSTREAM_NAMESPACE +namespace GZSTREAM_NAMESPACE { +#endif + +// ---------------------------------------------------------------------------- +// Internal classes to implement gzstream. See below for user classes. +// ---------------------------------------------------------------------------- + +class gzstreambuf : public std::streambuf { +private: + static const int bufferSize = 47+(1024*256); // size of data buff + // totals 512 bytes under g++ for igzstream at the end. + + gzFile file; // file handle for compressed file + char buffer[bufferSize]; // data buffer + char opened; // open/close state of stream + int mode; // I/O mode + + int flush_buffer(); + void handle_gzerror(); // throws exception +public: +#if defined(_WIN32) && !defined(CYGWIN) && !defined(EOF) + enum { + EOF = -1 + }; +#endif + gzstreambuf() : opened(0) { + setp( buffer, buffer + (bufferSize-1)); + setg( buffer + 4, // beginning of putback area + buffer + 4, // read position + buffer + 4); // end position + // ASSERT: both input & output capabilities will not be used together + } + int is_open() { return opened; } + gzstreambuf* open( const char* name, int open_mode); + gzstreambuf* close(); + ~gzstreambuf() { close(); } + + virtual int overflow( int c = EOF); + virtual int underflow(); + virtual int sync(); +}; + +class gzstreambase : virtual public std::ios { +protected: + gzstreambuf buf; +public: + gzstreambase() { init(&buf); } + gzstreambase( const char* name, int open_mode); + ~gzstreambase(); + void open( const char* name, int open_mode); + void close(); + gzstreambuf* rdbuf() { return &buf; } +}; + +// ---------------------------------------------------------------------------- +// User classes. Use igzstream and ogzstream analogously to ifstream and +// ofstream respectively. They read and write files based on the gz* +// function interface of the zlib. Files are compatible with gzip compression. +// ---------------------------------------------------------------------------- + +class igzstream : public gzstreambase, public std::istream { +public: + igzstream() : std::istream( &buf) {} + igzstream( const char* name, int open_mode = std::ios::in) + : gzstreambase( name, std::ios::in | open_mode), std::istream( &buf) {} + gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); } + void open( const char* name, int open_mode = std::ios::in) { + gzstreambase::open( name, open_mode); + } +}; + +class ogzstream : public gzstreambase, public std::ostream { +public: + ogzstream() : std::ostream( &buf) {} + ogzstream( const char* name, int mode = std::ios::out) + : gzstreambase( name, mode), std::ostream( &buf) {} + gzstreambuf* rdbuf() { return gzstreambase::rdbuf(); } + void open( const char* name, int open_mode = std::ios::out) { + gzstreambase::open( name, open_mode); + } +}; + +#ifdef GZSTREAM_NAMESPACE +} // namespace GZSTREAM_NAMESPACE +#endif + +#endif // GZSTREAM_H +// ============================================================================ +// EOF // + diff --git a/utils/hash.h b/utils/hash.h new file mode 100755 index 00000000..3a60a429 --- /dev/null +++ b/utils/hash.h @@ -0,0 +1,54 @@ +#ifndef CDEC_HASH_H +#define CDEC_HASH_H + +#include "murmur_hash.h" + +#include "config.h" +#ifdef HAVE_SPARSEHASH +# include +# define HASH_MAP google::dense_hash_map +# define HASH_MAP_RESERVED(h,empty,deleted) do { h.set_empty_key(empty); h.set_deleted_key(deleted); } while(0) +# define HASH_MAP_EMPTY(h,empty) do { h.set_empty_key(empty); } while(0) +#else +# include +# define HASH_MAP std::tr1::unordered_map +# define HASH_MAP_RESERVED(h,empty,deleted) +# define HASH_MAP_EMPTY(h,empty) +#endif + +#include + +// assumes C is POD +template +struct murmur_hash +{ + typedef MurmurInt return_type; + typedef C /*const&*/ argument_type; + return_type operator()(argument_type const& c) const { + return MurmurHash((void*)&c,sizeof(c)); + } +}; + +// murmur_hash_array isn't std guaranteed safe (you need to use string::data()) +template <> +struct murmur_hash +{ + typedef MurmurInt return_type; + typedef std::string /*const&*/ argument_type; + return_type operator()(argument_type const& c) const { + return MurmurHash(c.data(),c.size()); + } +}; + +// uses begin(),size() assuming contiguous layout and POD +template +struct murmur_hash_array +{ + typedef MurmurInt return_type; + typedef C /*const&*/ argument_type; + return_type operator()(argument_type const& c) const { + return MurmurHash(&*c.begin(),c.size()*sizeof(*c.begin())); + } +}; + +#endif diff --git a/utils/have_64_bits.h b/utils/have_64_bits.h new file mode 100755 index 00000000..d1e6064f --- /dev/null +++ b/utils/have_64_bits.h @@ -0,0 +1,17 @@ +#ifndef HAVE_64_BITS_H +#define HAVE_64_BITS_H + +#include + +#undef HAVE_64_BITS + +#if INTPTR_MAX == INT32_MAX +# define HAVE_64_BITS 0 +#elif INTPTR_MAX >= INT64_MAX +# define HAVE_64_BITS 1 +#else +# error "couldn't tell if HAVE_64_BITS from INTPTR_MAX INT32_MAX INT64_MAX" +#endif + + +#endif diff --git a/utils/int_or_pointer.h b/utils/int_or_pointer.h new file mode 100755 index 00000000..4b6a9e4a --- /dev/null +++ b/utils/int_or_pointer.h @@ -0,0 +1,70 @@ +#ifndef INT_OR_POINTER_H +#define INT_OR_POINTER_H + +// if you ever wanted to store a discriminated union of pointer/integer without an extra boolean flag, this will do it, assuming your pointers are never odd. + +// check lsb for expected tag? +#ifndef IOP_CHECK_LSB +# define IOP_CHECK_LSB 1 +#endif +#if IOP_CHECK_LSB +# define iop_assert(x) assert(x) +#else +# define iop_assert(x) +#endif + +#include +#include + +template +struct IntOrPointer { + typedef Pointed pointed_type; + typedef Int integer_type; + typedef Pointed *value_type; + typedef IntOrPointer self_type; + IntOrPointer(int j) { *this=j; } + IntOrPointer(size_t j) { *this=j; } + IntOrPointer(value_type v) { *this=v; } + bool is_integer() const { return i&1; } + bool is_pointer() const { return !(i&1); } + value_type & pointer() { return p; } + const value_type & pointer() const { iop_assert(is_pointer()); return p; } + integer_type integer() const { iop_assert(is_integer()); return i >> 1; } + void set_integer(Int j) { i=2*j+1; } + void set_pointer(value_type p_) { p=p_;iop_assert(is_pointer()); } + void operator=(unsigned j) { i = 2*(integer_type)j+1; } + void operator=(int j) { i = 2*(integer_type)j+1; } + template + void operator=(C j) { i = 2*(integer_type)j+1; } + void operator=(value_type v) { p=v; } + IntOrPointer() {} + IntOrPointer(const self_type &s) : p(s.p) {} + void operator=(const self_type &s) { p=s.p; } + template + bool operator ==(C* v) const { return p==v; } + template + bool operator ==(const C* v) const { return p==v; } + template + bool operator ==(C j) const { return integer() == j; } + bool operator ==(self_type s) const { return p==s.p; } + bool operator !=(self_type s) const { return p!=s.p; } + template void print(O&o) const + { + if (is_integer()) + o << integer(); + else { + o << "0x" << std::hex << (size_t)pointer() << std::dec; + } + } + friend inline std::ostream& operator<<(std::ostream &o,self_type const& s) { + s.print(o); return o; + } +protected: + union { + value_type p; // must be even (guaranteed unless you're pointing at packed chars) + integer_type i; // stored as 2*data+1, so only has half the range (one less bit) of a normal integer_type + }; +}; + + +#endif diff --git a/utils/intrusive_refcount.hpp b/utils/intrusive_refcount.hpp new file mode 100755 index 00000000..4a4b0187 --- /dev/null +++ b/utils/intrusive_refcount.hpp @@ -0,0 +1,84 @@ +#ifndef GRAEHL__SHARED__INTRUSIVE_REFCOUNT_HPP +#define GRAEHL__SHARED__INTRUSIVE_REFCOUNT_HPP + +#include +#include +#include +#include + +/** usage: + struct mine : public boost::instrusive_refcount {}; + + boost::intrusive_ptr p(new mine()); +*/ + +namespace boost { +// note: the free functions need to be in boost namespace, OR namespace of involved type. this is the only way to do it. + +template +class intrusive_refcount; + +template +class atomic_intrusive_refcount; + +template +void intrusive_ptr_add_ref(intrusive_refcount* ptr) +{ + ++(ptr->refs); +} + +template +void intrusive_ptr_release(intrusive_refcount* ptr) +{ + if (!--(ptr->refs)) delete static_cast(ptr); +} + + +//WARNING: only 2^32 (unsigned) refs allowed. hope that's ok :) +template +class intrusive_refcount : boost::noncopyable +{ + protected: +// typedef intrusive_refcount pointed_type; + friend void intrusive_ptr_add_ref(intrusive_refcount* ptr); + friend void intrusive_ptr_release(intrusive_refcount* ptr); +// friend class intrusive_ptr; + + intrusive_refcount(): refs(0) {} + ~intrusive_refcount() { assert(refs==0); } + +private: + unsigned refs; +}; + + +template +void intrusive_ptr_add_ref(atomic_intrusive_refcount* ptr) +{ + ++(ptr->refs); +} + +template +void intrusive_ptr_release(atomic_intrusive_refcount* ptr) +{ + if(!--(ptr->refs)) delete static_cast(ptr); +} + +template +class atomic_intrusive_refcount : boost::noncopyable +{ + protected: + friend void intrusive_ptr_add_ref(atomic_intrusive_refcount* ptr); + friend void intrusive_ptr_release(atomic_intrusive_refcount* ptr); + + atomic_intrusive_refcount(): refs(0) {} + ~atomic_intrusive_refcount() { assert(refs==0); } + +private: + boost::detail::atomic_count refs; +}; + +} + + +#endif diff --git a/utils/logval.h b/utils/logval.h new file mode 100644 index 00000000..37f14ae5 --- /dev/null +++ b/utils/logval.h @@ -0,0 +1,174 @@ +#ifndef LOGVAL_H_ +#define LOGVAL_H_ + +#define LOGVAL_CHECK_NEG false + +#include +#include +#include +#include + +template +class LogVal { + public: + LogVal() : s_(), v_(-std::numeric_limits::infinity()) {} + explicit LogVal(double x) : s_(std::signbit(x)), v_(s_ ? std::log(-x) : std::log(x)) {} + LogVal(int x) : s_(x<0), v_(s_ ? std::log(-x) : std::log(x)) {} + LogVal(unsigned x) : s_(0), v_(std::log(x)) { } + LogVal(double lnx,bool sign) : s_(sign),v_(lnx) {} + static LogVal exp(T lnx) { return LogVal(lnx,false); } + + static LogVal One() { return LogVal(1); } + static LogVal Zero() { return LogVal(); } + static LogVal e() { return LogVal(1,false); } + void logeq(const T& v) { s_ = false; v_ = v; } + + LogVal& operator+=(const LogVal& a) { + if (a.v_ == -std::numeric_limits::infinity()) return *this; + if (a.s_ == s_) { + if (a.v_ < v_) { + v_ = v_ + log1p(std::exp(a.v_ - v_)); + } else { + v_ = a.v_ + log1p(std::exp(v_ - a.v_)); + } + } else { + if (a.v_ < v_) { + v_ = v_ + log1p(-std::exp(a.v_ - v_)); + } else { + v_ = a.v_ + log1p(-std::exp(v_ - a.v_)); + s_ = !s_; + } + } + return *this; + } + + LogVal& operator*=(const LogVal& a) { + s_ = (s_ != a.s_); + v_ += a.v_; + return *this; + } + + LogVal& operator/=(const LogVal& a) { + s_ = (s_ != a.s_); + v_ -= a.v_; + return *this; + } + + LogVal& operator-=(const LogVal& a) { + LogVal b = a; + b.invert(); + return *this += b; + } + + // LogVal(fabs(log(x)),x.s_) + friend LogVal abslog(LogVal x) { + if (x.v_<0) x.v_=-x.v_; + return x; + } + + LogVal& poweq(const T& power) { +#if LOGVAL_CHECK_NEG + if (s_) { + std::cerr << "poweq(T) not implemented when s_ is true\n"; + std::abort(); + } else +#endif + v_ *= power; + return *this; + } + + void invert() { s_ = !s_; } + + LogVal pow(const T& power) const { + LogVal res = *this; + res.poweq(power); + return res; + } + + LogVal root(const T& root) const { + return pow(1/root); + } + + operator T() const { + if (s_) return -std::exp(v_); else return std::exp(v_); + } + + bool s_; + T v_; +}; + +// copy elision - as opposed to explicit copy of LogVal const& o1, we should be able to construct Logval r=a+(b+c) as a single result in place in r. todo: return std::move(o1) - C++0x +template +LogVal operator+(LogVal o1, const LogVal& o2) { + o1 += o2; + return o1; +} + +template +LogVal operator*(LogVal o1, const LogVal& o2) { + o1 *= o2; + return o1; +} + +template +LogVal operator/(LogVal o1, const LogVal& o2) { + o1 /= o2; + return o1; +} + +template +LogVal operator-(LogVal o1, const LogVal& o2) { + o1 -= o2; + return o1; +} + +template +T log(const LogVal& o) { +#ifdef LOGVAL_CHECK_NEG + if (o.s_) return log(-1.0); +#endif + return o.v_; +} + +template +LogVal pow(const LogVal& b, const T& e) { + return b.pow(e); +} + +template +bool operator<(const LogVal& lhs, const LogVal& rhs) { + if (lhs.s_ == rhs.s_) { + return (lhs.v_ < rhs.v_); + } else { + return lhs.s_ > rhs.s_; + } +} + +#if 0 +template +bool operator<=(const LogVal& lhs, const LogVal& rhs) { + return (lhs.v_ <= rhs.v_); +} + +template +bool operator>(const LogVal& lhs, const LogVal& rhs) { + return (lhs.v_ > rhs.v_); +} + +template +bool operator>=(const LogVal& lhs, const LogVal& rhs) { + return (lhs.v_ >= rhs.v_); +} +#endif + +template +bool operator==(const LogVal& lhs, const LogVal& rhs) { + return (lhs.v_ == rhs.v_) && (lhs.s_ == rhs.s_); +} + +template +bool operator!=(const LogVal& lhs, const LogVal& rhs) { + return !(lhs == rhs); +} + +#endif diff --git a/utils/logval_test.cc b/utils/logval_test.cc new file mode 100644 index 00000000..1a23177d --- /dev/null +++ b/utils/logval_test.cc @@ -0,0 +1,73 @@ +#include "logval.h" + +#include +#include + +class LogValTest : public testing::Test { + protected: + virtual void SetUp() { } + virtual void TearDown() { } +}; + +using namespace std; + +TEST_F(LogValTest,Order) { + LogVal a(-0.3); + LogVal b(0.3); + LogVal c(2.4); + EXPECT_LT(a,b); + EXPECT_LT(b,c); + EXPECT_LT(a,c); + EXPECT_FALSE(b < a); + EXPECT_FALSE(c < a); + EXPECT_FALSE(c < b); + EXPECT_FALSE(c < c); + EXPECT_FALSE(b < b); + EXPECT_FALSE(a < a); +} + +TEST_F(LogValTest,Invert) { + LogVal x(-2.4); + LogVal y(2.4); + y.invert(); + EXPECT_FLOAT_EQ(x,y); +} + +TEST_F(LogValTest,Minus) { + LogVal x(12); + LogVal y(2); + LogVal z1 = x - y; + LogVal z2 = x; + z2 -= y; + EXPECT_FLOAT_EQ(z1, z2); + EXPECT_FLOAT_EQ(z1, 10.0); + EXPECT_FLOAT_EQ(y - x, -10.0); +} + +TEST_F(LogValTest,TestOps) { + LogVal x(-12.12); + LogVal y(x); + cerr << x << endl; + cerr << (x*y) << endl; + cerr << (x*y + x) << endl; + cerr << (x + x*y) << endl; + cerr << log1p(-0.5) << endl; + LogVal aa(0.2); + LogVal bb(-0.3); + cerr << (aa + bb) << endl; + cerr << (bb + aa) << endl; + EXPECT_FLOAT_EQ((aa + bb), (bb + aa)); + EXPECT_FLOAT_EQ((aa + bb), -0.1); +} + +TEST_F(LogValTest,TestSizes) { + cerr << sizeof(LogVal) << endl; + cerr << sizeof(LogVal) << endl; + cerr << sizeof(void*) << endl; +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + diff --git a/utils/murmur_hash.h b/utils/murmur_hash.h new file mode 100755 index 00000000..8dbd7807 --- /dev/null +++ b/utils/murmur_hash.h @@ -0,0 +1,186 @@ +#ifndef _MURMUR_HASH_H_ +#define _MURMUR_HASH_H_ + +//NOTE: quite fast, nice collision properties, but endian dependent hash values + +#include "have_64_bits.h" +typedef uintptr_t MurmurInt; + +// MurmurHash2, by Austin Appleby + +static const uint32_t DEFAULT_SEED=2654435769U; + +#if HAVE_64_BITS +//MurmurInt MurmurHash(void const *key, int len, uint32_t seed=DEFAULT_SEED); + +inline uint64_t MurmurHash64( const void * key, int len, unsigned int seed=DEFAULT_SEED ) +{ + const uint64_t m = 0xc6a4a7935bd1e995; + const int r = 47; + + uint64_t h = seed ^ (len * m); + + const uint64_t * data = (const uint64_t *)key; + const uint64_t * end = data + (len/8); + + while(data != end) + { + uint64_t k = *data++; + + k *= m; + k ^= k >> r; + k *= m; + + h ^= k; + h *= m; + } + + const unsigned char * data2 = (const unsigned char*)data; + + switch(len & 7) + { + case 7: h ^= uint64_t(data2[6]) << 48; + case 6: h ^= uint64_t(data2[5]) << 40; + case 5: h ^= uint64_t(data2[4]) << 32; + case 4: h ^= uint64_t(data2[3]) << 24; + case 3: h ^= uint64_t(data2[2]) << 16; + case 2: h ^= uint64_t(data2[1]) << 8; + case 1: h ^= uint64_t(data2[0]); + h *= m; + }; + + h ^= h >> r; + h *= m; + h ^= h >> r; + + return h; +} + +inline uint32_t MurmurHash32(void const *key, int len, uint32_t seed=DEFAULT_SEED) +{ + return (uint32_t) MurmurHash64(key,len,seed); +} + +inline MurmurInt MurmurHash(void const *key, int len, uint32_t seed=DEFAULT_SEED) +{ + return MurmurHash64(key,len,seed); +} + +#else +// 32-bit + +// Note - This code makes a few assumptions about how your machine behaves - +// 1. We can read a 4-byte value from any address without crashing +// 2. sizeof(int) == 4 +inline uint32_t MurmurHash32 ( const void * key, int len, uint32_t seed=DEFAULT_SEED) +{ + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. + + const uint32_t m = 0x5bd1e995; + const int r = 24; + + // Initialize the hash to a 'random' value + + uint32_t h = seed ^ len; + + // Mix 4 bytes at a time into the hash + + const unsigned char * data = (const unsigned char *)key; + + while(len >= 4) + { + uint32_t k = *(uint32_t *)data; + + k *= m; + k ^= k >> r; + k *= m; + + h *= m; + h ^= k; + + data += 4; + len -= 4; + } + + // Handle the last few bytes of the input array + + switch(len) + { + case 3: h ^= data[2] << 16; + case 2: h ^= data[1] << 8; + case 1: h ^= data[0]; + h *= m; + }; + + // Do a few final mixes of the hash to ensure the last few + // bytes are well-incorporated. + + h ^= h >> 13; + h *= m; + h ^= h >> 15; + + return h; +} + +inline MurmurInt MurmurHash ( const void * key, int len, uint32_t seed=DEFAULT_SEED) { + return MurmurHash32(key,len,seed); +} + +// 64-bit hash for 32-bit platforms + +inline uint64_t MurmurHash64 ( const void * key, int len, uint32_t seed=DEFAULT_SEED) +{ + const uint32_t m = 0x5bd1e995; + const int r = 24; + + uint32_t h1 = seed ^ len; + uint32_t h2 = 0; + + const uint32_t * data = (const uint32_t *)key; + + while(len >= 8) + { + uint32_t k1 = *data++; + k1 *= m; k1 ^= k1 >> r; k1 *= m; + h1 *= m; h1 ^= k1; + len -= 4; + + uint32_t k2 = *data++; + k2 *= m; k2 ^= k2 >> r; k2 *= m; + h2 *= m; h2 ^= k2; + len -= 4; + } + + if(len >= 4) + { + uint32_t k1 = *data++; + k1 *= m; k1 ^= k1 >> r; k1 *= m; + h1 *= m; h1 ^= k1; + len -= 4; + } + + switch(len) + { + case 3: h2 ^= ((unsigned char*)data)[2] << 16; + case 2: h2 ^= ((unsigned char*)data)[1] << 8; + case 1: h2 ^= ((unsigned char*)data)[0]; + h2 *= m; + }; + + h1 ^= h2 >> 18; h1 *= m; + h2 ^= h1 >> 22; h2 *= m; + h1 ^= h2 >> 17; h1 *= m; + h2 ^= h1 >> 19; h2 *= m; + + uint64_t h = h1; + + h = (h << 32) | h2; + + return h; +} + +#endif +//32bit + +#endif diff --git a/utils/null_deleter.h b/utils/null_deleter.h new file mode 100755 index 00000000..082ab453 --- /dev/null +++ b/utils/null_deleter.h @@ -0,0 +1,9 @@ +#ifndef NULL_DELETER_H +#define NULL_DELETER_H + +struct null_deleter { + void operator()(void*) const {} + void operator()(void const*) const {} +}; + +#endif diff --git a/utils/prob.h b/utils/prob.h new file mode 100644 index 00000000..bc297870 --- /dev/null +++ b/utils/prob.h @@ -0,0 +1,8 @@ +#ifndef _PROB_H_ +#define _PROB_H_ + +#include "logval.h" + +typedef LogVal prob_t; + +#endif diff --git a/utils/sampler.h b/utils/sampler.h new file mode 100644 index 00000000..5fef45d0 --- /dev/null +++ b/utils/sampler.h @@ -0,0 +1,147 @@ +#ifndef SAMPLER_H_ +#define SAMPLER_H_ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "prob.h" + +struct SampleSet; + +template +struct RandomNumberGenerator { + static uint32_t GetTrulyRandomSeed() { + uint32_t seed; + std::ifstream r("/dev/urandom"); + if (r) { + r.read((char*)&seed,sizeof(uint32_t)); + } + if (r.fail() || !r) { + std::cerr << "Warning: could not read from /dev/urandom. Seeding from clock" << std::endl; + seed = std::time(NULL); + } + std::cerr << "Seeding random number sequence to " << seed << std::endl; + return seed; + } + + RandomNumberGenerator() : m_dist(0,1), m_generator(), m_random(m_generator,m_dist) { + uint32_t seed = GetTrulyRandomSeed(); + m_generator.seed(seed); + } + explicit RandomNumberGenerator(uint32_t seed) : m_dist(0,1), m_generator(), m_random(m_generator,m_dist) { + if (!seed) seed = GetTrulyRandomSeed(); + m_generator.seed(seed); + } + + size_t SelectSample(const prob_t& a, const prob_t& b, double T = 1.0) { + if (T == 1.0) { + if (this->next() > (a / (a + b))) return 1; else return 0; + } else { + assert(!"not implemented"); + } + } + + // T is the annealing temperature, if desired + size_t SelectSample(const SampleSet& ss, double T = 1.0); + + // draw a value from U(0,1) + double next() {return m_random();} + + // draw a value from N(mean,var) + double NextNormal(double mean, double var) { + return boost::normal_distribution(mean, var)(m_random); + } + + // draw a value from a Poisson distribution + // lambda must be greater than 0 + int NextPoisson(int lambda) { + return boost::poisson_distribution(lambda)(m_random); + } + + bool AcceptMetropolisHastings(const prob_t& p_cur, + const prob_t& p_prev, + const prob_t& q_cur, + const prob_t& q_prev) { + const prob_t a = (p_cur / p_prev) * (q_prev / q_cur); + if (log(a) >= 0.0) return true; + return (prob_t(this->next()) < a); + } + + RNG &gen() { return m_generator; } + typedef boost::variate_generator > IntRNG; + IntRNG inclusive(int low,int high_incl) { + assert(high_incl>=low); + return IntRNG(m_generator,boost::uniform_int<>(low,high_incl)); + } + + private: + boost::uniform_real<> m_dist; + RNG m_generator; + boost::variate_generator > m_random; +}; + +typedef RandomNumberGenerator MT19937; + +class SampleSet { + public: + const prob_t& operator[](int i) const { return m_scores[i]; } + prob_t& operator[](int i) { return m_scores[i]; } + bool empty() const { return m_scores.empty(); } + void add(const prob_t& s) { m_scores.push_back(s); } + void clear() { m_scores.clear(); } + size_t size() const { return m_scores.size(); } + void resize(int size) { m_scores.resize(size); } + std::vector m_scores; +}; + +template +size_t RandomNumberGenerator::SelectSample(const SampleSet& ss, double T) { + assert(T > 0.0); + assert(ss.m_scores.size() > 0); + if (ss.m_scores.size() == 1) return 0; + const prob_t annealing_factor(1.0 / T); + const bool anneal = (annealing_factor != prob_t::One()); + prob_t sum = prob_t::Zero(); + if (anneal) { + for (int i = 0; i < ss.m_scores.size(); ++i) + sum += ss.m_scores[i].pow(annealing_factor); // p^(1/T) + } else { + sum = std::accumulate(ss.m_scores.begin(), ss.m_scores.end(), prob_t::Zero()); + } + //for (size_t i = 0; i < ss.m_scores.size(); ++i) std::cerr << ss.m_scores[i] << ","; + //std::cerr << std::endl; + + prob_t random(this->next()); // random number between 0 and 1 + random *= sum; // scale with normalization factor + //std::cerr << "Random number " << random << std::endl; + + //now figure out which sample + size_t position = 1; + sum = ss.m_scores[0]; + if (anneal) { + sum.poweq(annealing_factor); + for (; position < ss.m_scores.size() && sum < random; ++position) + sum += ss.m_scores[position].pow(annealing_factor); + } else { + for (; position < ss.m_scores.size() && sum < random; ++position) + sum += ss.m_scores[position]; + } + //std::cout << "random: " << random << " sample: " << position << std::endl; + //std::cerr << "Sample: " << position-1 << std::endl; + //exit(1); + return position-1; +} + +#endif diff --git a/utils/small_vector.h b/utils/small_vector.h new file mode 100644 index 00000000..25c52359 --- /dev/null +++ b/utils/small_vector.h @@ -0,0 +1,265 @@ +#ifndef _SMALL_VECTOR_H_ +#define _SMALL_VECTOR_H_ + +/* REQUIRES that T is POD (can be memcpy). won't work (yet) due to union with SMALL_VECTOR_POD==0 - may be possible to handle movable types that have ctor/dtor, by using explicit allocation, ctor/dtor calls. but for now JUST USE THIS FOR no-meaningful ctor/dtor POD types. + + stores small element (<=SV_MAX items) vectors inline. recommend SV_MAX=sizeof(T)/sizeof(T*)>1?sizeof(T)/sizeof(T*):1. may not work if SV_MAX==0. + */ + +#define SMALL_VECTOR_POD 1 + +#include // std::max - where to get this? +#include +#include +#include +#include +#include +//sizeof(T)/sizeof(T*)>1?sizeof(T)/sizeof(T*):1 + +template +class SmallVector { +// typedef unsigned short uint16_t; + public: + typedef SmallVector Self; + SmallVector() : size_(0) {} + + typedef T const* const_iterator; + typedef T* iterator; + typedef T value_type; + typedef T &reference; + typedef T const& const_reference; + + T *begin() { return size_>SV_MAX?data_.ptr:data_.vals; } + T const* begin() const { return const_cast(this)->begin(); } + T *end() { return begin()+size_; } + T const* end() const { return begin()+size_; } + + explicit SmallVector(size_t s) : size_(s) { + assert(s < 0xA000); + if (s <= SV_MAX) { + for (int i = 0; i < s; ++i) new(&data_.vals[i]) T(); + } else { + capacity_ = s; + size_ = s; + data_.ptr = new T[s]; // TODO: replace this with allocator or ::operator new(sizeof(T)*s) everywhere + for (int i = 0; i < size_; ++i) new(&data_.ptr[i]) T(); + } + } + + SmallVector(size_t s, T const& v) : size_(s) { + assert(s < 0xA000); + if (s <= SV_MAX) { + for (int i = 0; i < s; ++i) data_.vals[i] = v; + } else { + capacity_ = s; + size_ = s; + data_.ptr = new T[s]; + for (int i = 0; i < size_; ++i) data_.ptr[i] = v; + } + } + + SmallVector(const Self& o) : size_(o.size_) { + if (size_ <= SV_MAX) { + std::memcpy(data_.vals,o.data_.vals,size_*sizeof(T)); +// for (int i = 0; i < size_; ++i) data_.vals[i] = o.data_.vals[i]; + } else { + capacity_ = size_ = o.size_; + data_.ptr = new T[capacity_]; + std::memcpy(data_.ptr, o.data_.ptr, size_ * sizeof(T)); + } + } + + const Self& operator=(const Self& o) { + if (size_ <= SV_MAX) { + if (o.size_ <= SV_MAX) { + size_ = o.size_; + for (int i = 0; i < SV_MAX; ++i) data_.vals[i] = o.data_.vals[i]; + } else { + capacity_ = size_ = o.size_; + data_.ptr = new T[capacity_]; + std::memcpy(data_.ptr, o.data_.ptr, size_ * sizeof(T)); + } + } else { + if (o.size_ <= SV_MAX) { + delete[] data_.ptr; + size_ = o.size_; + for (int i = 0; i < size_; ++i) data_.vals[i] = o.data_.vals[i]; + } else { + if (capacity_ < o.size_) { + delete[] data_.ptr; + capacity_ = o.size_; + data_.ptr = new T[capacity_]; + } + size_ = o.size_; + for (int i = 0; i < size_; ++i) + data_.ptr[i] = o.data_.ptr[i]; + } + } + return *this; + } + + ~SmallVector() { + if (size_ <= SV_MAX) { + // skip if pod? yes, we required pod anyway. no need to destruct +#if !SMALL_VECTOR_POD + for (int i=0;i SV_MAX) { + delete[] data_.ptr; + } + size_ = 0; + } + + bool empty() const { return size_ == 0; } + size_t size() const { return size_; } + + inline void ensure_capacity(uint16_t min_size) { + assert(min_size > SV_MAX); + if (min_size < capacity_) return; + uint16_t new_cap = std::max(static_cast(capacity_ << 1), min_size); + T* tmp = new T[new_cap]; + std::memcpy(tmp, data_.ptr, capacity_ * sizeof(T)); + delete[] data_.ptr; + data_.ptr = tmp; + capacity_ = new_cap; + } + +private: + inline void copy_vals_to_ptr() { + capacity_ = SV_MAX * 2; + T* tmp = new T[capacity_]; + for (int i = 0; i < SV_MAX; ++i) tmp[i] = data_.vals[i]; + data_.ptr = tmp; + } + inline void ptr_to_small() { + assert(size_<=SV_MAX); + int *tmp=data_.ptr; + for (int i=0;ioperator[](size_ - 1); } + const T& back() const { return this->operator[](size_ - 1); } + T& front() { return this->operator[](0); } + const T& front() const { return this->operator[](0); } + + void pop_back() { + assert(size_>0); + --size_; + if (size_==SV_MAX) + ptr_to_small(); + } + + void compact() { + compact(size_); + } + + // size must be <= size_ - TODO: test + void compact(uint16_t size) { + assert(size<=size_); + if (size_>SV_MAX) { + size_=size; + if (size<=SV_MAX) + ptr_to_small(); + } else + size_=size; + } + + void resize(size_t s, int v = 0) { + if (s <= SV_MAX) { + if (size_ > SV_MAX) { + T *tmp=data_.ptr; + for (int i = 0; i < s; ++i) data_.vals[i] = tmp[i]; + delete[] tmp; + size_ = s; + return; + } + if (s <= size_) { + size_ = s; + return; + } else { + for (int i = size_; i < s; ++i) + data_.vals[i] = v; + size_ = s; + return; + } + } else { + if (size_ <= SV_MAX) + copy_vals_to_ptr(); + if (s > capacity_) + ensure_capacity(s); + if (s > size_) { + for (int i = size_; i < s; ++i) + data_.ptr[i] = v; + } + size_ = s; + } + } + + T& operator[](size_t i) { + if (size_ <= SV_MAX) return data_.vals[i]; + return data_.ptr[i]; + } + + const T& operator[](size_t i) const { + if (size_ <= SV_MAX) return data_.vals[i]; + return data_.ptr[i]; + } + + bool operator==(const Self& o) const { + if (size_ != o.size_) return false; + if (size_ <= SV_MAX) { + for (size_t i = 0; i < size_; ++i) + if (data_.vals[i] != o.data_.vals[i]) return false; + return true; + } else { + for (size_t i = 0; i < size_; ++i) + if (data_.ptr[i] != o.data_.ptr[i]) return false; + return true; + } + } + + friend bool operator!=(const Self& a, const Self& b) { + return !(a==b); + } + + private: + union StorageType { + T vals[SV_MAX]; + T* ptr; + }; + StorageType data_; + uint16_t size_; + uint16_t capacity_; // only defined when size_ > __SV_MAX_STATIC +}; + +typedef SmallVector SmallVectorInt; + +template +void memcpy(void *out,SmallVector const& v) { + std::memcpy(out,v.begin(),v.size()*sizeof(T)); +} + +#endif diff --git a/utils/small_vector_test.cc b/utils/small_vector_test.cc new file mode 100644 index 00000000..d1d8dcab --- /dev/null +++ b/utils/small_vector_test.cc @@ -0,0 +1,129 @@ +#include "small_vector.h" + +#include +#include +#include +#include + +using namespace std; + +class SVTest : public testing::Test { + protected: + virtual void SetUp() { } + virtual void TearDown() { } +}; + +TEST_F(SVTest, LargerThan2) { + SmallVectorInt v; + SmallVectorInt v2; + v.push_back(0); + v.push_back(1); + v.push_back(2); + assert(v.size() == 3); + assert(v[2] == 2); + assert(v[1] == 1); + assert(v[0] == 0); + v2 = v; + SmallVectorInt copy(v); + assert(copy.size() == 3); + assert(copy[0] == 0); + assert(copy[1] == 1); + assert(copy[2] == 2); + assert(copy == v2); + copy[1] = 99; + assert(copy != v2); + assert(v2.size() == 3); + assert(v2[2] == 2); + assert(v2[1] == 1); + assert(v2[0] == 0); + v2[0] = -2; + v2[1] = -1; + v2[2] = 0; + assert(v2[2] == 0); + assert(v2[1] == -1); + assert(v2[0] == -2); + SmallVectorInt v3(1,1); + assert(v3[0] == 1); + v2 = v3; + assert(v2.size() == 1); + assert(v2[0] == 1); + SmallVectorInt v4(10, 1); + assert(v4.size() == 10); + assert(v4[5] == 1); + assert(v4[9] == 1); + v4 = v; + assert(v4.size() == 3); + assert(v4[2] == 2); + assert(v4[1] == 1); + assert(v4[0] == 0); + SmallVectorInt v5(10, 2); + assert(v5.size() == 10); + assert(v5[7] == 2); + assert(v5[0] == 2); + assert(v.size() == 3); + v = v5; + assert(v.size() == 10); + assert(v[2] == 2); + assert(v[9] == 2); + SmallVectorInt cc; + for (int i = 0; i < 33; ++i) + cc.push_back(i); + for (int i = 0; i < 33; ++i) + assert(cc[i] == i); + cc.resize(20); + assert(cc.size() == 20); + for (int i = 0; i < 20; ++i) + assert(cc[i] == i); + cc[0]=-1; + cc.resize(1, 999); + assert(cc.size() == 1); + assert(cc[0] == -1); + cc.resize(99, 99); + for (int i = 1; i < 99; ++i) { + cerr << i << " " << cc[i] << endl; + assert(cc[i] == 99); + } + cc.clear(); + assert(cc.size() == 0); +} + +TEST_F(SVTest, Small) { + SmallVectorInt v; + SmallVectorInt v1(1,0); + SmallVectorInt v2(2,10); + SmallVectorInt v1a(2,0); + EXPECT_TRUE(v1 != v1a); + EXPECT_TRUE(v1 == v1); + EXPECT_EQ(v1[0], 0); + EXPECT_EQ(v2[1], 10); + EXPECT_EQ(v2[0], 10); + ++v2[1]; + --v2[0]; + EXPECT_EQ(v2[0], 9); + EXPECT_EQ(v2[1], 11); + SmallVectorInt v3(v2); + assert(v3[0] == 9); + assert(v3[1] == 11); + assert(!v3.empty()); + assert(v3.size() == 2); + v3.clear(); + assert(v3.empty()); + assert(v3.size() == 0); + assert(v3 != v2); + assert(v2 != v3); + v3 = v2; + assert(v3 == v2); + assert(v2 == v3); + assert(v3[0] == 9); + assert(v3[1] == 11); + assert(!v3.empty()); + assert(v3.size() == 2); + cerr << sizeof(SmallVectorInt) << endl; + cerr << sizeof(vector) << endl; +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + diff --git a/utils/sparse_vector.cc b/utils/sparse_vector.cc new file mode 100644 index 00000000..6e42a216 --- /dev/null +++ b/utils/sparse_vector.cc @@ -0,0 +1,98 @@ +#include "sparse_vector.h" + +#include +#include + +#include "b64tools.h" + +using namespace std; + +namespace B64 { + +void Encode(double objective, const SparseVector& v, ostream* out) { + const int num_feats = v.num_active(); + size_t tot_size = 0; + const size_t off_objective = tot_size; + tot_size += sizeof(double); // objective + const size_t off_num_feats = tot_size; + tot_size += sizeof(int); // num_feats + const size_t off_data = tot_size; + tot_size += sizeof(unsigned char) * num_feats; // lengths of feature names; + typedef SparseVector::const_iterator const_iterator; + for (const_iterator it = v.begin(); it != v.end(); ++it) + tot_size += FD::Convert(it->first).size(); // feature names; + tot_size += sizeof(double) * num_feats; // gradient + const size_t off_magic = tot_size; + tot_size += 4; // magic + + // size_t b64_size = tot_size * 4 / 3; + // cerr << "Sparse vector binary size: " << tot_size << " (b64 size=" << b64_size << ")\n"; + char* data = new char[tot_size]; + *reinterpret_cast(&data[off_objective]) = objective; + *reinterpret_cast(&data[off_num_feats]) = num_feats; + char* cur = &data[off_data]; + assert(cur - data == off_data); + for (const_iterator it = v.begin(); it != v.end(); ++it) { + const string& fname = FD::Convert(it->first); + *cur++ = static_cast(fname.size()); // name len + memcpy(cur, &fname[0], fname.size()); + cur += fname.size(); + *reinterpret_cast(cur) = it->second; + cur += sizeof(double); + } + assert(cur - data == off_magic); + *reinterpret_cast(cur) = 0xBAABABBAu; + cur += sizeof(unsigned int); + assert(cur - data == tot_size); + b64encode(data, tot_size, out); + delete[] data; +} + +bool Decode(double* objective, SparseVector* v, const char* in, size_t size) { + v->clear(); + if (size % 4 != 0) { + cerr << "B64 error - line % 4 != 0\n"; + return false; + } + const size_t decoded_size = size * 3 / 4 - sizeof(unsigned int); + const size_t buf_size = decoded_size + sizeof(unsigned int); + if (decoded_size < 6) { cerr << "SparseVector decoding error: too short!\n"; return false; } + char* data = new char[buf_size]; + if (!b64decode(reinterpret_cast(in), size, data, buf_size)) { + delete[] data; + return false; + } + size_t cur = 0; + *objective = *reinterpret_cast(data); + cur += sizeof(double); + const int num_feats = *reinterpret_cast(&data[cur]); + cur += sizeof(int); + int fc = 0; + while(fc < num_feats && cur < decoded_size) { + ++fc; + const int fname_len = data[cur++]; + assert(fname_len > 0); + assert(fname_len < 256); + string fname(fname_len, '\0'); + memcpy(&fname[0], &data[cur], fname_len); + cur += fname_len; + const double val = *reinterpret_cast(&data[cur]); + cur += sizeof(double); + int fid = FD::Convert(fname); + v->set_value(fid, val); + } + if(num_feats != fc) { + cerr << "Expected " << num_feats << " but only decoded " << fc << "!\n"; + delete[] data; + return false; + } + if (*reinterpret_cast(&data[cur]) != 0xBAABABBAu) { + cerr << "SparseVector decodeding error : magic does not match!\n"; + delete[] data; + return false; + } + delete[] data; + return true; +} + +} diff --git a/utils/sparse_vector.h b/utils/sparse_vector.h new file mode 100644 index 00000000..207489c5 --- /dev/null +++ b/utils/sparse_vector.h @@ -0,0 +1,512 @@ +#ifndef _SPARSE_VECTOR_H_ +#define _SPARSE_VECTOR_H_ + +//#define SPARSE_VECTOR_HASH + +#ifdef SPARSE_VECTOR_HASH +#include "hash.h" +# define SPARSE_VECTOR_MAP HASH_MAP +# define SPARSE_VECTOR_MAP_RESERVED(h,empty,deleted) HASH_MAP_RESERVED(h,empty,deleted) +#else +# define SPARSE_VECTOR_MAP std::map +# define SPARSE_VECTOR_MAP_RESERVED(h,empty,deleted) +#endif +/* + use SparseVectorList (pair smallvector) for feat funcs / hypergraphs (you rarely need random access; just append a feature to the list) +*/ +/* hack: index 0 never gets printed because cdyer is creative and efficient. features which have no weight got feature dict id 0, see, and the models all clobered that value. nobody wants to see it. except that vlad is also creative and efficient and stored the oracle bleu there. */ +/* NOTE: zero vals may or may not be dropped from map (sparse, but not guaranteed to be so). + + I rely on !v the same as !((bool)v) the same as v==0 and v() same as v(0). + + one exception: + + a local: + T sum = 0; + is used instead of + T sum; + + because T may be a primitive type, and + + T sum(); + + is parsed as a function decl :( + + the alternative T sum=T() is also be reasonable. i've switched to that. +*/ + +// this is a modified version of code originally written +// by Phil Blunsom + +#include +#include +#include +#include +#include + +#include "fdict.h" +#include "small_vector.h" + +template +inline T & extend_vector(std::vector &v,int i) { + if (i>=v.size()) + v.resize(i+1); + return v[i]; +} + +template +class SparseVector { + void init_reserved() { + SPARSE_VECTOR_MAP_RESERVED(values_,-1,-2); + } +public: + T const& get_singleton() const { + assert(values_.size()==1); + return values_.begin()->second; + } + + typedef SparseVector Self; + typedef SPARSE_VECTOR_MAP MapType; + typedef typename MapType::const_iterator const_iterator; + SparseVector() { + init_reserved(); + } + explicit SparseVector(std::vector const& v) { + init_reserved(); + typename MapType::iterator p=values_.begin(); + const T z=0; + for (unsigned i=0;i *vp) const { + init_vector(*vp); + } + + void init_vector(std::vector &v) const { + v.clear(); + for (const_iterator i=values_.begin(),e=values_.end();i!=e;++i) + extend_vector(v,i->first)=i->second; + } + + void set_new_value(int index, T const& val) { + assert(values_.find(index)==values_.end()); + values_[index]=val; + } + + + // warning: exploits the fact that 0 values are always removed from map. change this if you change that. + bool nonzero(int index) const { + typename MapType::const_iterator found = values_.find(index); + return found==values_.end() || !found->second; + } + + + T get(int index) const { + typename MapType::const_iterator found = values_.find(index); + return found==values_.end()?T():found->second; + } + + T value(int i) const { return get(i); } + + // same as above but may add a 0 entry. TODO: check that people relying on no entry use get + T & operator[](int index){ + return values_[index]; + } + + inline void set_value(int index, const T &value) { + values_[index] = value; + } + + inline void maybe_add(int index, const T& value) { + if (value) add_value(index,value); + } + + T& add_value(int index, const T &value) { +#if 1 + return values_[index]+=value; +#else + // this is not really going to be any faster, and we already rely on default init = 0 init + std::pair art=values_.insert(std::make_pair(index,value)); + T &val=art.first->second; + if (!art.second) val += value; // already existed + return val; +#endif + } + + + void store(std::valarray* target) const { + (*target) *= 0; + for (typename MapType::const_iterator + it = values_.begin(); it != values_.end(); ++it) { + if (it->first >= target->size()) break; + (*target)[it->first] = it->second; + } + } + + int max_index() const { + if (empty()) return 0; + typename MapType::const_iterator found =values_.end(); + --found; + return found->first; + } + + // dot product with a unit vector of the same length + // as the sparse vector + T dot() const { + T sum = T(); + for (typename MapType::const_iterator + it = values_.begin(); it != values_.end(); ++it) + sum += it->second; + return sum; + } + + template + S cosine_sim(const SparseVector &vec) const { + return dot(vec)/(l2norm()*vec.l2norm()); + } + + // if values are binary, gives |A intersect B|/|A union B| + template + S tanimoto_coef(const SparseVector &vec) const { + S dp=dot(vec); + return dp/(l2norm_sq()+vec.l2norm_sq()-dp); + } + + template + S dot(const SparseVector &vec) const { + S sum = S(); + for (typename MapType::const_iterator + it = values_.begin(); it != values_.end(); ++it) + { + typename MapType::const_iterator + found = vec.values_.find(it->first); + if (found != vec.values_.end()) + sum += it->second * found->second; + } + return sum; + } + + template + S dot(const std::vector &vec) const { + S sum = S(); + for (typename MapType::const_iterator + it = values_.begin(); it != values_.end(); ++it) + { + if (it->first < static_cast(vec.size())) + sum += it->second * vec[it->first]; + } + return sum; + } + + template + S dot(const S *vec) const { + // this is not range checked! + S sum = S(); + for (typename MapType::const_iterator + it = values_.begin(); it != values_.end(); ++it) + sum += it->second * vec[it->first]; + std::cout << "dot(*vec) " << sum << std::endl; + return sum; + } + + T l1norm() const { + T sum = T(); + for (typename MapType::const_iterator + it = values_.begin(); it != values_.end(); ++it) + sum += fabs(it->second); + return sum; + } + + T l2norm_sq() const { + T sum = T(); + for (typename MapType::const_iterator + it = values_.begin(); it != values_.end(); ++it) + sum += it->second * it->second; + return sum; + } + + T l2norm() const { + return sqrt(l2norm_sq()); + } + + void erase(int key) { + values_.erase(key); +/* typename MapType::iterator found = values_.find(key); + if (found!=values_end()) + values_.erase(found);*/ + } + + template + void set_from(SparseVector const& other) { + for (typename MapType::const_iterator + it = other.values_.begin(); it != other.values_.end(); ++it) + { + values_[it->first]=it->second; + } + } + + SparseVector &operator+=(const SparseVector &other) { + for (typename MapType::const_iterator + it = other.values_.begin(); it != other.values_.end(); ++it) + { +// T v = + (values_[it->first] += it->second); +// if (!v) values_.erase(it->first); + } + return *this; + } + + SparseVector &operator-=(const SparseVector &other) { + for (typename MapType::const_iterator + it = other.values_.begin(); it != other.values_.end(); ++it) + { +// T v = + (values_[it->first] -= it->second); +// if (!v) values_.erase(it->first); + } + return *this; + } + + friend SparseVector operator -(SparseVector x,SparseVector const& y) { + x-=y; + return x; + } + friend SparseVector operator +(SparseVector x,SparseVector const& y) { + x+=y; + return x; + } + +private: + // DEPRECATED: becuase 0 values are dropped from the map, this doesn't even make sense if you have a fully populated (not really sparse re: what you'll ever use) vector + SparseVector &operator-=(T const& x) { + for (typename MapType::iterator + it = values_.begin(); it != values_.end(); ++it) + it->second -= x; + return *this; + } + + SparseVector &operator+=(T const& x) { + for (typename MapType::iterator + it = values_.begin(); it != values_.end(); ++it) + it->second += x; + return *this; + } +public: + SparseVector &operator/=(const T &x) { + for (typename MapType::iterator + it = values_.begin(); it != values_.end(); ++it) + it->second /= x; + return *this; + } + + SparseVector &operator*=(const T& x) { + for (typename MapType::iterator + it = values_.begin(); it != values_.end(); ++it) + it->second *= x; + return *this; + } + + SparseVector operator+(T const& x) const { + SparseVector result = *this; + return result += x; + } + + SparseVector operator-(T const& x) const { + SparseVector result = *this; + return result -= x; + } + + SparseVector operator/(T const& x) const { + SparseVector result = *this; + return result /= x; + } + + std::ostream &operator<<(std::ostream& out) const { + Write(true, &out); + return out; + } + + void Write(const bool with_semi, std::ostream* os) const { + bool first = true; + for (typename MapType::const_iterator + it = values_.begin(); it != values_.end(); ++it) { + // by definition feature id 0 is a dummy value + if (!it->first) continue; + if (with_semi) { + (*os) << (first ? "" : ";") + << FD::Convert(it->first) << '=' << it->second; + } else { + (*os) << (first ? "" : " ") + << FD::Convert(it->first) << '=' << it->second; + } + first = false; + } + } + + bool operator==(Self const & other) const { + return size()==other.size() && contains_keys_of(other) && other.contains_i(*this); + } + + bool contains(Self const &o) const { + return size()>o.size() && contains(o); + } + + bool at_equals(int i,T const& val) const { + const_iterator it=values_.find(i); + if (it==values_.end()) return !val; + return it->second==val; + } + + bool contains_i(Self const& o) const { + for (typename MapType::const_iterator i=o.begin(),e=o.end();i!=e;++i) + if (!at_equals(i->first,i->second)) + return false; + return true; + } + + bool contains_keys_of(Self const& o) const { + for (typename MapType::const_iterator i=o.begin(),e=o.end();i!=e;++i) + if (values_.find(i)==values_.end()) + return false; + return true; + } + +#ifndef SPARSE_VECTOR_HASH + bool operator<(const SparseVector &other) const { + typename MapType::const_iterator it = values_.begin(); + typename MapType::const_iterator other_it = other.values_.begin(); + + for (; it != values_.end() && other_it != other.values_.end(); ++it, ++other_it) + { + if (it->first < other_it->first) return true; + if (it->first > other_it->first) return false; + if (it->second < other_it->second) return true; + if (it->second > other_it->second) return false; + } + return values_.size() < other.values_.size(); + } +#endif + + int size() const { return values_.size(); } + + int num_active() const { return values_.size(); } + bool empty() const { return values_.empty(); } + + const_iterator begin() const { return values_.begin(); } + const_iterator end() const { return values_.end(); } + + void clear() { + values_.clear(); + } + + void swap(SparseVector& other) { + values_.swap(other.values_); + } + +private: + MapType values_; +}; + +//like a pair but can live in a union, because it lacks default+copy ctors, dtor. +template +struct feature_val { + int fid; + T val; +}; + +template +inline feature_val featval(int fid,T const &val) { + feature_val f; + f.fid=fid; + f.val=val; + return f; +} + + +// doesn't support fast indexing directly +template +class SparseVectorList { + typedef feature_val Pair; + typedef SmallVector List; + typedef typename List::const_iterator const_iterator; + SparseVectorList() { } + template + SparseVectorList(I i,I const& end) { + int c=0; + for (;i const& v) { + for (unsigned i=0;i *to) const { + for (int i=0;iset_value(p[i].fid,p[i].val); + } + void copy_to(SparseVector *to) const { + to->clear(); + overlay(to); + } + SparseVector sparse() const { + SparseVector r; + copy_to(r); + return r; + } +private: + List p; +}; + +template +SparseVector operator+(const SparseVector& a, const SparseVector& b) { + SparseVector result = a; + return result += b; +} + +template +SparseVector operator*(const SparseVector& a, const double& b) { + SparseVector result = a; + return result *= b; +} + +template +SparseVector operator*(const SparseVector& a, const T& b) { + SparseVector result = a; + return result *= b; +} + +template +SparseVector operator*(const double& a, const SparseVector& b) { + SparseVector result = b; + return result *= a; +} + +template +std::ostream &operator<<(std::ostream &out, const SparseVector &vec) +{ + return vec.operator<<(out); +} + +namespace B64 { + void Encode(double objective, const SparseVector& v, std::ostream* out); + // returns false if failed to decode + bool Decode(double* objective, SparseVector* v, const char* data, size_t size); +} + +#endif diff --git a/utils/static_utoa.h b/utils/static_utoa.h new file mode 100755 index 00000000..fe5f6d92 --- /dev/null +++ b/utils/static_utoa.h @@ -0,0 +1,115 @@ +#ifndef STATIC_UTOA_H +#define STATIC_UTOA_H + +#include "threadlocal.h" + + +#include +#include + +#define DIGIT_LOOKUP_TABLE 0 + +namespace { +THREADLOCAL char utoa_buf[] = "01234567890123456789"; // to put end of string character at buf[20] +const unsigned utoa_bufsize=sizeof(utoa_buf); +const unsigned utoa_bufsizem1=utoa_bufsize-1; +#ifdef DIGIT_LOOKUP_TABLE +char digits[] = "0123456789"; +#endif +} + +inline char digit_to_char(int d) { + return +#ifdef DIGIT_LOOKUP_TABLE + digits[d]; +#else + '0'+d; +#endif +} + +// returns n in string [return,num); *num=0 yourself before calling if you want a c_str +inline char *utoa(char *num,unsigned n) { + if ( !n ) { + *--num='0'; + } else { + unsigned rem; + // 3digit lookup table, divide by 1000 faster? + while ( n ) { +#if 1 + rem = n; + n /= 10; + rem -= 10*n; // maybe this is faster than mod because we are already dividing +#else + rem = n%10; // would optimizer combine these together? + n = n/10; +#endif + *--num = digit_to_char(rem); + } + } + return num; +} + +inline char *static_utoa(unsigned n) { + return utoa(utoa_buf+utoa_bufsizem1,n); +} + +//returns position of '\0' terminating number written starting at to +inline char* append_utoa(char *to,unsigned n) { + char *s=static_utoa(n); + int ns=(utoa_buf+utoa_bufsize)-s; + std::memcpy(to,s,ns); + return to+ns; +} + +// so named to avoid gcc segfault when named itoa +inline char *itoa(char *p,int n) { + if (n<0) { + p=utoa(p,-n); // TODO: check that (unsigned)(-INT_MIN) == 0x1000000 in 2s complement and not == 0 + *--p='-'; + return p; + } else + return utoa(p,n); +} + +inline char *static_itoa(int n) { + return itoa(utoa_buf+utoa_bufsizem1,n); +} + + +inline std::string utos(unsigned n) { + const int bufsz=20; + char buf[bufsz]; + char *end=buf+bufsz; + char *p=utoa(end,n); + return std::string(p,end); +} + +inline std::string itos(int n) { + const int bufsz=20; + char buf[bufsz]; + char *end=buf+bufsz; + char *p=itoa(end,n); + return std::string(p,end); +} + +#ifdef ITOA_SAMPLE +# include +# include +# include +using namespace std; + +int main(int argc,char *argv[]) { + printf("d U d U d U\n"); + for (int i=1;i +#include +#include +#include +#include + +using namespace std; + +void ParseTranslatorInput(const string& line, string* input, string* ref) { + size_t hint = 0; + if (line.find("{\"rules\":") == 0) { + hint = line.find("}}"); + if (hint == string::npos) { + cerr << "Syntax error: " << line << endl; + abort(); + } + hint += 2; + } + size_t pos = line.find("|||", hint); + if (pos == string::npos) { *input = line; return; } + ref->clear(); + *input = line.substr(0, pos - 1); + string rline = line.substr(pos + 4); + if (rline.size() > 0) { + assert(ref); + *ref = rline; + } +} + +void ProcessAndStripSGML(string* pline, map* out) { + map& meta = *out; + string& line = *pline; + string lline = LowercaseString(line); + if (lline.find(""); + if (close == string::npos) return; // error + size_t end = lline.find(""); + string seg = Trim(lline.substr(4, close-4)); + string text = line.substr(close+1, end - close - 1); + for (size_t i = 1; i < seg.size(); i++) { + if (seg[i] == '=' && seg[i-1] == ' ') { + string less = seg.substr(0, i-1) + seg.substr(i); + seg = less; i = 0; continue; + } + if (seg[i] == '=' && seg[i+1] == ' ') { + string less = seg.substr(0, i+1); + if (i+2 < seg.size()) less += seg.substr(i+2); + seg = less; i = 0; continue; + } + } + line = Trim(text); + if (seg == "") return; + for (size_t i = 1; i < seg.size(); i++) { + if (seg[i] == '=') { + string label = seg.substr(0, i); + string val = seg.substr(i+1); + if (val[0] == '"') { + val = val.substr(1); + size_t close = val.find('"'); + if (close == string::npos) { + cerr << "SGML parse error: missing \"\n"; + seg = ""; + i = 0; + } else { + seg = val.substr(close+1); + val = val.substr(0, close); + i = 0; + } + } else { + size_t close = val.find(' '); + if (close == string::npos) { + seg = ""; + i = 0; + } else { + seg = val.substr(close+1); + val = val.substr(0, close); + } + } + label = Trim(label); + seg = Trim(seg); + meta[label] = val; + } + } +} + diff --git a/utils/stringlib.h b/utils/stringlib.h new file mode 100644 index 00000000..84e95d44 --- /dev/null +++ b/utils/stringlib.h @@ -0,0 +1,267 @@ +#ifndef CDEC_STRINGLIB_H_ +#define CDEC_STRINGLIB_H_ + +//usage: string s=MAKESTRE(1<<" "<(ostringstream()< +#define SLIBDBG(x) do { std::cerr<<"DBG(stringlib): "< +#include +#include +#include +#include +#include +#include + +inline std::size_t skip_ws(std::string const& s,std::size_t starting=0,char const* ws=" \t\n\r") { + return s.find_first_not_of(ws,starting); +} + +// returns position of end of all non-ws chars before ending, i.e. string(s.begin()+skip_ws(s),s.begin()+trailing_ws(s)) strips both ends +inline std::size_t trailing_ws(std::string const& s,std::size_t ending=std::string::npos,char const* ws=" \t\n\r") { + std::size_t n=s.find_last_not_of(ws,ending); + if (n==std::string::npos) return n; + else return n+1; +} + +//TEST: if string is all whitespace, make sure that string(a+npos,a+npos) can't segfault (i.e. won't access any memory because begin==end) +inline std::string strip_ws(std::string const& s) { + return std::string(s.begin()+skip_ws(s),s.begin()+trailing_ws(s)); +} + + +inline bool is_single_line(std::string const& line) { + return std::count(line.begin(),line.end(),'\n')==0; // but we want to allow terminal newlines/blanks +} + +// is_single_line(strip_ws(line)) +inline bool is_single_line_stripped(std::string const& line) { + std::size_t b=skip_ws(line),e=trailing_ws(line); + std::size_t n=line.find('\n',b); + return n==std::string::npos || n>=e; +} + +struct toupperc { + inline char operator()(char c) const { + return std::toupper(c); + } +}; + +inline std::string toupper(std::string s) { + std::transform(s.begin(),s.end(),s.begin(),toupperc()); + return s; +} + +template inline +bool match_begin(Istr bstr,Istr estr,Isubstr bsub,Isubstr esub) +{ + while (bsub != esub) { + if (bstr == estr) + return false; + if (*bsub++ != *bstr++) + return false; + } + return true; +} + +template inline +bool match_begin(Istr bstr,Istr estr,Prefix prefix) +{ + return match_begin(bstr,estr,prefix.begin(),prefix.end()); +} + +template inline +bool match_begin(Str const& str,Prefix const& prefix) +{ + return match_begin(str.begin(),str.end(),prefix.begin(),prefix.end()); +} + + +// read line in the form of either: +// source +// source ||| target +// source will be returned as a string, target must be a sentence or +// a lattice (in PLF format) and will be returned as a Lattice object +void ParseTranslatorInput(const std::string& line, std::string* input, std::string* ref); +struct Lattice; +void ParseTranslatorInputLattice(const std::string& line, std::string* input, Lattice* ref); + +inline std::string Trim(const std::string& str, const std::string& dropChars = " \t") { + std::string res = str; + res.erase(str.find_last_not_of(dropChars)+1); + return res.erase(0, res.find_first_not_of(dropChars)); +} + +inline void Tokenize(const std::string& str, char delimiter, std::vector* res) { + std::string s = str; + int last = 0; + res->clear(); + for (int i=0; i < s.size(); ++i) + if (s[i] == delimiter) { + s[i]=0; + if (last != i) { + res->push_back(&s[last]); + } + last = i + 1; + } + if (last != s.size()) + res->push_back(&s[last]); +} + +inline unsigned NTokens(const std::string& str, char delimiter) +{ + std::vector r; + Tokenize(str,delimiter,&r); + return r.size(); +} + +inline std::string LowercaseString(const std::string& in) { + std::string res(in.size(),' '); + for (int i = 0; i < in.size(); ++i) + res[i] = tolower(in[i]); + return res; +} + +inline int CountSubstrings(const std::string& str, const std::string& sub) { + size_t p = 0; + int res = 0; + while (p < str.size()) { + p = str.find(sub, p); + if (p == std::string::npos) break; + ++res; + p += sub.size(); + } + return res; +} + +inline int SplitOnWhitespace(const std::string& in, std::vector* out) { + out->clear(); + int i = 0; + int start = 0; + std::string cur; + while(i < in.size()) { + if (in[i] == ' ' || in[i] == '\t') { + if (i - start > 0) + out->push_back(in.substr(start, i - start)); + start = i + 1; + } + ++i; + } + if (i > start) + out->push_back(in.substr(start, i - start)); + return out->size(); +} + +inline std::vector SplitOnWhitespace(std::string const& in) +{ + std::vector r; + SplitOnWhitespace(in,&r); + return r; +} + + +struct mutable_c_str { + // because making a copy of a string might not copy its storage, so modifying a c_str() could screw up original (nobody uses cow nowadays because it needs locking under threading) + char *p; + mutable_c_str(std::string const& s) : p((char *)::operator new(s.size()+1)) { + std::memcpy(p,s.data(),s.size()); + p[s.size()]=0; + } + ~mutable_c_str() { ::operator delete(p); } +private: + mutable_c_str(mutable_c_str const&); +}; + +// ' ' '\t' tokens hardcoded +//NOTE: you should have stripped endline chars out first. +inline bool IsWordSep(char c) { + return c==' '||c=='\t'; +} + + +template +// *end must be 0 (i.e. [p,end] is valid storage, which will be written to with 0 to separate c string tokens +void VisitTokens(char *p,char *const end,F f) { + SLIBDBG("VisitTokens. p="<* out); + +// given the first character of a UTF8 block, find out how wide it is +// see http://en.wikipedia.org/wiki/UTF-8 for more info +inline unsigned int UTF8Len(unsigned char x) { + if (x < 0x80) return 1; + else if ((x >> 5) == 0x06) return 2; + else if ((x >> 4) == 0x0e) return 3; + else if ((x >> 3) == 0x1e) return 4; + else return 0; +} + +#endif diff --git a/utils/stringlib_test.cc b/utils/stringlib_test.cc new file mode 100755 index 00000000..f66cdbeb --- /dev/null +++ b/utils/stringlib_test.cc @@ -0,0 +1,17 @@ +#define STRINGLIB_DEBUG +#include "stringlib.h" + +using namespace std; +struct print { + template + void operator()(S const& s) const { + cout<= end() will give a numeric token name (single per-thread shared buffer), which of course won't be Convert-able back to the id, because it's not added to the dict. This is a convenience for logging fake token indices. Any tokens actually added to the dict may cause end() to overlap the range of fake ids you were using - that's up to you to prevent. + +#include +#include +#include +#include "Ngram.h" +#include "dict.h" +#include "tdict.h" +#include "Vocab.h" +#include "stringlib.h" +#include "threadlocal.h" + +using namespace std; + +Vocab TD::dict_(0,TD::max_wordid); +WordID TD::ss=dict_.ssIndex(); +WordID TD::se=dict_.seIndex(); +WordID TD::unk=dict_.unkIndex(); +char const*const TD::ss_str=Vocab_SentStart; +char const*const TD::se_str=Vocab_SentEnd; +char const*const TD::unk_str=Vocab_Unknown; + +// pre+(i-base)+">" for i in [base,e) +inline void pad(std::string const& pre,int base,int e) { + assert(base<=e); + ostringstream o; + for (int i=base;i'; + WordID id=TD::Convert(o.str()); + assert(id==i); // this fails. why? + } +} + + +namespace { +struct TD_init { + TD_init() { + /* + // disabled for now since it's breaking trunk + assert(TD::Convert(TD::ss_str)==TD::ss); + assert(TD::Convert(TD::se_str)==TD::se); + assert(TD::Convert(TD::unk_str)==TD::unk); + assert(TD::none==Vocab_None); + pad("=dict_.highIndex()) return undef_token(w); +#endif + return dict_.getWord((VocabIndex)w); +} + + +void TD::GetWordIDs(const std::vector& strings, std::vector* ids) { + ids->clear(); + for (vector::const_iterator i = strings.begin(); i != strings.end(); ++i) + ids->push_back(TD::Convert(*i)); +} + +std::string TD::GetString(const std::vector& str) { + ostringstream o; + for (int i=0;i Ws; + Ws *ids; + explicit add_wordids(Ws *i) : ids(i) { } + add_wordids(const add_wordids& o) : ids(o.ids) { } + void operator()(char const* s) { + ids->push_back(TD::Convert(s)); + } + void operator()(std::string const& s) { + ids->push_back(TD::Convert(s)); + } +}; + +} + +void TD::ConvertSentence(std::string const& s, std::vector* ids) { + ids->clear(); + VisitTokens(s,add_wordids(ids)); +} diff --git a/utils/tdict.h b/utils/tdict.h new file mode 100644 index 00000000..a7b3ee1c --- /dev/null +++ b/utils/tdict.h @@ -0,0 +1,50 @@ +#ifndef _TDICT_H_ +#define _TDICT_H_ + +#include +#include +#include "wordid.h" +#include + +class Vocab; + +struct TD { + /* // disabled for now + static const int reserved_begin=10; // allow room for SRI special tokens e.g. unk ss se pause. tokens until this get "" + static const int n_reserved=10; // 0...n_reserved-1 get token '' + static inline WordID reserved(int i) { + assert(i>=0 && i"; + static char const* const se_str; //=""; + static char const* const unk_str; //=""; + static WordID ss,se,unk; // x=Convert(x_str) + static WordID end(); // next id to be assigned; [begin,end) give the non-reserved tokens seen so far + static Vocab dict_; + static void ConvertSentence(std::string const& sent, std::vector* ids); + static void GetWordIDs(const std::vector& strings, std::vector* ids); + static std::string GetString(const std::vector& str); + static std::string GetString(WordID const* i,WordID const* e); + static int AppendString(const WordID& w, int pos, int bufsize, char* buffer); + static unsigned int NumWords(); + static WordID Convert(const std::string& s); + static WordID Convert(char const* s); + static const char* Convert(WordID w); +}; + +struct ToTD { + typedef WordID result_type; + result_type operator()(std::string const& t) const { + return TD::Convert(t); + } +}; + + +#endif diff --git a/utils/test_data/weights b/utils/test_data/weights new file mode 100644 index 00000000..ea70229c --- /dev/null +++ b/utils/test_data/weights @@ -0,0 +1,8 @@ +# hiero +WordPenalty -0.387029 +LanguageModel 0.253195 +PhraseModel_0 0.142926 +PhraseModel_1 0.465119 +PhraseModel_2 0.079503 +CNPosteriorProbability 0.09259 +Inf -inf diff --git a/utils/threadlocal.h b/utils/threadlocal.h new file mode 100755 index 00000000..d79f5d9d --- /dev/null +++ b/utils/threadlocal.h @@ -0,0 +1,71 @@ +#ifndef THREADLOCAL_H +#define THREADLOCAL_H + +#ifndef SETLOCAL_SWAP +# define SETLOCAL_SWAP 0 +#endif + +#ifdef BOOST_NO_MT + +# define THREADLOCAL + +#else + +#ifdef _MSC_VER + +//FIXME: doesn't work with DLLs ... use TLS apis instead (http://www.boost.org/libs/thread/doc/tss.html) +# define THREADLOCAL __declspec(thread) + +#else + +# define THREADLOCAL __thread + +#endif + +#endif + +#include //swap + +// naturally, the below are only thread-safe if value is THREADLOCAL +template +struct SaveLocal { + D &value; + D old_value; + SaveLocal(D& val) : value(val), old_value(val) {} + ~SaveLocal() { +#if SETLOCAL_SWAP + swap(value,old_value); +#else + value=old_value; +#endif + } +}; + +template +struct SetLocal { + D &value; + D old_value; + SetLocal(D& val,const D &new_value) : value(val), old_value( +#if SETLOCAL_SWAP + new_value +#else + val +#endif + ) { +#if SETLOCAL_SWAP + swap(value,old_value); +#else + value=new_value; +#endif + } + ~SetLocal() { +#if SETLOCAL_SWAP + swap(value,old_value); +#else + value=old_value; +#endif + } +}; + + +#endif diff --git a/utils/timing_stats.cc b/utils/timing_stats.cc new file mode 100644 index 00000000..fc8e9df1 --- /dev/null +++ b/utils/timing_stats.cc @@ -0,0 +1,24 @@ +#include "timing_stats.h" + +#include +#include "time.h" //cygwin needs +using namespace std; + +map Timer::stats; + +Timer::Timer(const string& timername) : start_t(clock()), cur(stats[timername]) {} + +Timer::~Timer() { + ++cur.calls; + const clock_t end_t = clock(); + const double elapsed = (end_t - start_t) / 1000000.0; + cur.total_time += elapsed; +} + +void Timer::Summarize() { + for (map::iterator it = stats.begin(); it != stats.end(); ++it) { + cerr << it->first << ": " << it->second.total_time << " secs (" << it->second.calls << " calls)\n"; + } + stats.clear(); +} + diff --git a/utils/timing_stats.h b/utils/timing_stats.h new file mode 100644 index 00000000..0a9f7656 --- /dev/null +++ b/utils/timing_stats.h @@ -0,0 +1,25 @@ +#ifndef _TIMING_STATS_H_ +#define _TIMING_STATS_H_ + +#include +#include + +struct TimerInfo { + int calls; + double total_time; + TimerInfo() : calls(), total_time() {} +}; + +struct Timer { + Timer(const std::string& info); + ~Timer(); + static void Summarize(); + private: + static std::map stats; + clock_t start_t; + TimerInfo& cur; + Timer(const Timer& other); + const Timer& operator=(const Timer& other); +}; + +#endif diff --git a/utils/weights.cc b/utils/weights.cc new file mode 100644 index 00000000..84647585 --- /dev/null +++ b/utils/weights.cc @@ -0,0 +1,77 @@ +#include "weights.h" + +#include + +#include "fdict.h" +#include "filelib.h" + +using namespace std; + +void Weights::InitFromFile(const std::string& filename, vector* feature_list) { + cerr << "Reading weights from " << filename << endl; + ReadFile in_file(filename); + istream& in = *in_file.stream(); + assert(in); + int weight_count = 0; + bool fl = false; + while (in) { + double val = 0; + string buf; + getline(in, buf); + if (buf.size() == 0) continue; + if (buf[0] == '#') continue; + for (int i = 0; i < buf.size(); ++i) + if (buf[i] == '=') buf[i] = ' '; + int start = 0; + while(start < buf.size() && buf[start] == ' ') ++start; + int end = 0; + while(end < buf.size() && buf[end] != ' ') ++end; + int fid = FD::Convert(buf.substr(start, end - start)); + while(end < buf.size() && buf[end] == ' ') ++end; + val = strtod(&buf.c_str()[end], NULL); + if (isnan(val)) { + cerr << FD::Convert(fid) << " has weight NaN!\n"; + abort(); + } + if (wv_.size() <= fid) + wv_.resize(fid + 1); + wv_[fid] = val; + if (feature_list) { feature_list->push_back(FD::Convert(fid)); } + ++weight_count; + if (weight_count % 50000 == 0) { cerr << '.' << flush; fl = true; } + if (weight_count % 2000000 == 0) { cerr << " [" << weight_count << "]\n"; fl = false; } + } + if (fl) { cerr << endl; } + cerr << "Loaded " << weight_count << " feature weights\n"; +} + +void Weights::WriteToFile(const std::string& fname, bool hide_zero_value_features) const { + WriteFile out(fname); + ostream& o = *out.stream(); + assert(o); + o.precision(17); + const int num_feats = FD::NumFeats(); + for (int i = 1; i < num_feats; ++i) { + const double val = (i < wv_.size() ? wv_[i] : 0.0); + if (hide_zero_value_features && val == 0.0) continue; + o << FD::Convert(i) << ' ' << val << endl; + } +} + +void Weights::InitVector(std::vector* w) const { + *w = wv_; +} + +void Weights::InitSparseVector(SparseVector* w) const { + for (int i = 1; i < wv_.size(); ++i) { + const double& weight = wv_[i]; + if (weight) w->set_value(i, weight); + } +} + +void Weights::InitFromVector(const std::vector& w) { + wv_ = w; + if (wv_.size() > FD::NumFeats()) + cerr << "WARNING: initializing weight vector has more features than the global feature dictionary!\n"; + wv_.resize(FD::NumFeats(), 0); +} diff --git a/utils/weights.h b/utils/weights.h new file mode 100644 index 00000000..f19aa3ce --- /dev/null +++ b/utils/weights.h @@ -0,0 +1,21 @@ +#ifndef _WEIGHTS_H_ +#define _WEIGHTS_H_ + +#include +#include +#include +#include "sparse_vector.h" + +class Weights { + public: + Weights() {} + void InitFromFile(const std::string& fname, std::vector* feature_list = NULL); + void WriteToFile(const std::string& fname, bool hide_zero_value_features = true) const; + void InitVector(std::vector* w) const; + void InitSparseVector(SparseVector* w) const; + void InitFromVector(const std::vector& w); + private: + std::vector wv_; +}; + +#endif diff --git a/utils/weights_test.cc b/utils/weights_test.cc new file mode 100644 index 00000000..8a4c26ef --- /dev/null +++ b/utils/weights_test.cc @@ -0,0 +1,27 @@ +#include +#include +#include +#include +#include +#include "weights.h" +#include "tdict.h" + +using namespace std; + +class WeightsTest : public testing::Test { + protected: + virtual void SetUp() { } + virtual void TearDown() { } +}; + + +TEST_F(WeightsTest,Load) { + Weights w; + w.InitFromFile("test_data/weights"); + w.WriteToFile("-"); +} + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/utils/wordid.h b/utils/wordid.h new file mode 100644 index 00000000..fb50bcc1 --- /dev/null +++ b/utils/wordid.h @@ -0,0 +1,6 @@ +#ifndef _WORD_ID_H_ +#define _WORD_ID_H_ + +typedef int WordID; + +#endif diff --git a/vest/Makefile.am b/vest/Makefile.am index abdc8146..b869672b 100644 --- a/vest/Makefile.am +++ b/vest/Makefile.am @@ -1,15 +1,12 @@ bin_PROGRAMS = \ - mbr_kbest \ mr_vest_map \ mr_vest_reduce \ mr_vest_generate_mapper_input \ - fast_score \ sentserver \ sentclient if HAVE_GTEST noinst_PROGRAMS = \ - scorer_test \ lo_test endif @@ -17,25 +14,16 @@ sentserver_SOURCES = sentserver.c sentclient_SOURCES = sentclient.c -mbr_kbest_SOURCES = mbr_kbest.cc ter.cc comb_scorer.cc aer_scorer.cc scorer.cc viterbi_envelope.cc -mbr_kbest_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +mr_vest_generate_mapper_input_SOURCES = mr_vest_generate_mapper_input.cc line_optimizer.cc +mr_vest_generate_mapper_input_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz -fast_score_SOURCES = fast_score.cc ter.cc comb_scorer.cc aer_scorer.cc scorer.cc viterbi_envelope.cc -fast_score_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +mr_vest_map_SOURCES = viterbi_envelope.cc ces.cc error_surface.cc mr_vest_map.cc line_optimizer.cc +mr_vest_map_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz -mr_vest_generate_mapper_input_SOURCES = mr_vest_generate_mapper_input.cc line_optimizer.cc $(top_srcdir)/decoder/timing_stats.cc -mr_vest_generate_mapper_input_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +mr_vest_reduce_SOURCES = error_surface.cc ces.cc mr_vest_reduce.cc line_optimizer.cc viterbi_envelope.cc +mr_vest_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz -mr_vest_map_SOURCES = viterbi_envelope.cc error_surface.cc aer_scorer.cc mr_vest_map.cc scorer.cc ter.cc comb_scorer.cc line_optimizer.cc -mr_vest_map_LDADD = $(top_srcdir)/decoder/libcdec.a -lz +lo_test_SOURCES = lo_test.cc ces.cc viterbi_envelope.cc error_surface.cc line_optimizer.cc +lo_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz -mr_vest_reduce_SOURCES = error_surface.cc aer_scorer.cc mr_vest_reduce.cc scorer.cc ter.cc comb_scorer.cc line_optimizer.cc viterbi_envelope.cc -mr_vest_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a -lz - -scorer_test_SOURCES = aer_scorer.cc scorer_test.cc scorer.cc ter.cc comb_scorer.cc viterbi_envelope.cc -scorer_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/decoder/libcdec.a -lz - -lo_test_SOURCES = lo_test.cc scorer.cc ter.cc aer_scorer.cc comb_scorer.cc viterbi_envelope.cc error_surface.cc line_optimizer.cc -lo_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/decoder/libcdec.a -lz - -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/vest/aer_scorer.cc b/vest/aer_scorer.cc deleted file mode 100644 index 25b58b5e..00000000 --- a/vest/aer_scorer.cc +++ /dev/null @@ -1,135 +0,0 @@ -#include "aer_scorer.h" - -#include -#include -#include - -#include "tdict.h" -#include "aligner.h" - -using namespace std; - -class AERScore : public ScoreBase { - friend class AERScorer; - public: - AERScore() : num_matches(), num_predicted(), num_in_ref() {} - AERScore(int m, int p, int r) : - num_matches(m), num_predicted(p), num_in_ref(r) {} - virtual void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len){} - virtual void PlusEquals(const Score& delta, const float scale) { - const AERScore& other = static_cast(delta); - num_matches += scale*other.num_matches; - num_predicted += scale*other.num_predicted; - num_in_ref += scale*other.num_in_ref; - } - virtual void PlusEquals(const Score& delta) { - const AERScore& other = static_cast(delta); - num_matches += other.num_matches; - num_predicted += other.num_predicted; - num_in_ref += other.num_in_ref; - } - - - virtual ScoreP GetZero() const { - return ScoreP(new AERScore); - } - virtual ScoreP GetOne() const { - return ScoreP(new AERScore); - } - virtual void Subtract(const Score& rhs, Score* out) const { - AERScore* res = static_cast(out); - const AERScore& other = static_cast(rhs); - res->num_matches = num_matches - other.num_matches; - res->num_predicted = num_predicted - other.num_predicted; - res->num_in_ref = num_in_ref - other.num_in_ref; - } - float Precision() const { - return static_cast(num_matches) / num_predicted; - } - float Recall() const { - return static_cast(num_matches) / num_in_ref; - } - float ComputePartialScore() const { return 0.0;} - virtual float ComputeScore() const { - const float prec = Precision(); - const float rec = Recall(); - const float f = (2.0 * prec * rec) / (rec + prec); - if (isnan(f)) return 1.0f; - return 1.0f - f; - } - virtual bool IsAdditiveIdentity() const { - return (num_matches == 0) && (num_predicted == 0) && (num_in_ref == 0); - } - virtual void ScoreDetails(std::string* out) const { - ostringstream os; - os << "AER=" << (ComputeScore() * 100.0) - << " F=" << (100 - ComputeScore() * 100.0) - << " P=" << (Precision() * 100.0) << " R=" << (Recall() * 100.0) - << " [" << num_matches << " " << num_predicted << " " << num_in_ref << "]"; - *out = os.str(); - } - virtual void Encode(std::string*out) const { - out->resize(sizeof(int) * 3); - *(int *)&(*out)[sizeof(int) * 0] = num_matches; - *(int *)&(*out)[sizeof(int) * 1] = num_predicted; - *(int *)&(*out)[sizeof(int) * 2] = num_in_ref; - } - private: - int num_matches; - int num_predicted; - int num_in_ref; -}; - -AERScorer::AERScorer(const vector >& refs, const string& src) : src_(src) { - if (refs.size() != 1) { - cerr << "AERScorer can only take a single reference!\n"; - abort(); - } - ref_ = AlignerTools::ReadPharaohAlignmentGrid(TD::GetString(refs.front())); -} - -static inline bool Safe(const Array2D& a, int i, int j) { - if (i >= 0 && j >= 0 && i < a.width() && j < a.height()) - return a(i,j); - else - return false; -} - -ScoreP AERScorer::ScoreCCandidate(const vector& shyp) const { - return ScoreP(); -} - -ScoreP AERScorer::ScoreCandidate(const vector& shyp) const { - boost::shared_ptr > hyp = - AlignerTools::ReadPharaohAlignmentGrid(TD::GetString(shyp)); - - int m = 0; - int r = 0; - int p = 0; - int i_len = ref_->width(); - int j_len = ref_->height(); - for (int i = 0; i < i_len; ++i) { - for (int j = 0; j < j_len; ++j) { - if ((*ref_)(i,j)) { - ++r; - if (Safe(*hyp, i, j)) ++m; - } - } - } - for (int i = 0; i < hyp->width(); ++i) - for (int j = 0; j < hyp->height(); ++j) - if ((*hyp)(i,j)) ++p; - - return ScoreP(new AERScore(m,p,r)); -} - -ScoreP AERScorer::ScoreFromString(const string& in) { - AERScore* res = new AERScore; - res->num_matches = *(const int *)&in[sizeof(int) * 0]; - res->num_predicted = *(const int *)&in[sizeof(int) * 1]; - res->num_in_ref = *(const int *)&in[sizeof(int) * 2]; - return ScoreP(res); -} - -const std::string* AERScorer::GetSource() const { return &src_; } - diff --git a/vest/aer_scorer.h b/vest/aer_scorer.h deleted file mode 100644 index 6d53d359..00000000 --- a/vest/aer_scorer.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef _AER_SCORER_ -#define _AER_SCORER_ - -#include - -#include "scorer.h" -#include "array2d.h" - -class AERScorer : public SentenceScorer { - public: - // when constructing alignment strings from a hypergraph, the source - // is necessary. - AERScorer(const std::vector >& refs, const std::string& src = ""); - ScoreP ScoreCandidate(const std::vector& hyp) const; - ScoreP ScoreCCandidate(const std::vector& hyp) const; - static ScoreP ScoreFromString(const std::string& in); - const std::string* GetSource() const; - private: - std::string src_; - boost::shared_ptr > ref_; -}; - -#endif diff --git a/vest/comb_scorer.cc b/vest/comb_scorer.cc deleted file mode 100644 index 9fc37868..00000000 --- a/vest/comb_scorer.cc +++ /dev/null @@ -1,97 +0,0 @@ -#include "comb_scorer.h" - -#include - -using namespace std; - -class BLEUTERCombinationScore : public ScoreBase { - friend class BLEUTERCombinationScorer; - public: - ~BLEUTERCombinationScore(); - float ComputePartialScore() const { return 0.0;} - float ComputeScore() const { - return (bleu->ComputeScore() - ter->ComputeScore()) / 2.0f; - } - void ScoreDetails(string* details) const { - char buf[160]; - sprintf(buf, "Combi = %.2f, BLEU = %.2f, TER = %.2f", - ComputeScore()*100.0f, bleu->ComputeScore()*100.0f, ter->ComputeScore()*100.0f); - *details = buf; - } - void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len){} - - void PlusEquals(const Score& delta, const float scale) { - bleu->PlusEquals(*static_cast(delta).bleu, scale); - ter->PlusEquals(*static_cast(delta).ter, scale); - } - void PlusEquals(const Score& delta) { - bleu->PlusEquals(*static_cast(delta).bleu); - ter->PlusEquals(*static_cast(delta).ter); - } - - - - ScoreP GetOne() const { - BLEUTERCombinationScore* res = new BLEUTERCombinationScore; - res->bleu = bleu->GetOne(); - res->ter = ter->GetOne(); - return ScoreP(res); - } - ScoreP GetZero() const { - BLEUTERCombinationScore* res = new BLEUTERCombinationScore; - res->bleu = bleu->GetZero(); - res->ter = ter->GetZero(); - return ScoreP(res); - } - void Subtract(const Score& rhs, Score* res) const { - bleu->Subtract(*static_cast(rhs).bleu, - static_cast(res)->bleu.get()); - ter->Subtract(*static_cast(rhs).ter, - static_cast(res)->ter.get()); - } - void Encode(std::string* out) const { - string bs, ts; - bleu->Encode(&bs); - ter->Encode(&ts); - out->clear(); - (*out) += static_cast(bs.size()); - (*out) += bs; - (*out) += ts; - } - bool IsAdditiveIdentity() const { - return bleu->IsAdditiveIdentity() && ter->IsAdditiveIdentity(); - } - private: - ScoreP bleu; - ScoreP ter; -}; - -BLEUTERCombinationScore::~BLEUTERCombinationScore() { -} - -BLEUTERCombinationScorer::BLEUTERCombinationScorer(const vector >& refs) { - bleu_ = SentenceScorer::CreateSentenceScorer(IBM_BLEU, refs); - ter_ = SentenceScorer::CreateSentenceScorer(TER, refs); -} - -BLEUTERCombinationScorer::~BLEUTERCombinationScorer() { -} - -ScoreP BLEUTERCombinationScorer::ScoreCCandidate(const vector& hyp) const { - return ScoreP(); -} - -ScoreP BLEUTERCombinationScorer::ScoreCandidate(const std::vector& hyp) const { - BLEUTERCombinationScore* res = new BLEUTERCombinationScore; - res->bleu = bleu_->ScoreCandidate(hyp); - res->ter = ter_->ScoreCandidate(hyp); - return ScoreP(res); -} - -ScoreP BLEUTERCombinationScorer::ScoreFromString(const std::string& in) { - int bss = in[0]; - BLEUTERCombinationScore* r = new BLEUTERCombinationScore; - r->bleu = SentenceScorer::CreateScoreFromString(IBM_BLEU, in.substr(1, bss)); - r->ter = SentenceScorer::CreateScoreFromString(TER, in.substr(1 + bss)); - return ScoreP(r); -} diff --git a/vest/comb_scorer.h b/vest/comb_scorer.h deleted file mode 100644 index 346be576..00000000 --- a/vest/comb_scorer.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef _COMB_SCORER_ -#define _COMB_SCORER_ - -#include "scorer.h" - -class BLEUTERCombinationScorer : public SentenceScorer { - public: - BLEUTERCombinationScorer(const std::vector >& refs); - ~BLEUTERCombinationScorer(); - ScoreP ScoreCandidate(const std::vector& hyp) const; - ScoreP ScoreCCandidate(const std::vector& hyp) const; - static ScoreP ScoreFromString(const std::string& in); - private: - ScorerP bleu_,ter_; -}; - -#endif diff --git a/vest/fast_score.cc b/vest/fast_score.cc deleted file mode 100644 index 5ee264a6..00000000 --- a/vest/fast_score.cc +++ /dev/null @@ -1,72 +0,0 @@ -#include -#include - -#include -#include - -#include "filelib.h" -#include "tdict.h" -#include "scorer.h" - -using namespace std; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("reference,r",po::value >(), "[REQD] Reference translation(s) (tokenized text file)") - ("loss_function,l",po::value()->default_value("ibm_bleu"), "Scoring metric (ibm_bleu, nist_bleu, koehn_bleu, ter, combi)") - ("in_file,i", po::value()->default_value("-"), "Input file") - ("help,h", "Help"); - po::options_description dcmdline_options; - dcmdline_options.add(opts); - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - bool flag = false; - if (!conf->count("reference")) { - cerr << "Please specify one or more references using -r -r ...\n"; - flag = true; - } - if (flag || conf->count("help")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - const string loss_function = conf["loss_function"].as(); - ScoreType type = ScoreTypeFromString(loss_function); - DocScorer ds(type, conf["reference"].as >(), ""); - cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl; - - ReadFile rf(conf["in_file"].as()); - ScoreP acc; - istream& in = *rf.stream(); - int lc = 0; - while(in) { - string line; - getline(in, line); - if (line.empty() && !in) break; - vector sent; - TD::ConvertSentence(line, &sent); - ScoreP sentscore = ds[lc]->ScoreCandidate(sent); - if (!acc) { acc = sentscore->GetZero(); } - acc->PlusEquals(*sentscore); - ++lc; - } - assert(lc > 0); - if (lc > ds.size()) { - cerr << "Too many (" << lc << ") translations in input, expected " << ds.size() << endl; - return 1; - } - if (lc != ds.size()) - cerr << "Fewer sentences in hyp (" << lc << ") than refs (" - << ds.size() << "): scoring partial set!\n"; - float score = acc->ComputeScore(); - string details; - acc->ScoreDetails(&details); - cerr << details << endl; - cout << score << endl; - return 0; -} diff --git a/vest/lo_test.cc b/vest/lo_test.cc index 577113bb..9200eb34 100644 --- a/vest/lo_test.cc +++ b/vest/lo_test.cc @@ -5,6 +5,7 @@ #include #include +#include "ces.h" #include "fdict.h" #include "hg.h" #include "kbest.h" @@ -166,8 +167,8 @@ TEST_F(OptTest, TestS1) { envs[1] = Inside(hg2, NULL, wf); vector es(2); - scorer1->ComputeErrorSurface(envs[0], &es[0], IBM_BLEU, hg); - scorer2->ComputeErrorSurface(envs[1], &es[1], IBM_BLEU, hg2); + ComputeErrorSurface(*scorer1, envs[0], &es[0], IBM_BLEU, hg); + ComputeErrorSurface(*scorer2, envs[1], &es[1], IBM_BLEU, hg2); cerr << envs[0].size() << " " << envs[1].size() << endl; cerr << es[0].size() << " " << es[1].size() << endl; envs.clear(); diff --git a/vest/mr_vest_map.cc b/vest/mr_vest_map.cc index b3acc5dd..1506a99f 100644 --- a/vest/mr_vest_map.cc +++ b/vest/mr_vest_map.cc @@ -6,6 +6,7 @@ #include #include +#include "ces.h" #include "filelib.h" #include "stringlib.h" #include "sparse_vector.h" @@ -13,7 +14,7 @@ #include "viterbi_envelope.h" #include "inside_outside.h" #include "error_surface.h" -#include "hg.h" +#include "b64tools.h" #include "hg_io.h" using namespace std; @@ -90,7 +91,7 @@ int main(int argc, char** argv) { ViterbiEnvelopeWeightFunction wf(origin, axis); ViterbiEnvelope ve = Inside(hg, NULL, wf); ErrorSurface es; - ds[sent_id]->ComputeErrorSurface(ve, &es, type, hg); + ComputeErrorSurface(*ds[sent_id], ve, &es, type, hg); //cerr << "Viterbi envelope has " << ve.size() << " segments\n"; // cerr << "Error surface has " << es.size() << " segments\n"; string val; diff --git a/vest/mr_vest_reduce.cc b/vest/mr_vest_reduce.cc index 5efcc19a..3df52020 100644 --- a/vest/mr_vest_reduce.cc +++ b/vest/mr_vest_reduce.cc @@ -9,7 +9,7 @@ #include "sparse_vector.h" #include "error_surface.h" #include "line_optimizer.h" -#include "hg_io.h" +#include "b64tools.h" using namespace std; namespace po = boost::program_options; diff --git a/vest/scorer.cc b/vest/scorer.cc deleted file mode 100644 index 70fdef34..00000000 --- a/vest/scorer.cc +++ /dev/null @@ -1,708 +0,0 @@ -#include "scorer.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "filelib.h" -#include "aligner.h" -#include "viterbi_envelope.h" -#include "error_surface.h" -#include "ter.h" -#include "aer_scorer.h" -#include "comb_scorer.h" -#include "tdict.h" -#include "stringlib.h" -#include "lattice.h" - - -using boost::shared_ptr; -using namespace std; - -const bool minimize_segments = true; // if adjacent segments have equal scores, merge them - -void Score::TimesEquals(float scale) { - cerr<<"UNIMPLEMENTED except for BLEU (for MIRA): Score::TimesEquals"<=0 && st - float operator()(float sum,S const& ref) const { - return sum+ref.size(); - } -}; - -template -float avg_reflength(vector refs) { - unsigned n=refs.size(); - return n?accumulate(refs.begin(),refs.end(),0.,length_accum())/n:0.; -} - - -float SentenceScorer::ComputeRefLength(const Sentence &hyp) const { - return hyp.size(); // reasonable default? :) -} - -const std::string* SentenceScorer::GetSource() const { return NULL; } - -class SERScore : public ScoreBase { - friend class SERScorer; - public: - SERScore() : correct(0), total(0) {} - float ComputePartialScore() const { return 0.0;} - float ComputeScore() const { - return static_cast(correct) / static_cast(total); - } - void ScoreDetails(string* details) const { - ostringstream os; - os << "SER= " << ComputeScore() << " (" << correct << '/' << total << ')'; - *details = os.str(); - } - void PlusPartialEquals(const Score& /* delta */, int /* oracle_e_cover */, int /* oracle_f_cover */, int /* src_len */){} - - void PlusEquals(const Score& delta, const float scale) { - correct += scale*static_cast(delta).correct; - total += scale*static_cast(delta).total; - } - void PlusEquals(const Score& delta) { - correct += static_cast(delta).correct; - total += static_cast(delta).total; - } - ScoreP GetZero() const { return ScoreP(new SERScore); } - ScoreP GetOne() const { return ScoreP(new SERScore); } - void Subtract(const Score& rhs, Score* res) const { - SERScore* r = static_cast(res); - r->correct = correct - static_cast(rhs).correct; - r->total = total - static_cast(rhs).total; - } - void Encode(string* out) const { - assert(!"not implemented"); - } - bool IsAdditiveIdentity() const { - return (total == 0 && correct == 0); // correct is always 0 <= n <= total - } - private: - int correct, total; -}; - -std::string SentenceScorer::verbose_desc() const { - return desc+",ref0={ "+TD::GetString(refs[0])+" }"; -} - -class SERScorer : public SentenceScorer { - public: - SERScorer(const vector >& references) : SentenceScorer("SERScorer",references),refs_(references) {} - ScoreP ScoreCCandidate(const vector& /* hyp */) const { - return ScoreP(); - } - ScoreP ScoreCandidate(const vector& hyp) const { - SERScore* res = new SERScore; - res->total = 1; - for (int i = 0; i < refs_.size(); ++i) - if (refs_[i] == hyp) res->correct = 1; - return ScoreP(res); - } - static ScoreP ScoreFromString(const string& data) { - assert(!"Not implemented"); - } - private: - vector > refs_; -}; - -class BLEUScore : public ScoreBase { - friend class BLEUScorerBase; - public: - BLEUScore(int n) : correct_ngram_hit_counts(float(0),n), hyp_ngram_counts(float(0),n) { - ref_len = 0; - hyp_len = 0; } - BLEUScore(int n, int k) : correct_ngram_hit_counts(float(k),n), hyp_ngram_counts(float(k),n) { - ref_len = k; - hyp_len = k; } - float ComputeScore() const; - float ComputePartialScore() const; - void ScoreDetails(string* details) const; - void TimesEquals(float scale); - void PlusEquals(const Score& delta); - void PlusEquals(const Score& delta, const float scale); - void PlusPartialEquals(const Score& delta, int oracle_e_cover, int oracle_f_cover, int src_len); - ScoreP GetZero() const; - ScoreP GetOne() const; - void Subtract(const Score& rhs, Score* res) const; - void Encode(string* out) const; - bool IsAdditiveIdentity() const { - if (fabs(ref_len) > 0.1f || hyp_len != 0) return false; - for (int i = 0; i < correct_ngram_hit_counts.size(); ++i) - if (hyp_ngram_counts[i] != 0 || - correct_ngram_hit_counts[i] != 0) return false; - return true; - } - private: - int N() const { - return hyp_ngram_counts.size(); - } - float ComputeScore(vector* precs, float* bp) const; - float ComputePartialScore(vector* prec, float* bp) const; - valarray correct_ngram_hit_counts; - valarray hyp_ngram_counts; - float ref_len; - float hyp_len; -}; - -class BLEUScorerBase : public SentenceScorer { - public: - BLEUScorerBase(const vector >& references, - int n - ); - ScoreP ScoreCandidate(const vector& hyp) const; - ScoreP ScoreCCandidate(const vector& hyp) const; - static ScoreP ScoreFromString(const string& in); - - virtual float ComputeRefLength(const vector& hyp) const = 0; - private: - struct NGramCompare { - int operator() (const vector& a, const vector& b) { - size_t as = a.size(); - size_t bs = b.size(); - const size_t s = (as < bs ? as : bs); - for (size_t i = 0; i < s; ++i) { - int d = a[i] - b[i]; - if (d < 0) return true; - if (d > 0) return false; - } - return as < bs; - } - }; - typedef map, pair, NGramCompare> NGramCountMap; - void CountRef(const vector& ref) { - NGramCountMap tc; - vector ngram(n_); - int s = ref.size(); - for (int j=0; j& p = ngrams_[i->first]; - if (p.first < i->second.first) - p = i->second; - } - } - - void ComputeNgramStats(const vector& sent, - valarray* correct, - valarray* hyp, - bool clip_counts) - const { - assert(correct->size() == n_); - assert(hyp->size() == n_); - vector ngram(n_); - (*correct) *= 0; - (*hyp) *= 0; - int s = sent.size(); - for (int j=0; j& p = ngrams_[ngram]; - if(clip_counts){ - if (p.second < p.first) { - ++p.second; - (*correct)[i-1]++; - }} - else { - ++p.second; - (*correct)[i-1]++; - } - // if the 1 gram isn't found, don't try to match don't need to match any 2- 3- .. grams: - if (!p.first) { - for (; i<=k; ++i) - (*hyp)[i-1]++; - } else { - (*hyp)[i-1]++; - } - } - } - } - - mutable NGramCountMap ngrams_; - int n_; - vector lengths_; -}; - -ScoreP BLEUScorerBase::ScoreFromString(const string& in) { - istringstream is(in); - int n; - is >> n; - BLEUScore* r = new BLEUScore(n); - is >> r->ref_len >> r->hyp_len; - - for (int i = 0; i < n; ++i) { - is >> r->correct_ngram_hit_counts[i]; - is >> r->hyp_ngram_counts[i]; - } - return ScoreP(r); -} - -class IBM_BLEUScorer : public BLEUScorerBase { - public: - IBM_BLEUScorer(const vector >& references, - int n=4) : BLEUScorerBase(references, n), lengths_(references.size()) { - for (int i=0; i < references.size(); ++i) - lengths_[i] = references[i].size(); - } - float ComputeRefLength(const vector& hyp) const { - if (lengths_.size() == 1) return lengths_[0]; - int bestd = 2000000; - int hl = hyp.size(); - int bl = -1; - for (vector::const_iterator ci = lengths_.begin(); ci != lengths_.end(); ++ci) { - int cl = *ci; - if (abs(cl - hl) < bestd) { - bestd = abs(cl - hl); - bl = cl; - } - } - return bl; - } - private: - vector lengths_; -}; - -class NIST_BLEUScorer : public BLEUScorerBase { - public: - NIST_BLEUScorer(const vector >& references, - int n=4) : BLEUScorerBase(references, n), - shortest_(references[0].size()) { - for (int i=1; i < references.size(); ++i) - if (references[i].size() < shortest_) - shortest_ = references[i].size(); - } - float ComputeRefLength(const vector& /* hyp */) const { - return shortest_; - } - private: - float shortest_; -}; - -class Koehn_BLEUScorer : public BLEUScorerBase { - public: - Koehn_BLEUScorer(const vector >& references, - int n=4) : BLEUScorerBase(references, n), - avg_(0) { - for (int i=0; i < references.size(); ++i) - avg_ += references[i].size(); - avg_ /= references.size(); - } - float ComputeRefLength(const vector& /* hyp */) const { - return avg_; - } - private: - float avg_; -}; - -ScorerP SentenceScorer::CreateSentenceScorer(const ScoreType type, - const vector >& refs, - const string& src) -{ - SentenceScorer *r=0; - switch (type) { - case IBM_BLEU: r = new IBM_BLEUScorer(refs, 4);break; - case IBM_BLEU_3 : r = new IBM_BLEUScorer(refs,3);break; - case NIST_BLEU: r = new NIST_BLEUScorer(refs, 4);break; - case Koehn_BLEU: r = new Koehn_BLEUScorer(refs, 4);break; - case AER: r = new AERScorer(refs, src);break; - case TER: r = new TERScorer(refs);break; - case SER: r = new SERScorer(refs);break; - case BLEU_minus_TER_over_2: r = new BLEUTERCombinationScorer(refs);break; - default: - assert(!"Not implemented!"); - } - return ScorerP(r); -} - -ScoreP SentenceScorer::GetOne() const { - Sentence s; - return ScoreCCandidate(s)->GetOne(); -} - -ScoreP SentenceScorer::GetZero() const { - Sentence s; - return ScoreCCandidate(s)->GetZero(); -} - -ScoreP Score::GetOne(ScoreType type) { - std::vector refs; - return SentenceScorer::CreateSentenceScorer(type,refs)->GetOne(); -} - -ScoreP Score::GetZero(ScoreType type) { - std::vector refs; - return SentenceScorer::CreateSentenceScorer(type,refs)->GetZero(); -} - - -ScoreP SentenceScorer::CreateScoreFromString(const ScoreType type, const string& in) { - switch (type) { - case IBM_BLEU: - case IBM_BLEU_3: - case NIST_BLEU: - case Koehn_BLEU: - return BLEUScorerBase::ScoreFromString(in); - case TER: - return TERScorer::ScoreFromString(in); - case AER: - return AERScorer::ScoreFromString(in); - case SER: - return SERScorer::ScoreFromString(in); - case BLEU_minus_TER_over_2: - return BLEUTERCombinationScorer::ScoreFromString(in); - default: - assert(!"Not implemented!"); - } -} - -void SentenceScorer::ComputeErrorSurface(const ViterbiEnvelope& ve, ErrorSurface* env, const ScoreType type, const Hypergraph& hg) const { - vector prev_trans; - const vector >& ienv = ve.GetSortedSegs(); - env->resize(ienv.size()); - ScoreP prev_score; - int j = 0; - for (int i = 0; i < ienv.size(); ++i) { - const Segment& seg = *ienv[i]; - vector trans; - if (type == AER) { - vector edges(hg.edges_.size(), false); - seg.CollectEdgesUsed(&edges); // get the set of edges in the viterbi - // alignment - ostringstream os; - const string* psrc = this->GetSource(); - if (psrc == NULL) { - cerr << "AER scoring in VEST requires source, but it is missing!\n"; - abort(); - } - size_t pos = psrc->rfind(" ||| "); - if (pos == string::npos) { - cerr << "Malformed source for AER: expected |||\nINPUT: " << *psrc << endl; - abort(); - } - Lattice src; - Lattice ref; - LatticeTools::ConvertTextOrPLF(psrc->substr(0, pos), &src); - LatticeTools::ConvertTextOrPLF(psrc->substr(pos + 5), &ref); - AlignerTools::WriteAlignment(src, ref, hg, &os, true, &edges); - string tstr = os.str(); - TD::ConvertSentence(tstr.substr(tstr.rfind(" ||| ") + 5), &trans); - } else { - seg.ConstructTranslation(&trans); - } - // cerr << "Scoring: " << TD::GetString(trans) << endl; - if (trans == prev_trans) { - if (!minimize_segments) { - assert(prev_score); // if this fails, it means - // the decoder can generate null translations - ErrorSegment& out = (*env)[j]; - out.delta = prev_score->GetZero(); - out.x = seg.x; - ++j; - } - // cerr << "Identical translation, skipping scoring\n"; - } else { - ScoreP score = ScoreCandidate(trans); - // cerr << "score= " << score->ComputeScore() << "\n"; - ScoreP cur_delta_p = score->GetZero(); - Score* cur_delta = cur_delta_p.get(); - // just record the score diffs - if (!prev_score) - prev_score = score->GetZero(); - - score->Subtract(*prev_score, cur_delta); - prev_trans.swap(trans); - prev_score = score; - if ((!minimize_segments) || (!cur_delta->IsAdditiveIdentity())) { - ErrorSegment& out = (*env)[j]; - out.delta = cur_delta_p; - out.x = seg.x; - ++j; - } - } - } - // cerr << " In segments: " << ienv.size() << endl; - // cerr << "Out segments: " << j << endl; - assert(j > 0); - env->resize(j); -} - -void BLEUScore::ScoreDetails(string* details) const { - char buf[2000]; - vector precs(max(N(),4)); - float bp; - float bleu = ComputeScore(&precs, &bp); - for (int i=N();i<4;++i) - precs[i]=0.; - char *bufn; - bufn=buf+sprintf(buf, "BLEU = %.2f, %.1f|%.1f|%.1f|%.1f (brev=%.3f)", - bleu*100.0, - precs[0]*100.0, - precs[1]*100.0, - precs[2]*100.0, - precs[3]*100.0, - bp); - *details = buf; -} - -float BLEUScore::ComputeScore(vector* precs, float* bp) const { - float log_bleu = 0; - if (precs) precs->clear(); - int count = 0; - for (int i = 0; i < N(); ++i) { - if (hyp_ngram_counts[i] > 0) { - float lprec = log(correct_ngram_hit_counts[i]) - log(hyp_ngram_counts[i]); - if (precs) precs->push_back(exp(lprec)); - log_bleu += lprec; - ++count; - } - } - log_bleu /= static_cast(count); - float lbp = 0.0; - if (hyp_len < ref_len) - lbp = (hyp_len - ref_len) / hyp_len; - log_bleu += lbp; - if (bp) *bp = exp(lbp); - return exp(log_bleu); -} - - -//comptue scaled score for oracle retrieval -float BLEUScore::ComputePartialScore(vector* precs, float* bp) const { - // cerr << "Then here " << endl; - float log_bleu = 0; - if (precs) precs->clear(); - int count = 0; - for (int i = 0; i < N(); ++i) { - // cerr << "In CPS " << hyp_ngram_counts[i] << " " << correct_ngram_hit_counts[i] << endl; - if (hyp_ngram_counts[i] > 0) { - float lprec = log(correct_ngram_hit_counts[i]) - log(hyp_ngram_counts[i]); - if (precs) precs->push_back(exp(lprec)); - log_bleu += lprec; - ++count; - } - } - log_bleu /= static_cast(count); - float lbp = 0.0; - if (hyp_len < ref_len) - lbp = (hyp_len - ref_len) / hyp_len; - log_bleu += lbp; - if (bp) *bp = exp(lbp); - return exp(log_bleu); -} - -float BLEUScore::ComputePartialScore() const { - // cerr << "In here first " << endl; - return ComputePartialScore(NULL, NULL); -} - -float BLEUScore::ComputeScore() const { - return ComputeScore(NULL, NULL); -} - -void BLEUScore::Subtract(const Score& rhs, Score* res) const { - const BLEUScore& d = static_cast(rhs); - BLEUScore* o = static_cast(res); - o->ref_len = ref_len - d.ref_len; - o->hyp_len = hyp_len - d.hyp_len; - o->correct_ngram_hit_counts = correct_ngram_hit_counts - d.correct_ngram_hit_counts; - o->hyp_ngram_counts = hyp_ngram_counts - d.hyp_ngram_counts; -} - -void BLEUScore::PlusEquals(const Score& delta) { - const BLEUScore& d = static_cast(delta); - correct_ngram_hit_counts += d.correct_ngram_hit_counts; - hyp_ngram_counts += d.hyp_ngram_counts; - ref_len += d.ref_len; - hyp_len += d.hyp_len; -} - -void BLEUScore::TimesEquals(float scale) { - correct_ngram_hit_counts *= scale; - hyp_ngram_counts *= scale; - ref_len *= scale; - hyp_len *= scale; -} - -void BLEUScore::PlusEquals(const Score& delta, const float scale) { - const BLEUScore& d = static_cast(delta); - correct_ngram_hit_counts = correct_ngram_hit_counts + (d.correct_ngram_hit_counts * scale); - hyp_ngram_counts = hyp_ngram_counts + (d.hyp_ngram_counts * scale); - ref_len = ref_len + (d.ref_len * scale); - hyp_len = hyp_len + (d.hyp_len * scale); -} - -void BLEUScore::PlusPartialEquals(const Score& delta, int oracle_e_cover, int oracle_f_cover, int src_len){ - const BLEUScore& d = static_cast(delta); - correct_ngram_hit_counts += d.correct_ngram_hit_counts; - hyp_ngram_counts += d.hyp_ngram_counts; - //scale the reference length according to the size of the input sentence covered by this rule - - ref_len *= (float)oracle_f_cover / src_len; - ref_len += d.ref_len; - - hyp_len = oracle_e_cover; - hyp_len += d.hyp_len; -} - - -ScoreP BLEUScore::GetZero() const { - return ScoreP(new BLEUScore(N())); -} - -ScoreP BLEUScore::GetOne() const { - return ScoreP(new BLEUScore(N(),1)); -} - - -void BLEUScore::Encode(string* out) const { - ostringstream os; - const int n = correct_ngram_hit_counts.size(); - os << n << ' ' << ref_len << ' ' << hyp_len; - for (int i = 0; i < n; ++i) - os << ' ' << correct_ngram_hit_counts[i] << ' ' << hyp_ngram_counts[i]; - *out = os.str(); -} - -BLEUScorerBase::BLEUScorerBase(const vector >& references, - int n) : SentenceScorer("BLEU"+boost::lexical_cast(n),references),n_(n) { - for (vector >::const_iterator ci = references.begin(); - ci != references.end(); ++ci) { - lengths_.push_back(ci->size()); - CountRef(*ci); - } -} - -ScoreP BLEUScorerBase::ScoreCandidate(const vector& hyp) const { - BLEUScore* bs = new BLEUScore(n_); - for (NGramCountMap::iterator i=ngrams_.begin(); i != ngrams_.end(); ++i) - i->second.second = 0; - ComputeNgramStats(hyp, &bs->correct_ngram_hit_counts, &bs->hyp_ngram_counts, true); - bs->ref_len = ComputeRefLength(hyp); - bs->hyp_len = hyp.size(); - return ScoreP(bs); -} - -ScoreP BLEUScorerBase::ScoreCCandidate(const vector& hyp) const { - BLEUScore* bs = new BLEUScore(n_); - for (NGramCountMap::iterator i=ngrams_.begin(); i != ngrams_.end(); ++i) - i->second.second = 0; - bool clip = false; - ComputeNgramStats(hyp, &bs->correct_ngram_hit_counts, &bs->hyp_ngram_counts,clip); - bs->ref_len = ComputeRefLength(hyp); - bs->hyp_len = hyp.size(); - return ScoreP(bs); -} - - -DocScorer::~DocScorer() { -} - -void DocScorer::Init( - const ScoreType type, - const vector& ref_files, - const string& src_file, bool verbose) { - scorers_.clear(); - // TODO stop using valarray, start using ReadFile - cerr << "Loading references (" << ref_files.size() << " files)\n"; - ReadFile srcrf; - if (type == AER && src_file.size() > 0) { - cerr << " (source=" << src_file << ")\n"; - srcrf.Init(src_file); - } - std::vector ifs(ref_files.begin(),ref_files.end()); - for (int i=0; i < ref_files.size(); ++i) ifs[i].Init(ref_files[i]); - char buf[64000]; - bool expect_eof = false; - int line=0; - while (ifs[0].get()) { - vector > refs(ref_files.size()); - for (int i=0; i < ref_files.size(); ++i) { - istream &in=ifs[i].get(); - if (in.eof()) break; - in.getline(buf, 64000); - refs[i].clear(); - if (strlen(buf) == 0) { - if (in.eof()) { - if (!expect_eof) { - assert(i == 0); - expect_eof = true; - } - break; - } - } else { - TD::ConvertSentence(buf, &refs[i]); - assert(!refs[i].empty()); - } - assert(!expect_eof); - } - if (!expect_eof) { - string src_line; - if (srcrf) { - getline(srcrf.get(), src_line); - map dummy; - ProcessAndStripSGML(&src_line, &dummy); - } - scorers_.push_back(ScorerP(SentenceScorer::CreateSentenceScorer(type, refs, src_line))); - if (verbose) - cerr<<"doc_scorer["<verbose_desc()< -#include -#include -//TODO: use intrusive shared_ptr in Score (because there are many of them on ErrorSurfaces) -#include "wordid.h" -#include "intrusive_refcount.hpp" - -class Score; -class SentenceScorer; -typedef boost::intrusive_ptr ScoreP; -typedef boost::shared_ptr ScorerP; - -class ViterbiEnvelope; -class ErrorSurface; -class Hypergraph; // needed for alignment - -//TODO: BLEU N (N separate arg, not part of enum)? -enum ScoreType { IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, BLEU_minus_TER_over_2, SER, AER, IBM_BLEU_3 }; -ScoreType ScoreTypeFromString(const std::string& st); -std::string StringFromScoreType(ScoreType st); - -class Score : public boost::intrusive_refcount { - public: - virtual ~Score(); - virtual float ComputeScore() const = 0; - virtual float ComputePartialScore() const =0; - virtual void ScoreDetails(std::string* details) const = 0; - std::string ScoreDetails() { - std::string d; - ScoreDetails(&d); - return d; - } - virtual void TimesEquals(float scale); // only for bleu; for mira oracle - /// same as rhs.TimesEquals(scale);PlusEquals(rhs) except doesn't modify rhs. - virtual void PlusEquals(const Score& rhs, const float scale) = 0; - virtual void PlusEquals(const Score& rhs) = 0; - virtual void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len) = 0; - virtual void Subtract(const Score& rhs, Score *res) const = 0; - virtual ScoreP GetZero() const = 0; - virtual ScoreP GetOne() const = 0; - virtual bool IsAdditiveIdentity() const = 0; // returns true if adding this delta - // to another score results in no score change - // under any circumstances - virtual void Encode(std::string* out) const = 0; - static ScoreP GetZero(ScoreType type); - static ScoreP GetOne(ScoreType type); - virtual ScoreP Clone() const = 0; -protected: - Score() { } // we define these explicitly because refcount is noncopyable - Score(Score const& o) { } -}; - -//TODO: make sure default copy ctors for score types do what we want. -template -struct ScoreBase : public Score { - ScoreP Clone() const { - return ScoreP(new Derived(dynamic_cast(*this))); - } -}; - -class SentenceScorer { - public: - typedef std::vector Sentence; - typedef std::vector Sentences; - std::string desc; - Sentences refs; - SentenceScorer(std::string desc="SentenceScorer_unknown", Sentences const& refs=Sentences()) : desc(desc),refs(refs) { } - std::string verbose_desc() const; - virtual float ComputeRefLength(const Sentence& hyp) const; // default: avg of refs.length - virtual ~SentenceScorer(); - virtual ScoreP GetOne() const; - virtual ScoreP GetZero() const; - void ComputeErrorSurface(const ViterbiEnvelope& ve, ErrorSurface* es, const ScoreType type, const Hypergraph& hg) const; - virtual ScoreP ScoreCandidate(const Sentence& hyp) const = 0; - virtual ScoreP ScoreCCandidate(const Sentence& hyp) const =0; - virtual const std::string* GetSource() const; - static ScoreP CreateScoreFromString(const ScoreType type, const std::string& in); - static ScorerP CreateSentenceScorer(const ScoreType type, - const std::vector& refs, - const std::string& src = ""); -}; - -//TODO: should be able to GetOne GetZero without supplying sentence (just type) -class DocScorer { - public: - ~DocScorer(); - DocScorer() { } - void Init(const ScoreType type, - const std::vector& ref_files, - const std::string& src_file = "", - bool verbose=false - ); - DocScorer(const ScoreType type, - const std::vector& ref_files, - const std::string& src_file = "", - bool verbose=false - ) - { - Init(type,ref_files,src_file,verbose); - } - - int size() const { return scorers_.size(); } - ScorerP operator[](size_t i) const { return scorers_[i]; } - private: - std::vector scorers_; -}; - - -#endif diff --git a/vest/ter.cc b/vest/ter.cc deleted file mode 100644 index cacc5b00..00000000 --- a/vest/ter.cc +++ /dev/null @@ -1,535 +0,0 @@ -#include "ter.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "tdict.h" - -const bool ter_use_average_ref_len = true; -const int ter_short_circuit_long_sentences = -1; - -using namespace std; -using namespace std::tr1; - -struct COSTS { - static const float substitution; - static const float deletion; - static const float insertion; - static const float shift; -}; -const float COSTS::substitution = 1.0f; -const float COSTS::deletion = 1.0f; -const float COSTS::insertion = 1.0f; -const float COSTS::shift = 1.0f; - -static const int MAX_SHIFT_SIZE = 10; -static const int MAX_SHIFT_DIST = 50; - -struct Shift { - unsigned int d_; - Shift() : d_() {} - Shift(int b, int e, int m) : d_() { - begin(b); - end(e); - moveto(m); - } - inline int begin() const { - return d_ & 0x3ff; - } - inline int end() const { - return (d_ >> 10) & 0x3ff; - } - inline int moveto() const { - int m = (d_ >> 20) & 0x7ff; - if (m > 1024) { m -= 1024; m *= -1; } - return m; - } - inline void begin(int b) { - d_ &= 0xfffffc00u; - d_ |= (b & 0x3ff); - } - inline void end(int e) { - d_ &= 0xfff003ffu; - d_ |= (e & 0x3ff) << 10; - } - inline void moveto(int m) { - bool neg = (m < 0); - if (neg) { m *= -1; m += 1024; } - d_ &= 0xfffff; - d_ |= (m & 0x7ff) << 20; - } -}; - -class TERScorerImpl { - - public: - enum TransType { MATCH, SUBSTITUTION, INSERTION, DELETION }; - - explicit TERScorerImpl(const vector& ref) : ref_(ref) { - for (int i = 0; i < ref.size(); ++i) - rwexists_.insert(ref[i]); - } - - float Calculate(const vector& hyp, int* subs, int* ins, int* dels, int* shifts) const { - return CalculateAllShifts(hyp, subs, ins, dels, shifts); - } - - inline int GetRefLength() const { - return ref_.size(); - } - - private: - vector ref_; - set rwexists_; - - typedef unordered_map, set, boost::hash > > NgramToIntsMap; - mutable NgramToIntsMap nmap_; - - static float MinimumEditDistance( - const vector& hyp, - const vector& ref, - vector* path) { - vector > bmat(hyp.size() + 1, vector(ref.size() + 1, MATCH)); - vector > cmat(hyp.size() + 1, vector(ref.size() + 1, 0)); - for (int i = 0; i <= hyp.size(); ++i) - cmat[i][0] = i; - for (int j = 0; j <= ref.size(); ++j) - cmat[0][j] = j; - for (int i = 1; i <= hyp.size(); ++i) { - const WordID& hw = hyp[i-1]; - for (int j = 1; j <= ref.size(); ++j) { - const WordID& rw = ref[j-1]; - float& cur_c = cmat[i][j]; - TransType& cur_b = bmat[i][j]; - - if (rw == hw) { - cur_c = cmat[i-1][j-1]; - cur_b = MATCH; - } else { - cur_c = cmat[i-1][j-1] + COSTS::substitution; - cur_b = SUBSTITUTION; - } - float cwoi = cmat[i-1][j]; - if (cur_c > cwoi + COSTS::insertion) { - cur_c = cwoi + COSTS::insertion; - cur_b = INSERTION; - } - float cwod = cmat[i][j-1]; - if (cur_c > cwod + COSTS::deletion) { - cur_c = cwod + COSTS::deletion; - cur_b = DELETION; - } - } - } - - // trace back along the best path and record the transition types - path->clear(); - int i = hyp.size(); - int j = ref.size(); - while (i > 0 || j > 0) { - if (j == 0) { - --i; - path->push_back(INSERTION); - } else if (i == 0) { - --j; - path->push_back(DELETION); - } else { - TransType t = bmat[i][j]; - path->push_back(t); - switch (t) { - case SUBSTITUTION: - case MATCH: - --i; --j; break; - case INSERTION: - --i; break; - case DELETION: - --j; break; - } - } - } - reverse(path->begin(), path->end()); - return cmat[hyp.size()][ref.size()]; - } - - void BuildWordMatches(const vector& hyp, NgramToIntsMap* nmap) const { - nmap->clear(); - set exists_both; - for (int i = 0; i < hyp.size(); ++i) - if (rwexists_.find(hyp[i]) != rwexists_.end()) - exists_both.insert(hyp[i]); - for (int start=0; start cp; - int mlen = min(MAX_SHIFT_SIZE, static_cast(ref_.size() - start)); - for (int len=0; len& in, - int start, int end, int moveto, vector* out) { - // cerr << "ps: " << start << " " << end << " " << moveto << endl; - out->clear(); - if (moveto == -1) { - for (int i = start; i <= end; ++i) - out->push_back(in[i]); - for (int i = 0; i < start; ++i) - out->push_back(in[i]); - for (int i = end+1; i < in.size(); ++i) - out->push_back(in[i]); - } else if (moveto < start) { - for (int i = 0; i <= moveto; ++i) - out->push_back(in[i]); - for (int i = start; i <= end; ++i) - out->push_back(in[i]); - for (int i = moveto+1; i < start; ++i) - out->push_back(in[i]); - for (int i = end+1; i < in.size(); ++i) - out->push_back(in[i]); - } else if (moveto > end) { - for (int i = 0; i < start; ++i) - out->push_back(in[i]); - for (int i = end+1; i <= moveto; ++i) - out->push_back(in[i]); - for (int i = start; i <= end; ++i) - out->push_back(in[i]); - for (int i = moveto+1; i < in.size(); ++i) - out->push_back(in[i]); - } else { - for (int i = 0; i < start; ++i) - out->push_back(in[i]); - for (int i = end+1; (i < in.size()) && (i <= end + (moveto - start)); ++i) - out->push_back(in[i]); - for (int i = start; i <= end; ++i) - out->push_back(in[i]); - for (int i = (end + (moveto - start))+1; i < in.size(); ++i) - out->push_back(in[i]); - } - if (out->size() != in.size()) { - cerr << "ps: " << start << " " << end << " " << moveto << endl; - cerr << "in=" << TD::GetString(in) << endl; - cerr << "out=" << TD::GetString(*out) << endl; - } - assert(out->size() == in.size()); - // cerr << "ps: " << TD::GetString(*out) << endl; - } - - void GetAllPossibleShifts(const vector& hyp, - const vector& ralign, - const vector& herr, - const vector& rerr, - const int min_size, - vector >* shifts) const { - for (int start = 0; start < hyp.size(); ++start) { - vector cp(1, hyp[start]); - NgramToIntsMap::iterator niter = nmap_.find(cp); - if (niter == nmap_.end()) continue; - bool ok = false; - int moveto; - for (set::iterator i = niter->second.begin(); i != niter->second.end(); ++i) { - moveto = *i; - int rm = ralign[moveto]; - ok = (start != rm && - (rm - start) < MAX_SHIFT_DIST && - (start - rm - 1) < MAX_SHIFT_DIST); - if (ok) break; - } - if (!ok) continue; - cp.clear(); - for (int end = start + min_size - 1; - ok && end < hyp.size() && end < (start + MAX_SHIFT_SIZE); ++end) { - cp.push_back(hyp[end]); - vector& sshifts = (*shifts)[end - start]; - ok = false; - NgramToIntsMap::iterator niter = nmap_.find(cp); - if (niter == nmap_.end()) break; - bool any_herr = false; - for (int i = start; i <= end && !any_herr; ++i) - any_herr = herr[i]; - if (!any_herr) { - ok = true; - continue; - } - for (set::iterator mi = niter->second.begin(); - mi != niter->second.end(); ++mi) { - int moveto = *mi; - int rm = ralign[moveto]; - if (! ((rm != start) && - ((rm < start) || (rm > end)) && - (rm - start <= MAX_SHIFT_DIST) && - ((start - rm - 1) <= MAX_SHIFT_DIST))) continue; - ok = true; - bool any_rerr = false; - for (int i = 0; (i <= end - start) && (!any_rerr); ++i) - any_rerr = rerr[moveto+i]; - if (!any_rerr) continue; - for (int roff = 0; roff <= (end - start); ++roff) { - int rmr = ralign[moveto+roff]; - if ((start != rmr) && ((roff == 0) || (rmr != ralign[moveto]))) - sshifts.push_back(Shift(start, end, moveto + roff)); - } - } - } - } - } - - bool CalculateBestShift(const vector& cur, - const vector& hyp, - float curerr, - const vector& path, - vector* new_hyp, - float* newerr, - vector* new_path) const { - vector herr, rerr; - vector ralign; - int hpos = -1; - for (int i = 0; i < path.size(); ++i) { - switch (path[i]) { - case MATCH: - ++hpos; - herr.push_back(false); - rerr.push_back(false); - ralign.push_back(hpos); - break; - case SUBSTITUTION: - ++hpos; - herr.push_back(true); - rerr.push_back(true); - ralign.push_back(hpos); - break; - case INSERTION: - ++hpos; - herr.push_back(true); - break; - case DELETION: - rerr.push_back(true); - ralign.push_back(hpos); - break; - } - } -#if 0 - cerr << "RALIGN: "; - for (int i = 0; i < rerr.size(); ++i) - cerr << ralign[i] << " "; - cerr << endl; - cerr << "RERR: "; - for (int i = 0; i < rerr.size(); ++i) - cerr << (bool)rerr[i] << " "; - cerr << endl; - cerr << "HERR: "; - for (int i = 0; i < herr.size(); ++i) - cerr << (bool)herr[i] << " "; - cerr << endl; -#endif - - vector > shifts(MAX_SHIFT_SIZE + 1); - GetAllPossibleShifts(cur, ralign, herr, rerr, 1, &shifts); - float cur_best_shift_cost = 0; - *newerr = curerr; - vector cur_best_path; - vector cur_best_hyp; - - bool res = false; - for (int i = shifts.size() - 1; i >=0; --i) { - float curfix = curerr - (cur_best_shift_cost + *newerr); - float maxfix = 2.0f * (1 + i) - COSTS::shift; - if ((curfix > maxfix) || ((cur_best_shift_cost == 0) && (curfix == maxfix))) break; - for (int j = 0; j < shifts[i].size(); ++j) { - const Shift& s = shifts[i][j]; - curfix = curerr - (cur_best_shift_cost + *newerr); - maxfix = 2.0f * (1 + i) - COSTS::shift; // TODO remove? - if ((curfix > maxfix) || ((cur_best_shift_cost == 0) && (curfix == maxfix))) continue; - vector shifted(cur.size()); - PerformShift(cur, s.begin(), s.end(), ralign[s.moveto()], &shifted); - vector try_path; - float try_cost = MinimumEditDistance(shifted, ref_, &try_path); - float gain = (*newerr + cur_best_shift_cost) - (try_cost + COSTS::shift); - if (gain > 0.0f || ((cur_best_shift_cost == 0.0f) && (gain == 0.0f))) { - *newerr = try_cost; - cur_best_shift_cost = COSTS::shift; - new_path->swap(try_path); - new_hyp->swap(shifted); - res = true; - // cerr << "Found better shift " << s.begin() << "..." << s.end() << " moveto " << s.moveto() << endl; - } - } - } - - return res; - } - - static void GetPathStats(const vector& path, int* subs, int* ins, int* dels) { - *subs = *ins = *dels = 0; - for (int i = 0; i < path.size(); ++i) { - switch (path[i]) { - case SUBSTITUTION: - ++(*subs); - case MATCH: - break; - case INSERTION: - ++(*ins); break; - case DELETION: - ++(*dels); break; - } - } - } - - float CalculateAllShifts(const vector& hyp, - int* subs, int* ins, int* dels, int* shifts) const { - BuildWordMatches(hyp, &nmap_); - vector path; - float med_cost = MinimumEditDistance(hyp, ref_, &path); - float edits = 0; - vector cur = hyp; - *shifts = 0; - if (ter_short_circuit_long_sentences < 0 || - ref_.size() < ter_short_circuit_long_sentences) { - while (true) { - vector new_hyp; - vector new_path; - float new_med_cost; - if (!CalculateBestShift(cur, hyp, med_cost, path, &new_hyp, &new_med_cost, &new_path)) - break; - edits += COSTS::shift; - ++(*shifts); - med_cost = new_med_cost; - path.swap(new_path); - cur.swap(new_hyp); - } - } - GetPathStats(path, subs, ins, dels); - return med_cost + edits; - } -}; - -class TERScore : public ScoreBase { - friend class TERScorer; - - public: - static const unsigned kINSERTIONS = 0; - static const unsigned kDELETIONS = 1; - static const unsigned kSUBSTITUTIONS = 2; - static const unsigned kSHIFTS = 3; - static const unsigned kREF_WORDCOUNT = 4; - static const unsigned kDUMMY_LAST_ENTRY = 5; - - TERScore() : stats(0,kDUMMY_LAST_ENTRY) {} - float ComputePartialScore() const { return 0.0;} - float ComputeScore() const { - float edits = static_cast(stats[kINSERTIONS] + stats[kDELETIONS] + stats[kSUBSTITUTIONS] + stats[kSHIFTS]); - return edits / static_cast(stats[kREF_WORDCOUNT]); - } - void ScoreDetails(string* details) const; - void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len){} - void PlusEquals(const Score& delta, const float scale) { - if (scale==1) - stats += static_cast(delta).stats; - if (scale==-1) - stats -= static_cast(delta).stats; - throw std::runtime_error("TERScore::PlusEquals with scale != +-1"); - } - void PlusEquals(const Score& delta) { - stats += static_cast(delta).stats; - } - - ScoreP GetZero() const { - return ScoreP(new TERScore); - } - ScoreP GetOne() const { - return ScoreP(new TERScore); - } - void Subtract(const Score& rhs, Score* res) const { - static_cast(res)->stats = stats - static_cast(rhs).stats; - } - void Encode(std::string* out) const { - ostringstream os; - os << stats[kINSERTIONS] << ' ' - << stats[kDELETIONS] << ' ' - << stats[kSUBSTITUTIONS] << ' ' - << stats[kSHIFTS] << ' ' - << stats[kREF_WORDCOUNT]; - *out = os.str(); - } - bool IsAdditiveIdentity() const { - for (int i = 0; i < kDUMMY_LAST_ENTRY; ++i) - if (stats[i] != 0) return false; - return true; - } - private: - valarray stats; -}; - -ScoreP TERScorer::ScoreFromString(const std::string& data) { - istringstream is(data); - TERScore* r = new TERScore; - is >> r->stats[TERScore::kINSERTIONS] - >> r->stats[TERScore::kDELETIONS] - >> r->stats[TERScore::kSUBSTITUTIONS] - >> r->stats[TERScore::kSHIFTS] - >> r->stats[TERScore::kREF_WORDCOUNT]; - return ScoreP(r); -} - -void TERScore::ScoreDetails(std::string* details) const { - char buf[200]; - sprintf(buf, "TER = %.2f, %3d|%3d|%3d|%3d (len=%d)", - ComputeScore() * 100.0f, - stats[kINSERTIONS], - stats[kDELETIONS], - stats[kSUBSTITUTIONS], - stats[kSHIFTS], - stats[kREF_WORDCOUNT]); - *details = buf; -} - -TERScorer::~TERScorer() { - for (vector::iterator i = impl_.begin(); i != impl_.end(); ++i) - delete *i; -} - -TERScorer::TERScorer(const vector >& refs) : impl_(refs.size()) { - for (int i = 0; i < refs.size(); ++i) - impl_[i] = new TERScorerImpl(refs[i]); -} - -ScoreP TERScorer::ScoreCCandidate(const vector& hyp) const { - return ScoreP(); -} - -ScoreP TERScorer::ScoreCandidate(const std::vector& hyp) const { - float best_score = numeric_limits::max(); - TERScore* res = new TERScore; - int avg_len = 0; - for (int i = 0; i < impl_.size(); ++i) - avg_len += impl_[i]->GetRefLength(); - avg_len /= impl_.size(); - for (int i = 0; i < impl_.size(); ++i) { - int subs, ins, dels, shifts; - float score = impl_[i]->Calculate(hyp, &subs, &ins, &dels, &shifts); - // cerr << "Component TER cost: " << score << endl; - if (score < best_score) { - res->stats[TERScore::kINSERTIONS] = ins; - res->stats[TERScore::kDELETIONS] = dels; - res->stats[TERScore::kSUBSTITUTIONS] = subs; - res->stats[TERScore::kSHIFTS] = shifts; - if (ter_use_average_ref_len) { - res->stats[TERScore::kREF_WORDCOUNT] = avg_len; - } else { - res->stats[TERScore::kREF_WORDCOUNT] = impl_[i]->GetRefLength(); - } - - best_score = score; - } - } - return ScoreP(res); -} diff --git a/vest/ter.h b/vest/ter.h deleted file mode 100644 index 43314791..00000000 --- a/vest/ter.h +++ /dev/null @@ -1,19 +0,0 @@ -#ifndef _TER_H_ -#define _TER_H_ - -#include "scorer.h" - -class TERScorerImpl; - -class TERScorer : public SentenceScorer { - public: - TERScorer(const std::vector >& references); - ~TERScorer(); - ScoreP ScoreCandidate(const std::vector& hyp) const; - ScoreP ScoreCCandidate(const std::vector& hyp) const; - static ScoreP ScoreFromString(const std::string& data); - private: - std::vector impl_; -}; - -#endif -- cgit v1.2.3