From ba939df399a160f9a8370911c840635d6cee4f58 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 20 Dec 2011 18:34:14 -0500 Subject: migrate fast_score to the new API --- vest/dist-vest.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'vest') diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl index 11e791c1..c382a972 100755 --- a/vest/dist-vest.pl +++ b/vest/dist-vest.pl @@ -308,7 +308,7 @@ while (1){ $retries++; } die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest); - my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -l $metric"); + my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -m $metric"); chomp $dec_score; print STDERR "DECODER SCORE: $dec_score\n"; -- cgit v1.2.3 From 3c1c98b5aec7aec34432ddc37385df06d301bdd5 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 27 Jan 2012 02:31:00 -0500 Subject: migrate mert to the new scorer interface --- gi/pf/base_distributions.cc | 241 ++++++++++++++++++++++++++++++++++++++++ gi/pf/base_distributions.h | 261 ++++++++++++++++++++++++++++++++++++++++++++ gi/pf/base_measures.cc | 241 ---------------------------------------- gi/pf/base_measures.h | 247 ----------------------------------------- mteval/ns.cc | 4 + mteval/ns.h | 10 +- vest/ces.cc | 42 +++---- vest/ces.h | 10 +- vest/dist-vest.pl | 4 +- vest/error_surface.cc | 11 +- vest/error_surface.h | 6 +- vest/line_optimizer.cc | 20 ++-- vest/line_optimizer.h | 2 + vest/lo_test.cc | 21 ++-- vest/mr_vest_map.cc | 16 +-- vest/mr_vest_reduce.cc | 34 +++--- 16 files changed, 602 insertions(+), 568 deletions(-) create mode 100644 gi/pf/base_distributions.cc create mode 100644 gi/pf/base_distributions.h delete mode 100644 gi/pf/base_measures.cc delete mode 100644 gi/pf/base_measures.h (limited to 'vest') diff --git a/gi/pf/base_distributions.cc b/gi/pf/base_distributions.cc new file mode 100644 index 00000000..4b1863fa --- /dev/null +++ b/gi/pf/base_distributions.cc @@ -0,0 +1,241 @@ +#include "base_measures.h" + +#include + +#include "filelib.h" + +using namespace std; + +TableLookupBase::TableLookupBase(const string& fname) { + cerr << "TableLookupBase reading from " << fname << " ..." << endl; + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + unsigned lc = 0; + const WordID kDIV = TD::Convert("|||"); + vector tmp; + vector le, lf; + TRule x; + x.lhs_ = -TD::Convert("X"); + bool flag = false; + while(getline(in, line)) { + ++lc; + if (lc % 1000000 == 0) { cerr << " [" << lc << ']' << endl; flag = false; } + else if (lc % 25000 == 0) { cerr << '.' << flush; flag = true; } + tmp.clear(); + TD::ConvertSentence(line, &tmp); + x.f_.clear(); + x.e_.clear(); + size_t pos = 0; + int cc = 0; + while(pos < tmp.size()) { + const WordID cur = tmp[pos++]; + if (cur == kDIV) { + ++cc; + } else if (cc == 0) { + x.f_.push_back(cur); + } else if (cc == 1) { + x.e_.push_back(cur); + } else if (cc == 2) { + table[x].logeq(atof(TD::Convert(cur))); + ++cc; + } else { + if (flag) cerr << endl; + cerr << "Bad format in " << lc << ": " << line << endl; abort(); + } + } + if (cc != 3) { + if (flag) cerr << endl; + cerr << "Bad format in " << lc << ": " << line << endl; abort(); + } + } + if (flag) cerr << endl; + cerr << " read " << lc << " entries\n"; +} + +prob_t PhraseConditionalUninformativeUnigramBase::p0(const vector& vsrc, + const vector& vtrg, + int start_src, int start_trg) const { + const int flen = vsrc.size() - start_src; + const int elen = vtrg.size() - start_trg; + prob_t p; + p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) + //p.logeq(log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01) + for (int i = 0; i < elen; ++i) + p *= u(vtrg[i + start_trg]); // draw e_i ~Uniform + return p; +} + +prob_t PhraseConditionalUninformativeBase::p0(const vector& vsrc, + const vector& vtrg, + int start_src, int start_trg) const { + const int flen = vsrc.size() - start_src; + const int elen = vtrg.size() - start_trg; + prob_t p; + //p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) + p.logeq(log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01) + for (int i = 0; i < elen; ++i) + p *= kUNIFORM_TARGET; // draw e_i ~Uniform + return p; +} + +void Model1::LoadModel1(const string& fname) { + cerr << "Loading Model 1 parameters from " << fname << " ..." << endl; + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + unsigned lc = 0; + while(getline(in, line)) { + ++lc; + int cur = 0; + int start = 0; + while(cur < line.size() && line[cur] != ' ') { ++cur; } + assert(cur != line.size()); + line[cur] = 0; + const WordID src = TD::Convert(&line[0]); + ++cur; + start = cur; + while(cur < line.size() && line[cur] != ' ') { ++cur; } + assert(cur != line.size()); + line[cur] = 0; + WordID trg = TD::Convert(&line[start]); + const double logprob = strtod(&line[cur + 1], NULL); + if (src >= ttable.size()) ttable.resize(src + 1); + ttable[src][trg].logeq(logprob); + } + cerr << " read " << lc << " parameters.\n"; +} + +prob_t PhraseConditionalBase::p0(const vector& vsrc, + const vector& vtrg, + int start_src, int start_trg) const { + const int flen = vsrc.size() - start_src; + const int elen = vtrg.size() - start_trg; + prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); + prob_t p; + p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) + for (int i = 0; i < elen; ++i) { // for each position i in e-RHS + const WordID trg = vtrg[i + start_trg]; + prob_t tp = prob_t::Zero(); + for (int j = -1; j < flen; ++j) { + const WordID src = j < 0 ? 0 : vsrc[j + start_src]; + tp += kM1MIXTURE * model1(src, trg); + tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; + } + tp *= uniform_src_alignment; // draw a_i ~uniform + p *= tp; // draw e_i ~Model1(f_a_i) / uniform + } + if (p.is_0()) { + cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; + abort(); + } + return p; +} + +prob_t PhraseJointBase::p0(const vector& vsrc, + const vector& vtrg, + int start_src, int start_trg) const { + const int flen = vsrc.size() - start_src; + const int elen = vtrg.size() - start_trg; + prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); + prob_t p; + p.logeq(log_poisson(flen, 1.0)); // flen ~Pois(1) + // elen | flen ~Pois(flen + 0.01) + prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01)); + p *= ptrglen; + p *= kUNIFORM_SOURCE.pow(flen); // each f in F ~Uniform + for (int i = 0; i < elen; ++i) { // for each position i in E + const WordID trg = vtrg[i + start_trg]; + prob_t tp = prob_t::Zero(); + for (int j = -1; j < flen; ++j) { + const WordID src = j < 0 ? 0 : vsrc[j + start_src]; + tp += kM1MIXTURE * model1(src, trg); + tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; + } + tp *= uniform_src_alignment; // draw a_i ~uniform + p *= tp; // draw e_i ~Model1(f_a_i) / uniform + } + if (p.is_0()) { + cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; + abort(); + } + return p; +} + +prob_t PhraseJointBase_BiDir::p0(const vector& vsrc, + const vector& vtrg, + int start_src, int start_trg) const { + const int flen = vsrc.size() - start_src; + const int elen = vtrg.size() - start_trg; + prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); + prob_t uniform_trg_alignment; uniform_trg_alignment.logeq(-log(elen + 1)); + + prob_t p1; + p1.logeq(log_poisson(flen, 1.0)); // flen ~Pois(1) + // elen | flen ~Pois(flen + 0.01) + prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01)); + p1 *= ptrglen; + p1 *= kUNIFORM_SOURCE.pow(flen); // each f in F ~Uniform + for (int i = 0; i < elen; ++i) { // for each position i in E + const WordID trg = vtrg[i + start_trg]; + prob_t tp = prob_t::Zero(); + for (int j = -1; j < flen; ++j) { + const WordID src = j < 0 ? 0 : vsrc[j + start_src]; + tp += kM1MIXTURE * model1(src, trg); + tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; + } + tp *= uniform_src_alignment; // draw a_i ~uniform + p1 *= tp; // draw e_i ~Model1(f_a_i) / uniform + } + if (p1.is_0()) { + cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; + abort(); + } + + prob_t p2; + p2.logeq(log_poisson(elen, 1.0)); // elen ~Pois(1) + // flen | elen ~Pois(flen + 0.01) + prob_t psrclen; psrclen.logeq(log_poisson(flen, elen + 0.01)); + p2 *= psrclen; + p2 *= kUNIFORM_TARGET.pow(elen); // each f in F ~Uniform + for (int i = 0; i < flen; ++i) { // for each position i in E + const WordID src = vsrc[i + start_src]; + prob_t tp = prob_t::Zero(); + for (int j = -1; j < elen; ++j) { + const WordID trg = j < 0 ? 0 : vtrg[j + start_trg]; + tp += kM1MIXTURE * invmodel1(trg, src); + tp += kUNIFORM_MIXTURE * kUNIFORM_SOURCE; + } + tp *= uniform_trg_alignment; // draw a_i ~uniform + p2 *= tp; // draw e_i ~Model1(f_a_i) / uniform + } + if (p2.is_0()) { + cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; + abort(); + } + + static const prob_t kHALF(0.5); + return (p1 + p2) * kHALF; +} + +JumpBase::JumpBase() : p(200) { + for (unsigned src_len = 1; src_len < 200; ++src_len) { + map& cpd = p[src_len]; + int min_jump = 1 - src_len; + int max_jump = src_len; + prob_t z; + for (int j = min_jump; j <= max_jump; ++j) { + prob_t& cp = cpd[j]; + if (j < 0) + cp.logeq(log_poisson(1.5-j, 1)); + else if (j > 0) + cp.logeq(log_poisson(j, 1)); + cp.poweq(0.2); + z += cp; + } + for (int j = min_jump; j <= max_jump; ++j) { + cpd[j] /= z; + } + } +} + diff --git a/gi/pf/base_distributions.h b/gi/pf/base_distributions.h new file mode 100644 index 00000000..a23ac32b --- /dev/null +++ b/gi/pf/base_distributions.h @@ -0,0 +1,261 @@ +#ifndef _BASE_MEASURES_H_ +#define _BASE_MEASURES_H_ + +#include +#include +#include +#include +#include +#include + +#include "unigrams.h" +#include "trule.h" +#include "prob.h" +#include "tdict.h" +#include "sampler.h" + +inline double log_poisson(unsigned x, const double& lambda) { + assert(lambda > 0.0); + return log(lambda) * x - lgamma(x + 1) - lambda; +} + +inline double log_binom_coeff(unsigned n, unsigned k) { + assert(n >= k); + if (n == k) return 0.0; + return lgamma(n + 1) - lgamma(k + 1) - lgamma(n - k + 1); +} + +// http://en.wikipedia.org/wiki/Negative_binomial_distribution +inline double log_negative_binom(unsigned x, unsigned r, double p) { + assert(p > 0.0); + assert(p < 1.0); + return log_binom_coeff(x + r - 1, x) + r * log(1 - p) + x * log(p); +} + +inline std::ostream& operator<<(std::ostream& os, const std::vector& p) { + os << '['; + for (int i = 0; i < p.size(); ++i) + os << (i==0 ? "" : " ") << TD::Convert(p[i]); + return os << ']'; +} + +struct Model1 { + explicit Model1(const std::string& fname) : + kNULL(TD::Convert("")), + kZERO() { + LoadModel1(fname); + } + + void LoadModel1(const std::string& fname); + + // returns prob 0 if src or trg is not found + const prob_t& operator()(WordID src, WordID trg) const { + if (src == 0) src = kNULL; + if (src < ttable.size()) { + const std::map& cpd = ttable[src]; + const std::map::const_iterator it = cpd.find(trg); + if (it != cpd.end()) + return it->second; + } + return kZERO; + } + + const WordID kNULL; + const prob_t kZERO; + std::vector > ttable; +}; + +struct PoissonUniformUninformativeBase { + explicit PoissonUniformUninformativeBase(const unsigned ves) : kUNIFORM(1.0 / ves) {} + prob_t operator()(const TRule& r) const { + prob_t p; p.logeq(log_poisson(r.e_.size(), 1.0)); + prob_t q = kUNIFORM; q.poweq(r.e_.size()); + p *= q; + return p; + } + void Summary() const {} + void ResampleHyperparameters(MT19937*) {} + void Increment(const TRule&) {} + void Decrement(const TRule&) {} + prob_t Likelihood() const { return prob_t::One(); } + const prob_t kUNIFORM; +}; + +struct CompletelyUniformBase { + explicit CompletelyUniformBase(const unsigned ves) : kUNIFORM(1.0 / ves) {} + prob_t operator()(const TRule&) const { + return kUNIFORM; + } + void Summary() const {} + void ResampleHyperparameters(MT19937*) {} + void Increment(const TRule&) {} + void Decrement(const TRule&) {} + prob_t Likelihood() const { return prob_t::One(); } + const prob_t kUNIFORM; +}; + +struct UnigramWordBase { + explicit UnigramWordBase(const std::string& fname) : un(fname) {} + prob_t operator()(const TRule& r) const { + return un(r.e_); + } + const UnigramWordModel un; +}; + +struct RuleHasher { + size_t operator()(const TRule& r) const { + return hash_value(r); + } +}; + +struct TableLookupBase { + TableLookupBase(const std::string& fname); + + prob_t operator()(const TRule& rule) const { + const std::tr1::unordered_map::const_iterator it = table.find(rule); + if (it == table.end()) { + std::cerr << rule << " not found\n"; + abort(); + } + return it->second; + } + + void ResampleHyperparameters(MT19937*) {} + void Increment(const TRule&) {} + void Decrement(const TRule&) {} + prob_t Likelihood() const { return prob_t::One(); } + void Summary() const {} + + std::tr1::unordered_map table; +}; + +struct PhraseConditionalUninformativeBase { + explicit PhraseConditionalUninformativeBase(const unsigned vocab_e_size) : + kUNIFORM_TARGET(1.0 / vocab_e_size) { + assert(vocab_e_size > 0); + } + + // return p0 of rule.e_ | rule.f_ + prob_t operator()(const TRule& rule) const { + return p0(rule.f_, rule.e_, 0, 0); + } + + prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; + + void Summary() const {} + void ResampleHyperparameters(MT19937*) {} + void Increment(const TRule&) {} + void Decrement(const TRule&) {} + prob_t Likelihood() const { return prob_t::One(); } + const prob_t kUNIFORM_TARGET; +}; + +struct PhraseConditionalUninformativeUnigramBase { + explicit PhraseConditionalUninformativeUnigramBase(const std::string& file, const unsigned vocab_e_size) : u(file, vocab_e_size) {} + + // return p0 of rule.e_ | rule.f_ + prob_t operator()(const TRule& rule) const { + return p0(rule.f_, rule.e_, 0, 0); + } + + prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; + + const UnigramModel u; +}; + +struct PhraseConditionalBase { + explicit PhraseConditionalBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size) : + model1(m1), + kM1MIXTURE(m1mixture), + kUNIFORM_MIXTURE(1.0 - m1mixture), + kUNIFORM_TARGET(1.0 / vocab_e_size) { + assert(m1mixture >= 0.0 && m1mixture <= 1.0); + assert(vocab_e_size > 0); + } + + // return p0 of rule.e_ | rule.f_ + prob_t operator()(const TRule& rule) const { + return p0(rule.f_, rule.e_, 0, 0); + } + + prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; + + const Model1& model1; + const prob_t kM1MIXTURE; // Model 1 mixture component + const prob_t kUNIFORM_MIXTURE; // uniform mixture component + const prob_t kUNIFORM_TARGET; +}; + +struct PhraseJointBase { + explicit PhraseJointBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size, const unsigned vocab_f_size) : + model1(m1), + kM1MIXTURE(m1mixture), + kUNIFORM_MIXTURE(1.0 - m1mixture), + kUNIFORM_SOURCE(1.0 / vocab_f_size), + kUNIFORM_TARGET(1.0 / vocab_e_size) { + assert(m1mixture >= 0.0 && m1mixture <= 1.0); + assert(vocab_e_size > 0); + } + + // return p0 of rule.e_ , rule.f_ + prob_t operator()(const TRule& rule) const { + return p0(rule.f_, rule.e_, 0, 0); + } + + prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; + + const Model1& model1; + const prob_t kM1MIXTURE; // Model 1 mixture component + const prob_t kUNIFORM_MIXTURE; // uniform mixture component + const prob_t kUNIFORM_SOURCE; + const prob_t kUNIFORM_TARGET; +}; + +struct PhraseJointBase_BiDir { + explicit PhraseJointBase_BiDir(const Model1& m1, + const Model1& im1, + const double m1mixture, + const unsigned vocab_e_size, + const unsigned vocab_f_size) : + model1(m1), + invmodel1(im1), + kM1MIXTURE(m1mixture), + kUNIFORM_MIXTURE(1.0 - m1mixture), + kUNIFORM_SOURCE(1.0 / vocab_f_size), + kUNIFORM_TARGET(1.0 / vocab_e_size) { + assert(m1mixture >= 0.0 && m1mixture <= 1.0); + assert(vocab_e_size > 0); + } + + // return p0 of rule.e_ , rule.f_ + prob_t operator()(const TRule& rule) const { + return p0(rule.f_, rule.e_, 0, 0); + } + + prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; + + const Model1& model1; + const Model1& invmodel1; + const prob_t kM1MIXTURE; // Model 1 mixture component + const prob_t kUNIFORM_MIXTURE; // uniform mixture component + const prob_t kUNIFORM_SOURCE; + const prob_t kUNIFORM_TARGET; +}; + +// base distribution for jump size multinomials +// basically p(0) = 0 and then, p(1) is max, and then +// you drop as you move to the max jump distance +struct JumpBase { + JumpBase(); + + const prob_t& operator()(int jump, unsigned src_len) const { + assert(jump != 0); + const std::map::const_iterator it = p[src_len].find(jump); + assert(it != p[src_len].end()); + return it->second; + } + std::vector > p; +}; + + +#endif diff --git a/gi/pf/base_measures.cc b/gi/pf/base_measures.cc deleted file mode 100644 index 4b1863fa..00000000 --- a/gi/pf/base_measures.cc +++ /dev/null @@ -1,241 +0,0 @@ -#include "base_measures.h" - -#include - -#include "filelib.h" - -using namespace std; - -TableLookupBase::TableLookupBase(const string& fname) { - cerr << "TableLookupBase reading from " << fname << " ..." << endl; - ReadFile rf(fname); - istream& in = *rf.stream(); - string line; - unsigned lc = 0; - const WordID kDIV = TD::Convert("|||"); - vector tmp; - vector le, lf; - TRule x; - x.lhs_ = -TD::Convert("X"); - bool flag = false; - while(getline(in, line)) { - ++lc; - if (lc % 1000000 == 0) { cerr << " [" << lc << ']' << endl; flag = false; } - else if (lc % 25000 == 0) { cerr << '.' << flush; flag = true; } - tmp.clear(); - TD::ConvertSentence(line, &tmp); - x.f_.clear(); - x.e_.clear(); - size_t pos = 0; - int cc = 0; - while(pos < tmp.size()) { - const WordID cur = tmp[pos++]; - if (cur == kDIV) { - ++cc; - } else if (cc == 0) { - x.f_.push_back(cur); - } else if (cc == 1) { - x.e_.push_back(cur); - } else if (cc == 2) { - table[x].logeq(atof(TD::Convert(cur))); - ++cc; - } else { - if (flag) cerr << endl; - cerr << "Bad format in " << lc << ": " << line << endl; abort(); - } - } - if (cc != 3) { - if (flag) cerr << endl; - cerr << "Bad format in " << lc << ": " << line << endl; abort(); - } - } - if (flag) cerr << endl; - cerr << " read " << lc << " entries\n"; -} - -prob_t PhraseConditionalUninformativeUnigramBase::p0(const vector& vsrc, - const vector& vtrg, - int start_src, int start_trg) const { - const int flen = vsrc.size() - start_src; - const int elen = vtrg.size() - start_trg; - prob_t p; - p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) - //p.logeq(log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01) - for (int i = 0; i < elen; ++i) - p *= u(vtrg[i + start_trg]); // draw e_i ~Uniform - return p; -} - -prob_t PhraseConditionalUninformativeBase::p0(const vector& vsrc, - const vector& vtrg, - int start_src, int start_trg) const { - const int flen = vsrc.size() - start_src; - const int elen = vtrg.size() - start_trg; - prob_t p; - //p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) - p.logeq(log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01) - for (int i = 0; i < elen; ++i) - p *= kUNIFORM_TARGET; // draw e_i ~Uniform - return p; -} - -void Model1::LoadModel1(const string& fname) { - cerr << "Loading Model 1 parameters from " << fname << " ..." << endl; - ReadFile rf(fname); - istream& in = *rf.stream(); - string line; - unsigned lc = 0; - while(getline(in, line)) { - ++lc; - int cur = 0; - int start = 0; - while(cur < line.size() && line[cur] != ' ') { ++cur; } - assert(cur != line.size()); - line[cur] = 0; - const WordID src = TD::Convert(&line[0]); - ++cur; - start = cur; - while(cur < line.size() && line[cur] != ' ') { ++cur; } - assert(cur != line.size()); - line[cur] = 0; - WordID trg = TD::Convert(&line[start]); - const double logprob = strtod(&line[cur + 1], NULL); - if (src >= ttable.size()) ttable.resize(src + 1); - ttable[src][trg].logeq(logprob); - } - cerr << " read " << lc << " parameters.\n"; -} - -prob_t PhraseConditionalBase::p0(const vector& vsrc, - const vector& vtrg, - int start_src, int start_trg) const { - const int flen = vsrc.size() - start_src; - const int elen = vtrg.size() - start_trg; - prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); - prob_t p; - p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) - for (int i = 0; i < elen; ++i) { // for each position i in e-RHS - const WordID trg = vtrg[i + start_trg]; - prob_t tp = prob_t::Zero(); - for (int j = -1; j < flen; ++j) { - const WordID src = j < 0 ? 0 : vsrc[j + start_src]; - tp += kM1MIXTURE * model1(src, trg); - tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; - } - tp *= uniform_src_alignment; // draw a_i ~uniform - p *= tp; // draw e_i ~Model1(f_a_i) / uniform - } - if (p.is_0()) { - cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; - abort(); - } - return p; -} - -prob_t PhraseJointBase::p0(const vector& vsrc, - const vector& vtrg, - int start_src, int start_trg) const { - const int flen = vsrc.size() - start_src; - const int elen = vtrg.size() - start_trg; - prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); - prob_t p; - p.logeq(log_poisson(flen, 1.0)); // flen ~Pois(1) - // elen | flen ~Pois(flen + 0.01) - prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01)); - p *= ptrglen; - p *= kUNIFORM_SOURCE.pow(flen); // each f in F ~Uniform - for (int i = 0; i < elen; ++i) { // for each position i in E - const WordID trg = vtrg[i + start_trg]; - prob_t tp = prob_t::Zero(); - for (int j = -1; j < flen; ++j) { - const WordID src = j < 0 ? 0 : vsrc[j + start_src]; - tp += kM1MIXTURE * model1(src, trg); - tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; - } - tp *= uniform_src_alignment; // draw a_i ~uniform - p *= tp; // draw e_i ~Model1(f_a_i) / uniform - } - if (p.is_0()) { - cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; - abort(); - } - return p; -} - -prob_t PhraseJointBase_BiDir::p0(const vector& vsrc, - const vector& vtrg, - int start_src, int start_trg) const { - const int flen = vsrc.size() - start_src; - const int elen = vtrg.size() - start_trg; - prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); - prob_t uniform_trg_alignment; uniform_trg_alignment.logeq(-log(elen + 1)); - - prob_t p1; - p1.logeq(log_poisson(flen, 1.0)); // flen ~Pois(1) - // elen | flen ~Pois(flen + 0.01) - prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01)); - p1 *= ptrglen; - p1 *= kUNIFORM_SOURCE.pow(flen); // each f in F ~Uniform - for (int i = 0; i < elen; ++i) { // for each position i in E - const WordID trg = vtrg[i + start_trg]; - prob_t tp = prob_t::Zero(); - for (int j = -1; j < flen; ++j) { - const WordID src = j < 0 ? 0 : vsrc[j + start_src]; - tp += kM1MIXTURE * model1(src, trg); - tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; - } - tp *= uniform_src_alignment; // draw a_i ~uniform - p1 *= tp; // draw e_i ~Model1(f_a_i) / uniform - } - if (p1.is_0()) { - cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; - abort(); - } - - prob_t p2; - p2.logeq(log_poisson(elen, 1.0)); // elen ~Pois(1) - // flen | elen ~Pois(flen + 0.01) - prob_t psrclen; psrclen.logeq(log_poisson(flen, elen + 0.01)); - p2 *= psrclen; - p2 *= kUNIFORM_TARGET.pow(elen); // each f in F ~Uniform - for (int i = 0; i < flen; ++i) { // for each position i in E - const WordID src = vsrc[i + start_src]; - prob_t tp = prob_t::Zero(); - for (int j = -1; j < elen; ++j) { - const WordID trg = j < 0 ? 0 : vtrg[j + start_trg]; - tp += kM1MIXTURE * invmodel1(trg, src); - tp += kUNIFORM_MIXTURE * kUNIFORM_SOURCE; - } - tp *= uniform_trg_alignment; // draw a_i ~uniform - p2 *= tp; // draw e_i ~Model1(f_a_i) / uniform - } - if (p2.is_0()) { - cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; - abort(); - } - - static const prob_t kHALF(0.5); - return (p1 + p2) * kHALF; -} - -JumpBase::JumpBase() : p(200) { - for (unsigned src_len = 1; src_len < 200; ++src_len) { - map& cpd = p[src_len]; - int min_jump = 1 - src_len; - int max_jump = src_len; - prob_t z; - for (int j = min_jump; j <= max_jump; ++j) { - prob_t& cp = cpd[j]; - if (j < 0) - cp.logeq(log_poisson(1.5-j, 1)); - else if (j > 0) - cp.logeq(log_poisson(j, 1)); - cp.poweq(0.2); - z += cp; - } - for (int j = min_jump; j <= max_jump; ++j) { - cpd[j] /= z; - } - } -} - diff --git a/gi/pf/base_measures.h b/gi/pf/base_measures.h deleted file mode 100644 index b0495bfd..00000000 --- a/gi/pf/base_measures.h +++ /dev/null @@ -1,247 +0,0 @@ -#ifndef _BASE_MEASURES_H_ -#define _BASE_MEASURES_H_ - -#include -#include -#include -#include -#include - -#include "unigrams.h" -#include "trule.h" -#include "prob.h" -#include "tdict.h" -#include "sampler.h" - -inline double log_poisson(unsigned x, const double& lambda) { - assert(lambda > 0.0); - return log(lambda) * x - lgamma(x + 1) - lambda; -} - -inline std::ostream& operator<<(std::ostream& os, const std::vector& p) { - os << '['; - for (int i = 0; i < p.size(); ++i) - os << (i==0 ? "" : " ") << TD::Convert(p[i]); - return os << ']'; -} - -struct Model1 { - explicit Model1(const std::string& fname) : - kNULL(TD::Convert("")), - kZERO() { - LoadModel1(fname); - } - - void LoadModel1(const std::string& fname); - - // returns prob 0 if src or trg is not found - const prob_t& operator()(WordID src, WordID trg) const { - if (src == 0) src = kNULL; - if (src < ttable.size()) { - const std::map& cpd = ttable[src]; - const std::map::const_iterator it = cpd.find(trg); - if (it != cpd.end()) - return it->second; - } - return kZERO; - } - - const WordID kNULL; - const prob_t kZERO; - std::vector > ttable; -}; - -struct PoissonUniformUninformativeBase { - explicit PoissonUniformUninformativeBase(const unsigned ves) : kUNIFORM(1.0 / ves) {} - prob_t operator()(const TRule& r) const { - prob_t p; p.logeq(log_poisson(r.e_.size(), 1.0)); - prob_t q = kUNIFORM; q.poweq(r.e_.size()); - p *= q; - return p; - } - void Summary() const {} - void ResampleHyperparameters(MT19937*) {} - void Increment(const TRule&) {} - void Decrement(const TRule&) {} - prob_t Likelihood() const { return prob_t::One(); } - const prob_t kUNIFORM; -}; - -struct CompletelyUniformBase { - explicit CompletelyUniformBase(const unsigned ves) : kUNIFORM(1.0 / ves) {} - prob_t operator()(const TRule&) const { - return kUNIFORM; - } - void Summary() const {} - void ResampleHyperparameters(MT19937*) {} - void Increment(const TRule&) {} - void Decrement(const TRule&) {} - prob_t Likelihood() const { return prob_t::One(); } - const prob_t kUNIFORM; -}; - -struct UnigramWordBase { - explicit UnigramWordBase(const std::string& fname) : un(fname) {} - prob_t operator()(const TRule& r) const { - return un(r.e_); - } - const UnigramWordModel un; -}; - -struct RuleHasher { - size_t operator()(const TRule& r) const { - return hash_value(r); - } -}; - -struct TableLookupBase { - TableLookupBase(const std::string& fname); - - prob_t operator()(const TRule& rule) const { - const std::tr1::unordered_map::const_iterator it = table.find(rule); - if (it == table.end()) { - std::cerr << rule << " not found\n"; - abort(); - } - return it->second; - } - - void ResampleHyperparameters(MT19937*) {} - void Increment(const TRule&) {} - void Decrement(const TRule&) {} - prob_t Likelihood() const { return prob_t::One(); } - void Summary() const {} - - std::tr1::unordered_map table; -}; - -struct PhraseConditionalUninformativeBase { - explicit PhraseConditionalUninformativeBase(const unsigned vocab_e_size) : - kUNIFORM_TARGET(1.0 / vocab_e_size) { - assert(vocab_e_size > 0); - } - - // return p0 of rule.e_ | rule.f_ - prob_t operator()(const TRule& rule) const { - return p0(rule.f_, rule.e_, 0, 0); - } - - prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; - - void Summary() const {} - void ResampleHyperparameters(MT19937*) {} - void Increment(const TRule&) {} - void Decrement(const TRule&) {} - prob_t Likelihood() const { return prob_t::One(); } - const prob_t kUNIFORM_TARGET; -}; - -struct PhraseConditionalUninformativeUnigramBase { - explicit PhraseConditionalUninformativeUnigramBase(const std::string& file, const unsigned vocab_e_size) : u(file, vocab_e_size) {} - - // return p0 of rule.e_ | rule.f_ - prob_t operator()(const TRule& rule) const { - return p0(rule.f_, rule.e_, 0, 0); - } - - prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; - - const UnigramModel u; -}; - -struct PhraseConditionalBase { - explicit PhraseConditionalBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size) : - model1(m1), - kM1MIXTURE(m1mixture), - kUNIFORM_MIXTURE(1.0 - m1mixture), - kUNIFORM_TARGET(1.0 / vocab_e_size) { - assert(m1mixture >= 0.0 && m1mixture <= 1.0); - assert(vocab_e_size > 0); - } - - // return p0 of rule.e_ | rule.f_ - prob_t operator()(const TRule& rule) const { - return p0(rule.f_, rule.e_, 0, 0); - } - - prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; - - const Model1& model1; - const prob_t kM1MIXTURE; // Model 1 mixture component - const prob_t kUNIFORM_MIXTURE; // uniform mixture component - const prob_t kUNIFORM_TARGET; -}; - -struct PhraseJointBase { - explicit PhraseJointBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size, const unsigned vocab_f_size) : - model1(m1), - kM1MIXTURE(m1mixture), - kUNIFORM_MIXTURE(1.0 - m1mixture), - kUNIFORM_SOURCE(1.0 / vocab_f_size), - kUNIFORM_TARGET(1.0 / vocab_e_size) { - assert(m1mixture >= 0.0 && m1mixture <= 1.0); - assert(vocab_e_size > 0); - } - - // return p0 of rule.e_ , rule.f_ - prob_t operator()(const TRule& rule) const { - return p0(rule.f_, rule.e_, 0, 0); - } - - prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; - - const Model1& model1; - const prob_t kM1MIXTURE; // Model 1 mixture component - const prob_t kUNIFORM_MIXTURE; // uniform mixture component - const prob_t kUNIFORM_SOURCE; - const prob_t kUNIFORM_TARGET; -}; - -struct PhraseJointBase_BiDir { - explicit PhraseJointBase_BiDir(const Model1& m1, - const Model1& im1, - const double m1mixture, - const unsigned vocab_e_size, - const unsigned vocab_f_size) : - model1(m1), - invmodel1(im1), - kM1MIXTURE(m1mixture), - kUNIFORM_MIXTURE(1.0 - m1mixture), - kUNIFORM_SOURCE(1.0 / vocab_f_size), - kUNIFORM_TARGET(1.0 / vocab_e_size) { - assert(m1mixture >= 0.0 && m1mixture <= 1.0); - assert(vocab_e_size > 0); - } - - // return p0 of rule.e_ , rule.f_ - prob_t operator()(const TRule& rule) const { - return p0(rule.f_, rule.e_, 0, 0); - } - - prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; - - const Model1& model1; - const Model1& invmodel1; - const prob_t kM1MIXTURE; // Model 1 mixture component - const prob_t kUNIFORM_MIXTURE; // uniform mixture component - const prob_t kUNIFORM_SOURCE; - const prob_t kUNIFORM_TARGET; -}; - -// base distribution for jump size multinomials -// basically p(0) = 0 and then, p(1) is max, and then -// you drop as you move to the max jump distance -struct JumpBase { - JumpBase(); - - const prob_t& operator()(int jump, unsigned src_len) const { - assert(jump != 0); - const std::map::const_iterator it = p[src_len].find(jump); - assert(it != p[src_len].end()); - return it->second; - } - std::vector > p; -}; - - -#endif diff --git a/mteval/ns.cc b/mteval/ns.cc index 68c8deaa..da678b84 100644 --- a/mteval/ns.cc +++ b/mteval/ns.cc @@ -136,6 +136,10 @@ struct BleuSegmentEvaluator : public SegmentEvaluator { float* correct, // N elements reserved float* hyp, // N elements reserved bool clip_counts = true) const { + // clear clipping stats + for (typename NGramCountMap::iterator it = ngrams_.begin(); it != ngrams_.end(); ++it) + it->second.second = 0; + vector ngram(N); *correct *= 0; *hyp *= 0; diff --git a/mteval/ns.h b/mteval/ns.h index 622265db..d88c263b 100644 --- a/mteval/ns.h +++ b/mteval/ns.h @@ -6,6 +6,7 @@ #include #include #include "wordid.h" +#include class SufficientStats { public: @@ -43,6 +44,11 @@ class SufficientStats { bool operator==(const SufficientStats& other) const { return other.fields == fields; } + bool IsAdditiveIdentity() const { + for (unsigned i = 0; i < fields.size(); ++i) + if (fields[i]) return false; + return true; + } size_t size() const { return fields.size(); } float operator[](size_t i) const { if (i < fields.size()) return fields[i]; @@ -54,12 +60,12 @@ class SufficientStats { std::vector fields; }; -inline const SufficientStats& operator+(const SufficientStats& a, const SufficientStats& b) { +inline const SufficientStats operator+(const SufficientStats& a, const SufficientStats& b) { SufficientStats res(a); return res += b; } -inline const SufficientStats& operator-(const SufficientStats& a, const SufficientStats& b) { +inline const SufficientStats operator-(const SufficientStats& a, const SufficientStats& b) { SufficientStats res(a); return res -= b; } diff --git a/vest/ces.cc b/vest/ces.cc index 4ae6b695..cd89aa69 100644 --- a/vest/ces.cc +++ b/vest/ces.cc @@ -4,25 +4,32 @@ #include #include -#include "aligner.h" +// TODO, if AER is to be optimized again, we will need this +// #include "aligner.h" #include "lattice.h" #include "viterbi_envelope.h" #include "error_surface.h" +#include "ns.h" using boost::shared_ptr; using namespace std; const bool minimize_segments = true; // if adjacent segments have equal scores, merge them -void ComputeErrorSurface(const SentenceScorer& ss, const ViterbiEnvelope& ve, ErrorSurface* env, const ScoreType type, const Hypergraph& hg) { +void ComputeErrorSurface(const SegmentEvaluator& ss, + const ViterbiEnvelope& ve, + ErrorSurface* env, + const EvaluationMetric* metric, + const Hypergraph& hg) { vector prev_trans; const vector >& ienv = ve.GetSortedSegs(); env->resize(ienv.size()); - ScoreP prev_score; + SufficientStats prev_score; // defaults to 0 int j = 0; for (int i = 0; i < ienv.size(); ++i) { const Segment& seg = *ienv[i]; vector trans; +#if 0 if (type == AER) { vector edges(hg.edges_.size(), false); seg.CollectEdgesUsed(&edges); // get the set of edges in the viterbi @@ -46,34 +53,31 @@ void ComputeErrorSurface(const SentenceScorer& ss, const ViterbiEnvelope& ve, Er string tstr = os.str(); TD::ConvertSentence(tstr.substr(tstr.rfind(" ||| ") + 5), &trans); } else { +#endif seg.ConstructTranslation(&trans); - } - // cerr << "Scoring: " << TD::GetString(trans) << endl; + //} + //cerr << "Scoring: " << TD::GetString(trans) << endl; if (trans == prev_trans) { if (!minimize_segments) { - assert(prev_score); // if this fails, it means - // the decoder can generate null translations ErrorSegment& out = (*env)[j]; - out.delta = prev_score->GetZero(); + out.delta.fields.clear(); out.x = seg.x; ++j; } - // cerr << "Identical translation, skipping scoring\n"; + //cerr << "Identical translation, skipping scoring\n"; } else { - ScoreP score = ss.ScoreCandidate(trans); + SufficientStats score; + ss.Evaluate(trans, &score); // cerr << "score= " << score->ComputeScore() << "\n"; - ScoreP cur_delta_p = score->GetZero(); - Score* cur_delta = cur_delta_p.get(); - // just record the score diffs - if (!prev_score) - prev_score = score->GetZero(); - - score->Subtract(*prev_score, cur_delta); + //string x1; score.Encode(&x1); cerr << "STATS: " << x1 << endl; + const SufficientStats delta = score - prev_score; + //string x2; delta.Encode(&x2); cerr << "DELTA: " << x2 << endl; + //string xx; delta.Encode(&xx); cerr << xx << endl; prev_trans.swap(trans); prev_score = score; - if ((!minimize_segments) || (!cur_delta->IsAdditiveIdentity())) { + if ((!minimize_segments) || (!delta.IsAdditiveIdentity())) { ErrorSegment& out = (*env)[j]; - out.delta = cur_delta_p; + out.delta = delta; out.x = seg.x; ++j; } diff --git a/vest/ces.h b/vest/ces.h index 2f098990..e021e715 100644 --- a/vest/ces.h +++ b/vest/ces.h @@ -1,12 +1,16 @@ #ifndef _CES_H_ #define _CES_H_ -#include "scorer.h" - class ViterbiEnvelope; class Hypergraph; +class SegmentEvaluator; class ErrorSurface; +class EvaluationMetric; -void ComputeErrorSurface(const SentenceScorer& ss, const ViterbiEnvelope& ve, ErrorSurface* es, const ScoreType type, const Hypergraph& hg); +void ComputeErrorSurface(const SegmentEvaluator& ss, + const ViterbiEnvelope& ve, + ErrorSurface* es, + const EvaluationMetric* metric, + const Hypergraph& hg); #endif diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl index c382a972..8cde748b 100755 --- a/vest/dist-vest.pl +++ b/vest/dist-vest.pl @@ -364,7 +364,7 @@ while (1){ $mapoutput =~ s/mapinput/mapoutput/; push @mapoutputs, "$dir/splag.$im1/$mapoutput"; $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; - my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput"; + my $script = "$MAPPER -s $srcFile -m $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput"; if ($use_make) { my $script_file = "$dir/scripts/map.$shard"; open F, ">$script_file" or die "Can't write $script_file: $!"; @@ -424,7 +424,7 @@ while (1){ print STDERR "Results for $tol/$til lines\n"; print STDERR "\nSORTING AND RUNNING VEST REDUCER\n"; print STDERR unchecked_output("date"); - $cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -l $metric > $dir/redoutput.$im1"; + $cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -m $metric > $dir/redoutput.$im1"; print STDERR "COMMAND:\n$cmd\n"; check_bash_call($cmd); $cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1"; diff --git a/vest/error_surface.cc b/vest/error_surface.cc index 754aa8de..515b67f8 100644 --- a/vest/error_surface.cc +++ b/vest/error_surface.cc @@ -5,8 +5,7 @@ using namespace std; -ErrorSurface::~ErrorSurface() { -} +ErrorSurface::~ErrorSurface() {} void ErrorSurface::Serialize(std::string* out) const { const int segments = this->size(); @@ -15,8 +14,8 @@ void ErrorSurface::Serialize(std::string* out) const { for (int i = 0; i < segments; ++i) { const ErrorSegment& cur = (*this)[i]; string senc; - cur.delta->Encode(&senc); - assert(senc.size() < 256); + cur.delta.Encode(&senc); + assert(senc.size() < 1024); unsigned char len = senc.size(); os.write((const char*)&cur.x, sizeof(cur.x)); os.write((const char*)&len, sizeof(len)); @@ -25,7 +24,7 @@ void ErrorSurface::Serialize(std::string* out) const { *out = os.str(); } -void ErrorSurface::Deserialize(ScoreType type, const std::string& in) { +void ErrorSurface::Deserialize(const std::string& in) { istringstream is(in, ios::binary); int segments; is.read((char*)&segments, sizeof(segments)); @@ -37,7 +36,7 @@ void ErrorSurface::Deserialize(ScoreType type, const std::string& in) { is.read((char*)&len, sizeof(len)); string senc(len, '\0'); assert(senc.size() == len); is.read((char*)&senc[0], len); - cur.delta = SentenceScorer::CreateScoreFromString(type, senc); + cur.delta = SufficientStats(senc); } } diff --git a/vest/error_surface.h b/vest/error_surface.h index ad728cfa..bb65847b 100644 --- a/vest/error_surface.h +++ b/vest/error_surface.h @@ -4,13 +4,13 @@ #include #include -#include "scorer.h" +#include "ns.h" class Score; struct ErrorSegment { double x; - ScoreP delta; + SufficientStats delta; ErrorSegment() : x(0), delta() {} }; @@ -18,7 +18,7 @@ class ErrorSurface : public std::vector { public: ~ErrorSurface(); void Serialize(std::string* out) const; - void Deserialize(ScoreType type, const std::string& in); + void Deserialize(const std::string& in); }; #endif diff --git a/vest/line_optimizer.cc b/vest/line_optimizer.cc index 7303df8d..49443fbe 100644 --- a/vest/line_optimizer.cc +++ b/vest/line_optimizer.cc @@ -4,7 +4,7 @@ #include #include "sparse_vector.h" -#include "scorer.h" +#include "ns.h" using namespace std; @@ -18,6 +18,7 @@ struct IntervalComp { }; double LineOptimizer::LineOptimize( + const EvaluationMetric* metric, const vector& surfaces, const LineOptimizer::ScoreType type, float* best_score, @@ -32,8 +33,7 @@ double LineOptimizer::LineOptimize( } sort(all_ints.begin(), all_ints.end(), IntervalComp()); double last_boundary = all_ints.front()->x; - ScoreP accp = all_ints.front()->delta->GetZero(); - Score *acc=accp.get(); + SufficientStats acc; float& cur_best_score = *best_score; cur_best_score = (type == MAXIMIZE_SCORE ? -numeric_limits::max() : numeric_limits::max()); @@ -42,9 +42,8 @@ double LineOptimizer::LineOptimize( for (vector::iterator i = all_ints.begin(); i != all_ints.end(); ++i) { const ErrorSegment& seg = **i; - assert(seg.delta); if (seg.x - last_boundary > epsilon) { - float sco = acc->ComputeScore(); + float sco = metric->ComputeScore(acc); if ((type == MAXIMIZE_SCORE && sco > cur_best_score) || (type == MINIMIZE_SCORE && sco < cur_best_score) ) { cur_best_score = sco; @@ -54,16 +53,18 @@ double LineOptimizer::LineOptimize( } else { pos = last_boundary + (seg.x - last_boundary) / 2; } - // cerr << "NEW BEST: " << pos << " (score=" << cur_best_score << ")\n"; + //cerr << "NEW BEST: " << pos << " (score=" << cur_best_score << ")\n"; } - // string xx; acc->ScoreDetails(&xx); cerr << "---- " << xx; + // string xx = metric->DetailedScore(acc); cerr << "---- " << xx; // cerr << "---- s=" << sco << "\n"; last_boundary = seg.x; } // cerr << "x-boundary=" << seg.x << "\n"; - acc->PlusEquals(*seg.delta); + //string x2; acc.Encode(&x2); cerr << " ACC: " << x2 << endl; + //string x1; seg.delta.Encode(&x1); cerr << " DELTA: " << x1 << endl; + acc += seg.delta; } - float sco = acc->ComputeScore(); + float sco = metric->ComputeScore(acc); if ((type == MAXIMIZE_SCORE && sco > cur_best_score) || (type == MINIMIZE_SCORE && sco < cur_best_score) ) { cur_best_score = sco; @@ -107,3 +108,4 @@ void LineOptimizer::CreateOptimizationDirections( RandomUnitVector(features_to_optimize, &out[i], rng); cerr << "Generated " << out.size() << " total axes to optimize along.\n"; } + diff --git a/vest/line_optimizer.h b/vest/line_optimizer.h index 99a591f4..83819f41 100644 --- a/vest/line_optimizer.h +++ b/vest/line_optimizer.h @@ -7,6 +7,7 @@ #include "error_surface.h" #include "sampler.h" +class EvaluationMetric; class Weights; struct LineOptimizer { @@ -18,6 +19,7 @@ struct LineOptimizer { // merge all the error surfaces together into a global // error surface and find (the middle of) the best segment static double LineOptimize( + const EvaluationMetric* metric, const std::vector& envs, const LineOptimizer::ScoreType type, float* best_score, diff --git a/vest/lo_test.cc b/vest/lo_test.cc index f5638600..a67f65e1 100644 --- a/vest/lo_test.cc +++ b/vest/lo_test.cc @@ -5,6 +5,8 @@ #include #include +#include "ns.h" +#include "ns_docscorer.h" #include "ces.h" #include "fdict.h" #include "hg.h" @@ -15,7 +17,6 @@ #include "viterbi.h" #include "viterbi_envelope.h" #include "line_optimizer.h" -#include "scorer.h" using namespace std; using boost::shared_ptr; @@ -141,9 +142,6 @@ TEST_F(OptTest, TestS1) { TD::ConvertSentence(ref22, &refs2[1]); TD::ConvertSentence(ref32, &refs2[2]); TD::ConvertSentence(ref42, &refs2[3]); - ScoreType type = ScoreTypeFromString("ibm_bleu"); - ScorerP scorer1 = SentenceScorer::CreateSentenceScorer(type, refs1); - ScorerP scorer2 = SentenceScorer::CreateSentenceScorer(type, refs2); vector envs(2); RandomNumberGenerator rng; @@ -167,14 +165,17 @@ TEST_F(OptTest, TestS1) { envs[1] = Inside(hg2, NULL, wf); vector es(2); - ComputeErrorSurface(*scorer1, envs[0], &es[0], IBM_BLEU, hg); - ComputeErrorSurface(*scorer2, envs[1], &es[1], IBM_BLEU, hg2); + EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); + boost::shared_ptr scorer1 = metric->CreateSegmentEvaluator(refs1); + boost::shared_ptr scorer2 = metric->CreateSegmentEvaluator(refs2); + ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg); + ComputeErrorSurface(*scorer2, envs[1], &es[1], metric, hg2); cerr << envs[0].size() << " " << envs[1].size() << endl; cerr << es[0].size() << " " << es[1].size() << endl; envs.clear(); clock_t t_env=clock(); float score; - double m = LineOptimizer::LineOptimize(es, LineOptimizer::MAXIMIZE_SCORE, &score); + double m = LineOptimizer::LineOptimize(metric,es, LineOptimizer::MAXIMIZE_SCORE, &score); clock_t t_opt=clock(); cerr << "line optimizer returned: " << m << " (SCORE=" << score << ")\n"; EXPECT_FLOAT_EQ(0.48719698, score); @@ -217,15 +218,15 @@ TEST_F(OptTest,TestZeroOrigin) { vector envs(1); envs[0] = Inside(hg, NULL, wf); - ScoreType type = ScoreTypeFromString("ibm_bleu"); vector > mr(4); TD::ConvertSentence("untitled", &mr[0]); TD::ConvertSentence("with no title", &mr[1]); TD::ConvertSentence("without a title", &mr[2]); TD::ConvertSentence("without title", &mr[3]); - ScorerP scorer1 = SentenceScorer::CreateSentenceScorer(type, mr); + EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); + boost::shared_ptr scorer1 = metric->CreateSegmentEvaluator(mr); vector es(1); - ComputeErrorSurface(*scorer1, envs[0], &es[0], IBM_BLEU, hg); + ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg); } int main(int argc, char **argv) { diff --git a/vest/mr_vest_map.cc b/vest/mr_vest_map.cc index 71dda6d7..8f6e085d 100644 --- a/vest/mr_vest_map.cc +++ b/vest/mr_vest_map.cc @@ -6,11 +6,12 @@ #include #include +#include "ns.h" +#include "ns_docscorer.h" #include "ces.h" #include "filelib.h" #include "stringlib.h" #include "sparse_vector.h" -#include "scorer.h" #include "viterbi_envelope.h" #include "inside_outside.h" #include "error_surface.h" @@ -25,7 +26,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { opts.add_options() ("reference,r",po::value >(), "[REQD] Reference translation (tokenized text)") ("source,s",po::value(), "Source file (ignored, except for AER)") - ("loss_function,l",po::value()->default_value("ibm_bleu"), "Loss function being optimized") + ("evaluation_metric,m",po::value()->default_value("ibm_bleu"), "Evaluation metric being optimized") ("input,i",po::value()->default_value("-"), "Input file to map (- is STDIN)") ("help,h", "Help"); po::options_description dcmdline_options; @@ -67,10 +68,10 @@ bool ReadSparseVectorString(const string& s, SparseVector* v) { int main(int argc, char** argv) { po::variables_map conf; InitCommandLine(argc, argv, &conf); - const string loss_function = conf["loss_function"].as(); - ScoreType type = ScoreTypeFromString(loss_function); - DocScorer ds(type, conf["reference"].as >(), conf["source"].as()); - cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl; + const string evaluation_metric = conf["evaluation_metric"].as(); + EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); + DocumentScorer ds(metric, conf["reference"].as >()); + cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl; Hypergraph hg; string last_file; ReadFile in_read(conf["input"].as()); @@ -97,7 +98,8 @@ int main(int argc, char** argv) { ViterbiEnvelopeWeightFunction wf(origin, axis); ViterbiEnvelope ve = Inside(hg, NULL, wf); ErrorSurface es; - ComputeErrorSurface(*ds[sent_id], ve, &es, type, hg); + + ComputeErrorSurface(*ds[sent_id], ve, &es, metric, hg); //cerr << "Viterbi envelope has " << ve.size() << " segments\n"; // cerr << "Error surface has " << es.size() << " segments\n"; string val; diff --git a/vest/mr_vest_reduce.cc b/vest/mr_vest_reduce.cc index 3df52020..dda61f88 100644 --- a/vest/mr_vest_reduce.cc +++ b/vest/mr_vest_reduce.cc @@ -10,6 +10,7 @@ #include "error_surface.h" #include "line_optimizer.h" #include "b64tools.h" +#include "stringlib.h" using namespace std; namespace po = boost::program_options; @@ -17,12 +18,12 @@ namespace po = boost::program_options; void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() - ("loss_function,l",po::value(), "Loss function being optimized") + ("evaluation_metric,m",po::value(), "Evaluation metric (IBM_BLEU, etc.)") ("help,h", "Help"); po::options_description dcmdline_options; dcmdline_options.add(opts); po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - bool flag = conf->count("loss_function") == 0; + bool flag = conf->count("evaluation_metric") == 0; if (flag || conf->count("help")) { cerr << dcmdline_options << endl; exit(1); @@ -32,30 +33,27 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { int main(int argc, char** argv) { po::variables_map conf; InitCommandLine(argc, argv, &conf); - const string loss_function = conf["loss_function"].as(); - ScoreType type = ScoreTypeFromString(loss_function); + const string evaluation_metric = conf["evaluation_metric"].as(); LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE; - if (type == TER || type == AER) { + if (UppercaseString(evaluation_metric) == "TER") opt_type = LineOptimizer::MINIMIZE_SCORE; - } - string last_key; + EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); + vector esv; - while(cin) { - string line; - getline(cin, line); - if (line.empty()) continue; + string last_key, line, key, val; + while(getline(cin, line)) { size_t ks = line.find("\t"); assert(string::npos != ks); assert(ks > 2); - string key = line.substr(2, ks - 2); - string val = line.substr(ks + 1); + key = line.substr(2, ks - 2); + val = line.substr(ks + 1); if (key != last_key) { if (!last_key.empty()) { float score; - double x = LineOptimizer::LineOptimize(esv, opt_type, &score); + double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score); cout << last_key << "|" << x << "|" << score << endl; } - last_key = key; + last_key.swap(key); esv.clear(); } if (val.size() % 4 != 0) { @@ -68,13 +66,11 @@ int main(int argc, char** argv) { continue; } esv.push_back(ErrorSurface()); - esv.back().Deserialize(type, encoded); + esv.back().Deserialize(encoded); } if (!esv.empty()) { - // cerr << "ESV=" << esv.size() << endl; - // for (int i = 0; i < esv.size(); ++i) { cerr << esv[i].size() << endl; } float score; - double x = LineOptimizer::LineOptimize(esv, opt_type, &score); + double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score); cout << last_key << "|" << x << "|" << score << endl; } return 0; -- cgit v1.2.3 From 3d17bf9ae1ba67cd091794839d4d5f4c393a0e2c Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 27 Jan 2012 13:19:27 -0500 Subject: migration to new metric api for vest, clean up of unsupported/not functional code --- mteval/mbr_kbest.cc | 21 +- utils/fast_sparse_vector.h | 6 + vest/dist-vest.pl | 22 +-- vest/mbr_kbest.cc | 138 ------------- vest/mr_vest_generate_mapper_input.cc | 356 ++++++---------------------------- vest/mr_vest_map.cc | 16 +- 6 files changed, 84 insertions(+), 475 deletions(-) delete mode 100644 vest/mbr_kbest.cc (limited to 'vest') diff --git a/mteval/mbr_kbest.cc b/mteval/mbr_kbest.cc index 64a6a8bf..b5e4750c 100644 --- a/mteval/mbr_kbest.cc +++ b/mteval/mbr_kbest.cc @@ -5,7 +5,7 @@ #include "prob.h" #include "tdict.h" -#include "scorer.h" +#include "ns.h" #include "filelib.h" #include "stringlib.h" @@ -17,7 +17,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() ("scale,a",po::value()->default_value(1.0), "Posterior scaling factor (alpha)") - ("loss_function,l",po::value()->default_value("bleu"), "Loss function") + ("evaluation_metric,m",po::value()->default_value("ibm_bleu"), "Evaluation metric") ("input,i",po::value()->default_value("-"), "File to read k-best lists from") ("output_list,L", "Show reranked list as output") ("help,h", "Help"); @@ -75,13 +75,14 @@ bool ReadKBestList(istream* in, string* sent_id, vector, pro int main(int argc, char** argv) { po::variables_map conf; InitCommandLine(argc, argv, &conf); - const string metric = conf["loss_function"].as(); + const string smetric = conf["evaluation_metric"].as(); + EvaluationMetric* metric = EvaluationMetric::Instance(smetric); + const bool is_loss = (UppercaseString(smetric) == "TER"); const bool output_list = conf.count("output_list") > 0; const string file = conf["input"].as(); const double mbr_scale = conf["scale"].as(); cerr << "Posterior scaling factor (alpha) = " << mbr_scale << endl; - ScoreType type = ScoreTypeFromString(metric); vector, prob_t> > list; ReadFile rf(file); string sent_id; @@ -99,15 +100,15 @@ int main(int argc, char** argv) { vector mbr_scores(output_list ? list.size() : 0); double mbr_loss = numeric_limits::max(); for (int i = 0 ; i < list.size(); ++i) { - vector > refs(1, list[i].first); - //cerr << i << ": " << list[i].second <<"\t" << TD::GetString(list[i].first) << endl; - ScorerP scorer = SentenceScorer::CreateSentenceScorer(type, refs); + const vector > refs(1, list[i].first); + double wl_acc = 0; for (int j = 0; j < list.size(); ++j) { if (i != j) { - ScoreP s = scorer->ScoreCandidate(list[j].first); - double loss = 1.0 - s->ComputeScore(); - if (type == TER || type == AER) loss = 1.0 - loss; + SufficientStats ss; + metric->ComputeSufficientStatistics(list[j].first, refs, &ss); + double loss = 1.0 - metric->ComputeScore(ss); + if (is_loss) loss = 1.0 - loss; double weighted_loss = loss * (joints[j] / marginal).as_float(); wl_acc += weighted_loss; if ((!output_list) && wl_acc > mbr_loss) break; diff --git a/utils/fast_sparse_vector.h b/utils/fast_sparse_vector.h index 1301581a..17fa47bf 100644 --- a/utils/fast_sparse_vector.h +++ b/utils/fast_sparse_vector.h @@ -178,6 +178,12 @@ class FastSparseVector { T l2norm() const { return sqrt(l2norm_sq()); } + T pnorm(const double p) const { + T sum = T(); + for (const_iterator it = begin(), e = end(); it != e; ++it) + sum += pow(fabs(it->second), p); + return pow(sum, 1.0 / p); + } // if values are binary, gives |A intersect B|/|A union B| template S tanimoto_coef(const FastSparseVector &vec) const { diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl index 8cde748b..1ec8c6b1 100755 --- a/vest/dist-vest.pl +++ b/vest/dist-vest.pl @@ -65,8 +65,6 @@ my $oraclen=0; my $oracleb=20; my $bleu_weight=1; my $use_make = 1; # use make to parallelize line search -my $dirargs=''; -my $density_prune; my $useqsub; my $pass_suffix = ''; my $cpbin=1; @@ -75,7 +73,6 @@ Getopt::Long::Configure("no_auto_abbrev"); if (GetOptions( "decoder=s" => \$decoderOpt, "jobs=i" => \$jobs, - "density-prune=f" => \$density_prune, "dont-clean" => \$disable_clean, "pass-suffix=s" => \$pass_suffix, "dry-run" => \$dryrun, @@ -87,15 +84,7 @@ if (GetOptions( "normalize=s" => \$normalize, "pmem=s" => \$pmem, "cpbin!" => \$cpbin, - "rand-directions=i" => \$rand_directions, - "random_directions=i" => \$rand_directions, - "bleu_weight=s" => \$bleu_weight, - "no-primary!" => \$noprimary, - "max-similarity=s" => \$maxsim, - "oracle-directions=i" => \$oraclen, - "n-oracle=i" => \$oraclen, - "oracle-batch=i" => \$oracleb, - "directions-args=s" => \$dirargs, + "random-directions=i" => \$rand_directions, "ref-files=s" => \$refFiles, "metric=s" => \$metric, "source-file=s" => \$srcFile, @@ -107,10 +96,6 @@ if (GetOptions( exit; } -if (defined $density_prune) { - die "--density_prune n: n must be greater than 1.0\n" unless $density_prune > 1.0; -} - if ($useqsub) { $use_make = 0; die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub(); @@ -328,10 +313,7 @@ while (1){ print STDERR "\nGENERATE OPTIMIZATION STRATEGY (OPT-ITERATION $opt_iter/$optimization_iters)\n"; print STDERR unchecked_output("date"); $icc++; - my $nop=$noprimary?"--no_primary":""; - my $targs=$oraclen ? "--decoder_translations='$runFile.gz' ".get_comma_sep_refs('-references',$refFiles):""; - my $bwargs=$bleu_weight!=1 ? "--bleu_weight=$bleu_weight":""; - $cmd="$MAPINPUT -w $inweights -r $dir/hgs $bwargs -s $devSize -d $rand_directions --max_similarity=$maxsim --oracle_directions=$oraclen --oracle_batch=$oracleb $targs $dirargs > $dir/agenda.$im1-$opt_iter"; + $cmd="$MAPINPUT -w $inweights -r $dir/hgs -s $devSize -d $rand_directions > $dir/agenda.$im1-$opt_iter"; print STDERR "COMMAND:\n$cmd\n"; check_call($cmd); check_call("mkdir -p $dir/splag.$im1"); diff --git a/vest/mbr_kbest.cc b/vest/mbr_kbest.cc deleted file mode 100644 index 2867b36b..00000000 --- a/vest/mbr_kbest.cc +++ /dev/null @@ -1,138 +0,0 @@ -#include -#include - -#include - -#include "prob.h" -#include "tdict.h" -#include "scorer.h" -#include "filelib.h" -#include "stringlib.h" - -using namespace std; - -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("scale,a",po::value()->default_value(1.0), "Posterior scaling factor (alpha)") - ("loss_function,l",po::value()->default_value("bleu"), "Loss function") - ("input,i",po::value()->default_value("-"), "File to read k-best lists from") - ("output_list,L", "Show reranked list as output") - ("help,h", "Help"); - po::options_description dcmdline_options; - dcmdline_options.add(opts); - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - bool flag = false; - if (flag || conf->count("help")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -struct LossComparer { - bool operator()(const pair, double>& a, const pair, double>& b) const { - return a.second < b.second; - } -}; - -bool ReadKBestList(istream* in, string* sent_id, vector, prob_t> >* list) { - static string cache_id; - static pair, prob_t> cache_pair; - list->clear(); - string cur_id; - if (cache_pair.first.size() > 0) { - list->push_back(cache_pair); - cur_id = cache_id; - cache_pair.first.clear(); - } - string line; - string tstr; - while(*in) { - getline(*in, line); - if (line.empty()) continue; - size_t p1 = line.find(" ||| "); - if (p1 == string::npos) { cerr << "Bad format: " << line << endl; abort(); } - size_t p2 = line.find(" ||| ", p1 + 4); - if (p2 == string::npos) { cerr << "Bad format: " << line << endl; abort(); } - size_t p3 = line.rfind(" ||| "); - cache_id = line.substr(0, p1); - tstr = line.substr(p1 + 5, p2 - p1 - 5); - double val = strtod(line.substr(p3 + 5).c_str(), NULL); - TD::ConvertSentence(tstr, &cache_pair.first); - cache_pair.second.logeq(val); - if (cur_id.empty()) cur_id = cache_id; - if (cur_id == cache_id) { - list->push_back(cache_pair); - *sent_id = cur_id; - cache_pair.first.clear(); - } else { break; } - } - return !list->empty(); -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - const string metric = conf["loss_function"].as(); - const bool output_list = conf.count("output_list") > 0; - const string file = conf["input"].as(); - const double mbr_scale = conf["scale"].as(); - cerr << "Posterior scaling factor (alpha) = " << mbr_scale << endl; - - ScoreType type = ScoreTypeFromString(metric); - vector, prob_t> > list; - ReadFile rf(file); - string sent_id; - while(ReadKBestList(rf.stream(), &sent_id, &list)) { - vector joints(list.size()); - const prob_t max_score = pow(list.front().second, mbr_scale); - prob_t marginal = prob_t::Zero(); - for (int i = 0 ; i < list.size(); ++i) { - const prob_t joint = pow(list[i].second, mbr_scale) / max_score; - joints[i] = joint; - // cerr << "list[" << i << "] joint=" << log(joint) << endl; - marginal += joint; - } - int mbr_idx = -1; - vector mbr_scores(output_list ? list.size() : 0); - double mbr_loss = numeric_limits::max(); - for (int i = 0 ; i < list.size(); ++i) { - vector > refs(1, list[i].first); - //cerr << i << ": " << list[i].second <<"\t" << TD::GetString(list[i].first) << endl; - ScorerP scorer = SentenceScorer::CreateSentenceScorer(type, refs); - double wl_acc = 0; - for (int j = 0; j < list.size(); ++j) { - if (i != j) { - ScoreP s = scorer->ScoreCandidate(list[j].first); - double loss = 1.0 - s->ComputeScore(); - if (type == TER || type == AER) loss = 1.0 - loss; - double weighted_loss = loss * (joints[j] / marginal); - wl_acc += weighted_loss; - if ((!output_list) && wl_acc > mbr_loss) break; - } - } - if (output_list) mbr_scores[i] = wl_acc; - if (wl_acc < mbr_loss) { - mbr_loss = wl_acc; - mbr_idx = i; - } - } - // cerr << "ML translation: " << TD::GetString(list[0].first) << endl; - cerr << "MBR Best idx: " << mbr_idx << endl; - if (output_list) { - for (int i = 0; i < list.size(); ++i) - list[i].second.logeq(mbr_scores[i]); - sort(list.begin(), list.end(), LossComparer()); - for (int i = 0; i < list.size(); ++i) - cout << sent_id << " ||| " - << TD::GetString(list[i].first) << " ||| " - << log(list[i].second) << endl; - } else { - cout << TD::GetString(list[mbr_idx].first) << endl; - } - } - return 0; -} - diff --git a/vest/mr_vest_generate_mapper_input.cc b/vest/mr_vest_generate_mapper_input.cc index 0c094fd5..59d4f24f 100644 --- a/vest/mr_vest_generate_mapper_input.cc +++ b/vest/mr_vest_generate_mapper_input.cc @@ -1,320 +1,78 @@ -//TODO: debug segfault when references supplied, null shared_ptr when oracle #include #include -#include #include #include -#include "sampler.h" #include "filelib.h" #include "weights.h" #include "line_optimizer.h" -#include "hg.h" -#include "hg_io.h" -#include "scorer.h" -#include "oracle_bleu.h" -#include "ff_bleu.h" - -const bool DEBUG_ORACLE=true; - -//TODO: decide on cdec_ff ffs, or just bleumodel - if just bleumodel, then do existing features on serialized hypergraphs remain? weights (origin) is passed to oracle_bleu.h:ComputeOracle -//void register_feature_functions(); -//FFRegistry ff_registry; -namespace { -void init_bleumodel() { - ff_registry.clear(); - ff_registry.Register(new FFFactory); -} - -struct init_ff { - init_ff() { - init_bleumodel(); - } -}; -//init_ff reg; // order of initialization? ff_registry may not be init yet. call in Run() instead. -} using namespace std; namespace po = boost::program_options; -typedef SparseVector Dir; -typedef Dir Point; - -void compress_similar(vector &dirs,double min_dist,ostream *log=&cerr,bool avg=true,bool verbose=true) { - // return; //TODO: debug - if (min_dist<=0) return; - double max_s=1.-min_dist; - if (log&&verbose) *log<<"max allowed S="< "<add_options() - ("dev_set_size,s",po::value(&dev_set_size),"[REQD] Development set size (# of parallel sentences)") - ("forest_repository,r",po::value(&forest_repository),"[REQD] Path to forest repository") - ("weights,w",po::value(&weights_file),"[REQD] Current feature weights file") - ("optimize_feature,o",po::value >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)") - ("random_directions,d",po::value(&n_random)->default_value(10),"Number of random directions to run the line optimizer in") - ("no_primary,n","don't use the primary (orthogonal each feature alone) directions") - ("oracle_directions,O",po::value(&n_oracle)->default_value(0),"read the forests and choose this many directions based on heading toward a hope max (bleu+modelscore) translation.") - ("oracle_start_random",po::bool_switch(&start_random),"sample random subsets of dev set for ALL oracle directions, not just those after a sequential run through it") - ("oracle_batch,b",po::value(&oracle_batch)->default_value(10),"to produce each oracle direction, sum the 'gradient' over this many sentences") - ("max_similarity,m",po::value(&max_similarity)->default_value(0),"remove directions that are too similar (Tanimoto coeff. less than (1-this)). 0 means don't filter, 1 means only 1 direction allowed?") - ("fear_to_hope,f",po::bool_switch(&fear_to_hope),"for each of the oracle_directions, also include a direction from fear to hope (as well as origin to hope)") - ("no_old_to_hope","don't emit the usual old -> hope oracle") - ("decoder_translations",po::value(&decoder_translations_file)->default_value(""),"one per line decoder 1best translations for computing document BLEU vs. sentences-seen-so-far BLEU") - ; - } - void InitCommandLine(int argc, char *argv[], po::variables_map *conf) { - po::options_description opts("Configuration options"); - AddOptions(&opts); - opts.add_options()("help,h", "Help"); - - po::options_description dcmdline_options; - dcmdline_options.add(opts); - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - po::notify(*conf); - if (conf->count("dev_set_size") == 0) { - cerr << "Please specify the size of the development set using -s N\n"; - goto bad_cmdline; - } - if (conf->count("weights") == 0) { - cerr << "Please specify the starting-point weights using -w \n"; - goto bad_cmdline; - } - if (conf->count("forest_repository") == 0) { - cerr << "Please specify the forest repository location using -r \n"; - goto bad_cmdline; - } - if (n_oracle && oracle.refs.empty()) { - cerr<<"Specify references when using oracle directions\n"; - goto bad_cmdline; - } - if (conf->count("help")) { - cout << dcmdline_options << endl; - exit(0); - } - - return; - bad_cmdline: - cerr << dcmdline_options << endl; - exit(1); +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("dev_set_size,s",po::value(),"[REQD] Development set size (# of parallel sentences)") + ("forest_repository,r",po::value(),"[REQD] Path to forest repository") + ("weights,w",po::value(),"[REQD] Current feature weights file") + ("optimize_feature,o",po::value >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)") + ("random_directions,d",po::value()->default_value(20),"Number of random directions to run the line optimizer in") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (conf->count("dev_set_size") == 0) { + cerr << "Please specify the size of the development set using -d N\n"; + flag = true; } - - int main(int argc, char *argv[]) { - po::variables_map conf; - InitCommandLine(argc,argv,&conf); - init_bleumodel(); - UseConf(conf); - Run(); - return 0; + if (conf->count("weights") == 0) { + cerr << "Please specify the starting-point weights using -w \n"; + flag = true; } - bool verbose() const { return oracle.verbose; } - void Run() { -// register_feature_functions(); - AddPrimaryAndRandomDirections(); - AddOracleDirections(); - compress_similar(directions,max_similarity,&cerr,true,verbose()); - Print(); + if (conf->count("forest_repository") == 0) { + cerr << "Please specify the forest repository location using -r \n"; + flag = true; } - - - Point origin; // old weights that gave model 1best. - vector optimize_features; - void UseConf(po::variables_map const& conf) { - oracle.UseConf(conf); - include_primary=!conf.count("no_primary"); - old_to_hope=!conf.count("no_old_to_hope"); - - if (conf.count("optimize_feature") > 0) - optimize_features=conf["optimize_feature"].as >(); - Init(); + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); } +} - string weights_file; - double max_similarity; - unsigned n_oracle, oracle_batch; - string forest_repository; - unsigned dev_set_size; - vector oracles; - vector fids; - string forest_file(unsigned i) const { - ostringstream o; - o << forest_repository << '/' << i << ".json.gz"; - return o.str(); - } - - oracle_directions() { } - - Sentences model_hyps; - - vector model_scores; - bool have_doc; - void Init() { - have_doc=!decoder_translations_file.empty(); - if (have_doc) { - model_hyps.Load(decoder_translations_file); - if (verbose()) model_hyps.Print(cerr,5); - model_scores.resize(model_hyps.size()); - if (dev_set_size!=model_hyps.size()) { - cerr<<"You supplied decoder_translations with a different number of lines ("<ScoreCandidate(model_hyps[i]); - assert(model_scores[i]); - if (verbose()) cerr<<"Before model["<ScoreDetails()<PlusEquals(*model_scores[i]); - if (verbose()) cerr<<"After model["< features; - vector dorigin; - Weights::InitFromFile(weights_file, &dorigin, &features); - if (optimize_features.size()) - features=optimize_features; - Weights::InitSparseVector(dorigin, &origin); - fids.clear(); - AddFeatureIds(features); - oracles.resize(dev_set_size); - } - - void AddFeatureIds(vector const& features) { - int i = fids.size(); - fids.resize(fids.size()+features.size()); - for (; i < features.size(); ++i) - fids[i] = FD::Convert(features[i]); - } - - - std::string decoder_translations_file; // one per line - //TODO: is it worthwhile to get a complete document bleu first? would take a list of 1best translations one per line from the decoders, rather than loading all the forests (expensive). translations are in run.raw.N.gz - new arg - void adjust_doc(unsigned i,double scale=1.) { - oracle.doc_score->PlusEquals(*model_scores[i],scale); - } - - Score &ds() { - return *oracle.doc_score; - } - - Oracle const& ComputeOracle(unsigned i) { - Oracle &o=oracles[i]; - if (o.is_null()) { - if (have_doc) { - if (verbose()) cerr<<"Before removing i="<PlusEquals(*hopesc,1); - cerr<<"With hope: "<PlusEquals(*hopesc,-1); - cerr<<"Without hope: "<ScoreDetails()<=dev_set_size) ? rsg() : b); - - if (old_to_hope) - o2hope+=o.ModelHopeGradient(); - if (fear_to_hope) - fear2hope+=o.FearHopeGradient(); - } - double N=(double)oracle_batch; - if (old_to_hope) { - o2hope/=N; - directions.push_back(o2hope); - } - if (fear_to_hope) { - fear2hope/=N; - directions.push_back(fear2hope); - } +int main(int argc, char** argv) { + RandomNumberGenerator rng; + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + vector features; + SparseVector origin; + vector w; + Weights::InitFromFile(conf["weights"].as(), &w, &features); + Weights::InitSparseVector(w, &origin); + const string forest_repository = conf["forest_repository"].as(); + assert(DirectoryExists(forest_repository)); + if (conf.count("optimize_feature") > 0) + features=conf["optimize_feature"].as >(); + vector > directions; + vector fids(features.size()); + for (int i = 0; i < features.size(); ++i) + fids[i] = FD::Convert(features[i]); + LineOptimizer::CreateOptimizationDirections( + fids, + conf["random_directions"].as(), + &rng, + &directions); + unsigned dev_set_size = conf["dev_set_size"].as(); + for (unsigned i = 0; i < dev_set_size; ++i) { + for (unsigned j = 0; j < directions.size(); ++j) { + cout << forest_repository << '/' << i << ".json.gz " << i << ' '; + print(cout, origin, "=", ";"); + cout << ' '; + print(cout, directions[j], "=", ";"); + cout << endl; } } -}; - -int main(int argc, char** argv) { - oracle_directions od; - return od.main(argc,argv); + return 0; } diff --git a/vest/mr_vest_map.cc b/vest/mr_vest_map.cc index 8f6e085d..7d9625bc 100644 --- a/vest/mr_vest_map.cc +++ b/vest/mr_vest_map.cc @@ -82,20 +82,20 @@ int main(int argc, char** argv) { if (line.empty()) continue; istringstream is(line); int sent_id; - string file, s_origin, s_axis; + string file, s_origin, s_direction; // path-to-file (JSON) sent_ed starting-point search-direction - is >> file >> sent_id >> s_origin >> s_axis; + is >> file >> sent_id >> s_origin >> s_direction; SparseVector origin; - assert(ReadSparseVectorString(s_origin, &origin)); - SparseVector axis; - assert(ReadSparseVectorString(s_axis, &axis)); - // cerr << "File: " << file << "\nAxis: " << axis << "\n X: " << origin << endl; + ReadSparseVectorString(s_origin, &origin); + SparseVector direction; + ReadSparseVectorString(s_direction, &direction); + // cerr << "File: " << file << "\nDir: " << direction << "\n X: " << origin << endl; if (last_file != file) { last_file = file; ReadFile rf(file); HypergraphIO::ReadFromJSON(rf.stream(), &hg); } - ViterbiEnvelopeWeightFunction wf(origin, axis); + ViterbiEnvelopeWeightFunction wf(origin, direction); ViterbiEnvelope ve = Inside(hg, NULL, wf); ErrorSurface es; @@ -104,7 +104,7 @@ int main(int argc, char** argv) { // cerr << "Error surface has " << es.size() << " segments\n"; string val; es.Serialize(&val); - cout << 'M' << ' ' << s_origin << ' ' << s_axis << '\t'; + cout << 'M' << ' ' << s_origin << ' ' << s_direction << '\t'; B64::b64encode(val.c_str(), val.size(), &cout); cout << endl << flush; } -- cgit v1.2.3 From 89b662b51373f0f466d62a65d3f0a164d1d31b1c Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 27 Jan 2012 14:49:08 -0500 Subject: rename vest to dpmert (dynamic programming mert), rename variables and types to correspond to standard geometric concepts --- Makefile.am | 2 +- configure.ac | 2 +- dpmert/Makefile.am | 35 ++ dpmert/README.shared-mem | 9 + dpmert/cat.pl | 4 + dpmert/ces.cc | 91 ++++ dpmert/ces.h | 16 + dpmert/dpmert.pl | 700 ++++++++++++++++++++++++++++++ dpmert/error_surface.cc | 42 ++ dpmert/error_surface.h | 24 + dpmert/libcall.pl | 71 +++ dpmert/line_mediator.pl | 116 +++++ dpmert/line_optimizer.cc | 111 +++++ dpmert/line_optimizer.h | 48 ++ dpmert/lo_test.cc | 236 ++++++++++ dpmert/mert_geometry.cc | 186 ++++++++ dpmert/mert_geometry.h | 81 ++++ dpmert/mr_dpmert_generate_mapper_input.cc | 78 ++++ dpmert/mr_dpmert_map.cc | 112 +++++ dpmert/mr_dpmert_reduce.cc | 77 ++++ dpmert/parallelize.pl | 423 ++++++++++++++++++ dpmert/sentclient.c | 76 ++++ dpmert/sentserver.c | 515 ++++++++++++++++++++++ dpmert/sentserver.h | 6 + dpmert/tac.pl | 8 + dpmert/test_aer/README | 8 + dpmert/test_aer/cdec.ini | 3 + dpmert/test_aer/corpus.src | 3 + dpmert/test_aer/grammar | 12 + dpmert/test_aer/ref.0 | 3 + dpmert/test_aer/weights | 13 + dpmert/test_data/0.json.gz | Bin 0 -> 13709 bytes dpmert/test_data/1.json.gz | Bin 0 -> 204803 bytes dpmert/test_data/c2e.txt.0 | 2 + dpmert/test_data/c2e.txt.1 | 2 + dpmert/test_data/c2e.txt.2 | 2 + dpmert/test_data/c2e.txt.3 | 2 + dpmert/test_data/re.txt.0 | 5 + dpmert/test_data/re.txt.1 | 5 + dpmert/test_data/re.txt.2 | 5 + dpmert/test_data/re.txt.3 | 5 + vest/Makefile.am | 35 -- vest/README.shared-mem | 9 - vest/cat.pl | 4 - vest/ces.cc | 91 ---- vest/ces.h | 16 - vest/dist-vest.pl | 700 ------------------------------ vest/error_surface.cc | 42 -- vest/error_surface.h | 24 - vest/libcall.pl | 71 --- vest/line_mediator.pl | 116 ----- vest/line_optimizer.cc | 111 ----- vest/line_optimizer.h | 48 -- vest/lo_test.cc | 236 ---------- vest/mr_vest_generate_mapper_input.cc | 78 ---- vest/mr_vest_map.cc | 112 ----- vest/mr_vest_reduce.cc | 77 ---- vest/parallelize.pl | 423 ------------------ vest/sentclient.c | 76 ---- vest/sentserver.c | 515 ---------------------- vest/sentserver.h | 6 - vest/tac.pl | 8 - vest/test_aer/README | 8 - vest/test_aer/cdec.ini | 3 - vest/test_aer/corpus.src | 3 - vest/test_aer/grammar | 12 - vest/test_aer/ref.0 | 3 - vest/test_aer/weights | 13 - vest/test_data/0.json.gz | Bin 13709 -> 0 bytes vest/test_data/1.json.gz | Bin 204803 -> 0 bytes vest/test_data/c2e.txt.0 | 2 - vest/test_data/c2e.txt.1 | 2 - vest/test_data/c2e.txt.2 | 2 - vest/test_data/c2e.txt.3 | 2 - vest/test_data/re.txt.0 | 5 - vest/test_data/re.txt.1 | 5 - vest/test_data/re.txt.2 | 5 - vest/test_data/re.txt.3 | 5 - vest/viterbi_envelope.cc | 177 -------- vest/viterbi_envelope.h | 81 ---- 80 files changed, 3137 insertions(+), 3128 deletions(-) create mode 100644 dpmert/Makefile.am create mode 100644 dpmert/README.shared-mem create mode 100755 dpmert/cat.pl create mode 100644 dpmert/ces.cc create mode 100644 dpmert/ces.h create mode 100755 dpmert/dpmert.pl create mode 100644 dpmert/error_surface.cc create mode 100644 dpmert/error_surface.h create mode 100644 dpmert/libcall.pl create mode 100755 dpmert/line_mediator.pl create mode 100644 dpmert/line_optimizer.cc create mode 100644 dpmert/line_optimizer.h create mode 100644 dpmert/lo_test.cc create mode 100644 dpmert/mert_geometry.cc create mode 100644 dpmert/mert_geometry.h create mode 100644 dpmert/mr_dpmert_generate_mapper_input.cc create mode 100644 dpmert/mr_dpmert_map.cc create mode 100644 dpmert/mr_dpmert_reduce.cc create mode 100755 dpmert/parallelize.pl create mode 100644 dpmert/sentclient.c create mode 100644 dpmert/sentserver.c create mode 100644 dpmert/sentserver.h create mode 100755 dpmert/tac.pl create mode 100644 dpmert/test_aer/README create mode 100644 dpmert/test_aer/cdec.ini create mode 100644 dpmert/test_aer/corpus.src create mode 100644 dpmert/test_aer/grammar create mode 100644 dpmert/test_aer/ref.0 create mode 100644 dpmert/test_aer/weights create mode 100644 dpmert/test_data/0.json.gz create mode 100644 dpmert/test_data/1.json.gz create mode 100644 dpmert/test_data/c2e.txt.0 create mode 100644 dpmert/test_data/c2e.txt.1 create mode 100644 dpmert/test_data/c2e.txt.2 create mode 100644 dpmert/test_data/c2e.txt.3 create mode 100644 dpmert/test_data/re.txt.0 create mode 100644 dpmert/test_data/re.txt.1 create mode 100644 dpmert/test_data/re.txt.2 create mode 100644 dpmert/test_data/re.txt.3 delete mode 100644 vest/Makefile.am delete mode 100644 vest/README.shared-mem delete mode 100755 vest/cat.pl delete mode 100644 vest/ces.cc delete mode 100644 vest/ces.h delete mode 100755 vest/dist-vest.pl delete mode 100644 vest/error_surface.cc delete mode 100644 vest/error_surface.h delete mode 100644 vest/libcall.pl delete mode 100755 vest/line_mediator.pl delete mode 100644 vest/line_optimizer.cc delete mode 100644 vest/line_optimizer.h delete mode 100644 vest/lo_test.cc delete mode 100644 vest/mr_vest_generate_mapper_input.cc delete mode 100644 vest/mr_vest_map.cc delete mode 100644 vest/mr_vest_reduce.cc delete mode 100755 vest/parallelize.pl delete mode 100644 vest/sentclient.c delete mode 100644 vest/sentserver.c delete mode 100644 vest/sentserver.h delete mode 100755 vest/tac.pl delete mode 100644 vest/test_aer/README delete mode 100644 vest/test_aer/cdec.ini delete mode 100644 vest/test_aer/corpus.src delete mode 100644 vest/test_aer/grammar delete mode 100644 vest/test_aer/ref.0 delete mode 100644 vest/test_aer/weights delete mode 100644 vest/test_data/0.json.gz delete mode 100644 vest/test_data/1.json.gz delete mode 100644 vest/test_data/c2e.txt.0 delete mode 100644 vest/test_data/c2e.txt.1 delete mode 100644 vest/test_data/c2e.txt.2 delete mode 100644 vest/test_data/c2e.txt.3 delete mode 100644 vest/test_data/re.txt.0 delete mode 100644 vest/test_data/re.txt.1 delete mode 100644 vest/test_data/re.txt.2 delete mode 100644 vest/test_data/re.txt.3 delete mode 100644 vest/viterbi_envelope.cc delete mode 100644 vest/viterbi_envelope.h (limited to 'vest') diff --git a/Makefile.am b/Makefile.am index 59c2fc0a..c0fcb1f6 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,7 +1,7 @@ # warning - the subdirectories in the following list should # be kept in topologically sorted order. Also, DO NOT introduce # cyclic dependencies between these directories! -SUBDIRS = utils mteval klm/util klm/lm decoder phrasinator training mira vest pro-train extools gi/pf gi/markov_al +SUBDIRS = utils mteval klm/util klm/lm decoder phrasinator training mira dpmert pro-train extools gi/pf gi/markov_al #gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava diff --git a/configure.ac b/configure.ac index 131a1705..cd78ee72 100644 --- a/configure.ac +++ b/configure.ac @@ -113,4 +113,4 @@ then AM_CONDITIONAL([GLC], true) fi -AC_OUTPUT(Makefile utils/Makefile mteval/Makefile extools/Makefile decoder/Makefile phrasinator/Makefile training/Makefile vest/Makefile pro-train/Makefile klm/util/Makefile klm/lm/Makefile mira/Makefile gi/pyp-topics/src/Makefile gi/clda/src/Makefile gi/pf/Makefile gi/markov_al/Makefile) +AC_OUTPUT(Makefile utils/Makefile mteval/Makefile extools/Makefile decoder/Makefile phrasinator/Makefile training/Makefile dpmert/Makefile pro-train/Makefile klm/util/Makefile klm/lm/Makefile mira/Makefile gi/pyp-topics/src/Makefile gi/clda/src/Makefile gi/pf/Makefile gi/markov_al/Makefile) diff --git a/dpmert/Makefile.am b/dpmert/Makefile.am new file mode 100644 index 00000000..2676fb50 --- /dev/null +++ b/dpmert/Makefile.am @@ -0,0 +1,35 @@ +bin_PROGRAMS = \ + mr_dpmert_map \ + mr_dpmert_reduce \ + mr_dpmert_generate_mapper_input \ + sentserver \ + sentclient + +if HAVE_GTEST +noinst_PROGRAMS = \ + lo_test +TESTS = lo_test +endif + +sentserver_SOURCES = sentserver.c +sentserver_LDFLAGS = -all-static -pthread + +sentclient_SOURCES = sentclient.c +sentclient_LDFLAGS = -all-static -pthread + +mr_dpmert_generate_mapper_input_SOURCES = mr_dpmert_generate_mapper_input.cc line_optimizer.cc +mr_dpmert_generate_mapper_input_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz + +# nbest2hg_SOURCES = nbest2hg.cc +# nbest2hg_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lfst -lz + +mr_dpmert_map_SOURCES = mert_geometry.cc ces.cc error_surface.cc mr_dpmert_map.cc line_optimizer.cc +mr_dpmert_map_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz + +mr_dpmert_reduce_SOURCES = error_surface.cc ces.cc mr_dpmert_reduce.cc line_optimizer.cc mert_geometry.cc +mr_dpmert_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz + +lo_test_SOURCES = lo_test.cc ces.cc mert_geometry.cc error_surface.cc line_optimizer.cc +lo_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/dpmert/README.shared-mem b/dpmert/README.shared-mem new file mode 100644 index 00000000..7728efc0 --- /dev/null +++ b/dpmert/README.shared-mem @@ -0,0 +1,9 @@ +If you want to run dist-vest.pl on a very large shared memory machine, do the +following: + + ./dist-vest.pl --use-make I --decode-nodes J --weights weights.init --source-file=dev.src --ref-files=dev.ref.* cdec.ini + +This will use I jobs for doing the line search and J jobs to run the decoder. Typically, since the +decoder must load grammars, language models, etc., J should be smaller than I, but this will depend +on the system you are running on and the complexity of the models used for decoding. + diff --git a/dpmert/cat.pl b/dpmert/cat.pl new file mode 100755 index 00000000..2ecba3f9 --- /dev/null +++ b/dpmert/cat.pl @@ -0,0 +1,4 @@ +#!/usr/bin/perl + +$|=1; +print while(<>); diff --git a/dpmert/ces.cc b/dpmert/ces.cc new file mode 100644 index 00000000..a85454da --- /dev/null +++ b/dpmert/ces.cc @@ -0,0 +1,91 @@ +#include "ces.h" + +#include +#include +#include + +// TODO, if AER is to be optimized again, we will need this +// #include "aligner.h" +#include "lattice.h" +#include "mert_geometry.h" +#include "error_surface.h" +#include "ns.h" + +using boost::shared_ptr; +using namespace std; + +const bool minimize_segments = true; // if adjacent segments have equal scores, merge them + +void ComputeErrorSurface(const SegmentEvaluator& ss, + const ConvexHull& ve, + ErrorSurface* env, + const EvaluationMetric* metric, + const Hypergraph& hg) { + vector prev_trans; + const vector >& ienv = ve.GetSortedSegs(); + env->resize(ienv.size()); + SufficientStats prev_score; // defaults to 0 + int j = 0; + for (int i = 0; i < ienv.size(); ++i) { + const MERTPoint& seg = *ienv[i]; + vector trans; +#if 0 + if (type == AER) { + vector edges(hg.edges_.size(), false); + seg.CollectEdgesUsed(&edges); // get the set of edges in the viterbi + // alignment + ostringstream os; + const string* psrc = ss.GetSource(); + if (psrc == NULL) { + cerr << "AER scoring in VEST requires source, but it is missing!\n"; + abort(); + } + size_t pos = psrc->rfind(" ||| "); + if (pos == string::npos) { + cerr << "Malformed source for AER: expected |||\nINPUT: " << *psrc << endl; + abort(); + } + Lattice src; + Lattice ref; + LatticeTools::ConvertTextOrPLF(psrc->substr(0, pos), &src); + LatticeTools::ConvertTextOrPLF(psrc->substr(pos + 5), &ref); + AlignerTools::WriteAlignment(src, ref, hg, &os, true, 0, &edges); + string tstr = os.str(); + TD::ConvertSentence(tstr.substr(tstr.rfind(" ||| ") + 5), &trans); + } else { +#endif + seg.ConstructTranslation(&trans); + //} + //cerr << "Scoring: " << TD::GetString(trans) << endl; + if (trans == prev_trans) { + if (!minimize_segments) { + ErrorSegment& out = (*env)[j]; + out.delta.fields.clear(); + out.x = seg.x; + ++j; + } + //cerr << "Identical translation, skipping scoring\n"; + } else { + SufficientStats score; + ss.Evaluate(trans, &score); + // cerr << "score= " << score->ComputeScore() << "\n"; + //string x1; score.Encode(&x1); cerr << "STATS: " << x1 << endl; + const SufficientStats delta = score - prev_score; + //string x2; delta.Encode(&x2); cerr << "DELTA: " << x2 << endl; + //string xx; delta.Encode(&xx); cerr << xx << endl; + prev_trans.swap(trans); + prev_score = score; + if ((!minimize_segments) || (!delta.IsAdditiveIdentity())) { + ErrorSegment& out = (*env)[j]; + out.delta = delta; + out.x = seg.x; + ++j; + } + } + } + // cerr << " In segments: " << ienv.size() << endl; + // cerr << "Out segments: " << j << endl; + assert(j > 0); + env->resize(j); +} + diff --git a/dpmert/ces.h b/dpmert/ces.h new file mode 100644 index 00000000..e4fa2080 --- /dev/null +++ b/dpmert/ces.h @@ -0,0 +1,16 @@ +#ifndef _CES_H_ +#define _CES_H_ + +class ConvexHull; +class Hypergraph; +class SegmentEvaluator; +class ErrorSurface; +class EvaluationMetric; + +void ComputeErrorSurface(const SegmentEvaluator& ss, + const ConvexHull& convex_hull, + ErrorSurface* es, + const EvaluationMetric* metric, + const Hypergraph& hg); + +#endif diff --git a/dpmert/dpmert.pl b/dpmert/dpmert.pl new file mode 100755 index 00000000..52ce0fc0 --- /dev/null +++ b/dpmert/dpmert.pl @@ -0,0 +1,700 @@ +#!/usr/bin/env perl +use strict; +my @ORIG_ARGV=@ARGV; +use Cwd qw(getcwd); +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } + +# Skip local config (used for distributing jobs) if we're running in local-only mode +use LocalConfig; +use Getopt::Long; +use IPC::Open2; +use POSIX ":sys_wait_h"; +my $QSUB_CMD = qsub_args(mert_memory()); + +require "libcall.pl"; + +# Default settings +my $srcFile; +my $refFiles; +my $default_jobs = env_default_jobs(); +my $bin_dir = $SCRIPT_DIR; +die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; +my $FAST_SCORE="$bin_dir/../mteval/fast_score"; +die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; +my $MAPINPUT = "$bin_dir/mr_dpmert_generate_mapper_input"; +my $MAPPER = "$bin_dir/mr_dpmert_map"; +my $REDUCER = "$bin_dir/mr_dpmert_reduce"; +my $parallelize = "$bin_dir/parallelize.pl"; +my $libcall = "$bin_dir/libcall.pl"; +my $sentserver = "$bin_dir/sentserver"; +my $sentclient = "$bin_dir/sentclient"; +my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm"; + +my $SCORER = $FAST_SCORE; +die "Can't find $MAPPER" unless -x $MAPPER; +my $cdec = "$bin_dir/../decoder/cdec"; +die "Can't find decoder in $cdec" unless -x $cdec; +die "Can't find $parallelize" unless -x $parallelize; +die "Can't find $libcall" unless -e $libcall; +my $decoder = $cdec; +my $lines_per_mapper = 400; +my $rand_directions = 15; +my $iteration = 1; +my $best_weights; +my $max_iterations = 15; +my $optimization_iters = 6; +my $jobs = $default_jobs; # number of decode nodes +my $pmem = "9g"; +my $disable_clean = 0; +my %seen_weights; +my $normalize; +my $help = 0; +my $epsilon = 0.0001; +my $interval = 5; +my $dryrun = 0; +my $last_score = -10000000; +my $metric = "ibm_bleu"; +my $dir; +my $iniFile; +my $weights; +my $initialWeights; +my $decoderOpt; +my $noprimary; +my $maxsim=0; +my $oraclen=0; +my $oracleb=20; +my $bleu_weight=1; +my $use_make = 1; # use make to parallelize line search +my $useqsub; +my $pass_suffix = ''; +my $cpbin=1; +# Process command-line options +Getopt::Long::Configure("no_auto_abbrev"); +if (GetOptions( + "decoder=s" => \$decoderOpt, + "jobs=i" => \$jobs, + "dont-clean" => \$disable_clean, + "pass-suffix=s" => \$pass_suffix, + "dry-run" => \$dryrun, + "epsilon=s" => \$epsilon, + "help" => \$help, + "interval" => \$interval, + "qsub" => \$useqsub, + "max-iterations=i" => \$max_iterations, + "normalize=s" => \$normalize, + "pmem=s" => \$pmem, + "cpbin!" => \$cpbin, + "random-directions=i" => \$rand_directions, + "ref-files=s" => \$refFiles, + "metric=s" => \$metric, + "source-file=s" => \$srcFile, + "weights=s" => \$initialWeights, + "workdir=s" => \$dir, + "opt-iterations=i" => \$optimization_iters, +) == 0 || @ARGV!=1 || $help) { + print_help(); + exit; +} + +if ($useqsub) { + $use_make = 0; + die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub(); +} + +my @missing_args = (); +if (!defined $srcFile) { push @missing_args, "--source-file"; } +if (!defined $refFiles) { push @missing_args, "--ref-files"; } +if (!defined $initialWeights) { push @missing_args, "--weights"; } +die "Please specify missing arguments: " . join (', ', @missing_args) . "\n" if (@missing_args); + +if ($metric =~ /^(combi|ter)$/i) { + $lines_per_mapper = 40; +} elsif ($metric =~ /^meteor$/i) { + $lines_per_mapper = 2000; # start up time is really high +} + +($iniFile) = @ARGV; + + +sub write_config; +sub enseg; +sub print_help; + +my $nodelist; +my $host =check_output("hostname"); chomp $host; +my $bleu; +my $interval_count = 0; +my $logfile; +my $projected_score; + +# used in sorting scores +my $DIR_FLAG = '-r'; +if ($metric =~ /^ter$|^aer$/i) { + $DIR_FLAG = ''; +} + +my $refs_comma_sep = get_comma_sep_refs('r',$refFiles); + +unless ($dir){ + $dir = "dpmert"; +} +unless ($dir =~ /^\//){ # convert relative path to absolute path + my $basedir = check_output("pwd"); + chomp $basedir; + $dir = "$basedir/$dir"; +} + +if ($decoderOpt){ $decoder = $decoderOpt; } + + +# Initializations and helper functions +srand; + +my @childpids = (); +my @cleanupcmds = (); + +sub cleanup { + print STDERR "Cleanup...\n"; + for my $pid (@childpids){ unchecked_call("kill $pid"); } + for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); } + exit 1; +}; +# Always call cleanup, no matter how we exit +*CORE::GLOBAL::exit = + sub{ cleanup(); }; +$SIG{INT} = "cleanup"; +$SIG{TERM} = "cleanup"; +$SIG{HUP} = "cleanup"; + +my $decoderBase = check_output("basename $decoder"); chomp $decoderBase; +my $newIniFile = "$dir/$decoderBase.ini"; +my $inputFileName = "$dir/input"; +my $user = $ENV{"USER"}; + + +# process ini file +-e $iniFile || die "Error: could not open $iniFile for reading\n"; +open(INI, $iniFile); + +use File::Basename qw(basename); +#pass bindir, refs to vars holding bin +sub modbin { + local $_; + my $bindir=shift; + check_call("mkdir -p $bindir"); + -d $bindir || die "couldn't make bindir $bindir"; + for (@_) { + my $src=$$_; + $$_="$bindir/".basename($src); + check_call("cp -p $src $$_"); + } +} +sub dirsize { + opendir ISEMPTY,$_[0]; + return scalar(readdir(ISEMPTY))-1; +} +if ($dryrun){ + write_config(*STDERR); + exit 0; +} else { + if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-dpmert.pl outputs + die "ERROR: working dir $dir already exists\n\n"; + } else { + -e $dir || mkdir $dir; + mkdir "$dir/hgs"; + modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$REDUCER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin; + mkdir "$dir/scripts"; + my $cmdfile="$dir/rerun-dpmert.sh"; + open CMD,'>',$cmdfile; + print CMD "cd ",&getcwd,"\n"; +# print CMD &escaped_cmdline,"\n"; #buggy - last arg is quoted. + my $cline=&cmdline."\n"; + print CMD $cline; + close CMD; + print STDERR $cline; + chmod(0755,$cmdfile); + unless (-e $initialWeights) { + print STDERR "Please specify an initial weights file with --initial-weights\n"; + print_help(); + exit; + } + check_call("cp $initialWeights $dir/weights.0"); + die "Can't find weights.0" unless (-e "$dir/weights.0"); + } + write_config(*STDERR); +} + + +# Generate initial files and values +check_call("cp $iniFile $newIniFile"); +$iniFile = $newIniFile; + +my $newsrc = "$dir/dev.input"; +enseg($srcFile, $newsrc); +$srcFile = $newsrc; +my $devSize = 0; +open F, "<$srcFile" or die "Can't read $srcFile: $!"; +while() { $devSize++; } +close F; + +unless($best_weights){ $best_weights = $weights; } +unless($projected_score){ $projected_score = 0.0; } +$seen_weights{$weights} = 1; + +my $random_seed = int(time / 1000); +my $lastWeightsFile; +my $lastPScore = 0; +# main optimization loop +while (1){ + print STDERR "\n\nITERATION $iteration\n==========\n"; + + if ($iteration > $max_iterations){ + print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n"; + last; + } + # iteration-specific files + my $runFile="$dir/run.raw.$iteration"; + my $onebestFile="$dir/1best.$iteration"; + my $logdir="$dir/logs.$iteration"; + my $decoderLog="$logdir/decoder.sentserver.log.$iteration"; + my $scorerLog="$logdir/scorer.log.$iteration"; + check_call("mkdir -p $logdir"); + + + #decode + print STDERR "RUNNING DECODER AT "; + print STDERR unchecked_output("date"); + my $im1 = $iteration - 1; + my $weightsFile="$dir/weights.$im1"; + my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs"; + my $pcmd; + if ($use_make) { + $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $jobs --"; + } else { + $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --"; + } + my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile"; + print STDERR "COMMAND:\n$cmd\n"; + check_bash_call($cmd); + my $num_hgs; + my $num_topbest; + my $retries = 0; + while($retries < 5) { + $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l"); + $num_topbest = check_output("wc -l < $runFile"); + print STDERR "NUMBER OF HGs: $num_hgs\n"; + print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n"; + if($devSize == $num_hgs && $devSize == $num_topbest) { + last; + } else { + print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n"; + sleep(3); + } + $retries++; + } + die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest); + my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -m $metric"); + chomp $dec_score; + print STDERR "DECODER SCORE: $dec_score\n"; + + # save space + check_call("gzip -f $runFile"); + check_call("gzip -f $decoderLog"); + + # run optimizer + print STDERR "RUNNING OPTIMIZER AT "; + print STDERR unchecked_output("date"); + my $mergeLog="$logdir/prune-merge.log.$iteration"; + + my $score = 0; + my $icc = 0; + my $inweights="$dir/weights.$im1"; + for (my $opt_iter=1; $opt_iter<$optimization_iters; $opt_iter++) { + print STDERR "\nGENERATE OPTIMIZATION STRATEGY (OPT-ITERATION $opt_iter/$optimization_iters)\n"; + print STDERR unchecked_output("date"); + $icc++; + $cmd="$MAPINPUT -w $inweights -r $dir/hgs -s $devSize -d $rand_directions > $dir/agenda.$im1-$opt_iter"; + print STDERR "COMMAND:\n$cmd\n"; + check_call($cmd); + check_call("mkdir -p $dir/splag.$im1"); + $cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1-$opt_iter $dir/splag.$im1/mapinput."; + print STDERR "COMMAND:\n$cmd\n"; + check_call($cmd); + opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!"; + my @shards = grep { /^mapinput\./ } readdir(DIR); + closedir DIR; + die "No shards!" unless scalar @shards > 0; + my $joblist = ""; + my $nmappers = 0; + my @mapoutputs = (); + @cleanupcmds = (); + my %o2i = (); + my $first_shard = 1; + my $mkfile; # only used with makefiles + my $mkfilename; + if ($use_make) { + $mkfilename = "$dir/splag.$im1/domap.mk"; + open $mkfile, ">$mkfilename" or die "Couldn't write $mkfilename: $!"; + print $mkfile "all: $dir/splag.$im1/map.done\n\n"; + } + my @mkouts = (); # only used with makefiles + for my $shard (@shards) { + my $mapoutput = $shard; + my $client_name = $shard; + $client_name =~ s/mapinput.//; + $client_name = "dpmert.$client_name"; + $mapoutput =~ s/mapinput/mapoutput/; + push @mapoutputs, "$dir/splag.$im1/$mapoutput"; + $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; + my $script = "$MAPPER -s $srcFile -m $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput"; + if ($use_make) { + my $script_file = "$dir/scripts/map.$shard"; + open F, ">$script_file" or die "Can't write $script_file: $!"; + print F "#!/bin/bash\n"; + print F "$script\n"; + close F; + my $output = "$dir/splag.$im1/$mapoutput"; + push @mkouts, $output; + chmod(0755, $script_file) or die "Can't chmod $script_file: $!"; + if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } + print $mkfile "$output: $dir/splag.$im1/$shard\n\t$script_file\n\n"; + } else { + my $script_file = "$dir/scripts/map.$shard"; + open F, ">$script_file" or die "Can't write $script_file: $!"; + print F "$script\n"; + close F; + if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } + + $nmappers++; + my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file"; + my $jobid = check_output("$qcmd"); + chomp $jobid; + $jobid =~ s/^(\d+)(.*?)$/\1/g; + $jobid =~ s/^Your job (\d+) .*$/\1/; + push(@cleanupcmds, "qdel $jobid 2> /dev/null"); + print STDERR " $jobid"; + if ($joblist == "") { $joblist = $jobid; } + else {$joblist = $joblist . "\|" . $jobid; } + } + } + if ($use_make) { + print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n"; + close $mkfile; + my $mcmd = "make -j $jobs -f $mkfilename"; + print STDERR "\nExecuting: $mcmd\n"; + check_call($mcmd); + } else { + print STDERR "\nLaunched $nmappers mappers.\n"; + sleep 8; + print STDERR "Waiting for mappers to complete...\n"; + while ($nmappers > 0) { + sleep 5; + my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '"))); + $nmappers = scalar @livejobs; + } + print STDERR "All mappers complete.\n"; + } + my $tol = 0; + my $til = 0; + for my $mo (@mapoutputs) { + my $olines = get_lines($mo); + my $ilines = get_lines($o2i{$mo}); + $tol += $olines; + $til += $ilines; + die "$mo: output lines ($olines) doesn't match input lines ($ilines)" unless $olines==$ilines; + } + print STDERR "Results for $tol/$til lines\n"; + print STDERR "\nSORTING AND RUNNING VEST REDUCER\n"; + print STDERR unchecked_output("date"); + $cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -m $metric > $dir/redoutput.$im1"; + print STDERR "COMMAND:\n$cmd\n"; + check_bash_call($cmd); + $cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1"; + # sort returns failure even when it doesn't fail for some reason + my $best=unchecked_output("$cmd"); chomp $best; + print STDERR "$best\n"; + my ($oa, $x, $xscore) = split /\|/, $best; + $score = $xscore; + print STDERR "PROJECTED SCORE: $score\n"; + if (abs($x) < $epsilon) { + print STDERR "\nOPTIMIZER: no score improvement: abs($x) < $epsilon\n"; + last; + } + my $psd = $score - $last_score; + $last_score = $score; + if (abs($psd) < $epsilon) { + print STDERR "\nOPTIMIZER: no score improvement: abs($psd) < $epsilon\n"; + last; + } + my ($origin, $axis) = split /\s+/, $oa; + + my %ori = convert($origin); + my %axi = convert($axis); + + my $finalFile="$dir/weights.$im1-$opt_iter"; + open W, ">$finalFile" or die "Can't write: $finalFile: $!"; + my $norm = 0; + for my $k (sort keys %ori) { + my $dd = $ori{$k} + $axi{$k} * $x; + $norm += $dd * $dd; + } + $norm = sqrt($norm); + $norm = 1; + for my $k (sort keys %ori) { + my $v = ($ori{$k} + $axi{$k} * $x) / $norm; + print W "$k $v\n"; + } + check_call("rm $dir/splag.$im1/*"); + $inweights = $finalFile; + } + $lastWeightsFile = "$dir/weights.$iteration"; + check_call("cp $inweights $lastWeightsFile"); + if ($icc < 2) { + print STDERR "\nREACHED STOPPING CRITERION: score change too little\n"; + last; + } + $lastPScore = $score; + $iteration++; + print STDERR "\n==========\n"; +} + +print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w with the decoder)\n\n"; + +print STDOUT "$lastWeightsFile\n"; + +exit 0; + +sub normalize_weights { + my ($rfn, $rpts, $feat) = @_; + my @feat_names = @$rfn; + my @pts = @$rpts; + my $z = 1.0; + for (my $i=0; $i < scalar @feat_names; $i++) { + if ($feat_names[$i] eq $feat) { + $z = $pts[$i]; + last; + } + } + for (my $i=0; $i < scalar @feat_names; $i++) { + $pts[$i] /= $z; + } + print STDERR " NORM WEIGHTS: @pts\n"; + return @pts; +} + +sub get_lines { + my $fn = shift @_; + open FL, "<$fn" or die "Couldn't read $fn: $!"; + my $lc = 0; + while() { $lc++; } + return $lc; +} + +sub get_comma_sep_refs { + my ($r,$p) = @_; + my $o = check_output("echo $p"); + chomp $o; + my @files = split /\s+/, $o; + return "-$r " . join(" -$r ", @files); +} + +sub read_weights_file { + my ($file) = @_; + open F, "<$file" or die "Couldn't read $file: $!"; + my @r = (); + my $pm = -1; + while() { + next if /^#/; + next if /^\s*$/; + chomp; + if (/^(.+)\s+(.+)$/) { + my $m = $1; + my $w = $2; + die "Weights out of order: $m <= $pm" unless $m > $pm; + push @r, $w; + } else { + warn "Unexpected feature name in weight file: $_"; + } + } + close F; + return join ' ', @r; +} + +# subs +sub write_config { + my $fh = shift; + my $cleanup = "yes"; + if ($disable_clean) {$cleanup = "no";} + + print $fh "\n"; + print $fh "DECODER: $decoder\n"; + print $fh "INI FILE: $iniFile\n"; + print $fh "WORKING DIR: $dir\n"; + print $fh "SOURCE (DEV): $srcFile\n"; + print $fh "REFS (DEV): $refFiles\n"; + print $fh "EVAL METRIC: $metric\n"; + print $fh "START ITERATION: $iteration\n"; + print $fh "MAX ITERATIONS: $max_iterations\n"; + print $fh "PARALLEL JOBS: $jobs\n"; + print $fh "HEAD NODE: $host\n"; + print $fh "PMEM (DECODING): $pmem\n"; + print $fh "CLEANUP: $cleanup\n"; + print $fh "INITIAL WEIGHTS: $initialWeights\n"; +} + +sub update_weights_file { + my ($neww, $rfn, $rpts) = @_; + my @feats = @$rfn; + my @pts = @$rpts; + my $num_feats = scalar @feats; + my $num_pts = scalar @pts; + die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts; + open G, ">$neww" or die; + for (my $i = 0; $i < $num_feats; $i++) { + my $f = $feats[$i]; + my $lambda = $pts[$i]; + print G "$f $lambda\n"; + } + close G; +} + +sub enseg { + my $src = shift; + my $newsrc = shift; + open(SRC, $src); + open(NEWSRC, ">$newsrc"); + my $i=0; + while (my $line=){ + chomp $line; + if ($line =~ /^\s* tags, you must include a zero-based id attribute"; + } + } else { + print NEWSRC "$line\n"; + } + $i++; + } + close SRC; + close NEWSRC; +} + +sub print_help { + + my $executable = check_output("basename $0"); chomp $executable; + print << "Help"; + +Usage: $executable [options] + + $executable [options] + Runs a complete MERT optimization using the decoder configuration + in . Required options are --weights, --source-file, and + --ref-files. + +Options: + + --help + Print this message and exit. + + --max-iterations + Maximum number of iterations to run. If not specified, defaults + to 10. + + --pass-suffix + If the decoder is doing multi-pass decoding, the pass suffix "2", + "3", etc., is used to control what iteration of weights is set. + + --ref-files + Dev set ref files. This option takes only a single string argument. + To use multiple files (including file globbing), this argument should + be quoted. + + --metric + Metric to optimize. + Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi + + --normalize + After each iteration, rescale all feature weights such that feature- + name has a weight of 1.0. + + --rand-directions + MERT will attempt to optimize along all of the principle directions, + set this parameter to explore other directions. Defaults to 5. + + --source-file + Dev set source file. + + --weights + A file specifying initial feature weights. The format is + FeatureName_1 value1 + FeatureName_2 value2 + **All and only the weights listed in will be optimized!** + + --workdir + Directory for intermediate and output files. If not specified, the + name is derived from the ini filename. Assuming that the ini + filename begins with the decoder name and ends with ini, the default + name of the working directory is inferred from the middle part of + the filename. E.g. an ini file named decoder.foo.ini would have + a default working directory name foo. + +Job control options: + + --jobs + Number of decoder processes to run in parallel. [default=$default_jobs] + + --qsub + Use qsub to run jobs in parallel (qsub must be configured in + environment/LocalEnvironment.pm) + + --pmem + Amount of physical memory requested for parallel decoding jobs + (used with qsub requests only) + +Help +} + +sub convert { + my ($str) = @_; + my @ps = split /;/, $str; + my %dict = (); + for my $p (@ps) { + my ($k, $v) = split /=/, $p; + $dict{$k} = $v; + } + return %dict; +} + + + +sub cmdline { + return join ' ',($0,@ORIG_ARGV); +} + +#buggy: last arg gets quoted sometimes? +my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]}; +my $shell_escape_in_quote=qr{[\\"\$`!]}; + +sub escape_shell { + my ($arg)=@_; + return undef unless defined $arg; + if ($arg =~ /$is_shell_special/) { + $arg =~ s/($shell_escape_in_quote)/\\$1/g; + return "\"$arg\""; + } + return $arg; +} + +sub escaped_shell_args { + return map {local $_=$_;chomp;escape_shell($_)} @_; +} + +sub escaped_shell_args_str { + return join ' ',&escaped_shell_args(@_); +} + +sub escaped_cmdline { + return "$0 ".&escaped_shell_args_str(@ORIG_ARGV); +} diff --git a/dpmert/error_surface.cc b/dpmert/error_surface.cc new file mode 100644 index 00000000..515b67f8 --- /dev/null +++ b/dpmert/error_surface.cc @@ -0,0 +1,42 @@ +#include "error_surface.h" + +#include +#include + +using namespace std; + +ErrorSurface::~ErrorSurface() {} + +void ErrorSurface::Serialize(std::string* out) const { + const int segments = this->size(); + ostringstream os(ios::binary); + os.write((const char*)&segments,sizeof(segments)); + for (int i = 0; i < segments; ++i) { + const ErrorSegment& cur = (*this)[i]; + string senc; + cur.delta.Encode(&senc); + assert(senc.size() < 1024); + unsigned char len = senc.size(); + os.write((const char*)&cur.x, sizeof(cur.x)); + os.write((const char*)&len, sizeof(len)); + os.write((const char*)&senc[0], len); + } + *out = os.str(); +} + +void ErrorSurface::Deserialize(const std::string& in) { + istringstream is(in, ios::binary); + int segments; + is.read((char*)&segments, sizeof(segments)); + this->resize(segments); + for (int i = 0; i < segments; ++i) { + ErrorSegment& cur = (*this)[i]; + unsigned char len; + is.read((char*)&cur.x, sizeof(cur.x)); + is.read((char*)&len, sizeof(len)); + string senc(len, '\0'); assert(senc.size() == len); + is.read((char*)&senc[0], len); + cur.delta = SufficientStats(senc); + } +} + diff --git a/dpmert/error_surface.h b/dpmert/error_surface.h new file mode 100644 index 00000000..bb65847b --- /dev/null +++ b/dpmert/error_surface.h @@ -0,0 +1,24 @@ +#ifndef _ERROR_SURFACE_H_ +#define _ERROR_SURFACE_H_ + +#include +#include + +#include "ns.h" + +class Score; + +struct ErrorSegment { + double x; + SufficientStats delta; + ErrorSegment() : x(0), delta() {} +}; + +class ErrorSurface : public std::vector { + public: + ~ErrorSurface(); + void Serialize(std::string* out) const; + void Deserialize(const std::string& in); +}; + +#endif diff --git a/dpmert/libcall.pl b/dpmert/libcall.pl new file mode 100644 index 00000000..c7d0f128 --- /dev/null +++ b/dpmert/libcall.pl @@ -0,0 +1,71 @@ +use IPC::Open3; +use Symbol qw(gensym); + +$DUMMY_STDERR = gensym(); +$DUMMY_STDIN = gensym(); + +# Run the command and ignore failures +sub unchecked_call { + system("@_") +} + +# Run the command and return its output, if any ignoring failures +sub unchecked_output { + return `@_` +} + +# WARNING: Do not use this for commands that will return large amounts +# of stdout or stderr -- they might block indefinitely +sub check_output { + print STDERR "Executing and gathering output: @_\n"; + + my $pid = open3($DUMMY_STDIN, \*PH, $DUMMY_STDERR, @_); + my $proc_output = ""; + while( ) { + $proc_output .= $_; + } + waitpid($pid, 0); + # TODO: Grab signal that the process died from + my $child_exit_status = $? >> 8; + if($child_exit_status == 0) { + return $proc_output; + } else { + print STDERR "ERROR: Execution of @_ failed.\n"; + exit(1); + } +} + +# Based on Moses' safesystem sub +sub check_call { + print STDERR "Executing: @_\n"; + system(@_); + my $exitcode = $? >> 8; + if($exitcode == 0) { + return 0; + } elsif ($? == -1) { + print STDERR "ERROR: Failed to execute: @_\n $!\n"; + exit(1); + + } elsif ($? & 127) { + printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n", + ($? & 127), ($? & 128) ? 'with' : 'without'; + exit(1); + + } else { + print STDERR "Failed with exit code: $exitcode\n" if $exitcode; + exit($exitcode); + } +} + +sub check_bash_call { + my @args = ( "bash", "-auxeo", "pipefail", "-c", "@_"); + check_call(@args); +} + +sub check_bash_output { + my @args = ( "bash", "-auxeo", "pipefail", "-c", "@_"); + return check_output(@args); +} + +# perl module weirdness... +return 1; diff --git a/dpmert/line_mediator.pl b/dpmert/line_mediator.pl new file mode 100755 index 00000000..bc2bb24c --- /dev/null +++ b/dpmert/line_mediator.pl @@ -0,0 +1,116 @@ +#!/usr/bin/perl -w +#hooks up two processes, 2nd of which has one line of output per line of input, expected by the first, which starts off the communication + +# if you don't know how to fork/exec in a C program, this could be helpful under limited cirmustances (would be ok to liaise with sentserver) + +#WARNING: because it waits for the result from command 2 after sending every line, and especially if command 1 does the same, using sentserver as command 2 won't actually buy you any real parallelism. + +use strict; +use IPC::Open2; +use POSIX qw(pipe dup2 STDIN_FILENO STDOUT_FILENO); + +my $quiet=!$ENV{DEBUG}; +$quiet=1 if $ENV{QUIET}; +sub info { + local $,=' '; + print STDERR @_ unless $quiet; +} + +my $mode='CROSS'; +my $ser='DIRECT'; +$mode='PIPE' if $ENV{PIPE}; +$mode='SNAKE' if $ENV{SNAKE}; +$mode='CROSS' if $ENV{CROSS}; +$ser='SERIAL' if $ENV{SERIAL}; +$ser='DIRECT' if $ENV{DIRECT}; +$ser='SERIAL' if $mode eq 'SNAKE'; +info("mode: $mode\n"); +info("connection: $ser\n"); + + +my @c1; +if (scalar @ARGV) { + do { + push @c1,shift + } while scalar @ARGV && $c1[$#c1] ne '--'; +} +pop @c1; +my @c2=@ARGV; +@ARGV=(); +(scalar @c1 && scalar @c2) || die qq{ +usage: $0 cmd1 args -- cmd2 args +all options are environment variables. +DEBUG=1 env var enables debugging output. +CROSS=1 hooks up two processes, 2nd of which has one line of output per line of input, expected by the first, which starts off the communication. crosses stdin/stderr of cmd1 and cmd2 line by line (both must flush on newline and output. cmd1 initiates the conversation (sends the first line). default: attempts to cross stdin/stdout of c1 and c2 directly (via two unidirectional posix pipes created before fork). +SERIAL=1: (no parallelism possible) but lines exchanged are logged if DEBUG. +if SNAKE then stdin -> c1 -> c2 -> c1 -> stdout. +if PIPE then stdin -> c1 -> c2 -> stdout (same as shell c1|c2, but with SERIAL you can see the intermediate in real time; you could do similar with c1 | tee /dev/fd/2 |c2. +DIRECT=1 (default) will override SERIAL=1. +CROSS=1 (default) will override SNAKE or PIPE. +}; + +info("1 cmd:",@c1,"\n"); +info("2 cmd:",@c2,"\n"); + +sub lineto { + select $_[0]; + $|=1; + shift; + print @_; +} + +if ($ser eq 'SERIAL') { + my ($R1,$W1,$R2,$W2); + my $c1p=open2($R1,$W1,@c1); # Open2 R W backward from Open3. + my $c2p=open2($R2,$W2,@c2); + if ($mode eq 'CROSS') { + while(<$R1>) { + info("1:",$_); + lineto($W2,$_); + last unless defined ($_=<$R2>); + info("1|2:",$_); + lineto($W1,$_); + } + } else { + my $snake=$mode eq 'SNAKE'; + while() { + info("IN:",$_); + lineto($W1,$_); + last unless defined ($_=<$R1>); + info("IN|1:",$_); + lineto($W2,$_); + last unless defined ($_=<$R2>); + info("IN|1|2:",$_); + if ($snake) { + lineto($W1,$_); + last unless defined ($_=<$R1>); + info("IN|1|2|1:",$_); + } + lineto(*STDOUT,$_); + } + } +} else { + info("DIRECT mode\n"); + my @rw1=POSIX::pipe(); + my @rw2=POSIX::pipe(); + my $pid=undef; + $SIG{CHLD} = sub { wait }; + while (not defined ($pid=fork())) { + sleep 1; + } + my $pipe = $mode eq 'PIPE'; + unless ($pipe) { + POSIX::close(STDOUT_FILENO); + POSIX::close(STDIN_FILENO); + } + if ($pid) { + POSIX::dup2($rw1[1],STDOUT_FILENO); + POSIX::dup2($rw2[0],STDIN_FILENO) unless $pipe; + exec @c1; + } else { + POSIX::dup2($rw2[1],STDOUT_FILENO) unless $pipe; + POSIX::dup2($rw1[0],STDIN_FILENO); + exec @c2; + } + while (wait()!=-1) {} +} diff --git a/dpmert/line_optimizer.cc b/dpmert/line_optimizer.cc new file mode 100644 index 00000000..49443fbe --- /dev/null +++ b/dpmert/line_optimizer.cc @@ -0,0 +1,111 @@ +#include "line_optimizer.h" + +#include +#include + +#include "sparse_vector.h" +#include "ns.h" + +using namespace std; + +typedef ErrorSurface::const_iterator ErrorIter; + +// sort by increasing x-ints +struct IntervalComp { + bool operator() (const ErrorIter& a, const ErrorIter& b) const { + return a->x < b->x; + } +}; + +double LineOptimizer::LineOptimize( + const EvaluationMetric* metric, + const vector& surfaces, + const LineOptimizer::ScoreType type, + float* best_score, + const double epsilon) { + // cerr << "MIN=" << MINIMIZE_SCORE << " MAX=" << MAXIMIZE_SCORE << " MINE=" << type << endl; + vector all_ints; + for (vector::const_iterator i = surfaces.begin(); + i != surfaces.end(); ++i) { + const ErrorSurface& surface = *i; + for (ErrorIter j = surface.begin(); j != surface.end(); ++j) + all_ints.push_back(j); + } + sort(all_ints.begin(), all_ints.end(), IntervalComp()); + double last_boundary = all_ints.front()->x; + SufficientStats acc; + float& cur_best_score = *best_score; + cur_best_score = (type == MAXIMIZE_SCORE ? + -numeric_limits::max() : numeric_limits::max()); + bool left_edge = true; + double pos = numeric_limits::quiet_NaN(); + for (vector::iterator i = all_ints.begin(); + i != all_ints.end(); ++i) { + const ErrorSegment& seg = **i; + if (seg.x - last_boundary > epsilon) { + float sco = metric->ComputeScore(acc); + if ((type == MAXIMIZE_SCORE && sco > cur_best_score) || + (type == MINIMIZE_SCORE && sco < cur_best_score) ) { + cur_best_score = sco; + if (left_edge) { + pos = seg.x - 0.1; + left_edge = false; + } else { + pos = last_boundary + (seg.x - last_boundary) / 2; + } + //cerr << "NEW BEST: " << pos << " (score=" << cur_best_score << ")\n"; + } + // string xx = metric->DetailedScore(acc); cerr << "---- " << xx; + // cerr << "---- s=" << sco << "\n"; + last_boundary = seg.x; + } + // cerr << "x-boundary=" << seg.x << "\n"; + //string x2; acc.Encode(&x2); cerr << " ACC: " << x2 << endl; + //string x1; seg.delta.Encode(&x1); cerr << " DELTA: " << x1 << endl; + acc += seg.delta; + } + float sco = metric->ComputeScore(acc); + if ((type == MAXIMIZE_SCORE && sco > cur_best_score) || + (type == MINIMIZE_SCORE && sco < cur_best_score) ) { + cur_best_score = sco; + if (left_edge) { + pos = 0; + } else { + pos = last_boundary + 1000.0; + } + } + return pos; +} + +void LineOptimizer::RandomUnitVector(const vector& features_to_optimize, + SparseVector* axis, + RandomNumberGenerator* rng) { + axis->clear(); + for (int i = 0; i < features_to_optimize.size(); ++i) + axis->set_value(features_to_optimize[i], rng->NextNormal(0.0,1.0)); + (*axis) /= axis->l2norm(); +} + +void LineOptimizer::CreateOptimizationDirections( + const vector& features_to_optimize, + int additional_random_directions, + RandomNumberGenerator* rng, + vector >* dirs + , bool include_orthogonal + ) { + dirs->clear(); + typedef SparseVector Dir; + vector &out=*dirs; + int i=0; + if (include_orthogonal) + for (;i + +#include "sparse_vector.h" +#include "error_surface.h" +#include "sampler.h" + +class EvaluationMetric; +class Weights; + +struct LineOptimizer { + + // use MINIMIZE_SCORE for things like TER, WER + // MAXIMIZE_SCORE for things like BLEU + enum ScoreType { MAXIMIZE_SCORE, MINIMIZE_SCORE }; + + // merge all the error surfaces together into a global + // error surface and find (the middle of) the best segment + static double LineOptimize( + const EvaluationMetric* metric, + const std::vector& envs, + const LineOptimizer::ScoreType type, + float* best_score, + const double epsilon = 1.0/65536.0); + + // return a random vector of length 1 where all dimensions + // not listed in dimensions will be 0. + static void RandomUnitVector(const std::vector& dimensions, + SparseVector* axis, + RandomNumberGenerator* rng); + + // generate a list of directions to optimize; the list will + // contain the orthogonal vectors corresponding to the dimensions in + // primary and then additional_random_directions directions in those + // dimensions as well. All vectors will be length 1. + static void CreateOptimizationDirections( + const std::vector& primary, + int additional_random_directions, + RandomNumberGenerator* rng, + std::vector >* dirs + , bool include_primary=true + ); + +}; + +#endif diff --git a/dpmert/lo_test.cc b/dpmert/lo_test.cc new file mode 100644 index 00000000..d9b909b8 --- /dev/null +++ b/dpmert/lo_test.cc @@ -0,0 +1,236 @@ +#include +#include +#include + +#include +#include + +#include "ns.h" +#include "ns_docscorer.h" +#include "ces.h" +#include "fdict.h" +#include "hg.h" +#include "kbest.h" +#include "hg_io.h" +#include "filelib.h" +#include "inside_outside.h" +#include "viterbi.h" +#include "mert_geometry.h" +#include "line_optimizer.h" + +using namespace std; +using boost::shared_ptr; + +class OptTest : public testing::Test { + protected: + virtual void SetUp() { } + virtual void TearDown() { } +}; + +const char* ref11 = "australia reopens embassy in manila"; +const char* ref12 = "( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack ."; +const char* ref21 = "australia reopened manila embassy"; +const char* ref22 = "( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack ."; +const char* ref31 = "australia to reopen embassy in manila"; +const char* ref32 = "( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so - called confirmed terrorist attack threats ."; +const char* ref41 = "australia to re - open its embassy to manila"; +const char* ref42 = "( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so - called \" clear \" threat of terrorist attack 7 weeks ago ."; + +TEST_F(OptTest, TestCheckNaN) { + double x = 0; + double y = 0; + double z = x / y; + EXPECT_EQ(true, isnan(z)); +} + +TEST_F(OptTest,TestConvexHull) { + shared_ptr a1(new MERTPoint(-1, 0)); + shared_ptr b1(new MERTPoint(1, 0)); + shared_ptr a2(new MERTPoint(-1, 1)); + shared_ptr b2(new MERTPoint(1, -1)); + vector > sa; sa.push_back(a1); sa.push_back(b1); + vector > sb; sb.push_back(a2); sb.push_back(b2); + ConvexHull a(sa); + cerr << a << endl; + ConvexHull b(sb); + ConvexHull c = a; + c *= b; + cerr << a << " (*) " << b << " = " << c << endl; + EXPECT_EQ(3, c.size()); +} + +TEST_F(OptTest,TestConvexHullInside) { + const string json = "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| A [1]\",3,\"[X] ||| c\",4,\"[X] ||| C [1]\",5,\"[X] ||| [1] B [2]\",6,\"[X] ||| [1] b [2]\",7,\"[X] ||| X [1]\",8,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.8,1,-0.1],\"rule\":2}],\"node\":{\"in_edges\":[1]},\"edges\":[{\"tail\":[],\"feats\":[1,-1],\"rule\":3}],\"node\":{\"in_edges\":[2]},\"edges\":[{\"tail\":[2],\"feats\":[0,-0.2,1,-0.1],\"rule\":4}],\"node\":{\"in_edges\":[3]},\"edges\":[{\"tail\":[1,3],\"feats\":[0,-1.2,1,-0.2],\"rule\":5},{\"tail\":[1,3],\"feats\":[0,-0.5,1,-1.3],\"rule\":6}],\"node\":{\"in_edges\":[4,5]},\"edges\":[{\"tail\":[4],\"feats\":[0,-0.5,1,-0.8],\"rule\":7},{\"tail\":[4],\"feats\":[0,-0.7,1,-0.9],\"rule\":8}],\"node\":{\"in_edges\":[6,7]}}"; + Hypergraph hg; + istringstream instr(json); + HypergraphIO::ReadFromJSON(&instr, &hg); + SparseVector wts; + wts.set_value(FD::Convert("f1"), 0.4); + wts.set_value(FD::Convert("f2"), 1.0); + hg.Reweight(wts); + vector, prob_t> > list; + std::vector > features; + KBest::KBestDerivations, ESentenceTraversal> kbest(hg, 10); + for (int i = 0; i < 10; ++i) { + const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(hg.nodes_.size() - 1, i); + if (!d) break; + cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; + } + SparseVector dir; dir.set_value(FD::Convert("f1"), 1.0); + ConvexHullWeightFunction wf(wts, dir); + ConvexHull env = Inside(hg, NULL, wf); + cerr << env << endl; + const vector >& segs = env.GetSortedSegs(); + dir *= segs[1]->x; + wts += dir; + hg.Reweight(wts); + KBest::KBestDerivations, ESentenceTraversal> kbest2(hg, 10); + for (int i = 0; i < 10; ++i) { + const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = + kbest2.LazyKthBest(hg.nodes_.size() - 1, i); + if (!d) break; + cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; + } + for (int i = 0; i < segs.size(); ++i) { + cerr << "seg=" << i << endl; + vector trans; + segs[i]->ConstructTranslation(&trans); + cerr << TD::GetString(trans) << endl; + } +} + +TEST_F(OptTest, TestS1) { + int fPhraseModel_0 = FD::Convert("PhraseModel_0"); + int fPhraseModel_1 = FD::Convert("PhraseModel_1"); + int fPhraseModel_2 = FD::Convert("PhraseModel_2"); + int fLanguageModel = FD::Convert("LanguageModel"); + int fWordPenalty = FD::Convert("WordPenalty"); + int fPassThrough = FD::Convert("PassThrough"); + SparseVector wts; + wts.set_value(fWordPenalty, 4.25); + wts.set_value(fLanguageModel, -1.1165); + wts.set_value(fPhraseModel_0, -0.96); + wts.set_value(fPhraseModel_1, -0.65); + wts.set_value(fPhraseModel_2, -0.77); + wts.set_value(fPassThrough, -10.0); + + vector to_optimize; + to_optimize.push_back(fWordPenalty); + to_optimize.push_back(fLanguageModel); + to_optimize.push_back(fPhraseModel_0); + to_optimize.push_back(fPhraseModel_1); + to_optimize.push_back(fPhraseModel_2); + + Hypergraph hg; + ReadFile rf("./test_data/0.json.gz"); + HypergraphIO::ReadFromJSON(rf.stream(), &hg); + hg.Reweight(wts); + + Hypergraph hg2; + ReadFile rf2("./test_data/1.json.gz"); + HypergraphIO::ReadFromJSON(rf2.stream(), &hg2); + hg2.Reweight(wts); + + vector > refs1(4); + TD::ConvertSentence(ref11, &refs1[0]); + TD::ConvertSentence(ref21, &refs1[1]); + TD::ConvertSentence(ref31, &refs1[2]); + TD::ConvertSentence(ref41, &refs1[3]); + vector > refs2(4); + TD::ConvertSentence(ref12, &refs2[0]); + TD::ConvertSentence(ref22, &refs2[1]); + TD::ConvertSentence(ref32, &refs2[2]); + TD::ConvertSentence(ref42, &refs2[3]); + vector envs(2); + + RandomNumberGenerator rng; + + vector > axes; // directions to search + LineOptimizer::CreateOptimizationDirections( + to_optimize, + 10, + &rng, + &axes); + assert(axes.size() == 10 + to_optimize.size()); + for (int i = 0; i < axes.size(); ++i) + cerr << axes[i] << endl; + const SparseVector& axis = axes[0]; + + cerr << "Computing Viterbi envelope using inside algorithm...\n"; + cerr << "axis: " << axis << endl; + clock_t t_start=clock(); + ConvexHullWeightFunction wf(wts, axis); // wts = starting point, axis = search direction + envs[0] = Inside(hg, NULL, wf); + envs[1] = Inside(hg2, NULL, wf); + + vector es(2); + EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); + boost::shared_ptr scorer1 = metric->CreateSegmentEvaluator(refs1); + boost::shared_ptr scorer2 = metric->CreateSegmentEvaluator(refs2); + ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg); + ComputeErrorSurface(*scorer2, envs[1], &es[1], metric, hg2); + cerr << envs[0].size() << " " << envs[1].size() << endl; + cerr << es[0].size() << " " << es[1].size() << endl; + envs.clear(); + clock_t t_env=clock(); + float score; + double m = LineOptimizer::LineOptimize(metric,es, LineOptimizer::MAXIMIZE_SCORE, &score); + clock_t t_opt=clock(); + cerr << "line optimizer returned: " << m << " (SCORE=" << score << ")\n"; + EXPECT_FLOAT_EQ(0.48719698, score); + SparseVector res = axis; + res *= m; + res += wts; + cerr << "res: " << res << endl; + cerr << "ENVELOPE PROCESSING=" << (static_cast(t_env - t_start) / 1000.0) << endl; + cerr << " LINE OPTIMIZATION=" << (static_cast(t_opt - t_env) / 1000.0) << endl; + hg.Reweight(res); + hg2.Reweight(res); + vector t1,t2; + ViterbiESentence(hg, &t1); + ViterbiESentence(hg2, &t2); + cerr << TD::GetString(t1) << endl; + cerr << TD::GetString(t2) << endl; +} + +TEST_F(OptTest,TestZeroOrigin) { + const string json = "{\"rules\":[1,\"[X7] ||| blA ||| without ||| LHSProb=3.92173 LexE2F=2.90799 LexF2E=1.85003 GenerativeProb=10.5381 RulePenalty=1 XFE=2.77259 XEF=0.441833 LabelledEF=2.63906 LabelledFE=4.96981 LogRuleCount=0.693147\",2,\"[X7] ||| blA ||| except ||| LHSProb=4.92173 LexE2F=3.90799 LexF2E=1.85003 GenerativeProb=11.5381 RulePenalty=1 XFE=2.77259 XEF=1.44183 LabelledEF=2.63906 LabelledFE=4.96981 LogRuleCount=1.69315\",3,\"[S] ||| [X7,1] ||| [1] ||| GlueTop=1\",4,\"[X28] ||| EnwAn ||| title ||| LHSProb=3.96802 LexE2F=2.22462 LexF2E=1.83258 GenerativeProb=10.0863 RulePenalty=1 XFE=0 XEF=1.20397 LabelledEF=1.20397 LabelledFE=-1.98341e-08 LogRuleCount=1.09861\",5,\"[X0] ||| EnwAn ||| funny ||| LHSProb=3.98479 LexE2F=1.79176 LexF2E=3.21888 GenerativeProb=11.1681 RulePenalty=1 XFE=0 XEF=2.30259 LabelledEF=2.30259 LabelledFE=0 LogRuleCount=0 SingletonRule=1\",6,\"[X8] ||| [X7,1] EnwAn ||| entitled [1] ||| LHSProb=3.82533 LexE2F=3.21888 LexF2E=2.52573 GenerativeProb=11.3276 RulePenalty=1 XFE=1.20397 XEF=1.20397 LabelledEF=2.30259 LabelledFE=2.30259 LogRuleCount=0 SingletonRule=1\",7,\"[S] ||| [S,1] [X28,2] ||| [1] [2] ||| Glue=1\",8,\"[S] ||| [S,1] [X0,2] ||| [1] [2] ||| Glue=1\",9,\"[S] ||| [X8,1] ||| [1] ||| GlueTop=1\",10,\"[Goal] ||| [S,1] ||| [1]\"],\"features\":[\"PassThrough\",\"Glue\",\"GlueTop\",\"LanguageModel\",\"WordPenalty\",\"LHSProb\",\"LexE2F\",\"LexF2E\",\"GenerativeProb\",\"RulePenalty\",\"XFE\",\"XEF\",\"LabelledEF\",\"LabelledFE\",\"LogRuleCount\",\"SingletonRule\"],\"edges\":[{\"tail\":[],\"spans\":[0,1,-1,-1],\"feats\":[5,3.92173,6,2.90799,7,1.85003,8,10.5381,9,1,10,2.77259,11,0.441833,12,2.63906,13,4.96981,14,0.693147],\"rule\":1},{\"tail\":[],\"spans\":[0,1,-1,-1],\"feats\":[5,4.92173,6,3.90799,7,1.85003,8,11.5381,9,1,10,2.77259,11,1.44183,12,2.63906,13,4.96981,14,1.69315],\"rule\":2}],\"node\":{\"in_edges\":[0,1],\"cat\":\"X7\"},\"edges\":[{\"tail\":[0],\"spans\":[0,1,-1,-1],\"feats\":[2,1],\"rule\":3}],\"node\":{\"in_edges\":[2],\"cat\":\"S\"},\"edges\":[{\"tail\":[],\"spans\":[1,2,-1,-1],\"feats\":[5,3.96802,6,2.22462,7,1.83258,8,10.0863,9,1,11,1.20397,12,1.20397,13,-1.98341e-08,14,1.09861],\"rule\":4}],\"node\":{\"in_edges\":[3],\"cat\":\"X28\"},\"edges\":[{\"tail\":[],\"spans\":[1,2,-1,-1],\"feats\":[5,3.98479,6,1.79176,7,3.21888,8,11.1681,9,1,11,2.30259,12,2.30259,15,1],\"rule\":5}],\"node\":{\"in_edges\":[4],\"cat\":\"X0\"},\"edges\":[{\"tail\":[0],\"spans\":[0,2,-1,-1],\"feats\":[5,3.82533,6,3.21888,7,2.52573,8,11.3276,9,1,10,1.20397,11,1.20397,12,2.30259,13,2.30259,15,1],\"rule\":6}],\"node\":{\"in_edges\":[5],\"cat\":\"X8\"},\"edges\":[{\"tail\":[1,2],\"spans\":[0,2,-1,-1],\"feats\":[1,1],\"rule\":7},{\"tail\":[1,3],\"spans\":[0,2,-1,-1],\"feats\":[1,1],\"rule\":8},{\"tail\":[4],\"spans\":[0,2,-1,-1],\"feats\":[2,1],\"rule\":9}],\"node\":{\"in_edges\":[6,7,8],\"cat\":\"S\"},\"edges\":[{\"tail\":[5],\"spans\":[0,2,-1,-1],\"feats\":[],\"rule\":10}],\"node\":{\"in_edges\":[9],\"cat\":\"Goal\"}}"; + Hypergraph hg; + istringstream instr(json); + HypergraphIO::ReadFromJSON(&instr, &hg); + SparseVector wts; + wts.set_value(FD::Convert("PassThrough"), -0.929201533002898); + hg.Reweight(wts); + + vector, prob_t> > list; + std::vector > features; + KBest::KBestDerivations, ESentenceTraversal> kbest(hg, 10); + for (int i = 0; i < 10; ++i) { + const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(hg.nodes_.size() - 1, i); + if (!d) break; + cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; + } + + SparseVector axis; axis.set_value(FD::Convert("Glue"),1.0); + ConvexHullWeightFunction wf(wts, axis); // wts = starting point, axis = search direction + vector envs(1); + envs[0] = Inside(hg, NULL, wf); + + vector > mr(4); + TD::ConvertSentence("untitled", &mr[0]); + TD::ConvertSentence("with no title", &mr[1]); + TD::ConvertSentence("without a title", &mr[2]); + TD::ConvertSentence("without title", &mr[3]); + EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); + boost::shared_ptr scorer1 = metric->CreateSegmentEvaluator(mr); + vector es(1); + ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg); +} + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + diff --git a/dpmert/mert_geometry.cc b/dpmert/mert_geometry.cc new file mode 100644 index 00000000..81b25af9 --- /dev/null +++ b/dpmert/mert_geometry.cc @@ -0,0 +1,186 @@ +#include "mert_geometry.h" + +#include +#include + +using namespace std; +using boost::shared_ptr; + +ConvexHull::ConvexHull(int i) { + if (i == 0) { + // do nothing - <> + } else if (i == 1) { + points.push_back(shared_ptr(new MERTPoint(0, 0, 0, shared_ptr(), shared_ptr()))); + assert(this->IsMultiplicativeIdentity()); + } else { + cerr << "Only can create ConvexHull semiring 0 and 1 with this constructor!\n"; + abort(); + } +} + +const ConvexHull ConvexHullWeightFunction::operator()(const Hypergraph::Edge& e) const { + const double m = direction.dot(e.feature_values_); + const double b = origin.dot(e.feature_values_); + MERTPoint* point = new MERTPoint(m, b, e); + return ConvexHull(1, point); +} + +ostream& operator<<(ostream& os, const ConvexHull& env) { + os << '<'; + const vector >& points = env.GetSortedSegs(); + for (int i = 0; i < points.size(); ++i) + os << (i==0 ? "" : "|") << "x=" << points[i]->x << ",b=" << points[i]->b << ",m=" << points[i]->m << ",p1=" << points[i]->p1 << ",p2=" << points[i]->p2; + return os << '>'; +} + +#define ORIGINAL_MERT_IMPLEMENTATION 1 +#ifdef ORIGINAL_MERT_IMPLEMENTATION + +struct SlopeCompare { + bool operator() (const shared_ptr& a, const shared_ptr& b) const { + return a->m < b->m; + } +}; + +const ConvexHull& ConvexHull::operator+=(const ConvexHull& other) { + if (!other.is_sorted) other.Sort(); + if (points.empty()) { + points = other.points; + return *this; + } + is_sorted = false; + int j = points.size(); + points.resize(points.size() + other.points.size()); + for (int i = 0; i < other.points.size(); ++i) + points[j++] = other.points[i]; + assert(j == points.size()); + return *this; +} + +void ConvexHull::Sort() const { + sort(points.begin(), points.end(), SlopeCompare()); + const int k = points.size(); + int j = 0; + for (int i = 0; i < k; ++i) { + MERTPoint l = *points[i]; + l.x = kMinusInfinity; + // cerr << "m=" << l.m << endl; + if (0 < j) { + if (points[j-1]->m == l.m) { // lines are parallel + if (l.b <= points[j-1]->b) continue; + --j; + } + while(0 < j) { + l.x = (l.b - points[j-1]->b) / (points[j-1]->m - l.m); + if (points[j-1]->x < l.x) break; + --j; + } + if (0 == j) l.x = kMinusInfinity; + } + *points[j++] = l; + } + points.resize(j); + is_sorted = true; +} + +const ConvexHull& ConvexHull::operator*=(const ConvexHull& other) { + if (other.IsMultiplicativeIdentity()) { return *this; } + if (this->IsMultiplicativeIdentity()) { (*this) = other; return *this; } + + if (!is_sorted) Sort(); + if (!other.is_sorted) other.Sort(); + + if (this->IsEdgeEnvelope()) { +// if (other.size() > 1) +// cerr << *this << " (TIMES) " << other << endl; + shared_ptr edge_parent = points[0]; + const double& edge_b = edge_parent->b; + const double& edge_m = edge_parent->m; + points.clear(); + for (int i = 0; i < other.points.size(); ++i) { + const MERTPoint& p = *other.points[i]; + const double m = p.m + edge_m; + const double b = p.b + edge_b; + const double& x = p.x; // x's don't change with * + points.push_back(shared_ptr(new MERTPoint(x, m, b, edge_parent, other.points[i]))); + assert(points.back()->p1->edge); + } +// if (other.size() > 1) +// cerr << " = " << *this << endl; + } else { + vector > new_points; + int this_i = 0; + int other_i = 0; + const int this_size = points.size(); + const int other_size = other.points.size(); + double cur_x = kMinusInfinity; // moves from left to right across the + // real numbers, stopping for all inter- + // sections + double this_next_val = (1 < this_size ? points[1]->x : kPlusInfinity); + double other_next_val = (1 < other_size ? other.points[1]->x : kPlusInfinity); + while (this_i < this_size && other_i < other_size) { + const MERTPoint& this_point = *points[this_i]; + const MERTPoint& other_point= *other.points[other_i]; + const double m = this_point.m + other_point.m; + const double b = this_point.b + other_point.b; + + new_points.push_back(shared_ptr(new MERTPoint(cur_x, m, b, points[this_i], other.points[other_i]))); + int comp = 0; + if (this_next_val < other_next_val) comp = -1; else + if (this_next_val > other_next_val) comp = 1; + if (0 == comp) { // the next values are equal, advance both indices + ++this_i; + ++other_i; + cur_x = this_next_val; // could be other_next_val (they're equal!) + this_next_val = (this_i+1 < this_size ? points[this_i+1]->x : kPlusInfinity); + other_next_val = (other_i+1 < other_size ? other.points[other_i+1]->x : kPlusInfinity); + } else { // advance the i with the lower x, update cur_x + if (-1 == comp) { + ++this_i; + cur_x = this_next_val; + this_next_val = (this_i+1 < this_size ? points[this_i+1]->x : kPlusInfinity); + } else { + ++other_i; + cur_x = other_next_val; + other_next_val = (other_i+1 < other_size ? other.points[other_i+1]->x : kPlusInfinity); + } + } + } + points.swap(new_points); + } + //cerr << "Multiply: result=" << (*this) << endl; + return *this; +} + +// recursively construct translation +void MERTPoint::ConstructTranslation(vector* trans) const { + const MERTPoint* cur = this; + vector > ant_trans; + while(!cur->edge) { + ant_trans.resize(ant_trans.size() + 1); + cur->p2->ConstructTranslation(&ant_trans.back()); + cur = cur->p1.get(); + } + size_t ant_size = ant_trans.size(); + vector*> pants(ant_size); + assert(ant_size == cur->edge->tail_nodes_.size()); + --ant_size; + for (int i = 0; i < pants.size(); ++i) pants[ant_size - i] = &ant_trans[i]; + cur->edge->rule_->ESubstitute(pants, trans); +} + +void MERTPoint::CollectEdgesUsed(std::vector* edges_used) const { + if (edge) { + assert(edge->id_ < edges_used->size()); + (*edges_used)[edge->id_] = true; + } + if (p1) p1->CollectEdgesUsed(edges_used); + if (p2) p2->CollectEdgesUsed(edges_used); +} + +#else + +// THIS IS THE NEW FASTER IMPLEMENTATION OF THE MERT SEMIRING OPERATIONS + +#endif + diff --git a/dpmert/mert_geometry.h b/dpmert/mert_geometry.h new file mode 100644 index 00000000..a8b6959e --- /dev/null +++ b/dpmert/mert_geometry.h @@ -0,0 +1,81 @@ +#ifndef _MERT_GEOMETRY_H_ +#define _MERT_GEOMETRY_H_ + +#include +#include +#include + +#include "hg.h" +#include "sparse_vector.h" + +static const double kMinusInfinity = -std::numeric_limits::infinity(); +static const double kPlusInfinity = std::numeric_limits::infinity(); + +struct MERTPoint { + MERTPoint() : x(), m(), b(), edge() {} + MERTPoint(double _m, double _b) : + x(kMinusInfinity), m(_m), b(_b), edge() {} + MERTPoint(double _x, double _m, double _b, const boost::shared_ptr& p1_, const boost::shared_ptr& p2_) : + x(_x), m(_m), b(_b), p1(p1_), p2(p2_), edge() {} + MERTPoint(double _m, double _b, const Hypergraph::Edge& edge) : + x(kMinusInfinity), m(_m), b(_b), edge(&edge) {} + + double x; // x intersection with previous segment in env, or -inf if none + double m; // this line's slope + double b; // intercept with y-axis + + // we keep a pointer to the "parents" of this segment so we can reconstruct + // the Viterbi translation corresponding to this segment + boost::shared_ptr p1; + boost::shared_ptr p2; + + // only MERTPoints created from an edge using the ConvexHullWeightFunction + // have rules + // TRulePtr rule; + const Hypergraph::Edge* edge; + + // recursively recover the Viterbi translation that will result from setting + // the weights to origin + axis * x, where x is any value from this->x up + // until the next largest x in the containing ConvexHull + void ConstructTranslation(std::vector* trans) const; + void CollectEdgesUsed(std::vector* edges_used) const; +}; + +// this is the semiring value type, +// it defines constructors for 0, 1, and the operations + and * +struct ConvexHull { + // create semiring zero + ConvexHull() : is_sorted(true) {} // zero + // for debugging: + ConvexHull(const std::vector >& s) : points(s) { Sort(); } + // create semiring 1 or 0 + explicit ConvexHull(int i); + ConvexHull(int n, MERTPoint* point) : is_sorted(true), points(n, boost::shared_ptr(point)) {} + const ConvexHull& operator+=(const ConvexHull& other); + const ConvexHull& operator*=(const ConvexHull& other); + bool IsMultiplicativeIdentity() const { + return size() == 1 && (points[0]->b == 0.0 && points[0]->m == 0.0) && (!points[0]->edge) && (!points[0]->p1) && (!points[0]->p2); } + const std::vector >& GetSortedSegs() const { + if (!is_sorted) Sort(); + return points; + } + size_t size() const { return points.size(); } + + private: + bool IsEdgeEnvelope() const { + return points.size() == 1 && points[0]->edge; } + void Sort() const; + mutable bool is_sorted; + mutable std::vector > points; +}; +std::ostream& operator<<(std::ostream& os, const ConvexHull& env); + +struct ConvexHullWeightFunction { + ConvexHullWeightFunction(const SparseVector& ori, + const SparseVector& dir) : origin(ori), direction(dir) {} + const ConvexHull operator()(const Hypergraph::Edge& e) const; + const SparseVector origin; + const SparseVector direction; +}; + +#endif diff --git a/dpmert/mr_dpmert_generate_mapper_input.cc b/dpmert/mr_dpmert_generate_mapper_input.cc new file mode 100644 index 00000000..59d4f24f --- /dev/null +++ b/dpmert/mr_dpmert_generate_mapper_input.cc @@ -0,0 +1,78 @@ +#include +#include + +#include +#include + +#include "filelib.h" +#include "weights.h" +#include "line_optimizer.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("dev_set_size,s",po::value(),"[REQD] Development set size (# of parallel sentences)") + ("forest_repository,r",po::value(),"[REQD] Path to forest repository") + ("weights,w",po::value(),"[REQD] Current feature weights file") + ("optimize_feature,o",po::value >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)") + ("random_directions,d",po::value()->default_value(20),"Number of random directions to run the line optimizer in") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (conf->count("dev_set_size") == 0) { + cerr << "Please specify the size of the development set using -d N\n"; + flag = true; + } + if (conf->count("weights") == 0) { + cerr << "Please specify the starting-point weights using -w \n"; + flag = true; + } + if (conf->count("forest_repository") == 0) { + cerr << "Please specify the forest repository location using -r \n"; + flag = true; + } + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +int main(int argc, char** argv) { + RandomNumberGenerator rng; + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + vector features; + SparseVector origin; + vector w; + Weights::InitFromFile(conf["weights"].as(), &w, &features); + Weights::InitSparseVector(w, &origin); + const string forest_repository = conf["forest_repository"].as(); + assert(DirectoryExists(forest_repository)); + if (conf.count("optimize_feature") > 0) + features=conf["optimize_feature"].as >(); + vector > directions; + vector fids(features.size()); + for (int i = 0; i < features.size(); ++i) + fids[i] = FD::Convert(features[i]); + LineOptimizer::CreateOptimizationDirections( + fids, + conf["random_directions"].as(), + &rng, + &directions); + unsigned dev_set_size = conf["dev_set_size"].as(); + for (unsigned i = 0; i < dev_set_size; ++i) { + for (unsigned j = 0; j < directions.size(); ++j) { + cout << forest_repository << '/' << i << ".json.gz " << i << ' '; + print(cout, origin, "=", ";"); + cout << ' '; + print(cout, directions[j], "=", ";"); + cout << endl; + } + } + return 0; +} diff --git a/dpmert/mr_dpmert_map.cc b/dpmert/mr_dpmert_map.cc new file mode 100644 index 00000000..f3304f0f --- /dev/null +++ b/dpmert/mr_dpmert_map.cc @@ -0,0 +1,112 @@ +#include +#include +#include +#include + +#include +#include + +#include "ns.h" +#include "ns_docscorer.h" +#include "ces.h" +#include "filelib.h" +#include "stringlib.h" +#include "sparse_vector.h" +#include "mert_geometry.h" +#include "inside_outside.h" +#include "error_surface.h" +#include "b64tools.h" +#include "hg_io.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("reference,r",po::value >(), "[REQD] Reference translation (tokenized text)") + ("source,s",po::value(), "Source file (ignored, except for AER)") + ("evaluation_metric,m",po::value()->default_value("ibm_bleu"), "Evaluation metric being optimized") + ("input,i",po::value()->default_value("-"), "Input file to map (- is STDIN)") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (!conf->count("reference")) { + cerr << "Please specify one or more references using -r \n"; + flag = true; + } + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +bool ReadSparseVectorString(const string& s, SparseVector* v) { +#if 0 + // this should work, but untested. + std::istringstream i(s); + i>>*v; +#else + vector fields; + Tokenize(s, ';', &fields); + if (fields.empty()) return false; + for (int i = 0; i < fields.size(); ++i) { + vector pair(2); + Tokenize(fields[i], '=', &pair); + if (pair.size() != 2) { + cerr << "Error parsing vector string: " << fields[i] << endl; + return false; + } + v->set_value(FD::Convert(pair[0]), atof(pair[1].c_str())); + } + return true; +#endif +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const string evaluation_metric = conf["evaluation_metric"].as(); + EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); + DocumentScorer ds(metric, conf["reference"].as >()); + cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl; + Hypergraph hg; + string last_file; + ReadFile in_read(conf["input"].as()); + istream &in=*in_read.stream(); + while(in) { + string line; + getline(in, line); + if (line.empty()) continue; + istringstream is(line); + int sent_id; + string file, s_origin, s_direction; + // path-to-file (JSON) sent_ed starting-point search-direction + is >> file >> sent_id >> s_origin >> s_direction; + SparseVector origin; + ReadSparseVectorString(s_origin, &origin); + SparseVector direction; + ReadSparseVectorString(s_direction, &direction); + // cerr << "File: " << file << "\nDir: " << direction << "\n X: " << origin << endl; + if (last_file != file) { + last_file = file; + ReadFile rf(file); + HypergraphIO::ReadFromJSON(rf.stream(), &hg); + } + const ConvexHullWeightFunction wf(origin, direction); + const ConvexHull hull = Inside(hg, NULL, wf); + + ErrorSurface es; + ComputeErrorSurface(*ds[sent_id], hull, &es, metric, hg); + //cerr << "Viterbi envelope has " << ve.size() << " segments\n"; + // cerr << "Error surface has " << es.size() << " segments\n"; + string val; + es.Serialize(&val); + cout << 'M' << ' ' << s_origin << ' ' << s_direction << '\t'; + B64::b64encode(val.c_str(), val.size(), &cout); + cout << endl << flush; + } + return 0; +} diff --git a/dpmert/mr_dpmert_reduce.cc b/dpmert/mr_dpmert_reduce.cc new file mode 100644 index 00000000..dda61f88 --- /dev/null +++ b/dpmert/mr_dpmert_reduce.cc @@ -0,0 +1,77 @@ +#include +#include +#include +#include + +#include +#include + +#include "sparse_vector.h" +#include "error_surface.h" +#include "line_optimizer.h" +#include "b64tools.h" +#include "stringlib.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("evaluation_metric,m",po::value(), "Evaluation metric (IBM_BLEU, etc.)") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = conf->count("evaluation_metric") == 0; + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const string evaluation_metric = conf["evaluation_metric"].as(); + LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE; + if (UppercaseString(evaluation_metric) == "TER") + opt_type = LineOptimizer::MINIMIZE_SCORE; + EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); + + vector esv; + string last_key, line, key, val; + while(getline(cin, line)) { + size_t ks = line.find("\t"); + assert(string::npos != ks); + assert(ks > 2); + key = line.substr(2, ks - 2); + val = line.substr(ks + 1); + if (key != last_key) { + if (!last_key.empty()) { + float score; + double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score); + cout << last_key << "|" << x << "|" << score << endl; + } + last_key.swap(key); + esv.clear(); + } + if (val.size() % 4 != 0) { + cerr << "B64 encoding error 1! Skipping.\n"; + continue; + } + string encoded(val.size() / 4 * 3, '\0'); + if (!B64::b64decode(reinterpret_cast(&val[0]), val.size(), &encoded[0], encoded.size())) { + cerr << "B64 encoding error 2! Skipping.\n"; + continue; + } + esv.push_back(ErrorSurface()); + esv.back().Deserialize(encoded); + } + if (!esv.empty()) { + float score; + double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score); + cout << last_key << "|" << x << "|" << score << endl; + } + return 0; +} diff --git a/dpmert/parallelize.pl b/dpmert/parallelize.pl new file mode 100755 index 00000000..7d0365cc --- /dev/null +++ b/dpmert/parallelize.pl @@ -0,0 +1,423 @@ +#!/usr/bin/env perl + +# Author: Adam Lopez +# +# This script takes a command that processes input +# from stdin one-line-at-time, and parallelizes it +# on the cluster using David Chiang's sentserver/ +# sentclient architecture. +# +# Prerequisites: the command *must* read each line +# without waiting for subsequent lines of input +# (for instance, a command which must read all lines +# of input before processing will not work) and +# return it to the output *without* buffering +# multiple lines. + +#TODO: if -j 1, run immediately, not via sentserver? possible differences in environment might make debugging harder + +#ANNOYANCE: if input is shorter than -j n lines, or at the very last few lines, repeatedly sleeps. time cut down to 15s from 60s + +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } +use LocalConfig; + +use Cwd qw/ abs_path cwd getcwd /; +use File::Temp qw/ tempfile /; +use Getopt::Long; +use IPC::Open2; +use strict; +use POSIX ":sys_wait_h"; + +use File::Basename; +my $myDir = dirname(__FILE__); +print STDERR __FILE__." -> $myDir\n"; +push(@INC, $myDir); +require "libcall.pl"; + +my $tailn=5; # +0 = concatenate all the client logs. 5 = last 5 lines +my $recycle_clients; # spawn new clients when previous ones terminate +my $stay_alive; # dont let server die when having zero clients +my $joblist = ""; +my $errordir=""; +my $multiline; +my @files_to_stage; +my $numnodes = 8; +my $user = $ENV{"USER"}; +my $pmem = "9g"; +my $basep=50300; +my $randp=300; +my $tryp=50; +my $no_which; +my $no_cd; + +my $DEBUG=$ENV{DEBUG}; +print STDERR "DEBUG=$DEBUG output enabled.\n" if $DEBUG; +my $verbose = 1; +sub verbose { + if ($verbose) { + print STDERR @_,"\n"; + } +} +sub debug { + if ($DEBUG) { + my ($package, $filename, $line) = caller; + print STDERR "DEBUG: $filename($line): ",join(' ',@_),"\n"; + } +} +my $is_shell_special=qr.[ \t\n\\><|&;"'`~*?{}$!()].; +my $shell_escape_in_quote=qr.[\\"\$`!].; +sub escape_shell { + my ($arg)=@_; + return undef unless defined $arg; + return '""' unless $arg; + if ($arg =~ /$is_shell_special/) { + $arg =~ s/($shell_escape_in_quote)/\\$1/g; + return "\"$arg\""; + } + return $arg; +} +sub preview_files { + my ($l,$skipempty,$footer,$n)=@_; + $n=$tailn unless defined $n; + my @f=grep { ! ($skipempty && -z $_) } @$l; + my $fn=join(' ',map {escape_shell($_)} @f); + my $cmd="tail -n $n $fn"; + unchecked_output("$cmd").($footer?"\nNONEMPTY FILES:\n$fn\n":""); +} +sub prefix_dirname($) { + #like `dirname but if ends in / then return the whole thing + local ($_)=@_; + if (/\/$/) { + $_; + } else { + s#/[^/]$##; + $_ ? $_ : ''; + } +} +sub ensure_final_slash($) { + local ($_)=@_; + m#/$# ? $_ : ($_."/"); +} +sub extend_path($$;$$) { + my ($base,$ext,$mkdir,$baseisdir)=@_; + if (-d $base) { + $base.="/"; + } else { + my $dir; + if ($baseisdir) { + $dir=$base; + $base.='/' unless $base =~ /\/$/; + } else { + $dir=prefix_dirname($base); + } + my @cmd=("/bin/mkdir","-p",$dir); + check_call(@cmd) if $mkdir; + } + return $base.$ext; +} + +my $abscwd=abs_path(&getcwd); +sub print_help; + +my $use_fork; +my @pids; + +# Process command-line options +unless (GetOptions( + "stay-alive" => \$stay_alive, + "recycle-clients" => \$recycle_clients, + "error-dir=s" => \$errordir, + "multi-line" => \$multiline, + "file=s" => \@files_to_stage, + "use-fork" => \$use_fork, + "verbose" => \$verbose, + "jobs=i" => \$numnodes, + "pmem=s" => \$pmem, + "baseport=i" => \$basep, +# "iport=i" => \$randp, #for short name -i + "no-which!" => \$no_which, + "no-cd!" => \$no_cd, + "tailn=s" => \$tailn, +) && scalar @ARGV){ + print_help(); + die "bad options."; +} + +my $cmd = ""; +my $prog=shift; +if ($no_which) { + $cmd=$prog; +} else { + $cmd=check_output("which $prog"); + chomp $cmd; + die "$prog not found - $cmd" unless $cmd; +} +#$cmd=abs_path($cmd); +for my $arg (@ARGV) { + $cmd .= " ".escape_shell($arg); +} +die "Please specify a command to parallelize\n" if $cmd eq ''; + +my $cdcmd=$no_cd ? '' : ("cd ".escape_shell($abscwd)."\n"); + +my $executable = $cmd; +$executable =~ s/^\s*(\S+)($|\s.*)/$1/; +$executable=check_output("basename $executable"); +chomp $executable; + + +print STDERR "Parallelizing ($numnodes ways): $cmd\n\n"; + +# create -e dir and save .sh +use File::Temp qw/tempdir/; +unless ($errordir) { + $errordir=tempdir("$executable.XXXXXX",CLEANUP=>1); +} +if ($errordir) { + my $scriptfile=extend_path("$errordir/","$executable.sh",1,1); + -d $errordir || die "should have created -e dir $errordir"; + open SF,">",$scriptfile || die; + print SF "$cdcmd$cmd\n"; + close SF; + chmod 0755,$scriptfile; + $errordir=abs_path($errordir); + &verbose("-e dir: $errordir"); +} + +# set cleanup handler +my @cleanup_cmds; +sub cleanup; +sub cleanup_and_die; +$SIG{INT} = "cleanup_and_die"; +$SIG{TERM} = "cleanup_and_die"; +$SIG{HUP} = "cleanup_and_die"; + +# other subs: +sub numof_live_jobs; +sub launch_job_on_node; + + +# vars +my $mydir = check_output("dirname $0"); chomp $mydir; +my $sentserver = "$mydir/sentserver"; +my $sentclient = "$mydir/sentclient"; +my $host = check_output("hostname"); +chomp $host; + + +# find open port +srand; +my $port = 50300+int(rand($randp)); +my $endp=$port+$tryp; +sub listening_port_lines { + my $quiet=$verbose?'':'2>/dev/null'; + return unchecked_output("netstat -a -n $quiet | grep LISTENING | grep -i tcp"); +} +my $netstat=&listening_port_lines; + +if ($verbose){ print STDERR "Testing port $port...";} + +while ($netstat=~/$port/ || &listening_port_lines=~/$port/){ + if ($verbose){ print STDERR "port is busy\n";} + $port++; + if ($port > $endp){ + die "Unable to find open port\n"; + } + if ($verbose){ print STDERR "Testing port $port... "; } +} +if ($verbose){ + print STDERR "port $port is available\n"; +} + +my $key = int(rand()*1000000); + +my $multiflag = ""; +if ($multiline){ $multiflag = "-m"; print STDERR "expecting multiline output.\n"; } +my $stay_alive_flag = ""; +if ($stay_alive){ $stay_alive_flag = "--stay-alive"; print STDERR "staying alive while no clients are connected.\n"; } + +my $node_count = 0; +my $script = ""; +# fork == one thread runs the sentserver, while the +# other spawns the sentclient commands. +my $pid = fork; +if ($pid == 0) { # child + sleep 8; # give other thread time to start sentserver + $script = "$cdcmd$sentclient $host:$port:$key $cmd"; + + if ($verbose){ + print STDERR "Client script:\n====\n"; + print STDERR $script; + print STDERR "====\n"; + } + for (my $jobn=0; $jobn<$numnodes; $jobn++){ + launch_job(); + } + if ($recycle_clients) { + my $ret; + my $livejobs; + while (1) { + $ret = waitpid($pid, WNOHANG); + #print STDERR "waitpid $pid ret = $ret \n"; + last if ($ret != 0); + $livejobs = numof_live_jobs(); + if ($numnodes >= $livejobs ) { # a client terminated, OR # lines of input was less than -j + print STDERR "num of requested nodes = $numnodes; num of currently live jobs = $livejobs; Client terminated - launching another.\n"; + launch_job(); + } else { + sleep 15; + } + } + } + print STDERR "CHILD PROCESSES SPAWNED ... WAITING\n"; + for my $p (@pids) { + waitpid($p, 0); + } +} else { +# my $todo = "$sentserver -k $key $multiflag $port "; + my $todo = "$sentserver -k $key $multiflag $port $stay_alive_flag "; + if ($verbose){ print STDERR "Running: $todo\n"; } + check_call($todo); + print STDERR "Call to $sentserver returned.\n"; + cleanup(); + exit(0); +} + +sub numof_live_jobs { + if ($use_fork) { + die "not implemented"; + } else { + # We can probably continue decoding if the qstat error is only temporary + my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat"))); + return ($#livejobs + 1); + } +} +my (@errors,@outs,@cmds); + +sub launch_job { + if ($use_fork) { return launch_job_fork(); } + my $errorfile = "/dev/null"; + my $outfile = "/dev/null"; + $node_count++; + my $clientname = $executable; + $clientname =~ s/^(.{4}).*$/$1/; + $clientname = "$clientname.$node_count"; + if ($errordir){ + $errorfile = "$errordir/$clientname.ER"; + $outfile = "$errordir/$clientname.OU"; + push @errors,$errorfile; + push @outs,$outfile; + } + my $todo = qsub_args($pmem) . " -N $clientname -o $outfile -e $errorfile"; + push @cmds,$todo; + + print STDERR "Running: $todo\n"; + local(*QOUT, *QIN); + open2(\*QOUT, \*QIN, $todo) or die "Failed to open2: $!"; + print QIN $script; + close QIN; + while (my $jobid=){ + chomp $jobid; + if ($verbose){ print STDERR "Launched client job: $jobid"; } + $jobid =~ s/^(\d+)(.*?)$/\1/g; + $jobid =~ s/^Your job (\d+) .*$/\1/; + print STDERR " short job id $jobid\n"; + if ($verbose){ + print STDERR "cd: $abscwd\n"; + print STDERR "cmd: $cmd\n"; + } + if ($joblist == "") { $joblist = $jobid; } + else {$joblist = $joblist . "\|" . $jobid; } + my $cleanfn="qdel $jobid 2> /dev/null"; + push(@cleanup_cmds, $cleanfn); + } + close QOUT; +} + +sub launch_job_fork { + my $errorfile = "/dev/null"; + my $outfile = "/dev/null"; + $node_count++; + my $clientname = $executable; + $clientname =~ s/^(.{4}).*$/$1/; + $clientname = "$clientname.$node_count"; + if ($errordir){ + $errorfile = "$errordir/$clientname.ER"; + $outfile = "$errordir/$clientname.OU"; + push @errors,$errorfile; + push @outs,$outfile; + } + my $pid = fork; + if ($pid == 0) { + my ($fh, $scr_name) = get_temp_script(); + print $fh $script; + close $fh; + my $todo = "/bin/bash -xeo pipefail $scr_name 1> $outfile 2> $errorfile"; + print STDERR "EXEC: $todo\n"; + my $out = check_output("$todo"); + unlink $scr_name or warn "Failed to remove $scr_name"; + exit 0; + } else { + push @pids, $pid; + } +} + +sub get_temp_script { + my ($fh, $filename) = tempfile( "workXXXX", SUFFIX => '.sh'); + return ($fh, $filename); +} + +sub cleanup_and_die { + cleanup(); + die "\n"; +} + +sub cleanup { + print STDERR "Cleaning up...\n"; + for $cmd (@cleanup_cmds){ + print STDERR " Cleanup command: $cmd\n"; + eval $cmd; + } + print STDERR "outputs:\n",preview_files(\@outs,1),"\n"; + print STDERR "errors:\n",preview_files(\@errors,1),"\n"; + print STDERR "cmd:\n",$cmd,"\n"; + print STDERR " cat $errordir/*.ER\nfor logs.\n"; + print STDERR "Cleanup finished.\n"; +} + +sub print_help +{ + my $name = check_output("basename $0"); chomp $name; + print << "Help"; + +usage: $name [options] + + Automatic black-box parallelization of commands. + +options: + + --use-fork + Instead of using qsub, use fork. + + -e, --error-dir + Retain output files from jobs in , rather + than silently deleting them. + + -m, --multi-line + Expect that command may produce multiple output + lines for a single input line. $name makes a + reasonable attempt to obtain all output before + processing additional inputs. However, use of this + option is inherently unsafe. + + -v, --verbose + Print diagnostic informatoin on stderr. + + -j, --jobs + Number of jobs to use. + + -p, --pmem + pmem setting for each job. + +Help +} diff --git a/dpmert/sentclient.c b/dpmert/sentclient.c new file mode 100644 index 00000000..91d994ab --- /dev/null +++ b/dpmert/sentclient.c @@ -0,0 +1,76 @@ +/* Copyright (c) 2001 by David Chiang. All rights reserved.*/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sentserver.h" + +int main (int argc, char *argv[]) { + int sock, port; + char *s, *key; + struct hostent *hp; + struct sockaddr_in server; + int errors = 0; + + if (argc < 3) { + fprintf(stderr, "Usage: sentclient host[:port[:key]] command [args ...]\n"); + exit(1); + } + + s = strchr(argv[1], ':'); + key = NULL; + + if (s == NULL) { + port = DEFAULT_PORT; + } else { + *s = '\0'; + s+=1; + /* dumb hack */ + key = strchr(s, ':'); + if (key != NULL){ + *key = '\0'; + key += 1; + } + port = atoi(s); + } + + sock = socket(AF_INET, SOCK_STREAM, 0); + + hp = gethostbyname(argv[1]); + if (hp == NULL) { + fprintf(stderr, "unknown host %s\n", argv[1]); + exit(1); + } + + bzero((char *)&server, sizeof(server)); + bcopy(hp->h_addr, (char *)&server.sin_addr, hp->h_length); + server.sin_family = hp->h_addrtype; + server.sin_port = htons(port); + + while (connect(sock, (struct sockaddr *)&server, sizeof(server)) < 0) { + perror("connect()"); + sleep(1); + errors++; + if (errors > 5) + exit(1); + } + + close(0); + close(1); + dup2(sock, 0); + dup2(sock, 1); + + if (key != NULL){ + write(1, key, strlen(key)); + write(1, "\n", 1); + } + + execvp(argv[2], argv+2); + return 0; +} diff --git a/dpmert/sentserver.c b/dpmert/sentserver.c new file mode 100644 index 00000000..c20b4fa6 --- /dev/null +++ b/dpmert/sentserver.c @@ -0,0 +1,515 @@ +/* Copyright (c) 2001 by David Chiang. All rights reserved.*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sentserver.h" + +#define MAX_CLIENTS 64 + +struct clientinfo { + int s; + struct sockaddr_in sin; +}; + +struct line { + int id; + char *s; + int status; + struct line *next; +} *head, **ptail; + +int n_sent = 0, n_received=0, n_flushed=0; + +#define STATUS_RUNNING 0 +#define STATUS_ABORTED 1 +#define STATUS_FINISHED 2 + +pthread_mutex_t queue_mutex = PTHREAD_MUTEX_INITIALIZER; +pthread_mutex_t clients_mutex = PTHREAD_MUTEX_INITIALIZER; +pthread_mutex_t input_mutex = PTHREAD_MUTEX_INITIALIZER; + +int n_clients = 0; +int s; +int expect_multiline_output = 0; +int log_mutex = 0; +int stay_alive = 0; /* dont panic and die with zero clients */ + +void queue_finish(struct line *node, char *s, int fid); +char * read_line(int fd, int multiline); +void done (int code); + +struct line * queue_get(int fid) { + struct line *cur; + char *s, *synch; + + if (log_mutex) fprintf(stderr, "Getting for data for fid %d\n", fid); + if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); + pthread_mutex_lock(&queue_mutex); + + /* First, check for aborted sentences. */ + + if (log_mutex) fprintf(stderr, " Checking queue for aborted jobs (fid %d)\n", fid); + for (cur = head; cur != NULL; cur = cur->next) { + if (cur->status == STATUS_ABORTED) { + cur->status = STATUS_RUNNING; + + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); + + return cur; + } + } + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); + + /* Otherwise, read a new one. */ + if (log_mutex) fprintf(stderr, "Locking input mutex (%d)\n", fid); + if (log_mutex) fprintf(stderr, " Reading input for new data (fid %d)\n", fid); + pthread_mutex_lock(&input_mutex); + s = read_line(0,0); + + while (s) { + if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); + pthread_mutex_lock(&queue_mutex); + if (log_mutex) fprintf(stderr, "Unlocking input mutex (%d)\n", fid); + pthread_mutex_unlock(&input_mutex); + + cur = malloc(sizeof (struct line)); + cur->id = n_sent; + cur->s = s; + cur->next = NULL; + + *ptail = cur; + ptail = &cur->next; + + n_sent++; + + if (strcmp(s,"===SYNCH===\n")==0){ + fprintf(stderr, "Received ===SYNCH=== signal (fid %d)\n", fid); + // Note: queue_finish calls free(cur->s). + // Therefore we need to create a new string here. + synch = malloc((strlen("===SYNCH===\n")+2) * sizeof (char)); + synch = strcpy(synch, s); + + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); + queue_finish(cur, synch, fid); /* handles its own lock */ + + if (log_mutex) fprintf(stderr, "Locking input mutex (%d)\n", fid); + if (log_mutex) fprintf(stderr, " Reading input for new data (fid %d)\n", fid); + pthread_mutex_lock(&input_mutex); + + s = read_line(0,0); + } else { + if (log_mutex) fprintf(stderr, " Received new data %d (fid %d)\n", cur->id, fid); + cur->status = STATUS_RUNNING; + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); + return cur; + } + } + + if (log_mutex) fprintf(stderr, "Unlocking input mutex (%d)\n", fid); + pthread_mutex_unlock(&input_mutex); + /* Only way to reach this point: no more output */ + + if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); + pthread_mutex_lock(&queue_mutex); + if (head == NULL) { + fprintf(stderr, "Reached end of file. Exiting.\n"); + done(0); + } else + ptail = NULL; /* This serves as a signal that there is no more input */ + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); + + return NULL; +} + +void queue_panic() { + struct line *next; + while (head && head->status == STATUS_FINISHED) { + /* Write out finished sentences */ + if (head->status == STATUS_FINISHED) { + fputs(head->s, stdout); + fflush(stdout); + } + /* Write out blank line for unfinished sentences */ + if (head->status == STATUS_ABORTED) { + fputs("\n", stdout); + fflush(stdout); + } + /* By defition, there cannot be any RUNNING sentences, since + function is only called when n_clients == 0 */ + free(head->s); + next = head->next; + free(head); + head = next; + n_flushed++; + } + fclose(stdout); + fprintf(stderr, "All clients died. Panicking, flushing completed sentences and exiting.\n"); + done(1); +} + +void queue_abort(struct line *node, int fid) { + if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); + pthread_mutex_lock(&queue_mutex); + node->status = STATUS_ABORTED; + if (n_clients == 0) { + if (stay_alive) { + fprintf(stderr, "Warning! No live clients detected! Staying alive, will retry soon.\n"); + } else { + queue_panic(); + } + } + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); +} + + +void queue_print() { + struct line *cur; + + fprintf(stderr, " Queue\n"); + + for (cur = head; cur != NULL; cur = cur->next) { + switch(cur->status) { + case STATUS_RUNNING: + fprintf(stderr, " %d running ", cur->id); break; + case STATUS_ABORTED: + fprintf(stderr, " %d aborted ", cur->id); break; + case STATUS_FINISHED: + fprintf(stderr, " %d finished ", cur->id); break; + + } + fprintf(stderr, "\n"); + //fprintf(stderr, cur->s); + } +} + +void queue_finish(struct line *node, char *s, int fid) { + struct line *next; + if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); + pthread_mutex_lock(&queue_mutex); + + free(node->s); + node->s = s; + node->status = STATUS_FINISHED; + n_received++; + + /* Flush out finished nodes */ + while (head && head->status == STATUS_FINISHED) { + + if (log_mutex) fprintf(stderr, " Flushing finished node %d\n", head->id); + + fputs(head->s, stdout); + fflush(stdout); + if (log_mutex) fprintf(stderr, " Flushed node %d\n", head->id); + free(head->s); + + next = head->next; + free(head); + + head = next; + + n_flushed++; + + if (head == NULL) { /* empty queue */ + if (ptail == NULL) { /* This can only happen if set in queue_get as signal that there is no more input. */ + fprintf(stderr, "All sentences finished. Exiting.\n"); + done(0); + } else /* ptail pointed at something which was just popped off the stack -- reset to head*/ + ptail = &head; + } + } + + if (log_mutex) fprintf(stderr, " Flushing output %d\n", head->id); + fflush(stdout); + fprintf(stderr, "%d sentences sent, %d sentences finished, %d sentences flushed\n", n_sent, n_received, n_flushed); + + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); + +} + +char * read_line(int fd, int multiline) { + int size = 80; + char errorbuf[100]; + char *s = malloc(size+2); + int result, errors=0; + int i = 0; + + result = read(fd, s+i, 1); + + while (1) { + if (result < 0) { + perror("read()"); + sprintf(errorbuf, "Error code: %d\n", errno); + fprintf(stderr, errorbuf); + errors++; + if (errors > 5) { + free(s); + return NULL; + } else { + sleep(1); /* retry after delay */ + } + } else if (result == 0) { + break; + } else if (multiline==0 && s[i] == '\n') { + break; + } else { + if (s[i] == '\n'){ + /* if we've reached this point, + then multiline must be 1, and we're + going to poll the fd for an additional + line of data. The basic design is to + run a select on the filedescriptor fd. + Select will return under two conditions: + if there is data on the fd, or if a + timeout is reached. We'll select on this + fd. If select returns because there's data + ready, keep going; else assume there's no + more and return the data we already have. + */ + + fd_set set; + FD_ZERO(&set); + FD_SET(fd, &set); + + struct timeval timeout; + timeout.tv_sec = 3; // number of seconds for timeout + timeout.tv_usec = 0; + + int ready = select(FD_SETSIZE, &set, NULL, NULL, &timeout); + if (ready<1){ + break; // no more data, stop looping + } + } + i++; + + if (i == size) { + size = size*2; + s = realloc(s, size+2); + } + } + + result = read(fd, s+i, 1); + } + + if (result == 0 && i == 0) { /* end of file */ + free(s); + return NULL; + } + + s[i] = '\n'; + s[i+1] = '\0'; + + return s; +} + +void * new_client(void *arg) { + struct clientinfo *client = (struct clientinfo *)arg; + struct line *cur; + int result; + char *s; + char errorbuf[100]; + + pthread_mutex_lock(&clients_mutex); + n_clients++; + pthread_mutex_unlock(&clients_mutex); + + fprintf(stderr, "Client connected (%d connected)\n", n_clients); + + for (;;) { + + cur = queue_get(client->s); + + if (cur) { + /* fprintf(stderr, "Sending to client: %s", cur->s); */ + fprintf(stderr, "Sending data %d to client (fid %d)\n", cur->id, client->s); + result = write(client->s, cur->s, strlen(cur->s)); + if (result < strlen(cur->s)){ + perror("write()"); + sprintf(errorbuf, "Error code: %d\n", errno); + fprintf(stderr, errorbuf); + + pthread_mutex_lock(&clients_mutex); + n_clients--; + pthread_mutex_unlock(&clients_mutex); + + fprintf(stderr, "Client died (%d connected)\n", n_clients); + queue_abort(cur, client->s); + + close(client->s); + free(client); + + pthread_exit(NULL); + } + } else { + close(client->s); + pthread_mutex_lock(&clients_mutex); + n_clients--; + pthread_mutex_unlock(&clients_mutex); + fprintf(stderr, "Client dismissed (%d connected)\n", n_clients); + pthread_exit(NULL); + } + + s = read_line(client->s,expect_multiline_output); + if (s) { + /* fprintf(stderr, "Client (fid %d) returned: %s", client->s, s); */ + fprintf(stderr, "Client (fid %d) returned data %d\n", client->s, cur->id); +// queue_print(); + queue_finish(cur, s, client->s); + } else { + pthread_mutex_lock(&clients_mutex); + n_clients--; + pthread_mutex_unlock(&clients_mutex); + + fprintf(stderr, "Client died (%d connected)\n", n_clients); + queue_abort(cur, client->s); + + close(client->s); + free(client); + + pthread_exit(NULL); + } + + } + return 0; +} + +void done (int code) { + close(s); + exit(code); +} + + + +int main (int argc, char *argv[]) { + struct sockaddr_in sin, from; + int g; + socklen_t len; + struct clientinfo *client; + int port; + int opt; + int errors = 0; + int argi; + char *key = NULL, *client_key; + int use_key = 0; + /* the key stuff here doesn't provide any + real measure of security, it's mainly to keep + jobs from bumping into each other. */ + + pthread_t tid; + port = DEFAULT_PORT; + + for (argi=1; argi < argc; argi++){ + if (strcmp(argv[argi], "-m")==0){ + expect_multiline_output = 1; + } else if (strcmp(argv[argi], "-k")==0){ + argi++; + if (argi == argc){ + fprintf(stderr, "Key must be specified after -k\n"); + exit(1); + } + key = argv[argi]; + use_key = 1; + } else if (strcmp(argv[argi], "--stay-alive")==0){ + stay_alive = 1; /* dont panic and die with zero clients */ + } else { + port = atoi(argv[argi]); + } + } + + /* Initialize data structures */ + head = NULL; + ptail = &head; + + /* Set up listener */ + s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + opt = 1; + setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)); + + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(port); + while (bind(s, (struct sockaddr *) &sin, sizeof(sin)) < 0) { + perror("bind()"); + sleep(1); + errors++; + if (errors > 100) + exit(1); + } + + len = sizeof(sin); + getsockname(s, (struct sockaddr *) &sin, &len); + + fprintf(stderr, "Listening on port %hu\n", ntohs(sin.sin_port)); + + while (listen(s, MAX_CLIENTS) < 0) { + perror("listen()"); + sleep(1); + errors++; + if (errors > 100) + exit(1); + } + + for (;;) { + len = sizeof(from); + g = accept(s, (struct sockaddr *)&from, &len); + if (g < 0) { + perror("accept()"); + sleep(1); + continue; + } + client = malloc(sizeof(struct clientinfo)); + client->s = g; + bcopy(&from, &client->sin, len); + + if (use_key){ + fd_set set; + FD_ZERO(&set); + FD_SET(client->s, &set); + + struct timeval timeout; + timeout.tv_sec = 3; // number of seconds for timeout + timeout.tv_usec = 0; + + int ready = select(FD_SETSIZE, &set, NULL, NULL, &timeout); + if (ready<1){ + fprintf(stderr, "Prospective client failed to respond with correct key.\n"); + close(client->s); + free(client); + } else { + client_key = read_line(client->s,0); + client_key[strlen(client_key)-1]='\0'; /* chop trailing newline */ + if (strcmp(key, client_key)==0){ + pthread_create(&tid, NULL, new_client, client); + } else { + fprintf(stderr, "Prospective client failed to respond with correct key.\n"); + close(client->s); + free(client); + } + free(client_key); + } + } else { + pthread_create(&tid, NULL, new_client, client); + } + } + +} + + + diff --git a/dpmert/sentserver.h b/dpmert/sentserver.h new file mode 100644 index 00000000..cd17a546 --- /dev/null +++ b/dpmert/sentserver.h @@ -0,0 +1,6 @@ +#ifndef SENTSERVER_H +#define SENTSERVER_H + +#define DEFAULT_PORT 50000 + +#endif diff --git a/dpmert/tac.pl b/dpmert/tac.pl new file mode 100755 index 00000000..9fb525c1 --- /dev/null +++ b/dpmert/tac.pl @@ -0,0 +1,8 @@ +#!/usr/bin/perl + +while(<>) { + chomp; + $|=1; + print (scalar reverse($_)); + print "\n"; +} diff --git a/dpmert/test_aer/README b/dpmert/test_aer/README new file mode 100644 index 00000000..819b2e32 --- /dev/null +++ b/dpmert/test_aer/README @@ -0,0 +1,8 @@ +To run the test: + +../dist-vest.pl --local --metric aer cdec.ini --source-file corpus.src --ref-files=ref.0 --weights weights + +This will optimize the parameters of the tiny lexical translation model +so as to minimize the AER of the Viterbi alignment on the development +set in corpus.src according to the reference alignments in ref.0. + diff --git a/dpmert/test_aer/cdec.ini b/dpmert/test_aer/cdec.ini new file mode 100644 index 00000000..08187848 --- /dev/null +++ b/dpmert/test_aer/cdec.ini @@ -0,0 +1,3 @@ +formalism=lextrans +grammar=grammar +aligner=true diff --git a/dpmert/test_aer/corpus.src b/dpmert/test_aer/corpus.src new file mode 100644 index 00000000..31b23971 --- /dev/null +++ b/dpmert/test_aer/corpus.src @@ -0,0 +1,3 @@ +el gato negro ||| the black cat +el gato ||| the cat +el libro ||| the book diff --git a/dpmert/test_aer/grammar b/dpmert/test_aer/grammar new file mode 100644 index 00000000..9d857824 --- /dev/null +++ b/dpmert/test_aer/grammar @@ -0,0 +1,12 @@ +el ||| cat ||| F1=1 +el ||| the ||| F2=1 +el ||| black ||| F3=1 +el ||| book ||| F11=1 +gato ||| cat ||| F4=1 NN=1 +gato ||| black ||| F5=1 +gato ||| the ||| F6=1 +negro ||| the ||| F7=1 +negro ||| cat ||| F8=1 +negro ||| black ||| F9=1 +libro ||| the ||| F10=1 +libro ||| book ||| F12=1 NN=1 diff --git a/dpmert/test_aer/ref.0 b/dpmert/test_aer/ref.0 new file mode 100644 index 00000000..734a9c5b --- /dev/null +++ b/dpmert/test_aer/ref.0 @@ -0,0 +1,3 @@ +0-0 1-2 2-1 +0-0 1-1 +0-0 1-1 diff --git a/dpmert/test_aer/weights b/dpmert/test_aer/weights new file mode 100644 index 00000000..afc9282e --- /dev/null +++ b/dpmert/test_aer/weights @@ -0,0 +1,13 @@ +F1 0.1 +F2 -.5980815 +F3 0.24235 +F4 0.625 +F5 0.4514 +F6 0.112316 +F7 -0.123415 +F8 -0.25390285 +F9 -0.23852 +F10 0.646 +F11 0.413141 +F12 0.343216 +NN -0.1215 diff --git a/dpmert/test_data/0.json.gz b/dpmert/test_data/0.json.gz new file mode 100644 index 00000000..30f8dd77 Binary files /dev/null and b/dpmert/test_data/0.json.gz differ diff --git a/dpmert/test_data/1.json.gz b/dpmert/test_data/1.json.gz new file mode 100644 index 00000000..c82cc179 Binary files /dev/null and b/dpmert/test_data/1.json.gz differ diff --git a/dpmert/test_data/c2e.txt.0 b/dpmert/test_data/c2e.txt.0 new file mode 100644 index 00000000..12c4abe9 --- /dev/null +++ b/dpmert/test_data/c2e.txt.0 @@ -0,0 +1,2 @@ +australia reopens embassy in manila +( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack . diff --git a/dpmert/test_data/c2e.txt.1 b/dpmert/test_data/c2e.txt.1 new file mode 100644 index 00000000..4ac12df1 --- /dev/null +++ b/dpmert/test_data/c2e.txt.1 @@ -0,0 +1,2 @@ +australia reopened manila embassy +( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack . diff --git a/dpmert/test_data/c2e.txt.2 b/dpmert/test_data/c2e.txt.2 new file mode 100644 index 00000000..2f67b72f --- /dev/null +++ b/dpmert/test_data/c2e.txt.2 @@ -0,0 +1,2 @@ +australia to reopen embassy in manila +( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so-called confirmed terrorist attack threats . diff --git a/dpmert/test_data/c2e.txt.3 b/dpmert/test_data/c2e.txt.3 new file mode 100644 index 00000000..5483cef6 --- /dev/null +++ b/dpmert/test_data/c2e.txt.3 @@ -0,0 +1,2 @@ +australia to re - open its embassy to manila +( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so-called " clear " threat of terrorist attack 7 weeks ago . diff --git a/dpmert/test_data/re.txt.0 b/dpmert/test_data/re.txt.0 new file mode 100644 index 00000000..86eff087 --- /dev/null +++ b/dpmert/test_data/re.txt.0 @@ -0,0 +1,5 @@ +erdogan states turkey to reject any pressures to urge it to recognize cyprus +ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara will reject any pressure by the european union to urge it to recognize cyprus . this comes two weeks before the summit of european union state and government heads who will decide whether or nor membership negotiations with ankara should be opened . +erdogan told " ntv " television station that " the european union cannot address us by imposing new conditions on us with regard to cyprus . +we will discuss this dossier in the course of membership negotiations . " +he added " let me be clear , i cannot sidestep turkey , this is something we cannot accept . " diff --git a/dpmert/test_data/re.txt.1 b/dpmert/test_data/re.txt.1 new file mode 100644 index 00000000..2140f198 --- /dev/null +++ b/dpmert/test_data/re.txt.1 @@ -0,0 +1,5 @@ +erdogan confirms turkey will resist any pressure to recognize cyprus +ankara 12 - 1 ( afp ) - the turkish head of government , recep tayyip erdogan , announced today ( wednesday ) that ankara would resist any pressure the european union might exercise in order to force it into recognizing cyprus . this comes two weeks before a summit of european union heads of state and government , who will decide whether or not to open membership negotiations with ankara . +erdogan said to the ntv television channel : " the european union cannot engage with us through imposing new conditions on us with regard to cyprus . +we shall discuss this issue in the course of the membership negotiations . " +he added : " let me be clear - i cannot confine turkey . this is something we do not accept . " diff --git a/dpmert/test_data/re.txt.2 b/dpmert/test_data/re.txt.2 new file mode 100644 index 00000000..94e46286 --- /dev/null +++ b/dpmert/test_data/re.txt.2 @@ -0,0 +1,5 @@ +erdogan confirms that turkey will reject any pressures to encourage it to recognize cyprus +ankara , 12 / 1 ( afp ) - the turkish prime minister recep tayyip erdogan declared today , wednesday , that ankara will reject any pressures that the european union may apply on it to encourage to recognize cyprus . this comes two weeks before a summit of the heads of countries and governments of the european union , who will decide on whether or not to start negotiations on joining with ankara . +erdogan told the ntv television station that " it is not possible for the european union to talk to us by imposing new conditions on us regarding cyprus . +we shall discuss this dossier during the negotiations on joining . " +and he added , " let me be clear . turkey's arm should not be twisted ; this is something we cannot accept . " diff --git a/dpmert/test_data/re.txt.3 b/dpmert/test_data/re.txt.3 new file mode 100644 index 00000000..f87c3308 --- /dev/null +++ b/dpmert/test_data/re.txt.3 @@ -0,0 +1,5 @@ +erdogan stresses that turkey will reject all pressures to force it to recognize cyprus +ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara would refuse all pressures applied on it by the european union to force it to recognize cyprus . that came two weeks before the summit of the presidents and prime ministers of the european union , who would decide on whether to open negotiations on joining with ankara or not . +erdogan said to " ntv " tv station that the " european union can not communicate with us by imposing on us new conditions related to cyprus . +we will discuss this file during the negotiations on joining . " +he added , " let me be clear . turkey's arm should not be twisted . this is unacceptable to us . " diff --git a/vest/Makefile.am b/vest/Makefile.am deleted file mode 100644 index 05fa5639..00000000 --- a/vest/Makefile.am +++ /dev/null @@ -1,35 +0,0 @@ -bin_PROGRAMS = \ - mr_vest_map \ - mr_vest_reduce \ - mr_vest_generate_mapper_input \ - sentserver \ - sentclient - -if HAVE_GTEST -noinst_PROGRAMS = \ - lo_test -TESTS = lo_test -endif - -sentserver_SOURCES = sentserver.c -sentserver_LDFLAGS = -all-static -pthread - -sentclient_SOURCES = sentclient.c -sentclient_LDFLAGS = -all-static -pthread - -mr_vest_generate_mapper_input_SOURCES = mr_vest_generate_mapper_input.cc line_optimizer.cc -mr_vest_generate_mapper_input_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz - -# nbest2hg_SOURCES = nbest2hg.cc -# nbest2hg_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lfst -lz - -mr_vest_map_SOURCES = viterbi_envelope.cc ces.cc error_surface.cc mr_vest_map.cc line_optimizer.cc -mr_vest_map_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz - -mr_vest_reduce_SOURCES = error_surface.cc ces.cc mr_vest_reduce.cc line_optimizer.cc viterbi_envelope.cc -mr_vest_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz - -lo_test_SOURCES = lo_test.cc ces.cc viterbi_envelope.cc error_surface.cc line_optimizer.cc -lo_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz - -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/vest/README.shared-mem b/vest/README.shared-mem deleted file mode 100644 index 7728efc0..00000000 --- a/vest/README.shared-mem +++ /dev/null @@ -1,9 +0,0 @@ -If you want to run dist-vest.pl on a very large shared memory machine, do the -following: - - ./dist-vest.pl --use-make I --decode-nodes J --weights weights.init --source-file=dev.src --ref-files=dev.ref.* cdec.ini - -This will use I jobs for doing the line search and J jobs to run the decoder. Typically, since the -decoder must load grammars, language models, etc., J should be smaller than I, but this will depend -on the system you are running on and the complexity of the models used for decoding. - diff --git a/vest/cat.pl b/vest/cat.pl deleted file mode 100755 index 2ecba3f9..00000000 --- a/vest/cat.pl +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/perl - -$|=1; -print while(<>); diff --git a/vest/ces.cc b/vest/ces.cc deleted file mode 100644 index cd89aa69..00000000 --- a/vest/ces.cc +++ /dev/null @@ -1,91 +0,0 @@ -#include "ces.h" - -#include -#include -#include - -// TODO, if AER is to be optimized again, we will need this -// #include "aligner.h" -#include "lattice.h" -#include "viterbi_envelope.h" -#include "error_surface.h" -#include "ns.h" - -using boost::shared_ptr; -using namespace std; - -const bool minimize_segments = true; // if adjacent segments have equal scores, merge them - -void ComputeErrorSurface(const SegmentEvaluator& ss, - const ViterbiEnvelope& ve, - ErrorSurface* env, - const EvaluationMetric* metric, - const Hypergraph& hg) { - vector prev_trans; - const vector >& ienv = ve.GetSortedSegs(); - env->resize(ienv.size()); - SufficientStats prev_score; // defaults to 0 - int j = 0; - for (int i = 0; i < ienv.size(); ++i) { - const Segment& seg = *ienv[i]; - vector trans; -#if 0 - if (type == AER) { - vector edges(hg.edges_.size(), false); - seg.CollectEdgesUsed(&edges); // get the set of edges in the viterbi - // alignment - ostringstream os; - const string* psrc = ss.GetSource(); - if (psrc == NULL) { - cerr << "AER scoring in VEST requires source, but it is missing!\n"; - abort(); - } - size_t pos = psrc->rfind(" ||| "); - if (pos == string::npos) { - cerr << "Malformed source for AER: expected |||\nINPUT: " << *psrc << endl; - abort(); - } - Lattice src; - Lattice ref; - LatticeTools::ConvertTextOrPLF(psrc->substr(0, pos), &src); - LatticeTools::ConvertTextOrPLF(psrc->substr(pos + 5), &ref); - AlignerTools::WriteAlignment(src, ref, hg, &os, true, 0, &edges); - string tstr = os.str(); - TD::ConvertSentence(tstr.substr(tstr.rfind(" ||| ") + 5), &trans); - } else { -#endif - seg.ConstructTranslation(&trans); - //} - //cerr << "Scoring: " << TD::GetString(trans) << endl; - if (trans == prev_trans) { - if (!minimize_segments) { - ErrorSegment& out = (*env)[j]; - out.delta.fields.clear(); - out.x = seg.x; - ++j; - } - //cerr << "Identical translation, skipping scoring\n"; - } else { - SufficientStats score; - ss.Evaluate(trans, &score); - // cerr << "score= " << score->ComputeScore() << "\n"; - //string x1; score.Encode(&x1); cerr << "STATS: " << x1 << endl; - const SufficientStats delta = score - prev_score; - //string x2; delta.Encode(&x2); cerr << "DELTA: " << x2 << endl; - //string xx; delta.Encode(&xx); cerr << xx << endl; - prev_trans.swap(trans); - prev_score = score; - if ((!minimize_segments) || (!delta.IsAdditiveIdentity())) { - ErrorSegment& out = (*env)[j]; - out.delta = delta; - out.x = seg.x; - ++j; - } - } - } - // cerr << " In segments: " << ienv.size() << endl; - // cerr << "Out segments: " << j << endl; - assert(j > 0); - env->resize(j); -} - diff --git a/vest/ces.h b/vest/ces.h deleted file mode 100644 index e021e715..00000000 --- a/vest/ces.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef _CES_H_ -#define _CES_H_ - -class ViterbiEnvelope; -class Hypergraph; -class SegmentEvaluator; -class ErrorSurface; -class EvaluationMetric; - -void ComputeErrorSurface(const SegmentEvaluator& ss, - const ViterbiEnvelope& ve, - ErrorSurface* es, - const EvaluationMetric* metric, - const Hypergraph& hg); - -#endif diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl deleted file mode 100755 index 1ec8c6b1..00000000 --- a/vest/dist-vest.pl +++ /dev/null @@ -1,700 +0,0 @@ -#!/usr/bin/env perl -use strict; -my @ORIG_ARGV=@ARGV; -use Cwd qw(getcwd); -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } - -# Skip local config (used for distributing jobs) if we're running in local-only mode -use LocalConfig; -use Getopt::Long; -use IPC::Open2; -use POSIX ":sys_wait_h"; -my $QSUB_CMD = qsub_args(mert_memory()); - -require "libcall.pl"; - -# Default settings -my $srcFile; -my $refFiles; -my $default_jobs = env_default_jobs(); -my $bin_dir = $SCRIPT_DIR; -die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; -my $FAST_SCORE="$bin_dir/../mteval/fast_score"; -die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; -my $MAPINPUT = "$bin_dir/mr_vest_generate_mapper_input"; -my $MAPPER = "$bin_dir/mr_vest_map"; -my $REDUCER = "$bin_dir/mr_vest_reduce"; -my $parallelize = "$bin_dir/parallelize.pl"; -my $libcall = "$bin_dir/libcall.pl"; -my $sentserver = "$bin_dir/sentserver"; -my $sentclient = "$bin_dir/sentclient"; -my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm"; - -my $SCORER = $FAST_SCORE; -die "Can't find $MAPPER" unless -x $MAPPER; -my $cdec = "$bin_dir/../decoder/cdec"; -die "Can't find decoder in $cdec" unless -x $cdec; -die "Can't find $parallelize" unless -x $parallelize; -die "Can't find $libcall" unless -e $libcall; -my $decoder = $cdec; -my $lines_per_mapper = 400; -my $rand_directions = 15; -my $iteration = 1; -my $best_weights; -my $max_iterations = 15; -my $optimization_iters = 6; -my $jobs = $default_jobs; # number of decode nodes -my $pmem = "9g"; -my $disable_clean = 0; -my %seen_weights; -my $normalize; -my $help = 0; -my $epsilon = 0.0001; -my $interval = 5; -my $dryrun = 0; -my $last_score = -10000000; -my $metric = "ibm_bleu"; -my $dir; -my $iniFile; -my $weights; -my $initialWeights; -my $decoderOpt; -my $noprimary; -my $maxsim=0; -my $oraclen=0; -my $oracleb=20; -my $bleu_weight=1; -my $use_make = 1; # use make to parallelize line search -my $useqsub; -my $pass_suffix = ''; -my $cpbin=1; -# Process command-line options -Getopt::Long::Configure("no_auto_abbrev"); -if (GetOptions( - "decoder=s" => \$decoderOpt, - "jobs=i" => \$jobs, - "dont-clean" => \$disable_clean, - "pass-suffix=s" => \$pass_suffix, - "dry-run" => \$dryrun, - "epsilon=s" => \$epsilon, - "help" => \$help, - "interval" => \$interval, - "qsub" => \$useqsub, - "max-iterations=i" => \$max_iterations, - "normalize=s" => \$normalize, - "pmem=s" => \$pmem, - "cpbin!" => \$cpbin, - "random-directions=i" => \$rand_directions, - "ref-files=s" => \$refFiles, - "metric=s" => \$metric, - "source-file=s" => \$srcFile, - "weights=s" => \$initialWeights, - "workdir=s" => \$dir, - "opt-iterations=i" => \$optimization_iters, -) == 0 || @ARGV!=1 || $help) { - print_help(); - exit; -} - -if ($useqsub) { - $use_make = 0; - die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub(); -} - -my @missing_args = (); -if (!defined $srcFile) { push @missing_args, "--source-file"; } -if (!defined $refFiles) { push @missing_args, "--ref-files"; } -if (!defined $initialWeights) { push @missing_args, "--weights"; } -die "Please specify missing arguments: " . join (', ', @missing_args) . "\n" if (@missing_args); - -if ($metric =~ /^(combi|ter)$/i) { - $lines_per_mapper = 40; -} elsif ($metric =~ /^meteor$/i) { - $lines_per_mapper = 2000; # start up time is really high -} - -($iniFile) = @ARGV; - - -sub write_config; -sub enseg; -sub print_help; - -my $nodelist; -my $host =check_output("hostname"); chomp $host; -my $bleu; -my $interval_count = 0; -my $logfile; -my $projected_score; - -# used in sorting scores -my $DIR_FLAG = '-r'; -if ($metric =~ /^ter$|^aer$/i) { - $DIR_FLAG = ''; -} - -my $refs_comma_sep = get_comma_sep_refs('r',$refFiles); - -unless ($dir){ - $dir = "vest"; -} -unless ($dir =~ /^\//){ # convert relative path to absolute path - my $basedir = check_output("pwd"); - chomp $basedir; - $dir = "$basedir/$dir"; -} - -if ($decoderOpt){ $decoder = $decoderOpt; } - - -# Initializations and helper functions -srand; - -my @childpids = (); -my @cleanupcmds = (); - -sub cleanup { - print STDERR "Cleanup...\n"; - for my $pid (@childpids){ unchecked_call("kill $pid"); } - for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); } - exit 1; -}; -# Always call cleanup, no matter how we exit -*CORE::GLOBAL::exit = - sub{ cleanup(); }; -$SIG{INT} = "cleanup"; -$SIG{TERM} = "cleanup"; -$SIG{HUP} = "cleanup"; - -my $decoderBase = check_output("basename $decoder"); chomp $decoderBase; -my $newIniFile = "$dir/$decoderBase.ini"; -my $inputFileName = "$dir/input"; -my $user = $ENV{"USER"}; - - -# process ini file --e $iniFile || die "Error: could not open $iniFile for reading\n"; -open(INI, $iniFile); - -use File::Basename qw(basename); -#pass bindir, refs to vars holding bin -sub modbin { - local $_; - my $bindir=shift; - check_call("mkdir -p $bindir"); - -d $bindir || die "couldn't make bindir $bindir"; - for (@_) { - my $src=$$_; - $$_="$bindir/".basename($src); - check_call("cp -p $src $$_"); - } -} -sub dirsize { - opendir ISEMPTY,$_[0]; - return scalar(readdir(ISEMPTY))-1; -} -if ($dryrun){ - write_config(*STDERR); - exit 0; -} else { - if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-vest.pl outputs - die "ERROR: working dir $dir already exists\n\n"; - } else { - -e $dir || mkdir $dir; - mkdir "$dir/hgs"; - modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$REDUCER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin; - mkdir "$dir/scripts"; - my $cmdfile="$dir/rerun-vest.sh"; - open CMD,'>',$cmdfile; - print CMD "cd ",&getcwd,"\n"; -# print CMD &escaped_cmdline,"\n"; #buggy - last arg is quoted. - my $cline=&cmdline."\n"; - print CMD $cline; - close CMD; - print STDERR $cline; - chmod(0755,$cmdfile); - unless (-e $initialWeights) { - print STDERR "Please specify an initial weights file with --initial-weights\n"; - print_help(); - exit; - } - check_call("cp $initialWeights $dir/weights.0"); - die "Can't find weights.0" unless (-e "$dir/weights.0"); - } - write_config(*STDERR); -} - - -# Generate initial files and values -check_call("cp $iniFile $newIniFile"); -$iniFile = $newIniFile; - -my $newsrc = "$dir/dev.input"; -enseg($srcFile, $newsrc); -$srcFile = $newsrc; -my $devSize = 0; -open F, "<$srcFile" or die "Can't read $srcFile: $!"; -while() { $devSize++; } -close F; - -unless($best_weights){ $best_weights = $weights; } -unless($projected_score){ $projected_score = 0.0; } -$seen_weights{$weights} = 1; - -my $random_seed = int(time / 1000); -my $lastWeightsFile; -my $lastPScore = 0; -# main optimization loop -while (1){ - print STDERR "\n\nITERATION $iteration\n==========\n"; - - if ($iteration > $max_iterations){ - print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n"; - last; - } - # iteration-specific files - my $runFile="$dir/run.raw.$iteration"; - my $onebestFile="$dir/1best.$iteration"; - my $logdir="$dir/logs.$iteration"; - my $decoderLog="$logdir/decoder.sentserver.log.$iteration"; - my $scorerLog="$logdir/scorer.log.$iteration"; - check_call("mkdir -p $logdir"); - - - #decode - print STDERR "RUNNING DECODER AT "; - print STDERR unchecked_output("date"); - my $im1 = $iteration - 1; - my $weightsFile="$dir/weights.$im1"; - my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs"; - my $pcmd; - if ($use_make) { - $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $jobs --"; - } else { - $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --"; - } - my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile"; - print STDERR "COMMAND:\n$cmd\n"; - check_bash_call($cmd); - my $num_hgs; - my $num_topbest; - my $retries = 0; - while($retries < 5) { - $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l"); - $num_topbest = check_output("wc -l < $runFile"); - print STDERR "NUMBER OF HGs: $num_hgs\n"; - print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n"; - if($devSize == $num_hgs && $devSize == $num_topbest) { - last; - } else { - print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n"; - sleep(3); - } - $retries++; - } - die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest); - my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -m $metric"); - chomp $dec_score; - print STDERR "DECODER SCORE: $dec_score\n"; - - # save space - check_call("gzip -f $runFile"); - check_call("gzip -f $decoderLog"); - - # run optimizer - print STDERR "RUNNING OPTIMIZER AT "; - print STDERR unchecked_output("date"); - my $mergeLog="$logdir/prune-merge.log.$iteration"; - - my $score = 0; - my $icc = 0; - my $inweights="$dir/weights.$im1"; - for (my $opt_iter=1; $opt_iter<$optimization_iters; $opt_iter++) { - print STDERR "\nGENERATE OPTIMIZATION STRATEGY (OPT-ITERATION $opt_iter/$optimization_iters)\n"; - print STDERR unchecked_output("date"); - $icc++; - $cmd="$MAPINPUT -w $inweights -r $dir/hgs -s $devSize -d $rand_directions > $dir/agenda.$im1-$opt_iter"; - print STDERR "COMMAND:\n$cmd\n"; - check_call($cmd); - check_call("mkdir -p $dir/splag.$im1"); - $cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1-$opt_iter $dir/splag.$im1/mapinput."; - print STDERR "COMMAND:\n$cmd\n"; - check_call($cmd); - opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!"; - my @shards = grep { /^mapinput\./ } readdir(DIR); - closedir DIR; - die "No shards!" unless scalar @shards > 0; - my $joblist = ""; - my $nmappers = 0; - my @mapoutputs = (); - @cleanupcmds = (); - my %o2i = (); - my $first_shard = 1; - my $mkfile; # only used with makefiles - my $mkfilename; - if ($use_make) { - $mkfilename = "$dir/splag.$im1/domap.mk"; - open $mkfile, ">$mkfilename" or die "Couldn't write $mkfilename: $!"; - print $mkfile "all: $dir/splag.$im1/map.done\n\n"; - } - my @mkouts = (); # only used with makefiles - for my $shard (@shards) { - my $mapoutput = $shard; - my $client_name = $shard; - $client_name =~ s/mapinput.//; - $client_name = "vest.$client_name"; - $mapoutput =~ s/mapinput/mapoutput/; - push @mapoutputs, "$dir/splag.$im1/$mapoutput"; - $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; - my $script = "$MAPPER -s $srcFile -m $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput"; - if ($use_make) { - my $script_file = "$dir/scripts/map.$shard"; - open F, ">$script_file" or die "Can't write $script_file: $!"; - print F "#!/bin/bash\n"; - print F "$script\n"; - close F; - my $output = "$dir/splag.$im1/$mapoutput"; - push @mkouts, $output; - chmod(0755, $script_file) or die "Can't chmod $script_file: $!"; - if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } - print $mkfile "$output: $dir/splag.$im1/$shard\n\t$script_file\n\n"; - } else { - my $script_file = "$dir/scripts/map.$shard"; - open F, ">$script_file" or die "Can't write $script_file: $!"; - print F "$script\n"; - close F; - if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } - - $nmappers++; - my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file"; - my $jobid = check_output("$qcmd"); - chomp $jobid; - $jobid =~ s/^(\d+)(.*?)$/\1/g; - $jobid =~ s/^Your job (\d+) .*$/\1/; - push(@cleanupcmds, "qdel $jobid 2> /dev/null"); - print STDERR " $jobid"; - if ($joblist == "") { $joblist = $jobid; } - else {$joblist = $joblist . "\|" . $jobid; } - } - } - if ($use_make) { - print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n"; - close $mkfile; - my $mcmd = "make -j $jobs -f $mkfilename"; - print STDERR "\nExecuting: $mcmd\n"; - check_call($mcmd); - } else { - print STDERR "\nLaunched $nmappers mappers.\n"; - sleep 8; - print STDERR "Waiting for mappers to complete...\n"; - while ($nmappers > 0) { - sleep 5; - my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '"))); - $nmappers = scalar @livejobs; - } - print STDERR "All mappers complete.\n"; - } - my $tol = 0; - my $til = 0; - for my $mo (@mapoutputs) { - my $olines = get_lines($mo); - my $ilines = get_lines($o2i{$mo}); - $tol += $olines; - $til += $ilines; - die "$mo: output lines ($olines) doesn't match input lines ($ilines)" unless $olines==$ilines; - } - print STDERR "Results for $tol/$til lines\n"; - print STDERR "\nSORTING AND RUNNING VEST REDUCER\n"; - print STDERR unchecked_output("date"); - $cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -m $metric > $dir/redoutput.$im1"; - print STDERR "COMMAND:\n$cmd\n"; - check_bash_call($cmd); - $cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1"; - # sort returns failure even when it doesn't fail for some reason - my $best=unchecked_output("$cmd"); chomp $best; - print STDERR "$best\n"; - my ($oa, $x, $xscore) = split /\|/, $best; - $score = $xscore; - print STDERR "PROJECTED SCORE: $score\n"; - if (abs($x) < $epsilon) { - print STDERR "\nOPTIMIZER: no score improvement: abs($x) < $epsilon\n"; - last; - } - my $psd = $score - $last_score; - $last_score = $score; - if (abs($psd) < $epsilon) { - print STDERR "\nOPTIMIZER: no score improvement: abs($psd) < $epsilon\n"; - last; - } - my ($origin, $axis) = split /\s+/, $oa; - - my %ori = convert($origin); - my %axi = convert($axis); - - my $finalFile="$dir/weights.$im1-$opt_iter"; - open W, ">$finalFile" or die "Can't write: $finalFile: $!"; - my $norm = 0; - for my $k (sort keys %ori) { - my $dd = $ori{$k} + $axi{$k} * $x; - $norm += $dd * $dd; - } - $norm = sqrt($norm); - $norm = 1; - for my $k (sort keys %ori) { - my $v = ($ori{$k} + $axi{$k} * $x) / $norm; - print W "$k $v\n"; - } - check_call("rm $dir/splag.$im1/*"); - $inweights = $finalFile; - } - $lastWeightsFile = "$dir/weights.$iteration"; - check_call("cp $inweights $lastWeightsFile"); - if ($icc < 2) { - print STDERR "\nREACHED STOPPING CRITERION: score change too little\n"; - last; - } - $lastPScore = $score; - $iteration++; - print STDERR "\n==========\n"; -} - -print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w with the decoder)\n\n"; - -print STDOUT "$lastWeightsFile\n"; - -exit 0; - -sub normalize_weights { - my ($rfn, $rpts, $feat) = @_; - my @feat_names = @$rfn; - my @pts = @$rpts; - my $z = 1.0; - for (my $i=0; $i < scalar @feat_names; $i++) { - if ($feat_names[$i] eq $feat) { - $z = $pts[$i]; - last; - } - } - for (my $i=0; $i < scalar @feat_names; $i++) { - $pts[$i] /= $z; - } - print STDERR " NORM WEIGHTS: @pts\n"; - return @pts; -} - -sub get_lines { - my $fn = shift @_; - open FL, "<$fn" or die "Couldn't read $fn: $!"; - my $lc = 0; - while() { $lc++; } - return $lc; -} - -sub get_comma_sep_refs { - my ($r,$p) = @_; - my $o = check_output("echo $p"); - chomp $o; - my @files = split /\s+/, $o; - return "-$r " . join(" -$r ", @files); -} - -sub read_weights_file { - my ($file) = @_; - open F, "<$file" or die "Couldn't read $file: $!"; - my @r = (); - my $pm = -1; - while() { - next if /^#/; - next if /^\s*$/; - chomp; - if (/^(.+)\s+(.+)$/) { - my $m = $1; - my $w = $2; - die "Weights out of order: $m <= $pm" unless $m > $pm; - push @r, $w; - } else { - warn "Unexpected feature name in weight file: $_"; - } - } - close F; - return join ' ', @r; -} - -# subs -sub write_config { - my $fh = shift; - my $cleanup = "yes"; - if ($disable_clean) {$cleanup = "no";} - - print $fh "\n"; - print $fh "DECODER: $decoder\n"; - print $fh "INI FILE: $iniFile\n"; - print $fh "WORKING DIR: $dir\n"; - print $fh "SOURCE (DEV): $srcFile\n"; - print $fh "REFS (DEV): $refFiles\n"; - print $fh "EVAL METRIC: $metric\n"; - print $fh "START ITERATION: $iteration\n"; - print $fh "MAX ITERATIONS: $max_iterations\n"; - print $fh "PARALLEL JOBS: $jobs\n"; - print $fh "HEAD NODE: $host\n"; - print $fh "PMEM (DECODING): $pmem\n"; - print $fh "CLEANUP: $cleanup\n"; - print $fh "INITIAL WEIGHTS: $initialWeights\n"; -} - -sub update_weights_file { - my ($neww, $rfn, $rpts) = @_; - my @feats = @$rfn; - my @pts = @$rpts; - my $num_feats = scalar @feats; - my $num_pts = scalar @pts; - die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts; - open G, ">$neww" or die; - for (my $i = 0; $i < $num_feats; $i++) { - my $f = $feats[$i]; - my $lambda = $pts[$i]; - print G "$f $lambda\n"; - } - close G; -} - -sub enseg { - my $src = shift; - my $newsrc = shift; - open(SRC, $src); - open(NEWSRC, ">$newsrc"); - my $i=0; - while (my $line=){ - chomp $line; - if ($line =~ /^\s* tags, you must include a zero-based id attribute"; - } - } else { - print NEWSRC "$line\n"; - } - $i++; - } - close SRC; - close NEWSRC; -} - -sub print_help { - - my $executable = check_output("basename $0"); chomp $executable; - print << "Help"; - -Usage: $executable [options] - - $executable [options] - Runs a complete MERT optimization using the decoder configuration - in . Required options are --weights, --source-file, and - --ref-files. - -Options: - - --help - Print this message and exit. - - --max-iterations - Maximum number of iterations to run. If not specified, defaults - to 10. - - --pass-suffix - If the decoder is doing multi-pass decoding, the pass suffix "2", - "3", etc., is used to control what iteration of weights is set. - - --ref-files - Dev set ref files. This option takes only a single string argument. - To use multiple files (including file globbing), this argument should - be quoted. - - --metric - Metric to optimize. - Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi - - --normalize - After each iteration, rescale all feature weights such that feature- - name has a weight of 1.0. - - --rand-directions - MERT will attempt to optimize along all of the principle directions, - set this parameter to explore other directions. Defaults to 5. - - --source-file - Dev set source file. - - --weights - A file specifying initial feature weights. The format is - FeatureName_1 value1 - FeatureName_2 value2 - **All and only the weights listed in will be optimized!** - - --workdir - Directory for intermediate and output files. If not specified, the - name is derived from the ini filename. Assuming that the ini - filename begins with the decoder name and ends with ini, the default - name of the working directory is inferred from the middle part of - the filename. E.g. an ini file named decoder.foo.ini would have - a default working directory name foo. - -Job control options: - - --jobs - Number of decoder processes to run in parallel. [default=$default_jobs] - - --qsub - Use qsub to run jobs in parallel (qsub must be configured in - environment/LocalEnvironment.pm) - - --pmem - Amount of physical memory requested for parallel decoding jobs - (used with qsub requests only) - -Help -} - -sub convert { - my ($str) = @_; - my @ps = split /;/, $str; - my %dict = (); - for my $p (@ps) { - my ($k, $v) = split /=/, $p; - $dict{$k} = $v; - } - return %dict; -} - - - -sub cmdline { - return join ' ',($0,@ORIG_ARGV); -} - -#buggy: last arg gets quoted sometimes? -my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]}; -my $shell_escape_in_quote=qr{[\\"\$`!]}; - -sub escape_shell { - my ($arg)=@_; - return undef unless defined $arg; - if ($arg =~ /$is_shell_special/) { - $arg =~ s/($shell_escape_in_quote)/\\$1/g; - return "\"$arg\""; - } - return $arg; -} - -sub escaped_shell_args { - return map {local $_=$_;chomp;escape_shell($_)} @_; -} - -sub escaped_shell_args_str { - return join ' ',&escaped_shell_args(@_); -} - -sub escaped_cmdline { - return "$0 ".&escaped_shell_args_str(@ORIG_ARGV); -} diff --git a/vest/error_surface.cc b/vest/error_surface.cc deleted file mode 100644 index 515b67f8..00000000 --- a/vest/error_surface.cc +++ /dev/null @@ -1,42 +0,0 @@ -#include "error_surface.h" - -#include -#include - -using namespace std; - -ErrorSurface::~ErrorSurface() {} - -void ErrorSurface::Serialize(std::string* out) const { - const int segments = this->size(); - ostringstream os(ios::binary); - os.write((const char*)&segments,sizeof(segments)); - for (int i = 0; i < segments; ++i) { - const ErrorSegment& cur = (*this)[i]; - string senc; - cur.delta.Encode(&senc); - assert(senc.size() < 1024); - unsigned char len = senc.size(); - os.write((const char*)&cur.x, sizeof(cur.x)); - os.write((const char*)&len, sizeof(len)); - os.write((const char*)&senc[0], len); - } - *out = os.str(); -} - -void ErrorSurface::Deserialize(const std::string& in) { - istringstream is(in, ios::binary); - int segments; - is.read((char*)&segments, sizeof(segments)); - this->resize(segments); - for (int i = 0; i < segments; ++i) { - ErrorSegment& cur = (*this)[i]; - unsigned char len; - is.read((char*)&cur.x, sizeof(cur.x)); - is.read((char*)&len, sizeof(len)); - string senc(len, '\0'); assert(senc.size() == len); - is.read((char*)&senc[0], len); - cur.delta = SufficientStats(senc); - } -} - diff --git a/vest/error_surface.h b/vest/error_surface.h deleted file mode 100644 index bb65847b..00000000 --- a/vest/error_surface.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef _ERROR_SURFACE_H_ -#define _ERROR_SURFACE_H_ - -#include -#include - -#include "ns.h" - -class Score; - -struct ErrorSegment { - double x; - SufficientStats delta; - ErrorSegment() : x(0), delta() {} -}; - -class ErrorSurface : public std::vector { - public: - ~ErrorSurface(); - void Serialize(std::string* out) const; - void Deserialize(const std::string& in); -}; - -#endif diff --git a/vest/libcall.pl b/vest/libcall.pl deleted file mode 100644 index c7d0f128..00000000 --- a/vest/libcall.pl +++ /dev/null @@ -1,71 +0,0 @@ -use IPC::Open3; -use Symbol qw(gensym); - -$DUMMY_STDERR = gensym(); -$DUMMY_STDIN = gensym(); - -# Run the command and ignore failures -sub unchecked_call { - system("@_") -} - -# Run the command and return its output, if any ignoring failures -sub unchecked_output { - return `@_` -} - -# WARNING: Do not use this for commands that will return large amounts -# of stdout or stderr -- they might block indefinitely -sub check_output { - print STDERR "Executing and gathering output: @_\n"; - - my $pid = open3($DUMMY_STDIN, \*PH, $DUMMY_STDERR, @_); - my $proc_output = ""; - while( ) { - $proc_output .= $_; - } - waitpid($pid, 0); - # TODO: Grab signal that the process died from - my $child_exit_status = $? >> 8; - if($child_exit_status == 0) { - return $proc_output; - } else { - print STDERR "ERROR: Execution of @_ failed.\n"; - exit(1); - } -} - -# Based on Moses' safesystem sub -sub check_call { - print STDERR "Executing: @_\n"; - system(@_); - my $exitcode = $? >> 8; - if($exitcode == 0) { - return 0; - } elsif ($? == -1) { - print STDERR "ERROR: Failed to execute: @_\n $!\n"; - exit(1); - - } elsif ($? & 127) { - printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n", - ($? & 127), ($? & 128) ? 'with' : 'without'; - exit(1); - - } else { - print STDERR "Failed with exit code: $exitcode\n" if $exitcode; - exit($exitcode); - } -} - -sub check_bash_call { - my @args = ( "bash", "-auxeo", "pipefail", "-c", "@_"); - check_call(@args); -} - -sub check_bash_output { - my @args = ( "bash", "-auxeo", "pipefail", "-c", "@_"); - return check_output(@args); -} - -# perl module weirdness... -return 1; diff --git a/vest/line_mediator.pl b/vest/line_mediator.pl deleted file mode 100755 index bc2bb24c..00000000 --- a/vest/line_mediator.pl +++ /dev/null @@ -1,116 +0,0 @@ -#!/usr/bin/perl -w -#hooks up two processes, 2nd of which has one line of output per line of input, expected by the first, which starts off the communication - -# if you don't know how to fork/exec in a C program, this could be helpful under limited cirmustances (would be ok to liaise with sentserver) - -#WARNING: because it waits for the result from command 2 after sending every line, and especially if command 1 does the same, using sentserver as command 2 won't actually buy you any real parallelism. - -use strict; -use IPC::Open2; -use POSIX qw(pipe dup2 STDIN_FILENO STDOUT_FILENO); - -my $quiet=!$ENV{DEBUG}; -$quiet=1 if $ENV{QUIET}; -sub info { - local $,=' '; - print STDERR @_ unless $quiet; -} - -my $mode='CROSS'; -my $ser='DIRECT'; -$mode='PIPE' if $ENV{PIPE}; -$mode='SNAKE' if $ENV{SNAKE}; -$mode='CROSS' if $ENV{CROSS}; -$ser='SERIAL' if $ENV{SERIAL}; -$ser='DIRECT' if $ENV{DIRECT}; -$ser='SERIAL' if $mode eq 'SNAKE'; -info("mode: $mode\n"); -info("connection: $ser\n"); - - -my @c1; -if (scalar @ARGV) { - do { - push @c1,shift - } while scalar @ARGV && $c1[$#c1] ne '--'; -} -pop @c1; -my @c2=@ARGV; -@ARGV=(); -(scalar @c1 && scalar @c2) || die qq{ -usage: $0 cmd1 args -- cmd2 args -all options are environment variables. -DEBUG=1 env var enables debugging output. -CROSS=1 hooks up two processes, 2nd of which has one line of output per line of input, expected by the first, which starts off the communication. crosses stdin/stderr of cmd1 and cmd2 line by line (both must flush on newline and output. cmd1 initiates the conversation (sends the first line). default: attempts to cross stdin/stdout of c1 and c2 directly (via two unidirectional posix pipes created before fork). -SERIAL=1: (no parallelism possible) but lines exchanged are logged if DEBUG. -if SNAKE then stdin -> c1 -> c2 -> c1 -> stdout. -if PIPE then stdin -> c1 -> c2 -> stdout (same as shell c1|c2, but with SERIAL you can see the intermediate in real time; you could do similar with c1 | tee /dev/fd/2 |c2. -DIRECT=1 (default) will override SERIAL=1. -CROSS=1 (default) will override SNAKE or PIPE. -}; - -info("1 cmd:",@c1,"\n"); -info("2 cmd:",@c2,"\n"); - -sub lineto { - select $_[0]; - $|=1; - shift; - print @_; -} - -if ($ser eq 'SERIAL') { - my ($R1,$W1,$R2,$W2); - my $c1p=open2($R1,$W1,@c1); # Open2 R W backward from Open3. - my $c2p=open2($R2,$W2,@c2); - if ($mode eq 'CROSS') { - while(<$R1>) { - info("1:",$_); - lineto($W2,$_); - last unless defined ($_=<$R2>); - info("1|2:",$_); - lineto($W1,$_); - } - } else { - my $snake=$mode eq 'SNAKE'; - while() { - info("IN:",$_); - lineto($W1,$_); - last unless defined ($_=<$R1>); - info("IN|1:",$_); - lineto($W2,$_); - last unless defined ($_=<$R2>); - info("IN|1|2:",$_); - if ($snake) { - lineto($W1,$_); - last unless defined ($_=<$R1>); - info("IN|1|2|1:",$_); - } - lineto(*STDOUT,$_); - } - } -} else { - info("DIRECT mode\n"); - my @rw1=POSIX::pipe(); - my @rw2=POSIX::pipe(); - my $pid=undef; - $SIG{CHLD} = sub { wait }; - while (not defined ($pid=fork())) { - sleep 1; - } - my $pipe = $mode eq 'PIPE'; - unless ($pipe) { - POSIX::close(STDOUT_FILENO); - POSIX::close(STDIN_FILENO); - } - if ($pid) { - POSIX::dup2($rw1[1],STDOUT_FILENO); - POSIX::dup2($rw2[0],STDIN_FILENO) unless $pipe; - exec @c1; - } else { - POSIX::dup2($rw2[1],STDOUT_FILENO) unless $pipe; - POSIX::dup2($rw1[0],STDIN_FILENO); - exec @c2; - } - while (wait()!=-1) {} -} diff --git a/vest/line_optimizer.cc b/vest/line_optimizer.cc deleted file mode 100644 index 49443fbe..00000000 --- a/vest/line_optimizer.cc +++ /dev/null @@ -1,111 +0,0 @@ -#include "line_optimizer.h" - -#include -#include - -#include "sparse_vector.h" -#include "ns.h" - -using namespace std; - -typedef ErrorSurface::const_iterator ErrorIter; - -// sort by increasing x-ints -struct IntervalComp { - bool operator() (const ErrorIter& a, const ErrorIter& b) const { - return a->x < b->x; - } -}; - -double LineOptimizer::LineOptimize( - const EvaluationMetric* metric, - const vector& surfaces, - const LineOptimizer::ScoreType type, - float* best_score, - const double epsilon) { - // cerr << "MIN=" << MINIMIZE_SCORE << " MAX=" << MAXIMIZE_SCORE << " MINE=" << type << endl; - vector all_ints; - for (vector::const_iterator i = surfaces.begin(); - i != surfaces.end(); ++i) { - const ErrorSurface& surface = *i; - for (ErrorIter j = surface.begin(); j != surface.end(); ++j) - all_ints.push_back(j); - } - sort(all_ints.begin(), all_ints.end(), IntervalComp()); - double last_boundary = all_ints.front()->x; - SufficientStats acc; - float& cur_best_score = *best_score; - cur_best_score = (type == MAXIMIZE_SCORE ? - -numeric_limits::max() : numeric_limits::max()); - bool left_edge = true; - double pos = numeric_limits::quiet_NaN(); - for (vector::iterator i = all_ints.begin(); - i != all_ints.end(); ++i) { - const ErrorSegment& seg = **i; - if (seg.x - last_boundary > epsilon) { - float sco = metric->ComputeScore(acc); - if ((type == MAXIMIZE_SCORE && sco > cur_best_score) || - (type == MINIMIZE_SCORE && sco < cur_best_score) ) { - cur_best_score = sco; - if (left_edge) { - pos = seg.x - 0.1; - left_edge = false; - } else { - pos = last_boundary + (seg.x - last_boundary) / 2; - } - //cerr << "NEW BEST: " << pos << " (score=" << cur_best_score << ")\n"; - } - // string xx = metric->DetailedScore(acc); cerr << "---- " << xx; - // cerr << "---- s=" << sco << "\n"; - last_boundary = seg.x; - } - // cerr << "x-boundary=" << seg.x << "\n"; - //string x2; acc.Encode(&x2); cerr << " ACC: " << x2 << endl; - //string x1; seg.delta.Encode(&x1); cerr << " DELTA: " << x1 << endl; - acc += seg.delta; - } - float sco = metric->ComputeScore(acc); - if ((type == MAXIMIZE_SCORE && sco > cur_best_score) || - (type == MINIMIZE_SCORE && sco < cur_best_score) ) { - cur_best_score = sco; - if (left_edge) { - pos = 0; - } else { - pos = last_boundary + 1000.0; - } - } - return pos; -} - -void LineOptimizer::RandomUnitVector(const vector& features_to_optimize, - SparseVector* axis, - RandomNumberGenerator* rng) { - axis->clear(); - for (int i = 0; i < features_to_optimize.size(); ++i) - axis->set_value(features_to_optimize[i], rng->NextNormal(0.0,1.0)); - (*axis) /= axis->l2norm(); -} - -void LineOptimizer::CreateOptimizationDirections( - const vector& features_to_optimize, - int additional_random_directions, - RandomNumberGenerator* rng, - vector >* dirs - , bool include_orthogonal - ) { - dirs->clear(); - typedef SparseVector Dir; - vector &out=*dirs; - int i=0; - if (include_orthogonal) - for (;i - -#include "sparse_vector.h" -#include "error_surface.h" -#include "sampler.h" - -class EvaluationMetric; -class Weights; - -struct LineOptimizer { - - // use MINIMIZE_SCORE for things like TER, WER - // MAXIMIZE_SCORE for things like BLEU - enum ScoreType { MAXIMIZE_SCORE, MINIMIZE_SCORE }; - - // merge all the error surfaces together into a global - // error surface and find (the middle of) the best segment - static double LineOptimize( - const EvaluationMetric* metric, - const std::vector& envs, - const LineOptimizer::ScoreType type, - float* best_score, - const double epsilon = 1.0/65536.0); - - // return a random vector of length 1 where all dimensions - // not listed in dimensions will be 0. - static void RandomUnitVector(const std::vector& dimensions, - SparseVector* axis, - RandomNumberGenerator* rng); - - // generate a list of directions to optimize; the list will - // contain the orthogonal vectors corresponding to the dimensions in - // primary and then additional_random_directions directions in those - // dimensions as well. All vectors will be length 1. - static void CreateOptimizationDirections( - const std::vector& primary, - int additional_random_directions, - RandomNumberGenerator* rng, - std::vector >* dirs - , bool include_primary=true - ); - -}; - -#endif diff --git a/vest/lo_test.cc b/vest/lo_test.cc deleted file mode 100644 index a67f65e1..00000000 --- a/vest/lo_test.cc +++ /dev/null @@ -1,236 +0,0 @@ -#include -#include -#include - -#include -#include - -#include "ns.h" -#include "ns_docscorer.h" -#include "ces.h" -#include "fdict.h" -#include "hg.h" -#include "kbest.h" -#include "hg_io.h" -#include "filelib.h" -#include "inside_outside.h" -#include "viterbi.h" -#include "viterbi_envelope.h" -#include "line_optimizer.h" - -using namespace std; -using boost::shared_ptr; - -class OptTest : public testing::Test { - protected: - virtual void SetUp() { } - virtual void TearDown() { } -}; - -const char* ref11 = "australia reopens embassy in manila"; -const char* ref12 = "( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack ."; -const char* ref21 = "australia reopened manila embassy"; -const char* ref22 = "( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack ."; -const char* ref31 = "australia to reopen embassy in manila"; -const char* ref32 = "( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so - called confirmed terrorist attack threats ."; -const char* ref41 = "australia to re - open its embassy to manila"; -const char* ref42 = "( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so - called \" clear \" threat of terrorist attack 7 weeks ago ."; - -TEST_F(OptTest, TestCheckNaN) { - double x = 0; - double y = 0; - double z = x / y; - EXPECT_EQ(true, isnan(z)); -} - -TEST_F(OptTest,TestViterbiEnvelope) { - shared_ptr a1(new Segment(-1, 0)); - shared_ptr b1(new Segment(1, 0)); - shared_ptr a2(new Segment(-1, 1)); - shared_ptr b2(new Segment(1, -1)); - vector > sa; sa.push_back(a1); sa.push_back(b1); - vector > sb; sb.push_back(a2); sb.push_back(b2); - ViterbiEnvelope a(sa); - cerr << a << endl; - ViterbiEnvelope b(sb); - ViterbiEnvelope c = a; - c *= b; - cerr << a << " (*) " << b << " = " << c << endl; - EXPECT_EQ(3, c.size()); -} - -TEST_F(OptTest,TestViterbiEnvelopeInside) { - const string json = "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| A [1]\",3,\"[X] ||| c\",4,\"[X] ||| C [1]\",5,\"[X] ||| [1] B [2]\",6,\"[X] ||| [1] b [2]\",7,\"[X] ||| X [1]\",8,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.8,1,-0.1],\"rule\":2}],\"node\":{\"in_edges\":[1]},\"edges\":[{\"tail\":[],\"feats\":[1,-1],\"rule\":3}],\"node\":{\"in_edges\":[2]},\"edges\":[{\"tail\":[2],\"feats\":[0,-0.2,1,-0.1],\"rule\":4}],\"node\":{\"in_edges\":[3]},\"edges\":[{\"tail\":[1,3],\"feats\":[0,-1.2,1,-0.2],\"rule\":5},{\"tail\":[1,3],\"feats\":[0,-0.5,1,-1.3],\"rule\":6}],\"node\":{\"in_edges\":[4,5]},\"edges\":[{\"tail\":[4],\"feats\":[0,-0.5,1,-0.8],\"rule\":7},{\"tail\":[4],\"feats\":[0,-0.7,1,-0.9],\"rule\":8}],\"node\":{\"in_edges\":[6,7]}}"; - Hypergraph hg; - istringstream instr(json); - HypergraphIO::ReadFromJSON(&instr, &hg); - SparseVector wts; - wts.set_value(FD::Convert("f1"), 0.4); - wts.set_value(FD::Convert("f2"), 1.0); - hg.Reweight(wts); - vector, prob_t> > list; - std::vector > features; - KBest::KBestDerivations, ESentenceTraversal> kbest(hg, 10); - for (int i = 0; i < 10; ++i) { - const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = - kbest.LazyKthBest(hg.nodes_.size() - 1, i); - if (!d) break; - cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; - } - SparseVector dir; dir.set_value(FD::Convert("f1"), 1.0); - ViterbiEnvelopeWeightFunction wf(wts, dir); - ViterbiEnvelope env = Inside(hg, NULL, wf); - cerr << env << endl; - const vector >& segs = env.GetSortedSegs(); - dir *= segs[1]->x; - wts += dir; - hg.Reweight(wts); - KBest::KBestDerivations, ESentenceTraversal> kbest2(hg, 10); - for (int i = 0; i < 10; ++i) { - const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = - kbest2.LazyKthBest(hg.nodes_.size() - 1, i); - if (!d) break; - cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; - } - for (int i = 0; i < segs.size(); ++i) { - cerr << "seg=" << i << endl; - vector trans; - segs[i]->ConstructTranslation(&trans); - cerr << TD::GetString(trans) << endl; - } -} - -TEST_F(OptTest, TestS1) { - int fPhraseModel_0 = FD::Convert("PhraseModel_0"); - int fPhraseModel_1 = FD::Convert("PhraseModel_1"); - int fPhraseModel_2 = FD::Convert("PhraseModel_2"); - int fLanguageModel = FD::Convert("LanguageModel"); - int fWordPenalty = FD::Convert("WordPenalty"); - int fPassThrough = FD::Convert("PassThrough"); - SparseVector wts; - wts.set_value(fWordPenalty, 4.25); - wts.set_value(fLanguageModel, -1.1165); - wts.set_value(fPhraseModel_0, -0.96); - wts.set_value(fPhraseModel_1, -0.65); - wts.set_value(fPhraseModel_2, -0.77); - wts.set_value(fPassThrough, -10.0); - - vector to_optimize; - to_optimize.push_back(fWordPenalty); - to_optimize.push_back(fLanguageModel); - to_optimize.push_back(fPhraseModel_0); - to_optimize.push_back(fPhraseModel_1); - to_optimize.push_back(fPhraseModel_2); - - Hypergraph hg; - ReadFile rf("./test_data/0.json.gz"); - HypergraphIO::ReadFromJSON(rf.stream(), &hg); - hg.Reweight(wts); - - Hypergraph hg2; - ReadFile rf2("./test_data/1.json.gz"); - HypergraphIO::ReadFromJSON(rf2.stream(), &hg2); - hg2.Reweight(wts); - - vector > refs1(4); - TD::ConvertSentence(ref11, &refs1[0]); - TD::ConvertSentence(ref21, &refs1[1]); - TD::ConvertSentence(ref31, &refs1[2]); - TD::ConvertSentence(ref41, &refs1[3]); - vector > refs2(4); - TD::ConvertSentence(ref12, &refs2[0]); - TD::ConvertSentence(ref22, &refs2[1]); - TD::ConvertSentence(ref32, &refs2[2]); - TD::ConvertSentence(ref42, &refs2[3]); - vector envs(2); - - RandomNumberGenerator rng; - - vector > axes; // directions to search - LineOptimizer::CreateOptimizationDirections( - to_optimize, - 10, - &rng, - &axes); - assert(axes.size() == 10 + to_optimize.size()); - for (int i = 0; i < axes.size(); ++i) - cerr << axes[i] << endl; - const SparseVector& axis = axes[0]; - - cerr << "Computing Viterbi envelope using inside algorithm...\n"; - cerr << "axis: " << axis << endl; - clock_t t_start=clock(); - ViterbiEnvelopeWeightFunction wf(wts, axis); // wts = starting point, axis = search direction - envs[0] = Inside(hg, NULL, wf); - envs[1] = Inside(hg2, NULL, wf); - - vector es(2); - EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); - boost::shared_ptr scorer1 = metric->CreateSegmentEvaluator(refs1); - boost::shared_ptr scorer2 = metric->CreateSegmentEvaluator(refs2); - ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg); - ComputeErrorSurface(*scorer2, envs[1], &es[1], metric, hg2); - cerr << envs[0].size() << " " << envs[1].size() << endl; - cerr << es[0].size() << " " << es[1].size() << endl; - envs.clear(); - clock_t t_env=clock(); - float score; - double m = LineOptimizer::LineOptimize(metric,es, LineOptimizer::MAXIMIZE_SCORE, &score); - clock_t t_opt=clock(); - cerr << "line optimizer returned: " << m << " (SCORE=" << score << ")\n"; - EXPECT_FLOAT_EQ(0.48719698, score); - SparseVector res = axis; - res *= m; - res += wts; - cerr << "res: " << res << endl; - cerr << "ENVELOPE PROCESSING=" << (static_cast(t_env - t_start) / 1000.0) << endl; - cerr << " LINE OPTIMIZATION=" << (static_cast(t_opt - t_env) / 1000.0) << endl; - hg.Reweight(res); - hg2.Reweight(res); - vector t1,t2; - ViterbiESentence(hg, &t1); - ViterbiESentence(hg2, &t2); - cerr << TD::GetString(t1) << endl; - cerr << TD::GetString(t2) << endl; -} - -TEST_F(OptTest,TestZeroOrigin) { - const string json = "{\"rules\":[1,\"[X7] ||| blA ||| without ||| LHSProb=3.92173 LexE2F=2.90799 LexF2E=1.85003 GenerativeProb=10.5381 RulePenalty=1 XFE=2.77259 XEF=0.441833 LabelledEF=2.63906 LabelledFE=4.96981 LogRuleCount=0.693147\",2,\"[X7] ||| blA ||| except ||| LHSProb=4.92173 LexE2F=3.90799 LexF2E=1.85003 GenerativeProb=11.5381 RulePenalty=1 XFE=2.77259 XEF=1.44183 LabelledEF=2.63906 LabelledFE=4.96981 LogRuleCount=1.69315\",3,\"[S] ||| [X7,1] ||| [1] ||| GlueTop=1\",4,\"[X28] ||| EnwAn ||| title ||| LHSProb=3.96802 LexE2F=2.22462 LexF2E=1.83258 GenerativeProb=10.0863 RulePenalty=1 XFE=0 XEF=1.20397 LabelledEF=1.20397 LabelledFE=-1.98341e-08 LogRuleCount=1.09861\",5,\"[X0] ||| EnwAn ||| funny ||| LHSProb=3.98479 LexE2F=1.79176 LexF2E=3.21888 GenerativeProb=11.1681 RulePenalty=1 XFE=0 XEF=2.30259 LabelledEF=2.30259 LabelledFE=0 LogRuleCount=0 SingletonRule=1\",6,\"[X8] ||| [X7,1] EnwAn ||| entitled [1] ||| LHSProb=3.82533 LexE2F=3.21888 LexF2E=2.52573 GenerativeProb=11.3276 RulePenalty=1 XFE=1.20397 XEF=1.20397 LabelledEF=2.30259 LabelledFE=2.30259 LogRuleCount=0 SingletonRule=1\",7,\"[S] ||| [S,1] [X28,2] ||| [1] [2] ||| Glue=1\",8,\"[S] ||| [S,1] [X0,2] ||| [1] [2] ||| Glue=1\",9,\"[S] ||| [X8,1] ||| [1] ||| GlueTop=1\",10,\"[Goal] ||| [S,1] ||| [1]\"],\"features\":[\"PassThrough\",\"Glue\",\"GlueTop\",\"LanguageModel\",\"WordPenalty\",\"LHSProb\",\"LexE2F\",\"LexF2E\",\"GenerativeProb\",\"RulePenalty\",\"XFE\",\"XEF\",\"LabelledEF\",\"LabelledFE\",\"LogRuleCount\",\"SingletonRule\"],\"edges\":[{\"tail\":[],\"spans\":[0,1,-1,-1],\"feats\":[5,3.92173,6,2.90799,7,1.85003,8,10.5381,9,1,10,2.77259,11,0.441833,12,2.63906,13,4.96981,14,0.693147],\"rule\":1},{\"tail\":[],\"spans\":[0,1,-1,-1],\"feats\":[5,4.92173,6,3.90799,7,1.85003,8,11.5381,9,1,10,2.77259,11,1.44183,12,2.63906,13,4.96981,14,1.69315],\"rule\":2}],\"node\":{\"in_edges\":[0,1],\"cat\":\"X7\"},\"edges\":[{\"tail\":[0],\"spans\":[0,1,-1,-1],\"feats\":[2,1],\"rule\":3}],\"node\":{\"in_edges\":[2],\"cat\":\"S\"},\"edges\":[{\"tail\":[],\"spans\":[1,2,-1,-1],\"feats\":[5,3.96802,6,2.22462,7,1.83258,8,10.0863,9,1,11,1.20397,12,1.20397,13,-1.98341e-08,14,1.09861],\"rule\":4}],\"node\":{\"in_edges\":[3],\"cat\":\"X28\"},\"edges\":[{\"tail\":[],\"spans\":[1,2,-1,-1],\"feats\":[5,3.98479,6,1.79176,7,3.21888,8,11.1681,9,1,11,2.30259,12,2.30259,15,1],\"rule\":5}],\"node\":{\"in_edges\":[4],\"cat\":\"X0\"},\"edges\":[{\"tail\":[0],\"spans\":[0,2,-1,-1],\"feats\":[5,3.82533,6,3.21888,7,2.52573,8,11.3276,9,1,10,1.20397,11,1.20397,12,2.30259,13,2.30259,15,1],\"rule\":6}],\"node\":{\"in_edges\":[5],\"cat\":\"X8\"},\"edges\":[{\"tail\":[1,2],\"spans\":[0,2,-1,-1],\"feats\":[1,1],\"rule\":7},{\"tail\":[1,3],\"spans\":[0,2,-1,-1],\"feats\":[1,1],\"rule\":8},{\"tail\":[4],\"spans\":[0,2,-1,-1],\"feats\":[2,1],\"rule\":9}],\"node\":{\"in_edges\":[6,7,8],\"cat\":\"S\"},\"edges\":[{\"tail\":[5],\"spans\":[0,2,-1,-1],\"feats\":[],\"rule\":10}],\"node\":{\"in_edges\":[9],\"cat\":\"Goal\"}}"; - Hypergraph hg; - istringstream instr(json); - HypergraphIO::ReadFromJSON(&instr, &hg); - SparseVector wts; - wts.set_value(FD::Convert("PassThrough"), -0.929201533002898); - hg.Reweight(wts); - - vector, prob_t> > list; - std::vector > features; - KBest::KBestDerivations, ESentenceTraversal> kbest(hg, 10); - for (int i = 0; i < 10; ++i) { - const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = - kbest.LazyKthBest(hg.nodes_.size() - 1, i); - if (!d) break; - cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; - } - - SparseVector axis; axis.set_value(FD::Convert("Glue"),1.0); - ViterbiEnvelopeWeightFunction wf(wts, axis); // wts = starting point, axis = search direction - vector envs(1); - envs[0] = Inside(hg, NULL, wf); - - vector > mr(4); - TD::ConvertSentence("untitled", &mr[0]); - TD::ConvertSentence("with no title", &mr[1]); - TD::ConvertSentence("without a title", &mr[2]); - TD::ConvertSentence("without title", &mr[3]); - EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); - boost::shared_ptr scorer1 = metric->CreateSegmentEvaluator(mr); - vector es(1); - ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg); -} - -int main(int argc, char **argv) { - testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} - diff --git a/vest/mr_vest_generate_mapper_input.cc b/vest/mr_vest_generate_mapper_input.cc deleted file mode 100644 index 59d4f24f..00000000 --- a/vest/mr_vest_generate_mapper_input.cc +++ /dev/null @@ -1,78 +0,0 @@ -#include -#include - -#include -#include - -#include "filelib.h" -#include "weights.h" -#include "line_optimizer.h" - -using namespace std; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("dev_set_size,s",po::value(),"[REQD] Development set size (# of parallel sentences)") - ("forest_repository,r",po::value(),"[REQD] Path to forest repository") - ("weights,w",po::value(),"[REQD] Current feature weights file") - ("optimize_feature,o",po::value >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)") - ("random_directions,d",po::value()->default_value(20),"Number of random directions to run the line optimizer in") - ("help,h", "Help"); - po::options_description dcmdline_options; - dcmdline_options.add(opts); - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - bool flag = false; - if (conf->count("dev_set_size") == 0) { - cerr << "Please specify the size of the development set using -d N\n"; - flag = true; - } - if (conf->count("weights") == 0) { - cerr << "Please specify the starting-point weights using -w \n"; - flag = true; - } - if (conf->count("forest_repository") == 0) { - cerr << "Please specify the forest repository location using -r \n"; - flag = true; - } - if (flag || conf->count("help")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -int main(int argc, char** argv) { - RandomNumberGenerator rng; - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - vector features; - SparseVector origin; - vector w; - Weights::InitFromFile(conf["weights"].as(), &w, &features); - Weights::InitSparseVector(w, &origin); - const string forest_repository = conf["forest_repository"].as(); - assert(DirectoryExists(forest_repository)); - if (conf.count("optimize_feature") > 0) - features=conf["optimize_feature"].as >(); - vector > directions; - vector fids(features.size()); - for (int i = 0; i < features.size(); ++i) - fids[i] = FD::Convert(features[i]); - LineOptimizer::CreateOptimizationDirections( - fids, - conf["random_directions"].as(), - &rng, - &directions); - unsigned dev_set_size = conf["dev_set_size"].as(); - for (unsigned i = 0; i < dev_set_size; ++i) { - for (unsigned j = 0; j < directions.size(); ++j) { - cout << forest_repository << '/' << i << ".json.gz " << i << ' '; - print(cout, origin, "=", ";"); - cout << ' '; - print(cout, directions[j], "=", ";"); - cout << endl; - } - } - return 0; -} diff --git a/vest/mr_vest_map.cc b/vest/mr_vest_map.cc deleted file mode 100644 index 7d9625bc..00000000 --- a/vest/mr_vest_map.cc +++ /dev/null @@ -1,112 +0,0 @@ -#include -#include -#include -#include - -#include -#include - -#include "ns.h" -#include "ns_docscorer.h" -#include "ces.h" -#include "filelib.h" -#include "stringlib.h" -#include "sparse_vector.h" -#include "viterbi_envelope.h" -#include "inside_outside.h" -#include "error_surface.h" -#include "b64tools.h" -#include "hg_io.h" - -using namespace std; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("reference,r",po::value >(), "[REQD] Reference translation (tokenized text)") - ("source,s",po::value(), "Source file (ignored, except for AER)") - ("evaluation_metric,m",po::value()->default_value("ibm_bleu"), "Evaluation metric being optimized") - ("input,i",po::value()->default_value("-"), "Input file to map (- is STDIN)") - ("help,h", "Help"); - po::options_description dcmdline_options; - dcmdline_options.add(opts); - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - bool flag = false; - if (!conf->count("reference")) { - cerr << "Please specify one or more references using -r \n"; - flag = true; - } - if (flag || conf->count("help")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -bool ReadSparseVectorString(const string& s, SparseVector* v) { -#if 0 - // this should work, but untested. - std::istringstream i(s); - i>>*v; -#else - vector fields; - Tokenize(s, ';', &fields); - if (fields.empty()) return false; - for (int i = 0; i < fields.size(); ++i) { - vector pair(2); - Tokenize(fields[i], '=', &pair); - if (pair.size() != 2) { - cerr << "Error parsing vector string: " << fields[i] << endl; - return false; - } - v->set_value(FD::Convert(pair[0]), atof(pair[1].c_str())); - } - return true; -#endif -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - const string evaluation_metric = conf["evaluation_metric"].as(); - EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); - DocumentScorer ds(metric, conf["reference"].as >()); - cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl; - Hypergraph hg; - string last_file; - ReadFile in_read(conf["input"].as()); - istream &in=*in_read.stream(); - while(in) { - string line; - getline(in, line); - if (line.empty()) continue; - istringstream is(line); - int sent_id; - string file, s_origin, s_direction; - // path-to-file (JSON) sent_ed starting-point search-direction - is >> file >> sent_id >> s_origin >> s_direction; - SparseVector origin; - ReadSparseVectorString(s_origin, &origin); - SparseVector direction; - ReadSparseVectorString(s_direction, &direction); - // cerr << "File: " << file << "\nDir: " << direction << "\n X: " << origin << endl; - if (last_file != file) { - last_file = file; - ReadFile rf(file); - HypergraphIO::ReadFromJSON(rf.stream(), &hg); - } - ViterbiEnvelopeWeightFunction wf(origin, direction); - ViterbiEnvelope ve = Inside(hg, NULL, wf); - ErrorSurface es; - - ComputeErrorSurface(*ds[sent_id], ve, &es, metric, hg); - //cerr << "Viterbi envelope has " << ve.size() << " segments\n"; - // cerr << "Error surface has " << es.size() << " segments\n"; - string val; - es.Serialize(&val); - cout << 'M' << ' ' << s_origin << ' ' << s_direction << '\t'; - B64::b64encode(val.c_str(), val.size(), &cout); - cout << endl << flush; - } - return 0; -} diff --git a/vest/mr_vest_reduce.cc b/vest/mr_vest_reduce.cc deleted file mode 100644 index dda61f88..00000000 --- a/vest/mr_vest_reduce.cc +++ /dev/null @@ -1,77 +0,0 @@ -#include -#include -#include -#include - -#include -#include - -#include "sparse_vector.h" -#include "error_surface.h" -#include "line_optimizer.h" -#include "b64tools.h" -#include "stringlib.h" - -using namespace std; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("evaluation_metric,m",po::value(), "Evaluation metric (IBM_BLEU, etc.)") - ("help,h", "Help"); - po::options_description dcmdline_options; - dcmdline_options.add(opts); - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - bool flag = conf->count("evaluation_metric") == 0; - if (flag || conf->count("help")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - const string evaluation_metric = conf["evaluation_metric"].as(); - LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE; - if (UppercaseString(evaluation_metric) == "TER") - opt_type = LineOptimizer::MINIMIZE_SCORE; - EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); - - vector esv; - string last_key, line, key, val; - while(getline(cin, line)) { - size_t ks = line.find("\t"); - assert(string::npos != ks); - assert(ks > 2); - key = line.substr(2, ks - 2); - val = line.substr(ks + 1); - if (key != last_key) { - if (!last_key.empty()) { - float score; - double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score); - cout << last_key << "|" << x << "|" << score << endl; - } - last_key.swap(key); - esv.clear(); - } - if (val.size() % 4 != 0) { - cerr << "B64 encoding error 1! Skipping.\n"; - continue; - } - string encoded(val.size() / 4 * 3, '\0'); - if (!B64::b64decode(reinterpret_cast(&val[0]), val.size(), &encoded[0], encoded.size())) { - cerr << "B64 encoding error 2! Skipping.\n"; - continue; - } - esv.push_back(ErrorSurface()); - esv.back().Deserialize(encoded); - } - if (!esv.empty()) { - float score; - double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score); - cout << last_key << "|" << x << "|" << score << endl; - } - return 0; -} diff --git a/vest/parallelize.pl b/vest/parallelize.pl deleted file mode 100755 index 7d0365cc..00000000 --- a/vest/parallelize.pl +++ /dev/null @@ -1,423 +0,0 @@ -#!/usr/bin/env perl - -# Author: Adam Lopez -# -# This script takes a command that processes input -# from stdin one-line-at-time, and parallelizes it -# on the cluster using David Chiang's sentserver/ -# sentclient architecture. -# -# Prerequisites: the command *must* read each line -# without waiting for subsequent lines of input -# (for instance, a command which must read all lines -# of input before processing will not work) and -# return it to the output *without* buffering -# multiple lines. - -#TODO: if -j 1, run immediately, not via sentserver? possible differences in environment might make debugging harder - -#ANNOYANCE: if input is shorter than -j n lines, or at the very last few lines, repeatedly sleeps. time cut down to 15s from 60s - -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } -use LocalConfig; - -use Cwd qw/ abs_path cwd getcwd /; -use File::Temp qw/ tempfile /; -use Getopt::Long; -use IPC::Open2; -use strict; -use POSIX ":sys_wait_h"; - -use File::Basename; -my $myDir = dirname(__FILE__); -print STDERR __FILE__." -> $myDir\n"; -push(@INC, $myDir); -require "libcall.pl"; - -my $tailn=5; # +0 = concatenate all the client logs. 5 = last 5 lines -my $recycle_clients; # spawn new clients when previous ones terminate -my $stay_alive; # dont let server die when having zero clients -my $joblist = ""; -my $errordir=""; -my $multiline; -my @files_to_stage; -my $numnodes = 8; -my $user = $ENV{"USER"}; -my $pmem = "9g"; -my $basep=50300; -my $randp=300; -my $tryp=50; -my $no_which; -my $no_cd; - -my $DEBUG=$ENV{DEBUG}; -print STDERR "DEBUG=$DEBUG output enabled.\n" if $DEBUG; -my $verbose = 1; -sub verbose { - if ($verbose) { - print STDERR @_,"\n"; - } -} -sub debug { - if ($DEBUG) { - my ($package, $filename, $line) = caller; - print STDERR "DEBUG: $filename($line): ",join(' ',@_),"\n"; - } -} -my $is_shell_special=qr.[ \t\n\\><|&;"'`~*?{}$!()].; -my $shell_escape_in_quote=qr.[\\"\$`!].; -sub escape_shell { - my ($arg)=@_; - return undef unless defined $arg; - return '""' unless $arg; - if ($arg =~ /$is_shell_special/) { - $arg =~ s/($shell_escape_in_quote)/\\$1/g; - return "\"$arg\""; - } - return $arg; -} -sub preview_files { - my ($l,$skipempty,$footer,$n)=@_; - $n=$tailn unless defined $n; - my @f=grep { ! ($skipempty && -z $_) } @$l; - my $fn=join(' ',map {escape_shell($_)} @f); - my $cmd="tail -n $n $fn"; - unchecked_output("$cmd").($footer?"\nNONEMPTY FILES:\n$fn\n":""); -} -sub prefix_dirname($) { - #like `dirname but if ends in / then return the whole thing - local ($_)=@_; - if (/\/$/) { - $_; - } else { - s#/[^/]$##; - $_ ? $_ : ''; - } -} -sub ensure_final_slash($) { - local ($_)=@_; - m#/$# ? $_ : ($_."/"); -} -sub extend_path($$;$$) { - my ($base,$ext,$mkdir,$baseisdir)=@_; - if (-d $base) { - $base.="/"; - } else { - my $dir; - if ($baseisdir) { - $dir=$base; - $base.='/' unless $base =~ /\/$/; - } else { - $dir=prefix_dirname($base); - } - my @cmd=("/bin/mkdir","-p",$dir); - check_call(@cmd) if $mkdir; - } - return $base.$ext; -} - -my $abscwd=abs_path(&getcwd); -sub print_help; - -my $use_fork; -my @pids; - -# Process command-line options -unless (GetOptions( - "stay-alive" => \$stay_alive, - "recycle-clients" => \$recycle_clients, - "error-dir=s" => \$errordir, - "multi-line" => \$multiline, - "file=s" => \@files_to_stage, - "use-fork" => \$use_fork, - "verbose" => \$verbose, - "jobs=i" => \$numnodes, - "pmem=s" => \$pmem, - "baseport=i" => \$basep, -# "iport=i" => \$randp, #for short name -i - "no-which!" => \$no_which, - "no-cd!" => \$no_cd, - "tailn=s" => \$tailn, -) && scalar @ARGV){ - print_help(); - die "bad options."; -} - -my $cmd = ""; -my $prog=shift; -if ($no_which) { - $cmd=$prog; -} else { - $cmd=check_output("which $prog"); - chomp $cmd; - die "$prog not found - $cmd" unless $cmd; -} -#$cmd=abs_path($cmd); -for my $arg (@ARGV) { - $cmd .= " ".escape_shell($arg); -} -die "Please specify a command to parallelize\n" if $cmd eq ''; - -my $cdcmd=$no_cd ? '' : ("cd ".escape_shell($abscwd)."\n"); - -my $executable = $cmd; -$executable =~ s/^\s*(\S+)($|\s.*)/$1/; -$executable=check_output("basename $executable"); -chomp $executable; - - -print STDERR "Parallelizing ($numnodes ways): $cmd\n\n"; - -# create -e dir and save .sh -use File::Temp qw/tempdir/; -unless ($errordir) { - $errordir=tempdir("$executable.XXXXXX",CLEANUP=>1); -} -if ($errordir) { - my $scriptfile=extend_path("$errordir/","$executable.sh",1,1); - -d $errordir || die "should have created -e dir $errordir"; - open SF,">",$scriptfile || die; - print SF "$cdcmd$cmd\n"; - close SF; - chmod 0755,$scriptfile; - $errordir=abs_path($errordir); - &verbose("-e dir: $errordir"); -} - -# set cleanup handler -my @cleanup_cmds; -sub cleanup; -sub cleanup_and_die; -$SIG{INT} = "cleanup_and_die"; -$SIG{TERM} = "cleanup_and_die"; -$SIG{HUP} = "cleanup_and_die"; - -# other subs: -sub numof_live_jobs; -sub launch_job_on_node; - - -# vars -my $mydir = check_output("dirname $0"); chomp $mydir; -my $sentserver = "$mydir/sentserver"; -my $sentclient = "$mydir/sentclient"; -my $host = check_output("hostname"); -chomp $host; - - -# find open port -srand; -my $port = 50300+int(rand($randp)); -my $endp=$port+$tryp; -sub listening_port_lines { - my $quiet=$verbose?'':'2>/dev/null'; - return unchecked_output("netstat -a -n $quiet | grep LISTENING | grep -i tcp"); -} -my $netstat=&listening_port_lines; - -if ($verbose){ print STDERR "Testing port $port...";} - -while ($netstat=~/$port/ || &listening_port_lines=~/$port/){ - if ($verbose){ print STDERR "port is busy\n";} - $port++; - if ($port > $endp){ - die "Unable to find open port\n"; - } - if ($verbose){ print STDERR "Testing port $port... "; } -} -if ($verbose){ - print STDERR "port $port is available\n"; -} - -my $key = int(rand()*1000000); - -my $multiflag = ""; -if ($multiline){ $multiflag = "-m"; print STDERR "expecting multiline output.\n"; } -my $stay_alive_flag = ""; -if ($stay_alive){ $stay_alive_flag = "--stay-alive"; print STDERR "staying alive while no clients are connected.\n"; } - -my $node_count = 0; -my $script = ""; -# fork == one thread runs the sentserver, while the -# other spawns the sentclient commands. -my $pid = fork; -if ($pid == 0) { # child - sleep 8; # give other thread time to start sentserver - $script = "$cdcmd$sentclient $host:$port:$key $cmd"; - - if ($verbose){ - print STDERR "Client script:\n====\n"; - print STDERR $script; - print STDERR "====\n"; - } - for (my $jobn=0; $jobn<$numnodes; $jobn++){ - launch_job(); - } - if ($recycle_clients) { - my $ret; - my $livejobs; - while (1) { - $ret = waitpid($pid, WNOHANG); - #print STDERR "waitpid $pid ret = $ret \n"; - last if ($ret != 0); - $livejobs = numof_live_jobs(); - if ($numnodes >= $livejobs ) { # a client terminated, OR # lines of input was less than -j - print STDERR "num of requested nodes = $numnodes; num of currently live jobs = $livejobs; Client terminated - launching another.\n"; - launch_job(); - } else { - sleep 15; - } - } - } - print STDERR "CHILD PROCESSES SPAWNED ... WAITING\n"; - for my $p (@pids) { - waitpid($p, 0); - } -} else { -# my $todo = "$sentserver -k $key $multiflag $port "; - my $todo = "$sentserver -k $key $multiflag $port $stay_alive_flag "; - if ($verbose){ print STDERR "Running: $todo\n"; } - check_call($todo); - print STDERR "Call to $sentserver returned.\n"; - cleanup(); - exit(0); -} - -sub numof_live_jobs { - if ($use_fork) { - die "not implemented"; - } else { - # We can probably continue decoding if the qstat error is only temporary - my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat"))); - return ($#livejobs + 1); - } -} -my (@errors,@outs,@cmds); - -sub launch_job { - if ($use_fork) { return launch_job_fork(); } - my $errorfile = "/dev/null"; - my $outfile = "/dev/null"; - $node_count++; - my $clientname = $executable; - $clientname =~ s/^(.{4}).*$/$1/; - $clientname = "$clientname.$node_count"; - if ($errordir){ - $errorfile = "$errordir/$clientname.ER"; - $outfile = "$errordir/$clientname.OU"; - push @errors,$errorfile; - push @outs,$outfile; - } - my $todo = qsub_args($pmem) . " -N $clientname -o $outfile -e $errorfile"; - push @cmds,$todo; - - print STDERR "Running: $todo\n"; - local(*QOUT, *QIN); - open2(\*QOUT, \*QIN, $todo) or die "Failed to open2: $!"; - print QIN $script; - close QIN; - while (my $jobid=){ - chomp $jobid; - if ($verbose){ print STDERR "Launched client job: $jobid"; } - $jobid =~ s/^(\d+)(.*?)$/\1/g; - $jobid =~ s/^Your job (\d+) .*$/\1/; - print STDERR " short job id $jobid\n"; - if ($verbose){ - print STDERR "cd: $abscwd\n"; - print STDERR "cmd: $cmd\n"; - } - if ($joblist == "") { $joblist = $jobid; } - else {$joblist = $joblist . "\|" . $jobid; } - my $cleanfn="qdel $jobid 2> /dev/null"; - push(@cleanup_cmds, $cleanfn); - } - close QOUT; -} - -sub launch_job_fork { - my $errorfile = "/dev/null"; - my $outfile = "/dev/null"; - $node_count++; - my $clientname = $executable; - $clientname =~ s/^(.{4}).*$/$1/; - $clientname = "$clientname.$node_count"; - if ($errordir){ - $errorfile = "$errordir/$clientname.ER"; - $outfile = "$errordir/$clientname.OU"; - push @errors,$errorfile; - push @outs,$outfile; - } - my $pid = fork; - if ($pid == 0) { - my ($fh, $scr_name) = get_temp_script(); - print $fh $script; - close $fh; - my $todo = "/bin/bash -xeo pipefail $scr_name 1> $outfile 2> $errorfile"; - print STDERR "EXEC: $todo\n"; - my $out = check_output("$todo"); - unlink $scr_name or warn "Failed to remove $scr_name"; - exit 0; - } else { - push @pids, $pid; - } -} - -sub get_temp_script { - my ($fh, $filename) = tempfile( "workXXXX", SUFFIX => '.sh'); - return ($fh, $filename); -} - -sub cleanup_and_die { - cleanup(); - die "\n"; -} - -sub cleanup { - print STDERR "Cleaning up...\n"; - for $cmd (@cleanup_cmds){ - print STDERR " Cleanup command: $cmd\n"; - eval $cmd; - } - print STDERR "outputs:\n",preview_files(\@outs,1),"\n"; - print STDERR "errors:\n",preview_files(\@errors,1),"\n"; - print STDERR "cmd:\n",$cmd,"\n"; - print STDERR " cat $errordir/*.ER\nfor logs.\n"; - print STDERR "Cleanup finished.\n"; -} - -sub print_help -{ - my $name = check_output("basename $0"); chomp $name; - print << "Help"; - -usage: $name [options] - - Automatic black-box parallelization of commands. - -options: - - --use-fork - Instead of using qsub, use fork. - - -e, --error-dir - Retain output files from jobs in , rather - than silently deleting them. - - -m, --multi-line - Expect that command may produce multiple output - lines for a single input line. $name makes a - reasonable attempt to obtain all output before - processing additional inputs. However, use of this - option is inherently unsafe. - - -v, --verbose - Print diagnostic informatoin on stderr. - - -j, --jobs - Number of jobs to use. - - -p, --pmem - pmem setting for each job. - -Help -} diff --git a/vest/sentclient.c b/vest/sentclient.c deleted file mode 100644 index 91d994ab..00000000 --- a/vest/sentclient.c +++ /dev/null @@ -1,76 +0,0 @@ -/* Copyright (c) 2001 by David Chiang. All rights reserved.*/ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "sentserver.h" - -int main (int argc, char *argv[]) { - int sock, port; - char *s, *key; - struct hostent *hp; - struct sockaddr_in server; - int errors = 0; - - if (argc < 3) { - fprintf(stderr, "Usage: sentclient host[:port[:key]] command [args ...]\n"); - exit(1); - } - - s = strchr(argv[1], ':'); - key = NULL; - - if (s == NULL) { - port = DEFAULT_PORT; - } else { - *s = '\0'; - s+=1; - /* dumb hack */ - key = strchr(s, ':'); - if (key != NULL){ - *key = '\0'; - key += 1; - } - port = atoi(s); - } - - sock = socket(AF_INET, SOCK_STREAM, 0); - - hp = gethostbyname(argv[1]); - if (hp == NULL) { - fprintf(stderr, "unknown host %s\n", argv[1]); - exit(1); - } - - bzero((char *)&server, sizeof(server)); - bcopy(hp->h_addr, (char *)&server.sin_addr, hp->h_length); - server.sin_family = hp->h_addrtype; - server.sin_port = htons(port); - - while (connect(sock, (struct sockaddr *)&server, sizeof(server)) < 0) { - perror("connect()"); - sleep(1); - errors++; - if (errors > 5) - exit(1); - } - - close(0); - close(1); - dup2(sock, 0); - dup2(sock, 1); - - if (key != NULL){ - write(1, key, strlen(key)); - write(1, "\n", 1); - } - - execvp(argv[2], argv+2); - return 0; -} diff --git a/vest/sentserver.c b/vest/sentserver.c deleted file mode 100644 index c20b4fa6..00000000 --- a/vest/sentserver.c +++ /dev/null @@ -1,515 +0,0 @@ -/* Copyright (c) 2001 by David Chiang. All rights reserved.*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "sentserver.h" - -#define MAX_CLIENTS 64 - -struct clientinfo { - int s; - struct sockaddr_in sin; -}; - -struct line { - int id; - char *s; - int status; - struct line *next; -} *head, **ptail; - -int n_sent = 0, n_received=0, n_flushed=0; - -#define STATUS_RUNNING 0 -#define STATUS_ABORTED 1 -#define STATUS_FINISHED 2 - -pthread_mutex_t queue_mutex = PTHREAD_MUTEX_INITIALIZER; -pthread_mutex_t clients_mutex = PTHREAD_MUTEX_INITIALIZER; -pthread_mutex_t input_mutex = PTHREAD_MUTEX_INITIALIZER; - -int n_clients = 0; -int s; -int expect_multiline_output = 0; -int log_mutex = 0; -int stay_alive = 0; /* dont panic and die with zero clients */ - -void queue_finish(struct line *node, char *s, int fid); -char * read_line(int fd, int multiline); -void done (int code); - -struct line * queue_get(int fid) { - struct line *cur; - char *s, *synch; - - if (log_mutex) fprintf(stderr, "Getting for data for fid %d\n", fid); - if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); - pthread_mutex_lock(&queue_mutex); - - /* First, check for aborted sentences. */ - - if (log_mutex) fprintf(stderr, " Checking queue for aborted jobs (fid %d)\n", fid); - for (cur = head; cur != NULL; cur = cur->next) { - if (cur->status == STATUS_ABORTED) { - cur->status = STATUS_RUNNING; - - if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); - pthread_mutex_unlock(&queue_mutex); - - return cur; - } - } - if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); - pthread_mutex_unlock(&queue_mutex); - - /* Otherwise, read a new one. */ - if (log_mutex) fprintf(stderr, "Locking input mutex (%d)\n", fid); - if (log_mutex) fprintf(stderr, " Reading input for new data (fid %d)\n", fid); - pthread_mutex_lock(&input_mutex); - s = read_line(0,0); - - while (s) { - if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); - pthread_mutex_lock(&queue_mutex); - if (log_mutex) fprintf(stderr, "Unlocking input mutex (%d)\n", fid); - pthread_mutex_unlock(&input_mutex); - - cur = malloc(sizeof (struct line)); - cur->id = n_sent; - cur->s = s; - cur->next = NULL; - - *ptail = cur; - ptail = &cur->next; - - n_sent++; - - if (strcmp(s,"===SYNCH===\n")==0){ - fprintf(stderr, "Received ===SYNCH=== signal (fid %d)\n", fid); - // Note: queue_finish calls free(cur->s). - // Therefore we need to create a new string here. - synch = malloc((strlen("===SYNCH===\n")+2) * sizeof (char)); - synch = strcpy(synch, s); - - if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); - pthread_mutex_unlock(&queue_mutex); - queue_finish(cur, synch, fid); /* handles its own lock */ - - if (log_mutex) fprintf(stderr, "Locking input mutex (%d)\n", fid); - if (log_mutex) fprintf(stderr, " Reading input for new data (fid %d)\n", fid); - pthread_mutex_lock(&input_mutex); - - s = read_line(0,0); - } else { - if (log_mutex) fprintf(stderr, " Received new data %d (fid %d)\n", cur->id, fid); - cur->status = STATUS_RUNNING; - if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); - pthread_mutex_unlock(&queue_mutex); - return cur; - } - } - - if (log_mutex) fprintf(stderr, "Unlocking input mutex (%d)\n", fid); - pthread_mutex_unlock(&input_mutex); - /* Only way to reach this point: no more output */ - - if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); - pthread_mutex_lock(&queue_mutex); - if (head == NULL) { - fprintf(stderr, "Reached end of file. Exiting.\n"); - done(0); - } else - ptail = NULL; /* This serves as a signal that there is no more input */ - if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); - pthread_mutex_unlock(&queue_mutex); - - return NULL; -} - -void queue_panic() { - struct line *next; - while (head && head->status == STATUS_FINISHED) { - /* Write out finished sentences */ - if (head->status == STATUS_FINISHED) { - fputs(head->s, stdout); - fflush(stdout); - } - /* Write out blank line for unfinished sentences */ - if (head->status == STATUS_ABORTED) { - fputs("\n", stdout); - fflush(stdout); - } - /* By defition, there cannot be any RUNNING sentences, since - function is only called when n_clients == 0 */ - free(head->s); - next = head->next; - free(head); - head = next; - n_flushed++; - } - fclose(stdout); - fprintf(stderr, "All clients died. Panicking, flushing completed sentences and exiting.\n"); - done(1); -} - -void queue_abort(struct line *node, int fid) { - if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); - pthread_mutex_lock(&queue_mutex); - node->status = STATUS_ABORTED; - if (n_clients == 0) { - if (stay_alive) { - fprintf(stderr, "Warning! No live clients detected! Staying alive, will retry soon.\n"); - } else { - queue_panic(); - } - } - if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); - pthread_mutex_unlock(&queue_mutex); -} - - -void queue_print() { - struct line *cur; - - fprintf(stderr, " Queue\n"); - - for (cur = head; cur != NULL; cur = cur->next) { - switch(cur->status) { - case STATUS_RUNNING: - fprintf(stderr, " %d running ", cur->id); break; - case STATUS_ABORTED: - fprintf(stderr, " %d aborted ", cur->id); break; - case STATUS_FINISHED: - fprintf(stderr, " %d finished ", cur->id); break; - - } - fprintf(stderr, "\n"); - //fprintf(stderr, cur->s); - } -} - -void queue_finish(struct line *node, char *s, int fid) { - struct line *next; - if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); - pthread_mutex_lock(&queue_mutex); - - free(node->s); - node->s = s; - node->status = STATUS_FINISHED; - n_received++; - - /* Flush out finished nodes */ - while (head && head->status == STATUS_FINISHED) { - - if (log_mutex) fprintf(stderr, " Flushing finished node %d\n", head->id); - - fputs(head->s, stdout); - fflush(stdout); - if (log_mutex) fprintf(stderr, " Flushed node %d\n", head->id); - free(head->s); - - next = head->next; - free(head); - - head = next; - - n_flushed++; - - if (head == NULL) { /* empty queue */ - if (ptail == NULL) { /* This can only happen if set in queue_get as signal that there is no more input. */ - fprintf(stderr, "All sentences finished. Exiting.\n"); - done(0); - } else /* ptail pointed at something which was just popped off the stack -- reset to head*/ - ptail = &head; - } - } - - if (log_mutex) fprintf(stderr, " Flushing output %d\n", head->id); - fflush(stdout); - fprintf(stderr, "%d sentences sent, %d sentences finished, %d sentences flushed\n", n_sent, n_received, n_flushed); - - if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); - pthread_mutex_unlock(&queue_mutex); - -} - -char * read_line(int fd, int multiline) { - int size = 80; - char errorbuf[100]; - char *s = malloc(size+2); - int result, errors=0; - int i = 0; - - result = read(fd, s+i, 1); - - while (1) { - if (result < 0) { - perror("read()"); - sprintf(errorbuf, "Error code: %d\n", errno); - fprintf(stderr, errorbuf); - errors++; - if (errors > 5) { - free(s); - return NULL; - } else { - sleep(1); /* retry after delay */ - } - } else if (result == 0) { - break; - } else if (multiline==0 && s[i] == '\n') { - break; - } else { - if (s[i] == '\n'){ - /* if we've reached this point, - then multiline must be 1, and we're - going to poll the fd for an additional - line of data. The basic design is to - run a select on the filedescriptor fd. - Select will return under two conditions: - if there is data on the fd, or if a - timeout is reached. We'll select on this - fd. If select returns because there's data - ready, keep going; else assume there's no - more and return the data we already have. - */ - - fd_set set; - FD_ZERO(&set); - FD_SET(fd, &set); - - struct timeval timeout; - timeout.tv_sec = 3; // number of seconds for timeout - timeout.tv_usec = 0; - - int ready = select(FD_SETSIZE, &set, NULL, NULL, &timeout); - if (ready<1){ - break; // no more data, stop looping - } - } - i++; - - if (i == size) { - size = size*2; - s = realloc(s, size+2); - } - } - - result = read(fd, s+i, 1); - } - - if (result == 0 && i == 0) { /* end of file */ - free(s); - return NULL; - } - - s[i] = '\n'; - s[i+1] = '\0'; - - return s; -} - -void * new_client(void *arg) { - struct clientinfo *client = (struct clientinfo *)arg; - struct line *cur; - int result; - char *s; - char errorbuf[100]; - - pthread_mutex_lock(&clients_mutex); - n_clients++; - pthread_mutex_unlock(&clients_mutex); - - fprintf(stderr, "Client connected (%d connected)\n", n_clients); - - for (;;) { - - cur = queue_get(client->s); - - if (cur) { - /* fprintf(stderr, "Sending to client: %s", cur->s); */ - fprintf(stderr, "Sending data %d to client (fid %d)\n", cur->id, client->s); - result = write(client->s, cur->s, strlen(cur->s)); - if (result < strlen(cur->s)){ - perror("write()"); - sprintf(errorbuf, "Error code: %d\n", errno); - fprintf(stderr, errorbuf); - - pthread_mutex_lock(&clients_mutex); - n_clients--; - pthread_mutex_unlock(&clients_mutex); - - fprintf(stderr, "Client died (%d connected)\n", n_clients); - queue_abort(cur, client->s); - - close(client->s); - free(client); - - pthread_exit(NULL); - } - } else { - close(client->s); - pthread_mutex_lock(&clients_mutex); - n_clients--; - pthread_mutex_unlock(&clients_mutex); - fprintf(stderr, "Client dismissed (%d connected)\n", n_clients); - pthread_exit(NULL); - } - - s = read_line(client->s,expect_multiline_output); - if (s) { - /* fprintf(stderr, "Client (fid %d) returned: %s", client->s, s); */ - fprintf(stderr, "Client (fid %d) returned data %d\n", client->s, cur->id); -// queue_print(); - queue_finish(cur, s, client->s); - } else { - pthread_mutex_lock(&clients_mutex); - n_clients--; - pthread_mutex_unlock(&clients_mutex); - - fprintf(stderr, "Client died (%d connected)\n", n_clients); - queue_abort(cur, client->s); - - close(client->s); - free(client); - - pthread_exit(NULL); - } - - } - return 0; -} - -void done (int code) { - close(s); - exit(code); -} - - - -int main (int argc, char *argv[]) { - struct sockaddr_in sin, from; - int g; - socklen_t len; - struct clientinfo *client; - int port; - int opt; - int errors = 0; - int argi; - char *key = NULL, *client_key; - int use_key = 0; - /* the key stuff here doesn't provide any - real measure of security, it's mainly to keep - jobs from bumping into each other. */ - - pthread_t tid; - port = DEFAULT_PORT; - - for (argi=1; argi < argc; argi++){ - if (strcmp(argv[argi], "-m")==0){ - expect_multiline_output = 1; - } else if (strcmp(argv[argi], "-k")==0){ - argi++; - if (argi == argc){ - fprintf(stderr, "Key must be specified after -k\n"); - exit(1); - } - key = argv[argi]; - use_key = 1; - } else if (strcmp(argv[argi], "--stay-alive")==0){ - stay_alive = 1; /* dont panic and die with zero clients */ - } else { - port = atoi(argv[argi]); - } - } - - /* Initialize data structures */ - head = NULL; - ptail = &head; - - /* Set up listener */ - s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); - opt = 1; - setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)); - - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = htonl(INADDR_ANY); - sin.sin_port = htons(port); - while (bind(s, (struct sockaddr *) &sin, sizeof(sin)) < 0) { - perror("bind()"); - sleep(1); - errors++; - if (errors > 100) - exit(1); - } - - len = sizeof(sin); - getsockname(s, (struct sockaddr *) &sin, &len); - - fprintf(stderr, "Listening on port %hu\n", ntohs(sin.sin_port)); - - while (listen(s, MAX_CLIENTS) < 0) { - perror("listen()"); - sleep(1); - errors++; - if (errors > 100) - exit(1); - } - - for (;;) { - len = sizeof(from); - g = accept(s, (struct sockaddr *)&from, &len); - if (g < 0) { - perror("accept()"); - sleep(1); - continue; - } - client = malloc(sizeof(struct clientinfo)); - client->s = g; - bcopy(&from, &client->sin, len); - - if (use_key){ - fd_set set; - FD_ZERO(&set); - FD_SET(client->s, &set); - - struct timeval timeout; - timeout.tv_sec = 3; // number of seconds for timeout - timeout.tv_usec = 0; - - int ready = select(FD_SETSIZE, &set, NULL, NULL, &timeout); - if (ready<1){ - fprintf(stderr, "Prospective client failed to respond with correct key.\n"); - close(client->s); - free(client); - } else { - client_key = read_line(client->s,0); - client_key[strlen(client_key)-1]='\0'; /* chop trailing newline */ - if (strcmp(key, client_key)==0){ - pthread_create(&tid, NULL, new_client, client); - } else { - fprintf(stderr, "Prospective client failed to respond with correct key.\n"); - close(client->s); - free(client); - } - free(client_key); - } - } else { - pthread_create(&tid, NULL, new_client, client); - } - } - -} - - - diff --git a/vest/sentserver.h b/vest/sentserver.h deleted file mode 100644 index cd17a546..00000000 --- a/vest/sentserver.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef SENTSERVER_H -#define SENTSERVER_H - -#define DEFAULT_PORT 50000 - -#endif diff --git a/vest/tac.pl b/vest/tac.pl deleted file mode 100755 index 9fb525c1..00000000 --- a/vest/tac.pl +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/perl - -while(<>) { - chomp; - $|=1; - print (scalar reverse($_)); - print "\n"; -} diff --git a/vest/test_aer/README b/vest/test_aer/README deleted file mode 100644 index 819b2e32..00000000 --- a/vest/test_aer/README +++ /dev/null @@ -1,8 +0,0 @@ -To run the test: - -../dist-vest.pl --local --metric aer cdec.ini --source-file corpus.src --ref-files=ref.0 --weights weights - -This will optimize the parameters of the tiny lexical translation model -so as to minimize the AER of the Viterbi alignment on the development -set in corpus.src according to the reference alignments in ref.0. - diff --git a/vest/test_aer/cdec.ini b/vest/test_aer/cdec.ini deleted file mode 100644 index 08187848..00000000 --- a/vest/test_aer/cdec.ini +++ /dev/null @@ -1,3 +0,0 @@ -formalism=lextrans -grammar=grammar -aligner=true diff --git a/vest/test_aer/corpus.src b/vest/test_aer/corpus.src deleted file mode 100644 index 31b23971..00000000 --- a/vest/test_aer/corpus.src +++ /dev/null @@ -1,3 +0,0 @@ -el gato negro ||| the black cat -el gato ||| the cat -el libro ||| the book diff --git a/vest/test_aer/grammar b/vest/test_aer/grammar deleted file mode 100644 index 9d857824..00000000 --- a/vest/test_aer/grammar +++ /dev/null @@ -1,12 +0,0 @@ -el ||| cat ||| F1=1 -el ||| the ||| F2=1 -el ||| black ||| F3=1 -el ||| book ||| F11=1 -gato ||| cat ||| F4=1 NN=1 -gato ||| black ||| F5=1 -gato ||| the ||| F6=1 -negro ||| the ||| F7=1 -negro ||| cat ||| F8=1 -negro ||| black ||| F9=1 -libro ||| the ||| F10=1 -libro ||| book ||| F12=1 NN=1 diff --git a/vest/test_aer/ref.0 b/vest/test_aer/ref.0 deleted file mode 100644 index 734a9c5b..00000000 --- a/vest/test_aer/ref.0 +++ /dev/null @@ -1,3 +0,0 @@ -0-0 1-2 2-1 -0-0 1-1 -0-0 1-1 diff --git a/vest/test_aer/weights b/vest/test_aer/weights deleted file mode 100644 index afc9282e..00000000 --- a/vest/test_aer/weights +++ /dev/null @@ -1,13 +0,0 @@ -F1 0.1 -F2 -.5980815 -F3 0.24235 -F4 0.625 -F5 0.4514 -F6 0.112316 -F7 -0.123415 -F8 -0.25390285 -F9 -0.23852 -F10 0.646 -F11 0.413141 -F12 0.343216 -NN -0.1215 diff --git a/vest/test_data/0.json.gz b/vest/test_data/0.json.gz deleted file mode 100644 index 30f8dd77..00000000 Binary files a/vest/test_data/0.json.gz and /dev/null differ diff --git a/vest/test_data/1.json.gz b/vest/test_data/1.json.gz deleted file mode 100644 index c82cc179..00000000 Binary files a/vest/test_data/1.json.gz and /dev/null differ diff --git a/vest/test_data/c2e.txt.0 b/vest/test_data/c2e.txt.0 deleted file mode 100644 index 12c4abe9..00000000 --- a/vest/test_data/c2e.txt.0 +++ /dev/null @@ -1,2 +0,0 @@ -australia reopens embassy in manila -( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack . diff --git a/vest/test_data/c2e.txt.1 b/vest/test_data/c2e.txt.1 deleted file mode 100644 index 4ac12df1..00000000 --- a/vest/test_data/c2e.txt.1 +++ /dev/null @@ -1,2 +0,0 @@ -australia reopened manila embassy -( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack . diff --git a/vest/test_data/c2e.txt.2 b/vest/test_data/c2e.txt.2 deleted file mode 100644 index 2f67b72f..00000000 --- a/vest/test_data/c2e.txt.2 +++ /dev/null @@ -1,2 +0,0 @@ -australia to reopen embassy in manila -( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so-called confirmed terrorist attack threats . diff --git a/vest/test_data/c2e.txt.3 b/vest/test_data/c2e.txt.3 deleted file mode 100644 index 5483cef6..00000000 --- a/vest/test_data/c2e.txt.3 +++ /dev/null @@ -1,2 +0,0 @@ -australia to re - open its embassy to manila -( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so-called " clear " threat of terrorist attack 7 weeks ago . diff --git a/vest/test_data/re.txt.0 b/vest/test_data/re.txt.0 deleted file mode 100644 index 86eff087..00000000 --- a/vest/test_data/re.txt.0 +++ /dev/null @@ -1,5 +0,0 @@ -erdogan states turkey to reject any pressures to urge it to recognize cyprus -ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara will reject any pressure by the european union to urge it to recognize cyprus . this comes two weeks before the summit of european union state and government heads who will decide whether or nor membership negotiations with ankara should be opened . -erdogan told " ntv " television station that " the european union cannot address us by imposing new conditions on us with regard to cyprus . -we will discuss this dossier in the course of membership negotiations . " -he added " let me be clear , i cannot sidestep turkey , this is something we cannot accept . " diff --git a/vest/test_data/re.txt.1 b/vest/test_data/re.txt.1 deleted file mode 100644 index 2140f198..00000000 --- a/vest/test_data/re.txt.1 +++ /dev/null @@ -1,5 +0,0 @@ -erdogan confirms turkey will resist any pressure to recognize cyprus -ankara 12 - 1 ( afp ) - the turkish head of government , recep tayyip erdogan , announced today ( wednesday ) that ankara would resist any pressure the european union might exercise in order to force it into recognizing cyprus . this comes two weeks before a summit of european union heads of state and government , who will decide whether or not to open membership negotiations with ankara . -erdogan said to the ntv television channel : " the european union cannot engage with us through imposing new conditions on us with regard to cyprus . -we shall discuss this issue in the course of the membership negotiations . " -he added : " let me be clear - i cannot confine turkey . this is something we do not accept . " diff --git a/vest/test_data/re.txt.2 b/vest/test_data/re.txt.2 deleted file mode 100644 index 94e46286..00000000 --- a/vest/test_data/re.txt.2 +++ /dev/null @@ -1,5 +0,0 @@ -erdogan confirms that turkey will reject any pressures to encourage it to recognize cyprus -ankara , 12 / 1 ( afp ) - the turkish prime minister recep tayyip erdogan declared today , wednesday , that ankara will reject any pressures that the european union may apply on it to encourage to recognize cyprus . this comes two weeks before a summit of the heads of countries and governments of the european union , who will decide on whether or not to start negotiations on joining with ankara . -erdogan told the ntv television station that " it is not possible for the european union to talk to us by imposing new conditions on us regarding cyprus . -we shall discuss this dossier during the negotiations on joining . " -and he added , " let me be clear . turkey's arm should not be twisted ; this is something we cannot accept . " diff --git a/vest/test_data/re.txt.3 b/vest/test_data/re.txt.3 deleted file mode 100644 index f87c3308..00000000 --- a/vest/test_data/re.txt.3 +++ /dev/null @@ -1,5 +0,0 @@ -erdogan stresses that turkey will reject all pressures to force it to recognize cyprus -ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara would refuse all pressures applied on it by the european union to force it to recognize cyprus . that came two weeks before the summit of the presidents and prime ministers of the european union , who would decide on whether to open negotiations on joining with ankara or not . -erdogan said to " ntv " tv station that the " european union can not communicate with us by imposing on us new conditions related to cyprus . -we will discuss this file during the negotiations on joining . " -he added , " let me be clear . turkey's arm should not be twisted . this is unacceptable to us . " diff --git a/vest/viterbi_envelope.cc b/vest/viterbi_envelope.cc deleted file mode 100644 index 9fcf75a0..00000000 --- a/vest/viterbi_envelope.cc +++ /dev/null @@ -1,177 +0,0 @@ -#include "viterbi_envelope.h" - -#include -#include - -using namespace std; -using boost::shared_ptr; - -ostream& operator<<(ostream& os, const ViterbiEnvelope& env) { - os << '<'; - const vector >& segs = env.GetSortedSegs(); - for (int i = 0; i < segs.size(); ++i) - os << (i==0 ? "" : "|") << "x=" << segs[i]->x << ",b=" << segs[i]->b << ",m=" << segs[i]->m << ",p1=" << segs[i]->p1 << ",p2=" << segs[i]->p2; - return os << '>'; -} - -ViterbiEnvelope::ViterbiEnvelope(int i) { - if (i == 0) { - // do nothing - <> - } else if (i == 1) { - segs.push_back(shared_ptr(new Segment(0, 0, 0, shared_ptr(), shared_ptr()))); - assert(this->IsMultiplicativeIdentity()); - } else { - cerr << "Only can create ViterbiEnvelope semiring 0 and 1 with this constructor!\n"; - abort(); - } -} - -struct SlopeCompare { - bool operator() (const shared_ptr& a, const shared_ptr& b) const { - return a->m < b->m; - } -}; - -const ViterbiEnvelope& ViterbiEnvelope::operator+=(const ViterbiEnvelope& other) { - if (!other.is_sorted) other.Sort(); - if (segs.empty()) { - segs = other.segs; - return *this; - } - is_sorted = false; - int j = segs.size(); - segs.resize(segs.size() + other.segs.size()); - for (int i = 0; i < other.segs.size(); ++i) - segs[j++] = other.segs[i]; - assert(j == segs.size()); - return *this; -} - -void ViterbiEnvelope::Sort() const { - sort(segs.begin(), segs.end(), SlopeCompare()); - const int k = segs.size(); - int j = 0; - for (int i = 0; i < k; ++i) { - Segment l = *segs[i]; - l.x = kMinusInfinity; - // cerr << "m=" << l.m << endl; - if (0 < j) { - if (segs[j-1]->m == l.m) { // lines are parallel - if (l.b <= segs[j-1]->b) continue; - --j; - } - while(0 < j) { - l.x = (l.b - segs[j-1]->b) / (segs[j-1]->m - l.m); - if (segs[j-1]->x < l.x) break; - --j; - } - if (0 == j) l.x = kMinusInfinity; - } - *segs[j++] = l; - } - segs.resize(j); - is_sorted = true; -} - -const ViterbiEnvelope& ViterbiEnvelope::operator*=(const ViterbiEnvelope& other) { - if (other.IsMultiplicativeIdentity()) { return *this; } - if (this->IsMultiplicativeIdentity()) { (*this) = other; return *this; } - - if (!is_sorted) Sort(); - if (!other.is_sorted) other.Sort(); - - if (this->IsEdgeEnvelope()) { -// if (other.size() > 1) -// cerr << *this << " (TIMES) " << other << endl; - shared_ptr edge_parent = segs[0]; - const double& edge_b = edge_parent->b; - const double& edge_m = edge_parent->m; - segs.clear(); - for (int i = 0; i < other.segs.size(); ++i) { - const Segment& seg = *other.segs[i]; - const double m = seg.m + edge_m; - const double b = seg.b + edge_b; - const double& x = seg.x; // x's don't change with * - segs.push_back(shared_ptr(new Segment(x, m, b, edge_parent, other.segs[i]))); - assert(segs.back()->p1->edge); - } -// if (other.size() > 1) -// cerr << " = " << *this << endl; - } else { - vector > new_segs; - int this_i = 0; - int other_i = 0; - const int this_size = segs.size(); - const int other_size = other.segs.size(); - double cur_x = kMinusInfinity; // moves from left to right across the - // real numbers, stopping for all inter- - // sections - double this_next_val = (1 < this_size ? segs[1]->x : kPlusInfinity); - double other_next_val = (1 < other_size ? other.segs[1]->x : kPlusInfinity); - while (this_i < this_size && other_i < other_size) { - const Segment& this_seg = *segs[this_i]; - const Segment& other_seg= *other.segs[other_i]; - const double m = this_seg.m + other_seg.m; - const double b = this_seg.b + other_seg.b; - - new_segs.push_back(shared_ptr(new Segment(cur_x, m, b, segs[this_i], other.segs[other_i]))); - int comp = 0; - if (this_next_val < other_next_val) comp = -1; else - if (this_next_val > other_next_val) comp = 1; - if (0 == comp) { // the next values are equal, advance both indices - ++this_i; - ++other_i; - cur_x = this_next_val; // could be other_next_val (they're equal!) - this_next_val = (this_i+1 < this_size ? segs[this_i+1]->x : kPlusInfinity); - other_next_val = (other_i+1 < other_size ? other.segs[other_i+1]->x : kPlusInfinity); - } else { // advance the i with the lower x, update cur_x - if (-1 == comp) { - ++this_i; - cur_x = this_next_val; - this_next_val = (this_i+1 < this_size ? segs[this_i+1]->x : kPlusInfinity); - } else { - ++other_i; - cur_x = other_next_val; - other_next_val = (other_i+1 < other_size ? other.segs[other_i+1]->x : kPlusInfinity); - } - } - } - segs.swap(new_segs); - } - //cerr << "Multiply: result=" << (*this) << endl; - return *this; -} - -// recursively construct translation -void Segment::ConstructTranslation(vector* trans) const { - const Segment* cur = this; - vector > ant_trans; - while(!cur->edge) { - ant_trans.resize(ant_trans.size() + 1); - cur->p2->ConstructTranslation(&ant_trans.back()); - cur = cur->p1.get(); - } - size_t ant_size = ant_trans.size(); - vector*> pants(ant_size); - assert(ant_size == cur->edge->tail_nodes_.size()); - --ant_size; - for (int i = 0; i < pants.size(); ++i) pants[ant_size - i] = &ant_trans[i]; - cur->edge->rule_->ESubstitute(pants, trans); -} - -void Segment::CollectEdgesUsed(std::vector* edges_used) const { - if (edge) { - assert(edge->id_ < edges_used->size()); - (*edges_used)[edge->id_] = true; - } - if (p1) p1->CollectEdgesUsed(edges_used); - if (p2) p2->CollectEdgesUsed(edges_used); -} - -ViterbiEnvelope ViterbiEnvelopeWeightFunction::operator()(const Hypergraph::Edge& e) const { - const double m = direction.dot(e.feature_values_); - const double b = origin.dot(e.feature_values_); - Segment* seg = new Segment(m, b, e); - return ViterbiEnvelope(1, seg); -} - diff --git a/vest/viterbi_envelope.h b/vest/viterbi_envelope.h deleted file mode 100644 index 60ad82d8..00000000 --- a/vest/viterbi_envelope.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef _VITERBI_ENVELOPE_H_ -#define _VITERBI_ENVELOPE_H_ - -#include -#include -#include - -#include "hg.h" -#include "sparse_vector.h" - -static const double kMinusInfinity = -std::numeric_limits::infinity(); -static const double kPlusInfinity = std::numeric_limits::infinity(); - -struct Segment { - Segment() : x(), m(), b(), edge() {} - Segment(double _m, double _b) : - x(kMinusInfinity), m(_m), b(_b), edge() {} - Segment(double _x, double _m, double _b, const boost::shared_ptr& p1_, const boost::shared_ptr& p2_) : - x(_x), m(_m), b(_b), p1(p1_), p2(p2_), edge() {} - Segment(double _m, double _b, const Hypergraph::Edge& edge) : - x(kMinusInfinity), m(_m), b(_b), edge(&edge) {} - - double x; // x intersection with previous segment in env, or -inf if none - double m; // this line's slope - double b; // intercept with y-axis - - // we keep a pointer to the "parents" of this segment so we can reconstruct - // the Viterbi translation corresponding to this segment - boost::shared_ptr p1; - boost::shared_ptr p2; - - // only Segments created from an edge using the ViterbiEnvelopeWeightFunction - // have rules - // TRulePtr rule; - const Hypergraph::Edge* edge; - - // recursively recover the Viterbi translation that will result from setting - // the weights to origin + axis * x, where x is any value from this->x up - // until the next largest x in the containing ViterbiEnvelope - void ConstructTranslation(std::vector* trans) const; - void CollectEdgesUsed(std::vector* edges_used) const; -}; - -// this is the semiring value type, -// it defines constructors for 0, 1, and the operations + and * -struct ViterbiEnvelope { - // create semiring zero - ViterbiEnvelope() : is_sorted(true) {} // zero - // for debugging: - ViterbiEnvelope(const std::vector >& s) : segs(s) { Sort(); } - // create semiring 1 or 0 - explicit ViterbiEnvelope(int i); - ViterbiEnvelope(int n, Segment* seg) : is_sorted(true), segs(n, boost::shared_ptr(seg)) {} - const ViterbiEnvelope& operator+=(const ViterbiEnvelope& other); - const ViterbiEnvelope& operator*=(const ViterbiEnvelope& other); - bool IsMultiplicativeIdentity() const { - return size() == 1 && (segs[0]->b == 0.0 && segs[0]->m == 0.0) && (!segs[0]->edge) && (!segs[0]->p1) && (!segs[0]->p2); } - const std::vector >& GetSortedSegs() const { - if (!is_sorted) Sort(); - return segs; - } - size_t size() const { return segs.size(); } - - private: - bool IsEdgeEnvelope() const { - return segs.size() == 1 && segs[0]->edge; } - void Sort() const; - mutable bool is_sorted; - mutable std::vector > segs; -}; -std::ostream& operator<<(std::ostream& os, const ViterbiEnvelope& env); - -struct ViterbiEnvelopeWeightFunction { - ViterbiEnvelopeWeightFunction(const SparseVector& ori, - const SparseVector& dir) : origin(ori), direction(dir) {} - ViterbiEnvelope operator()(const Hypergraph::Edge& e) const; - const SparseVector origin; - const SparseVector direction; -}; - -#endif -- cgit v1.2.3