From 0da1f6de1b33bbff5cb99b1938bb07d050479f10 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 14 Dec 2011 21:02:50 -0800 Subject: random incomplete metric stuff, including string subsequence kernel impl --- mteval/ns.cc | 241 +++++++++++++++++++ mteval/ns.h | 106 +++++++++ mteval/ns_ter.cc | 551 +++++++++++++++++++++++++++++++++++++++++++ mteval/ns_ter.h | 18 ++ mteval/scorer_test.cc | 46 ++++ utils/kernel_string_subseq.h | 51 ++++ 6 files changed, 1013 insertions(+) create mode 100644 mteval/ns.cc create mode 100644 mteval/ns.h create mode 100644 mteval/ns_ter.cc create mode 100644 mteval/ns_ter.h create mode 100644 utils/kernel_string_subseq.h diff --git a/mteval/ns.cc b/mteval/ns.cc new file mode 100644 index 00000000..1045a51f --- /dev/null +++ b/mteval/ns.cc @@ -0,0 +1,241 @@ +#include "ns.h" +#include "ns_ter.h" + +#include +#include +#include +#include +#include + +using namespace std; +using boost::shared_ptr; + +map EvaluationMetric::instances_; + +SegmentEvaluator::~SegmentEvaluator() {} +EvaluationMetric::~EvaluationMetric() {} + +struct DefaultSegmentEvaluator : public SegmentEvaluator { + DefaultSegmentEvaluator(const vector >& refs, const EvaluationMetric* em) : refs_(refs), em_(em) {} + void Evaluate(const vector& hyp, SufficientStats* out) const { + em_->ComputeSufficientStatistics(hyp, refs_, out); + } + const vector > refs_; + const EvaluationMetric* em_; +}; + +shared_ptr EvaluationMetric::CreateSegmentEvaluator(const vector >& refs) const { + return shared_ptr(new DefaultSegmentEvaluator(refs, this)); +} + +void EvaluationMetric::ComputeSufficientStatistics(const vector&, + const vector >&, + SufficientStats*) const { + cerr << "Base class ComputeSufficientStatistics should not be called.\n"; + abort(); +} + +enum BleuType { IBM, Koehn, NIST }; +template +struct BleuSegmentEvaluator : public SegmentEvaluator { + BleuSegmentEvaluator(const vector >& refs, const EvaluationMetric* em) : evaluation_metric(em) { + assert(refs.size() > 0); + float tot = 0; + int smallest = 9999999; + for (vector >::const_iterator ci = refs.begin(); + ci != refs.end(); ++ci) { + lengths_.push_back(ci->size()); + tot += lengths_.back(); + if (lengths_.back() < smallest) smallest = lengths_.back(); + CountRef(*ci); + } + if (BrevityType == Koehn) + lengths_[0] = tot / refs.size(); + if (BrevityType == NIST) + lengths_[0] = smallest; + } + + void Evaluate(const vector& hyp, SufficientStats* out) const { + out->fields.resize(N + N + 2); + out->evaluation_metric = evaluation_metric; + for (unsigned i = 0; i < N+N+2; ++i) out->fields[i] = 0; + + ComputeNgramStats(hyp, &out->fields[0], &out->fields[N], true); + float& hyp_len = out->fields[2*N]; + float& ref_len = out->fields[2*N + 1]; + hyp_len = hyp.size(); + ref_len = lengths_[0]; + if (lengths_.size() > 1 && BrevityType == IBM) { + float bestd = 2000000; + float hl = hyp.size(); + float bl = -1; + for (vector::const_iterator ci = lengths_.begin(); ci != lengths_.end(); ++ci) { + if (fabs(*ci - hl) < bestd) { + bestd = fabs(*ci - hl); + bl = *ci; + } + } + ref_len = bl; + } + } + + struct NGramCompare { + int operator() (const vector& a, const vector& b) { + const size_t as = a.size(); + const size_t bs = b.size(); + const size_t s = (as < bs ? as : bs); + for (size_t i = 0; i < s; ++i) { + int d = a[i] - b[i]; + if (d < 0) return true; + if (d > 0) return false; + } + return as < bs; + } + }; + typedef map, pair, NGramCompare> NGramCountMap; + + void CountRef(const vector& ref) { + NGramCountMap tc; + vector ngram(N); + int s = ref.size(); + for (int j=0; j& p = ngrams_[i->first]; + if (p.first < i->second.first) + p = i->second; + } + } + + void ComputeNgramStats(const vector& sent, + float* correct, // N elements reserved + float* hyp, // N elements reserved + bool clip_counts = true) const { + vector ngram(N); + *correct *= 0; + *hyp *= 0; + int s = sent.size(); + for (int j=0; j& p = ngrams_[ngram]; + if(clip_counts){ + if (p.second < p.first) { + ++p.second; + correct[i-1]++; + } + } else { + ++p.second; + correct[i-1]++; + } + // if the 1 gram isn't found, don't try to match don't need to match any 2- 3- .. grams: + if (!p.first) { + for (; i<=k; ++i) + hyp[i-1]++; + } else { + hyp[i-1]++; + } + } + } + } + + const EvaluationMetric* evaluation_metric; + vector lengths_; + mutable NGramCountMap ngrams_; +}; + +template +struct BleuMetric : public EvaluationMetric { + BleuMetric() : EvaluationMetric("IBM_BLEU") {} + float ComputeScore(const SufficientStats& stats) const { + float log_bleu = 0; + int count = 0; + for (int i = 0; i < N; ++i) { + if (stats.fields[i+N] > 0) { + float cor_count = stats.fields[i]; // correct_ngram_hit_counts[i]; + // smooth bleu + if (!cor_count) { cor_count = 0.01; } + float lprec = log(cor_count) - log(stats.fields[i+N]); // log(hyp_ngram_counts[i]); + // if (precs) precs->push_back(exp(lprec)); + log_bleu += lprec; + ++count; + } + } + log_bleu /= count; + float lbp = 0.0; + const float& hyp_len = stats.fields[2*N]; + const float& ref_len = stats.fields[2*N + 1]; + if (hyp_len < ref_len) + lbp = (hyp_len - ref_len) / hyp_len; + log_bleu += lbp; + //if (bp) *bp = exp(lbp); + return exp(log_bleu); + } + shared_ptr CreateSegmentEvaluator(const vector >& refs) const { + return shared_ptr(new BleuSegmentEvaluator(refs, this)); + } +}; + +EvaluationMetric* EvaluationMetric::Instance(const string& metric_id) { + static bool is_first = true; + if (is_first) { + instances_["NULL"] = NULL; + is_first = false; + } + + map::iterator it = instances_.find(metric_id); + if (it == instances_.end()) { + EvaluationMetric* m = NULL; + if (metric_id == "IBM_BLEU") { + m = new BleuMetric<4, IBM>; + } else if (metric_id == "NIST_BLEU") { + m = new BleuMetric<4, NIST>; + } else if (metric_id == "Koehn_BLEU") { + m = new BleuMetric<4, Koehn>; + } else if (metric_id == "TER") { + m = new TERMetric; + } else { + cerr << "Implement please: " << metric_id << endl; + abort(); + } + if (m->MetricId() != metric_id) { + cerr << "Registry error: " << metric_id << " vs. " << m->MetricId() << endl; + abort(); + } + return instances_[metric_id] = m; + } else { + return it->second; + } +} + +SufficientStats::SufficientStats(const string& encoded) { + istringstream is(encoded); + string type; + is >> type; + evaluation_metric = EvaluationMetric::Instance(type); + float val; + while(is >> val) + fields.push_back(val); +} + +void SufficientStats::Encode(string* out) const { + ostringstream os; + if (evaluation_metric) + os << evaluation_metric->MetricId(); + else + os << "NULL"; + for (unsigned i = 0; i < fields.size(); ++i) + os << ' ' << fields[i]; + *out = os.str(); +} + diff --git a/mteval/ns.h b/mteval/ns.h new file mode 100644 index 00000000..f19b7509 --- /dev/null +++ b/mteval/ns.h @@ -0,0 +1,106 @@ +#ifndef _NS_H_ +#define _NS_H_ + +#include +#include +#include +#include +#include "wordid.h" + +class EvaluationMetric; + +class SufficientStats { + public: + SufficientStats() : evaluation_metric() {} + explicit SufficientStats(const std::string& encoded); + explicit SufficientStats(const EvaluationMetric* s) : evaluation_metric(s) {} + SufficientStats(const EvaluationMetric* s, const std::vector& f) : + evaluation_metric(s), fields(f) {} + + SufficientStats& operator+=(const SufficientStats& delta) { + if (delta.evaluation_metric) evaluation_metric = delta.evaluation_metric; + if (fields.size() != delta.fields.size()) + fields.resize(std::max(fields.size(), delta.fields.size())); + for (unsigned i = 0; i < delta.fields.size(); ++i) + fields[i] += delta.fields[i]; + return *this; + } + SufficientStats& operator-=(const SufficientStats& delta) { + if (delta.evaluation_metric) evaluation_metric = delta.evaluation_metric; + if (fields.size() != delta.fields.size()) + fields.resize(std::max(fields.size(), delta.fields.size())); + for (unsigned i = 0; i < delta.fields.size(); ++i) + fields[i] -= delta.fields[i]; + return *this; + } + SufficientStats& operator*=(const double& scalar) { + for (unsigned i = 0; i < fields.size(); ++i) + fields[i] *= scalar; + return *this; + } + SufficientStats& operator/=(const double& scalar) { + for (unsigned i = 0; i < fields.size(); ++i) + fields[i] /= scalar; + return *this; + } + bool operator==(const SufficientStats& other) const { + return other.fields == fields; + } + size_t size() const { return fields.size(); } + float operator[](size_t i) const { + if (i < fields.size()) return fields[i]; + return 0; + } + void Encode(std::string* out) const; + + const EvaluationMetric* evaluation_metric; + std::vector fields; +}; + +inline const SufficientStats& operator+(const SufficientStats& a, const SufficientStats& b) { + SufficientStats res(a); + return res += b; +} + +inline const SufficientStats& operator-(const SufficientStats& a, const SufficientStats& b) { + SufficientStats res(a); + return res -= b; +} + +struct SegmentEvaluator { + virtual ~SegmentEvaluator(); + virtual void Evaluate(const std::vector& hyp, SufficientStats* out) const = 0; +}; + +// Instructions for implementing a new metric +// Override MetricId() and give the metric a unique string name (no spaces) +// To Instance(), add something that creates the metric +// Implement ONE of the following: +// 1) void ComputeSufficientStatistics(const std::vector >& refs, SufficientStats* out) const; +// 2) a new SegmentEvaluator class AND CreateSegmentEvaluator(const std::vector >& refs) const; +// The later (#2) is only used when it is necessary to precompute per-segment data from a set of refs +// Implement ComputeScore(const SufficientStats& stats) const; +class EvaluationMetric { + public: + static EvaluationMetric* Instance(const std::string& metric_id = "IBM_BLEU"); + + protected: + EvaluationMetric(const std::string& id) : name_(id) {} + virtual ~EvaluationMetric(); + + public: + const std::string& MetricId() const { return name_; } + + virtual float ComputeScore(const SufficientStats& stats) const = 0; + virtual boost::shared_ptr CreateSegmentEvaluator(const std::vector >& refs) const; + virtual void ComputeSufficientStatistics(const std::vector& hyp, + const std::vector >& refs, + SufficientStats* out) const; + + private: + static std::map instances_; + const std::string name_; +}; + +#endif + diff --git a/mteval/ns_ter.cc b/mteval/ns_ter.cc new file mode 100644 index 00000000..14dc6e49 --- /dev/null +++ b/mteval/ns_ter.cc @@ -0,0 +1,551 @@ +#include "ns_ter.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "tdict.h" + +static const bool ter_use_average_ref_len = true; +static const int ter_short_circuit_long_sentences = -1; + +static const unsigned kINSERTIONS = 0; +static const unsigned kDELETIONS = 1; +static const unsigned kSUBSTITUTIONS = 2; +static const unsigned kSHIFTS = 3; +static const unsigned kREF_WORDCOUNT = 4; +static const unsigned kDUMMY_LAST_ENTRY = 5; + +using namespace std; +using namespace std::tr1; + +#if 0 + +struct COSTS { + static const float substitution; + static const float deletion; + static const float insertion; + static const float shift; +}; +const float COSTS::substitution = 1.0f; +const float COSTS::deletion = 1.0f; +const float COSTS::insertion = 1.0f; +const float COSTS::shift = 1.0f; + +static const int MAX_SHIFT_SIZE = 10; +static const int MAX_SHIFT_DIST = 50; + +struct Shift { + unsigned int d_; + Shift() : d_() {} + Shift(int b, int e, int m) : d_() { + begin(b); + end(e); + moveto(m); + } + inline int begin() const { + return d_ & 0x3ff; + } + inline int end() const { + return (d_ >> 10) & 0x3ff; + } + inline int moveto() const { + int m = (d_ >> 20) & 0x7ff; + if (m > 1024) { m -= 1024; m *= -1; } + return m; + } + inline void begin(int b) { + d_ &= 0xfffffc00u; + d_ |= (b & 0x3ff); + } + inline void end(int e) { + d_ &= 0xfff003ffu; + d_ |= (e & 0x3ff) << 10; + } + inline void moveto(int m) { + bool neg = (m < 0); + if (neg) { m *= -1; m += 1024; } + d_ &= 0xfffff; + d_ |= (m & 0x7ff) << 20; + } +}; + +class TERScorerImpl { + + public: + enum TransType { MATCH, SUBSTITUTION, INSERTION, DELETION }; + + explicit TERScorerImpl(const vector& ref) : ref_(ref) { + for (int i = 0; i < ref.size(); ++i) + rwexists_.insert(ref[i]); + } + + float Calculate(const vector& hyp, int* subs, int* ins, int* dels, int* shifts) const { + return CalculateAllShifts(hyp, subs, ins, dels, shifts); + } + + inline int GetRefLength() const { + return ref_.size(); + } + + private: + vector ref_; + set rwexists_; + + typedef unordered_map, set, boost::hash > > NgramToIntsMap; + mutable NgramToIntsMap nmap_; + + static float MinimumEditDistance( + const vector& hyp, + const vector& ref, + vector* path) { + vector > bmat(hyp.size() + 1, vector(ref.size() + 1, MATCH)); + vector > cmat(hyp.size() + 1, vector(ref.size() + 1, 0)); + for (int i = 0; i <= hyp.size(); ++i) + cmat[i][0] = i; + for (int j = 0; j <= ref.size(); ++j) + cmat[0][j] = j; + for (int i = 1; i <= hyp.size(); ++i) { + const WordID& hw = hyp[i-1]; + for (int j = 1; j <= ref.size(); ++j) { + const WordID& rw = ref[j-1]; + float& cur_c = cmat[i][j]; + TransType& cur_b = bmat[i][j]; + + if (rw == hw) { + cur_c = cmat[i-1][j-1]; + cur_b = MATCH; + } else { + cur_c = cmat[i-1][j-1] + COSTS::substitution; + cur_b = SUBSTITUTION; + } + float cwoi = cmat[i-1][j]; + if (cur_c > cwoi + COSTS::insertion) { + cur_c = cwoi + COSTS::insertion; + cur_b = INSERTION; + } + float cwod = cmat[i][j-1]; + if (cur_c > cwod + COSTS::deletion) { + cur_c = cwod + COSTS::deletion; + cur_b = DELETION; + } + } + } + + // trace back along the best path and record the transition types + path->clear(); + int i = hyp.size(); + int j = ref.size(); + while (i > 0 || j > 0) { + if (j == 0) { + --i; + path->push_back(INSERTION); + } else if (i == 0) { + --j; + path->push_back(DELETION); + } else { + TransType t = bmat[i][j]; + path->push_back(t); + switch (t) { + case SUBSTITUTION: + case MATCH: + --i; --j; break; + case INSERTION: + --i; break; + case DELETION: + --j; break; + } + } + } + reverse(path->begin(), path->end()); + return cmat[hyp.size()][ref.size()]; + } + + void BuildWordMatches(const vector& hyp, NgramToIntsMap* nmap) const { + nmap->clear(); + set exists_both; + for (int i = 0; i < hyp.size(); ++i) + if (rwexists_.find(hyp[i]) != rwexists_.end()) + exists_both.insert(hyp[i]); + for (int start=0; start cp; + int mlen = min(MAX_SHIFT_SIZE, static_cast(ref_.size() - start)); + for (int len=0; len& in, + int start, int end, int moveto, vector* out) { + // cerr << "ps: " << start << " " << end << " " << moveto << endl; + out->clear(); + if (moveto == -1) { + for (int i = start; i <= end; ++i) + out->push_back(in[i]); + for (int i = 0; i < start; ++i) + out->push_back(in[i]); + for (int i = end+1; i < in.size(); ++i) + out->push_back(in[i]); + } else if (moveto < start) { + for (int i = 0; i <= moveto; ++i) + out->push_back(in[i]); + for (int i = start; i <= end; ++i) + out->push_back(in[i]); + for (int i = moveto+1; i < start; ++i) + out->push_back(in[i]); + for (int i = end+1; i < in.size(); ++i) + out->push_back(in[i]); + } else if (moveto > end) { + for (int i = 0; i < start; ++i) + out->push_back(in[i]); + for (int i = end+1; i <= moveto; ++i) + out->push_back(in[i]); + for (int i = start; i <= end; ++i) + out->push_back(in[i]); + for (int i = moveto+1; i < in.size(); ++i) + out->push_back(in[i]); + } else { + for (int i = 0; i < start; ++i) + out->push_back(in[i]); + for (int i = end+1; (i < in.size()) && (i <= end + (moveto - start)); ++i) + out->push_back(in[i]); + for (int i = start; i <= end; ++i) + out->push_back(in[i]); + for (int i = (end + (moveto - start))+1; i < in.size(); ++i) + out->push_back(in[i]); + } + if (out->size() != in.size()) { + cerr << "ps: " << start << " " << end << " " << moveto << endl; + cerr << "in=" << TD::GetString(in) << endl; + cerr << "out=" << TD::GetString(*out) << endl; + } + assert(out->size() == in.size()); + // cerr << "ps: " << TD::GetString(*out) << endl; + } + + void GetAllPossibleShifts(const vector& hyp, + const vector& ralign, + const vector& herr, + const vector& rerr, + const int min_size, + vector >* shifts) const { + for (int start = 0; start < hyp.size(); ++start) { + vector cp(1, hyp[start]); + NgramToIntsMap::iterator niter = nmap_.find(cp); + if (niter == nmap_.end()) continue; + bool ok = false; + int moveto; + for (set::iterator i = niter->second.begin(); i != niter->second.end(); ++i) { + moveto = *i; + int rm = ralign[moveto]; + ok = (start != rm && + (rm - start) < MAX_SHIFT_DIST && + (start - rm - 1) < MAX_SHIFT_DIST); + if (ok) break; + } + if (!ok) continue; + cp.clear(); + for (int end = start + min_size - 1; + ok && end < hyp.size() && end < (start + MAX_SHIFT_SIZE); ++end) { + cp.push_back(hyp[end]); + vector& sshifts = (*shifts)[end - start]; + ok = false; + NgramToIntsMap::iterator niter = nmap_.find(cp); + if (niter == nmap_.end()) break; + bool any_herr = false; + for (int i = start; i <= end && !any_herr; ++i) + any_herr = herr[i]; + if (!any_herr) { + ok = true; + continue; + } + for (set::iterator mi = niter->second.begin(); + mi != niter->second.end(); ++mi) { + int moveto = *mi; + int rm = ralign[moveto]; + if (! ((rm != start) && + ((rm < start) || (rm > end)) && + (rm - start <= MAX_SHIFT_DIST) && + ((start - rm - 1) <= MAX_SHIFT_DIST))) continue; + ok = true; + bool any_rerr = false; + for (int i = 0; (i <= end - start) && (!any_rerr); ++i) + any_rerr = rerr[moveto+i]; + if (!any_rerr) continue; + for (int roff = 0; roff <= (end - start); ++roff) { + int rmr = ralign[moveto+roff]; + if ((start != rmr) && ((roff == 0) || (rmr != ralign[moveto]))) + sshifts.push_back(Shift(start, end, moveto + roff)); + } + } + } + } + } + + bool CalculateBestShift(const vector& cur, + const vector& hyp, + float curerr, + const vector& path, + vector* new_hyp, + float* newerr, + vector* new_path) const { + vector herr, rerr; + vector ralign; + int hpos = -1; + for (int i = 0; i < path.size(); ++i) { + switch (path[i]) { + case MATCH: + ++hpos; + herr.push_back(false); + rerr.push_back(false); + ralign.push_back(hpos); + break; + case SUBSTITUTION: + ++hpos; + herr.push_back(true); + rerr.push_back(true); + ralign.push_back(hpos); + break; + case INSERTION: + ++hpos; + herr.push_back(true); + break; + case DELETION: + rerr.push_back(true); + ralign.push_back(hpos); + break; + } + } +#if 0 + cerr << "RALIGN: "; + for (int i = 0; i < rerr.size(); ++i) + cerr << ralign[i] << " "; + cerr << endl; + cerr << "RERR: "; + for (int i = 0; i < rerr.size(); ++i) + cerr << (bool)rerr[i] << " "; + cerr << endl; + cerr << "HERR: "; + for (int i = 0; i < herr.size(); ++i) + cerr << (bool)herr[i] << " "; + cerr << endl; +#endif + + vector > shifts(MAX_SHIFT_SIZE + 1); + GetAllPossibleShifts(cur, ralign, herr, rerr, 1, &shifts); + float cur_best_shift_cost = 0; + *newerr = curerr; + vector cur_best_path; + vector cur_best_hyp; + + bool res = false; + for (int i = shifts.size() - 1; i >=0; --i) { + float curfix = curerr - (cur_best_shift_cost + *newerr); + float maxfix = 2.0f * (1 + i) - COSTS::shift; + if ((curfix > maxfix) || ((cur_best_shift_cost == 0) && (curfix == maxfix))) break; + for (int j = 0; j < shifts[i].size(); ++j) { + const Shift& s = shifts[i][j]; + curfix = curerr - (cur_best_shift_cost + *newerr); + maxfix = 2.0f * (1 + i) - COSTS::shift; // TODO remove? + if ((curfix > maxfix) || ((cur_best_shift_cost == 0) && (curfix == maxfix))) continue; + vector shifted(cur.size()); + PerformShift(cur, s.begin(), s.end(), ralign[s.moveto()], &shifted); + vector try_path; + float try_cost = MinimumEditDistance(shifted, ref_, &try_path); + float gain = (*newerr + cur_best_shift_cost) - (try_cost + COSTS::shift); + if (gain > 0.0f || ((cur_best_shift_cost == 0.0f) && (gain == 0.0f))) { + *newerr = try_cost; + cur_best_shift_cost = COSTS::shift; + new_path->swap(try_path); + new_hyp->swap(shifted); + res = true; + // cerr << "Found better shift " << s.begin() << "..." << s.end() << " moveto " << s.moveto() << endl; + } + } + } + + return res; + } + + static void GetPathStats(const vector& path, int* subs, int* ins, int* dels) { + *subs = *ins = *dels = 0; + for (int i = 0; i < path.size(); ++i) { + switch (path[i]) { + case SUBSTITUTION: + ++(*subs); + case MATCH: + break; + case INSERTION: + ++(*ins); break; + case DELETION: + ++(*dels); break; + } + } + } + + float CalculateAllShifts(const vector& hyp, + int* subs, int* ins, int* dels, int* shifts) const { + BuildWordMatches(hyp, &nmap_); + vector path; + float med_cost = MinimumEditDistance(hyp, ref_, &path); + float edits = 0; + vector cur = hyp; + *shifts = 0; + if (ter_short_circuit_long_sentences < 0 || + ref_.size() < ter_short_circuit_long_sentences) { + while (true) { + vector new_hyp; + vector new_path; + float new_med_cost; + if (!CalculateBestShift(cur, hyp, med_cost, path, &new_hyp, &new_med_cost, &new_path)) + break; + edits += COSTS::shift; + ++(*shifts); + med_cost = new_med_cost; + path.swap(new_path); + cur.swap(new_hyp); + } + } + GetPathStats(path, subs, ins, dels); + return med_cost + edits; + } +}; + +class TERScore : public ScoreBase { + friend class TERScorer; + + public: + + TERScore() : stats(0,kDUMMY_LAST_ENTRY) {} + float ComputePartialScore() const { return 0.0;} + float ComputeScore() const { + float edits = static_cast(stats[kINSERTIONS] + stats[kDELETIONS] + stats[kSUBSTITUTIONS] + stats[kSHIFTS]); + return edits / static_cast(stats[kREF_WORDCOUNT]); + } + void ScoreDetails(string* details) const; + void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len){} + void PlusEquals(const Score& delta, const float scale) { + if (scale==1) + stats += static_cast(delta).stats; + if (scale==-1) + stats -= static_cast(delta).stats; + throw std::runtime_error("TERScore::PlusEquals with scale != +-1"); + } + void PlusEquals(const Score& delta) { + stats += static_cast(delta).stats; + } + + ScoreP GetZero() const { + return ScoreP(new TERScore); + } + ScoreP GetOne() const { + return ScoreP(new TERScore); + } + void Subtract(const Score& rhs, Score* res) const { + static_cast(res)->stats = stats - static_cast(rhs).stats; + } + void Encode(std::string* out) const { + ostringstream os; + os << stats[kINSERTIONS] << ' ' + << stats[kDELETIONS] << ' ' + << stats[kSUBSTITUTIONS] << ' ' + << stats[kSHIFTS] << ' ' + << stats[kREF_WORDCOUNT]; + *out = os.str(); + } + bool IsAdditiveIdentity() const { + for (int i = 0; i < kDUMMY_LAST_ENTRY; ++i) + if (stats[i] != 0) return false; + return true; + } + private: + valarray stats; +}; + +ScoreP TERScorer::ScoreFromString(const std::string& data) { + istringstream is(data); + TERScore* r = new TERScore; + is >> r->stats[TERScore::kINSERTIONS] + >> r->stats[TERScore::kDELETIONS] + >> r->stats[TERScore::kSUBSTITUTIONS] + >> r->stats[TERScore::kSHIFTS] + >> r->stats[TERScore::kREF_WORDCOUNT]; + return ScoreP(r); +} + +void TERScore::ScoreDetails(std::string* details) const { + char buf[200]; + sprintf(buf, "TER = %.2f, %3d|%3d|%3d|%3d (len=%d)", + ComputeScore() * 100.0f, + stats[kINSERTIONS], + stats[kDELETIONS], + stats[kSUBSTITUTIONS], + stats[kSHIFTS], + stats[kREF_WORDCOUNT]); + *details = buf; +} + +TERScorer::~TERScorer() { + for (vector::iterator i = impl_.begin(); i != impl_.end(); ++i) + delete *i; +} + +TERScorer::TERScorer(const vector >& refs) : impl_(refs.size()) { + for (int i = 0; i < refs.size(); ++i) + impl_[i] = new TERScorerImpl(refs[i]); +} + +ScoreP TERScorer::ScoreCCandidate(const vector& hyp) const { + return ScoreP(); +} + +ScoreP TERScorer::ScoreCandidate(const std::vector& hyp) const { + float best_score = numeric_limits::max(); + TERScore* res = new TERScore; + int avg_len = 0; + for (int i = 0; i < impl_.size(); ++i) + avg_len += impl_[i]->GetRefLength(); + avg_len /= impl_.size(); + for (int i = 0; i < impl_.size(); ++i) { + int subs, ins, dels, shifts; + float score = impl_[i]->Calculate(hyp, &subs, &ins, &dels, &shifts); + // cerr << "Component TER cost: " << score << endl; + if (score < best_score) { + res->stats[TERScore::kINSERTIONS] = ins; + res->stats[TERScore::kDELETIONS] = dels; + res->stats[TERScore::kSUBSTITUTIONS] = subs; + res->stats[TERScore::kSHIFTS] = shifts; + if (ter_use_average_ref_len) { + res->stats[TERScore::kREF_WORDCOUNT] = avg_len; + } else { + res->stats[TERScore::kREF_WORDCOUNT] = impl_[i]->GetRefLength(); + } + + best_score = score; + } + } + return ScoreP(res); +} +#endif + +void TERMetric::ComputeSufficientStatistics(const vector& hyp, + const vector >& refs, + SufficientStats* out) const { + out->fields.resize(kDUMMY_LAST_ENTRY); +} + +float TERMetric::ComputeScore(const SufficientStats& stats) const { + float edits = static_cast(stats[kINSERTIONS] + stats[kDELETIONS] + stats[kSUBSTITUTIONS] + stats[kSHIFTS]); + return edits / static_cast(stats[kREF_WORDCOUNT]); +} + diff --git a/mteval/ns_ter.h b/mteval/ns_ter.h new file mode 100644 index 00000000..bb90f95e --- /dev/null +++ b/mteval/ns_ter.h @@ -0,0 +1,18 @@ +#ifndef _NS_TER_H_ +#define _NS_TER_H_ + +#include "ns.h" + +class TERMetric : public EvaluationMetric { + friend class EvaluationMetric; + protected: + TERMetric() : EvaluationMetric("TER") {} + + public: + virtual void ComputeSufficientStatistics(const std::vector& hyp, + const std::vector >& refs, + SufficientStats* out) const; + virtual float ComputeScore(const SufficientStats& stats) const; +}; + +#endif diff --git a/mteval/scorer_test.cc b/mteval/scorer_test.cc index a07a8c4b..09da250c 100644 --- a/mteval/scorer_test.cc +++ b/mteval/scorer_test.cc @@ -3,9 +3,11 @@ #include #include +#include "ns.h" #include "tdict.h" #include "scorer.h" #include "aer_scorer.h" +#include "kernel_string_subseq.h" using namespace std; @@ -175,6 +177,50 @@ TEST_F(ScorerTest, AERTest) { EXPECT_EQ(d2, details); } +TEST_F(ScorerTest, Kernel) { + for (int i = 1; i < 10; ++i) { + const float l = (i / 10.0); + float f = ssk<4>(refs0[0], hyp1, l) + + ssk<4>(refs0[1], hyp1, l) + + ssk<4>(refs0[2], hyp1, l) + + ssk<4>(refs0[3], hyp1, l); + float f2= ssk<4>(refs1[0], hyp2, l) + + ssk<4>(refs1[1], hyp2, l) + + ssk<4>(refs1[2], hyp2, l) + + ssk<4>(refs1[3], hyp2, l); + f /= 4; + f2 /= 4; + float f3= ssk<4>(refs0[0], hyp2, l) + + ssk<4>(refs0[1], hyp2, l) + + ssk<4>(refs0[2], hyp2, l) + + ssk<4>(refs0[3], hyp2, l); + float f4= ssk<4>(refs1[0], hyp1, l) + + ssk<4>(refs1[1], hyp1, l) + + ssk<4>(refs1[2], hyp1, l) + + ssk<4>(refs1[3], hyp1, l); + f3 += f4; + f3 /= 8; + cerr << "LAMBDA=" << l << "\t" << f << " " << f2 << "\tf=" << ((f + f2)/2 - f3) << " (bad=" << f3 << ")" << endl; + } +} + +TEST_F(ScorerTest, NewScoreAPI) { + EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); + boost::shared_ptr e1 = metric->CreateSegmentEvaluator(refs0); + boost::shared_ptr e2 = metric->CreateSegmentEvaluator(refs1); + SufficientStats stats1; + e1->Evaluate(hyp2, &stats1); + SufficientStats stats2; + e2->Evaluate(hyp1, &stats2); + stats1 += stats2; + string ss; + stats1.Encode(&ss); + cerr << "SS: " << ss << endl; + cerr << metric->ComputeScore(stats1) << endl; + SufficientStats statse("IBM_BLEU 53 32 18 11 65 63 61 59 65 72"); + cerr << metric->ComputeScore(statse) << endl; +} + int main(int argc, char **argv) { testing::InitGoogleTest(&argc, argv); return RUN_ALL_TESTS(); diff --git a/utils/kernel_string_subseq.h b/utils/kernel_string_subseq.h new file mode 100644 index 00000000..516e8b89 --- /dev/null +++ b/utils/kernel_string_subseq.h @@ -0,0 +1,51 @@ +#ifndef _KERNEL_STRING_SUBSEQ_H_ +#define _KERNEL_STRING_SUBSEQ_H_ + +#include +#include +#include + +template +float ssk(const T* s, const size_t s_size, const T* t, const size_t t_size, const float lambda) { + assert(N > 0); + boost::multi_array kp(boost::extents[N + 1][s_size + 1][t_size + 1]); + const float l2 = lambda * lambda; + for (unsigned j = 0; j < s_size; ++j) + for (unsigned k = 0; k < t_size; ++k) + kp[0][j][k] = 1.0f; + for (unsigned i = 0; i < N; ++i) { + for (unsigned j = 0; j < s_size; ++j) { + float kpp = 0.0f; + for (unsigned k = 0; k < t_size; ++k) { + kpp = lambda * (kpp + lambda * (s[j]==t[k]) * kp[i][j][k]); + kp[i + 1][j + 1][k + 1] = lambda * kp[i + 1][j][k + 1] + kpp; + } + } + } + float kn = 0.0f; + for (int i = 0; i < N; ++i) + for (int j = 0; j < s_size; ++j) + for (int k = 0; k < t_size; ++k) + kn += l2 * (s[j] == t[k]) * kp[i][j][k]; + return kn; +} + +template +float ssk(const std::vector& s, const std::vector& t, const float lambda) { + float kst = ssk(&s[0], s.size(), &t[0], t.size(), lambda); + if (!kst) return 0.0f; + float kss = ssk(&s[0], s.size(), &s[0], s.size(), lambda); + float ktt = ssk(&t[0], t.size(), &t[0], t.size(), lambda); + return kst / std::sqrt(kss * ktt); +} + +template +float ssk(const std::string& s, const std::string& t, const float lambda) { + float kst = ssk(&s[0], s.size(), &t[0], t.size(), lambda); + if (!kst) return 0.0f; + float kss = ssk(&s[0], s.size(), &s[0], s.size(), lambda); + float ktt = ssk(&t[0], t.size(), &t[0], t.size(), lambda); + return kst / std::sqrt(kss * ktt); +} + +#endif -- cgit v1.2.3 From 2eb3bb96c6f780c477585b33273fc0c0d56c80e4 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 20 Dec 2011 15:51:11 -0500 Subject: new scorer interface is implemented, but not used --- mteval/Makefile.am | 2 +- mteval/ns.cc | 67 ++++++++++++++++++++------ mteval/ns.h | 23 +++++---- mteval/ns_comb.cc | 87 +++++++++++++++++++++++++++++++++ mteval/ns_comb.h | 19 ++++++++ mteval/ns_ext.cc | 130 ++++++++++++++++++++++++++++++++++++++++++++++++++ mteval/ns_ext.h | 21 ++++++++ mteval/ns_ter.cc | 126 ++++++++++-------------------------------------- mteval/ns_ter.h | 1 + mteval/scorer_test.cc | 12 +++-- utils/stringlib.h | 7 +++ 11 files changed, 362 insertions(+), 133 deletions(-) create mode 100644 mteval/ns_comb.cc create mode 100644 mteval/ns_comb.h create mode 100644 mteval/ns_ext.cc create mode 100644 mteval/ns_ext.h diff --git a/mteval/Makefile.am b/mteval/Makefile.am index 95845090..6679d949 100644 --- a/mteval/Makefile.am +++ b/mteval/Makefile.am @@ -10,7 +10,7 @@ endif noinst_LIBRARIES = libmteval.a -libmteval_a_SOURCES = ter.cc comb_scorer.cc aer_scorer.cc scorer.cc external_scorer.cc +libmteval_a_SOURCES = ter.cc comb_scorer.cc aer_scorer.cc scorer.cc external_scorer.cc ns.cc ns_ter.cc ns_ext.cc ns_comb.cc fast_score_SOURCES = fast_score.cc fast_score_LDADD = libmteval.a $(top_srcdir)/utils/libutils.a -lz diff --git a/mteval/ns.cc b/mteval/ns.cc index 1045a51f..6139757d 100644 --- a/mteval/ns.cc +++ b/mteval/ns.cc @@ -1,5 +1,7 @@ #include "ns.h" #include "ns_ter.h" +#include "ns_ext.h" +#include "ns_comb.h" #include #include @@ -7,6 +9,9 @@ #include #include +#include "tdict.h" +#include "stringlib.h" + using namespace std; using boost::shared_ptr; @@ -19,6 +24,7 @@ struct DefaultSegmentEvaluator : public SegmentEvaluator { DefaultSegmentEvaluator(const vector >& refs, const EvaluationMetric* em) : refs_(refs), em_(em) {} void Evaluate(const vector& hyp, SufficientStats* out) const { em_->ComputeSufficientStatistics(hyp, refs_, out); + out->id_ = em_->MetricId(); } const vector > refs_; const EvaluationMetric* em_; @@ -28,6 +34,11 @@ shared_ptr EvaluationMetric::CreateSegmentEvaluator(const vect return shared_ptr(new DefaultSegmentEvaluator(refs, this)); } +#define MAX_SS_VECTOR_SIZE 50 +unsigned EvaluationMetric::SufficientStatisticsVectorSize() const { + return MAX_SS_VECTOR_SIZE; +} + void EvaluationMetric::ComputeSufficientStatistics(const vector&, const vector >&, SufficientStats*) const { @@ -35,6 +46,12 @@ void EvaluationMetric::ComputeSufficientStatistics(const vector&, abort(); } +string EvaluationMetric::DetailedScore(const SufficientStats& stats) const { + ostringstream os; + os << MetricId() << "=" << ComputeScore(stats); + return os.str(); +} + enum BleuType { IBM, Koehn, NIST }; template struct BleuSegmentEvaluator : public SegmentEvaluator { @@ -57,7 +74,7 @@ struct BleuSegmentEvaluator : public SegmentEvaluator { void Evaluate(const vector& hyp, SufficientStats* out) const { out->fields.resize(N + N + 2); - out->evaluation_metric = evaluation_metric; + out->id_ = evaluation_metric->MetricId(); for (unsigned i = 0; i < N+N+2; ++i) out->fields[i] = 0; ComputeNgramStats(hyp, &out->fields[0], &out->fields[N], true); @@ -157,7 +174,12 @@ struct BleuSegmentEvaluator : public SegmentEvaluator { template struct BleuMetric : public EvaluationMetric { BleuMetric() : EvaluationMetric("IBM_BLEU") {} - float ComputeScore(const SufficientStats& stats) const { + unsigned SufficientStatisticsVectorSize() const { return N*2 + 2; } + shared_ptr CreateSegmentEvaluator(const vector >& refs) const { + return shared_ptr(new BleuSegmentEvaluator(refs, this)); + } + float ComputeBreakdown(const SufficientStats& stats, float* bp, vector* out) const { + if (out) { out->clear(); } float log_bleu = 0; int count = 0; for (int i = 0; i < N; ++i) { @@ -166,7 +188,7 @@ struct BleuMetric : public EvaluationMetric { // smooth bleu if (!cor_count) { cor_count = 0.01; } float lprec = log(cor_count) - log(stats.fields[i+N]); // log(hyp_ngram_counts[i]); - // if (precs) precs->push_back(exp(lprec)); + if (out) out->push_back(exp(lprec)); log_bleu += lprec; ++count; } @@ -178,32 +200,51 @@ struct BleuMetric : public EvaluationMetric { if (hyp_len < ref_len) lbp = (hyp_len - ref_len) / hyp_len; log_bleu += lbp; - //if (bp) *bp = exp(lbp); + if (bp) *bp = exp(lbp); return exp(log_bleu); } - shared_ptr CreateSegmentEvaluator(const vector >& refs) const { - return shared_ptr(new BleuSegmentEvaluator(refs, this)); + string DetailedScore(const SufficientStats& stats) const { + char buf[2000]; + vector precs(N); + float bp; + float bleu = ComputeBreakdown(stats, &bp, &precs); + sprintf(buf, "BLEU = %.2f, %.1f|%.1f|%.1f|%.1f (brev=%.3f)", + bleu*100.0, + precs[0]*100.0, + precs[1]*100.0, + precs[2]*100.0, + precs[3]*100.0, + bp); + return buf; + } + float ComputeScore(const SufficientStats& stats) const { + return ComputeBreakdown(stats, NULL, NULL); } }; -EvaluationMetric* EvaluationMetric::Instance(const string& metric_id) { +EvaluationMetric* EvaluationMetric::Instance(const string& imetric_id) { static bool is_first = true; if (is_first) { instances_["NULL"] = NULL; is_first = false; } + const string metric_id = UppercaseString(imetric_id); map::iterator it = instances_.find(metric_id); if (it == instances_.end()) { EvaluationMetric* m = NULL; - if (metric_id == "IBM_BLEU") { + if (metric_id == "IBM_BLEU") { m = new BleuMetric<4, IBM>; } else if (metric_id == "NIST_BLEU") { m = new BleuMetric<4, NIST>; - } else if (metric_id == "Koehn_BLEU") { + } else if (metric_id == "KOEHN_BLEU") { m = new BleuMetric<4, Koehn>; } else if (metric_id == "TER") { m = new TERMetric; + } else if (metric_id == "METEOR") { + m = new ExternalMetric("METEOR", "java -Xmx1536m -jar /Users/cdyer/software/meteor/meteor-1.3.jar - - -mira -lower -t tune -l en"); + } else if (metric_id.find("COMB:") == 0) { + m = new CombinationMetric(metric_id); } else { cerr << "Implement please: " << metric_id << endl; abort(); @@ -220,9 +261,7 @@ EvaluationMetric* EvaluationMetric::Instance(const string& metric_id) { SufficientStats::SufficientStats(const string& encoded) { istringstream is(encoded); - string type; - is >> type; - evaluation_metric = EvaluationMetric::Instance(type); + is >> id_; float val; while(is >> val) fields.push_back(val); @@ -230,8 +269,8 @@ SufficientStats::SufficientStats(const string& encoded) { void SufficientStats::Encode(string* out) const { ostringstream os; - if (evaluation_metric) - os << evaluation_metric->MetricId(); + if (id_.size() > 0) + os << id_; else os << "NULL"; for (unsigned i = 0; i < fields.size(); ++i) diff --git a/mteval/ns.h b/mteval/ns.h index f19b7509..622265db 100644 --- a/mteval/ns.h +++ b/mteval/ns.h @@ -7,18 +7,15 @@ #include #include "wordid.h" -class EvaluationMetric; - class SufficientStats { public: - SufficientStats() : evaluation_metric() {} + SufficientStats() : id_() {} explicit SufficientStats(const std::string& encoded); - explicit SufficientStats(const EvaluationMetric* s) : evaluation_metric(s) {} - SufficientStats(const EvaluationMetric* s, const std::vector& f) : - evaluation_metric(s), fields(f) {} + SufficientStats(const std::string& mid, const std::vector& f) : + id_(mid), fields(f) {} SufficientStats& operator+=(const SufficientStats& delta) { - if (delta.evaluation_metric) evaluation_metric = delta.evaluation_metric; + if (id_.empty() && delta.id_.size()) id_ = delta.id_; if (fields.size() != delta.fields.size()) fields.resize(std::max(fields.size(), delta.fields.size())); for (unsigned i = 0; i < delta.fields.size(); ++i) @@ -26,7 +23,7 @@ class SufficientStats { return *this; } SufficientStats& operator-=(const SufficientStats& delta) { - if (delta.evaluation_metric) evaluation_metric = delta.evaluation_metric; + if (id_.empty() && delta.id_.size()) id_ = delta.id_; if (fields.size() != delta.fields.size()) fields.resize(std::max(fields.size(), delta.fields.size())); for (unsigned i = 0; i < delta.fields.size(); ++i) @@ -53,7 +50,7 @@ class SufficientStats { } void Encode(std::string* out) const; - const EvaluationMetric* evaluation_metric; + std::string id_; std::vector fields; }; @@ -73,13 +70,13 @@ struct SegmentEvaluator { }; // Instructions for implementing a new metric -// Override MetricId() and give the metric a unique string name (no spaces) // To Instance(), add something that creates the metric +// Implement ComputeScore(const SufficientStats& stats) const; // Implement ONE of the following: // 1) void ComputeSufficientStatistics(const std::vector >& refs, SufficientStats* out) const; // 2) a new SegmentEvaluator class AND CreateSegmentEvaluator(const std::vector >& refs) const; -// The later (#2) is only used when it is necessary to precompute per-segment data from a set of refs -// Implement ComputeScore(const SufficientStats& stats) const; +// [The later (#2) is only used when it is necessary to precompute per-segment data from a set of refs] +// OPTIONAL: Override SufficientStatisticsVectorSize() if it is easy to do so class EvaluationMetric { public: static EvaluationMetric* Instance(const std::string& metric_id = "IBM_BLEU"); @@ -91,7 +88,9 @@ class EvaluationMetric { public: const std::string& MetricId() const { return name_; } + virtual unsigned SufficientStatisticsVectorSize() const; virtual float ComputeScore(const SufficientStats& stats) const = 0; + virtual std::string DetailedScore(const SufficientStats& stats) const; virtual boost::shared_ptr CreateSegmentEvaluator(const std::vector >& refs) const; virtual void ComputeSufficientStatistics(const std::vector& hyp, const std::vector >& refs, diff --git a/mteval/ns_comb.cc b/mteval/ns_comb.cc new file mode 100644 index 00000000..41c634cd --- /dev/null +++ b/mteval/ns_comb.cc @@ -0,0 +1,87 @@ +#include "ns_comb.h" + +#include + +#include "stringlib.h" + +using namespace std; + +// e.g. COMB:IBM_BLEU=0.5;TER=0.5 +CombinationMetric::CombinationMetric(const std::string& cmd) : + EvaluationMetric(cmd), + total_size() { + if (cmd.find("COMB:") != 0 || cmd.size() < 9) { + cerr << "Error in combination metric specifier: " << cmd << endl; + exit(1); + } + string mix = cmd.substr(5); + vector comps; + Tokenize(cmd.substr(5), ';', &comps); + if(comps.size() < 2) { + cerr << "Error in combination metric specifier: " << cmd << endl; + exit(1); + } + vector cwpairs; + for (unsigned i = 0; i < comps.size(); ++i) { + Tokenize(comps[i], '=', &cwpairs); + if (cwpairs.size() != 2) { cerr << "Error in combination metric specifier: " << cmd << endl; exit(1); } + metrics.push_back(EvaluationMetric::Instance(cwpairs[0])); + coeffs.push_back(atof(cwpairs[1].c_str())); + offsets.push_back(total_size); + total_size += metrics.back()->SufficientStatisticsVectorSize(); + cerr << (i > 0 ? " + " : "( ") << coeffs.back() << " * " << cwpairs[0]; + } + cerr << " )\n"; +} + +struct CombinationSegmentEvaluator : public SegmentEvaluator { + CombinationSegmentEvaluator(const string& id, + const vector >& refs, + const vector& metrics, + const vector& offsets, + const unsigned ts) : id_(id), offsets_(offsets), total_size_(ts), component_evaluators_(metrics.size()) { + for (unsigned i = 0; i < metrics.size(); ++i) + component_evaluators_[i] = metrics[i]->CreateSegmentEvaluator(refs); + } + virtual void Evaluate(const std::vector& hyp, SufficientStats* out) const { + out->id_ = id_; + out->fields.resize(total_size_); + for (unsigned i = 0; i < component_evaluators_.size(); ++i) { + SufficientStats t; + component_evaluators_[i]->Evaluate(hyp, &t); + for (unsigned j = 0; j < t.fields.size(); ++j) { + unsigned op = j + offsets_[i]; + assert(op < out->fields.size()); + out->fields[op] = t[j]; + } + } + } + const string& id_; + const vector& offsets_; + const unsigned total_size_; + vector > component_evaluators_; +}; + +boost::shared_ptr CombinationMetric::CreateSegmentEvaluator(const std::vector >& refs) const { + boost::shared_ptr res; + res.reset(new CombinationSegmentEvaluator(MetricId(), refs, metrics, offsets, total_size)); + return res; +} + +float CombinationMetric::ComputeScore(const SufficientStats& stats) const { + float tot = 0; + for (unsigned i = 0; i < metrics.size(); ++i) { + SufficientStats t; + unsigned next = total_size; + if (i + 1 < offsets.size()) next = offsets[i+1]; + for (unsigned j = offsets[i]; j < next; ++j) + t.fields.push_back(stats[j]); + tot += metrics[i]->ComputeScore(t) * coeffs[i]; + } + return tot; +} + +unsigned CombinationMetric::SufficientStatisticsVectorSize() const { + return total_size; +} + diff --git a/mteval/ns_comb.h b/mteval/ns_comb.h new file mode 100644 index 00000000..140e7e6a --- /dev/null +++ b/mteval/ns_comb.h @@ -0,0 +1,19 @@ +#ifndef _NS_COMB_H_ +#define _NS_COMB_H_ + +#include "ns.h" + +class CombinationMetric : public EvaluationMetric { + public: + CombinationMetric(const std::string& cmd); + virtual boost::shared_ptr CreateSegmentEvaluator(const std::vector >& refs) const; + virtual float ComputeScore(const SufficientStats& stats) const; + virtual unsigned SufficientStatisticsVectorSize() const; + private: + std::vector metrics; + std::vector coeffs; + std::vector offsets; + unsigned total_size; +}; + +#endif diff --git a/mteval/ns_ext.cc b/mteval/ns_ext.cc new file mode 100644 index 00000000..956708af --- /dev/null +++ b/mteval/ns_ext.cc @@ -0,0 +1,130 @@ +#include "ns_ext.h" + +#include // popen +#include +#include +#include +#include +#include +#include + +#include "stringlib.h" +#include "tdict.h" + +using namespace std; + +struct NScoreServer { + NScoreServer(const std::string& cmd); + ~NScoreServer(); + + float ComputeScore(const std::vector& fields); + void Evaluate(const std::vector >& refs, const std::vector& hyp, std::vector* fields); + + private: + void RequestResponse(const std::string& request, std::string* response); + int p2c[2]; + int c2p[2]; +}; + +NScoreServer::NScoreServer(const string& cmd) { + cerr << "Invoking " << cmd << " ..." << endl; + if (pipe(p2c) < 0) { perror("pipe"); exit(1); } + if (pipe(c2p) < 0) { perror("pipe"); exit(1); } + pid_t cpid = fork(); + if (cpid < 0) { perror("fork"); exit(1); } + if (cpid == 0) { // child + close(p2c[1]); + close(c2p[0]); + dup2(p2c[0], 0); + close(p2c[0]); + dup2(c2p[1], 1); + close(c2p[1]); + cerr << "Exec'ing from child " << cmd << endl; + vector vargs; + SplitOnWhitespace(cmd, &vargs); + const char** cargv = static_cast(malloc(sizeof(const char*) * vargs.size())); + for (unsigned i = 1; i < vargs.size(); ++i) cargv[i-1] = vargs[i].c_str(); + cargv[vargs.size() - 1] = NULL; + execvp(vargs[0].c_str(), (char* const*)cargv); + } else { // parent + close(c2p[1]); + close(p2c[0]); + } + string dummy; + RequestResponse("SCORE ||| Reference initialization string . ||| Testing initialization string .", &dummy); + assert(dummy.size() > 0); + cerr << "Connection established.\n"; +} + +NScoreServer::~NScoreServer() { + // TODO close stuff, join stuff +} + +float NScoreServer::ComputeScore(const vector& fields) { + ostringstream os; + os << "EVAL |||"; + for (unsigned i = 0; i < fields.size(); ++i) + os << ' ' << fields[i]; + string sres; + RequestResponse(os.str(), &sres); + return strtod(sres.c_str(), NULL); +} + +void NScoreServer::Evaluate(const vector >& refs, const vector& hyp, vector* fields) { + ostringstream os; + os << "SCORE"; + for (unsigned i = 0; i < refs.size(); ++i) { + os << " |||"; + for (unsigned j = 0; j < refs[i].size(); ++j) { + os << ' ' << TD::Convert(refs[i][j]); + } + } + os << " |||"; + for (unsigned i = 0; i < hyp.size(); ++i) { + os << ' ' << TD::Convert(hyp[i]); + } + string sres; + RequestResponse(os.str(), &sres); + istringstream is(sres); + float val; + fields->clear(); + while(is >> val) + fields->push_back(val); +} + +#define MAX_BUF 16000 + +void NScoreServer::RequestResponse(const string& request, string* response) { +// cerr << "@SERVER: " << request << endl; + string x = request + "\n"; + write(p2c[1], x.c_str(), x.size()); + char buf[MAX_BUF]; + size_t n = read(c2p[0], buf, MAX_BUF); + while (n < MAX_BUF && buf[n-1] != '\n') + n += read(c2p[0], &buf[n], MAX_BUF - n); + + buf[n-1] = 0; + if (n < 2) { + cerr << "Malformed response: " << buf << endl; + } + *response = Trim(buf, " \t\n"); +// cerr << "@RESPONSE: '" << *response << "'\n"; +} + +void ExternalMetric::ComputeSufficientStatistics(const std::vector& hyp, + const std::vector >& refs, + SufficientStats* out) const { + eval_server->Evaluate(refs, hyp, &out->fields); +} + +float ExternalMetric::ComputeScore(const SufficientStats& stats) const { + eval_server->ComputeScore(stats.fields); +} + +ExternalMetric::ExternalMetric(const string& metric_name, const std::string& command) : + EvaluationMetric(metric_name), + eval_server(new NScoreServer(command)) {} + +ExternalMetric::~ExternalMetric() { + delete eval_server; +} diff --git a/mteval/ns_ext.h b/mteval/ns_ext.h new file mode 100644 index 00000000..78badb2e --- /dev/null +++ b/mteval/ns_ext.h @@ -0,0 +1,21 @@ +#ifndef _NS_EXTERNAL_SCORER_H_ +#define _NS_EXTERNAL_SCORER_H_ + +#include "ns.h" + +struct NScoreServer; +class ExternalMetric : public EvaluationMetric { + public: + ExternalMetric(const std::string& metricid, const std::string& command); + ~ExternalMetric(); + + virtual void ComputeSufficientStatistics(const std::vector& hyp, + const std::vector >& refs, + SufficientStats* out) const; + virtual float ComputeScore(const SufficientStats& stats) const; + + protected: + NScoreServer* eval_server; +}; + +#endif diff --git a/mteval/ns_ter.cc b/mteval/ns_ter.cc index 14dc6e49..8c969e58 100644 --- a/mteval/ns_ter.cc +++ b/mteval/ns_ter.cc @@ -1,15 +1,11 @@ #include "ns_ter.h" -#include #include #include #include -#include #include #include -#include #include -#include #include "tdict.h" static const bool ter_use_average_ref_len = true; @@ -25,7 +21,7 @@ static const unsigned kDUMMY_LAST_ENTRY = 5; using namespace std; using namespace std::tr1; -#if 0 +namespace NewScorer { struct COSTS { static const float substitution; @@ -82,7 +78,7 @@ class TERScorerImpl { enum TransType { MATCH, SUBSTITUTION, INSERTION, DELETION }; explicit TERScorerImpl(const vector& ref) : ref_(ref) { - for (int i = 0; i < ref.size(); ++i) + for (unsigned i = 0; i < ref.size(); ++i) rwexists_.insert(ref[i]); } @@ -95,7 +91,7 @@ class TERScorerImpl { } private: - vector ref_; + const vector& ref_; set rwexists_; typedef unordered_map, set, boost::hash > > NgramToIntsMap; @@ -421,68 +417,7 @@ class TERScorerImpl { } }; -class TERScore : public ScoreBase { - friend class TERScorer; - - public: - - TERScore() : stats(0,kDUMMY_LAST_ENTRY) {} - float ComputePartialScore() const { return 0.0;} - float ComputeScore() const { - float edits = static_cast(stats[kINSERTIONS] + stats[kDELETIONS] + stats[kSUBSTITUTIONS] + stats[kSHIFTS]); - return edits / static_cast(stats[kREF_WORDCOUNT]); - } - void ScoreDetails(string* details) const; - void PlusPartialEquals(const Score& rhs, int oracle_e_cover, int oracle_f_cover, int src_len){} - void PlusEquals(const Score& delta, const float scale) { - if (scale==1) - stats += static_cast(delta).stats; - if (scale==-1) - stats -= static_cast(delta).stats; - throw std::runtime_error("TERScore::PlusEquals with scale != +-1"); - } - void PlusEquals(const Score& delta) { - stats += static_cast(delta).stats; - } - - ScoreP GetZero() const { - return ScoreP(new TERScore); - } - ScoreP GetOne() const { - return ScoreP(new TERScore); - } - void Subtract(const Score& rhs, Score* res) const { - static_cast(res)->stats = stats - static_cast(rhs).stats; - } - void Encode(std::string* out) const { - ostringstream os; - os << stats[kINSERTIONS] << ' ' - << stats[kDELETIONS] << ' ' - << stats[kSUBSTITUTIONS] << ' ' - << stats[kSHIFTS] << ' ' - << stats[kREF_WORDCOUNT]; - *out = os.str(); - } - bool IsAdditiveIdentity() const { - for (int i = 0; i < kDUMMY_LAST_ENTRY; ++i) - if (stats[i] != 0) return false; - return true; - } - private: - valarray stats; -}; - -ScoreP TERScorer::ScoreFromString(const std::string& data) { - istringstream is(data); - TERScore* r = new TERScore; - is >> r->stats[TERScore::kINSERTIONS] - >> r->stats[TERScore::kDELETIONS] - >> r->stats[TERScore::kSUBSTITUTIONS] - >> r->stats[TERScore::kSHIFTS] - >> r->stats[TERScore::kREF_WORDCOUNT]; - return ScoreP(r); -} - +#if 0 void TERScore::ScoreDetails(std::string* details) const { char buf[200]; sprintf(buf, "TER = %.2f, %3d|%3d|%3d|%3d (len=%d)", @@ -494,54 +429,43 @@ void TERScore::ScoreDetails(std::string* details) const { stats[kREF_WORDCOUNT]); *details = buf; } +#endif -TERScorer::~TERScorer() { - for (vector::iterator i = impl_.begin(); i != impl_.end(); ++i) - delete *i; -} +} // namespace NewScorer -TERScorer::TERScorer(const vector >& refs) : impl_(refs.size()) { +void TERMetric::ComputeSufficientStatistics(const vector& hyp, + const vector >& refs, + SufficientStats* out) const { + out->fields.resize(kDUMMY_LAST_ENTRY); + float best_score = numeric_limits::max(); + unsigned avg_len = 0; for (int i = 0; i < refs.size(); ++i) - impl_[i] = new TERScorerImpl(refs[i]); -} + avg_len += refs[i].size(); + avg_len /= refs.size(); -ScoreP TERScorer::ScoreCCandidate(const vector& hyp) const { - return ScoreP(); -} - -ScoreP TERScorer::ScoreCandidate(const std::vector& hyp) const { - float best_score = numeric_limits::max(); - TERScore* res = new TERScore; - int avg_len = 0; - for (int i = 0; i < impl_.size(); ++i) - avg_len += impl_[i]->GetRefLength(); - avg_len /= impl_.size(); - for (int i = 0; i < impl_.size(); ++i) { + for (int i = 0; i < refs.size(); ++i) { int subs, ins, dels, shifts; - float score = impl_[i]->Calculate(hyp, &subs, &ins, &dels, &shifts); + NewScorer::TERScorerImpl ter(refs[i]); + float score = ter.Calculate(hyp, &subs, &ins, &dels, &shifts); // cerr << "Component TER cost: " << score << endl; if (score < best_score) { - res->stats[TERScore::kINSERTIONS] = ins; - res->stats[TERScore::kDELETIONS] = dels; - res->stats[TERScore::kSUBSTITUTIONS] = subs; - res->stats[TERScore::kSHIFTS] = shifts; + out->fields[kINSERTIONS] = ins; + out->fields[kDELETIONS] = dels; + out->fields[kSUBSTITUTIONS] = subs; + out->fields[kSHIFTS] = shifts; if (ter_use_average_ref_len) { - res->stats[TERScore::kREF_WORDCOUNT] = avg_len; + out->fields[kREF_WORDCOUNT] = avg_len; } else { - res->stats[TERScore::kREF_WORDCOUNT] = impl_[i]->GetRefLength(); + out->fields[kREF_WORDCOUNT] = refs[i].size(); } best_score = score; } } - return ScoreP(res); } -#endif -void TERMetric::ComputeSufficientStatistics(const vector& hyp, - const vector >& refs, - SufficientStats* out) const { - out->fields.resize(kDUMMY_LAST_ENTRY); +unsigned TERMetric::SufficientStatisticsVectorSize() const { + return kDUMMY_LAST_ENTRY; } float TERMetric::ComputeScore(const SufficientStats& stats) const { diff --git a/mteval/ns_ter.h b/mteval/ns_ter.h index bb90f95e..6c020cfa 100644 --- a/mteval/ns_ter.h +++ b/mteval/ns_ter.h @@ -9,6 +9,7 @@ class TERMetric : public EvaluationMetric { TERMetric() : EvaluationMetric("TER") {} public: + virtual unsigned SufficientStatisticsVectorSize() const; virtual void ComputeSufficientStatistics(const std::vector& hyp, const std::vector >& refs, SufficientStats* out) const; diff --git a/mteval/scorer_test.cc b/mteval/scorer_test.cc index 09da250c..73159557 100644 --- a/mteval/scorer_test.cc +++ b/mteval/scorer_test.cc @@ -205,20 +205,22 @@ TEST_F(ScorerTest, Kernel) { } TEST_F(ScorerTest, NewScoreAPI) { - EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); + //EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); + //EvaluationMetric* metric = EvaluationMetric::Instance("METEOR"); + EvaluationMetric* metric = EvaluationMetric::Instance("COMB:IBM_BLEU=0.5;TER=-0.5"); boost::shared_ptr e1 = metric->CreateSegmentEvaluator(refs0); boost::shared_ptr e2 = metric->CreateSegmentEvaluator(refs1); SufficientStats stats1; - e1->Evaluate(hyp2, &stats1); + e1->Evaluate(hyp1, &stats1); SufficientStats stats2; - e2->Evaluate(hyp1, &stats2); + e2->Evaluate(hyp2, &stats2); stats1 += stats2; string ss; stats1.Encode(&ss); cerr << "SS: " << ss << endl; cerr << metric->ComputeScore(stats1) << endl; - SufficientStats statse("IBM_BLEU 53 32 18 11 65 63 61 59 65 72"); - cerr << metric->ComputeScore(statse) << endl; + //SufficientStats statse("IBM_BLEU 53 32 18 11 65 63 61 59 65 72"); + //cerr << metric->ComputeScore(statse) << endl; } int main(int argc, char **argv) { diff --git a/utils/stringlib.h b/utils/stringlib.h index cafbdac3..f457e1e4 100644 --- a/utils/stringlib.h +++ b/utils/stringlib.h @@ -125,6 +125,13 @@ inline std::string LowercaseString(const std::string& in) { return res; } +inline std::string UppercaseString(const std::string& in) { + std::string res(in.size(),' '); + for (int i = 0; i < in.size(); ++i) + res[i] = toupper(in[i]); + return res; +} + inline int CountSubstrings(const std::string& str, const std::string& sub) { size_t p = 0; int res = 0; -- cgit v1.2.3 From e4c5e87db2139aa0f8655b063da7d8b5199cb46d Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 20 Dec 2011 18:34:14 -0500 Subject: migrate fast_score to the new API --- mteval/Makefile.am | 2 +- mteval/fast_score.cc | 40 +++++++++++++++++++++++----------------- mteval/ns.cc | 5 +++-- mteval/ns_ter.cc | 12 ++++++++++++ mteval/ns_ter.h | 1 + pro-train/dist-pro.pl | 2 +- vest/dist-vest.pl | 2 +- 7 files changed, 42 insertions(+), 22 deletions(-) diff --git a/mteval/Makefile.am b/mteval/Makefile.am index 6679d949..e7126675 100644 --- a/mteval/Makefile.am +++ b/mteval/Makefile.am @@ -10,7 +10,7 @@ endif noinst_LIBRARIES = libmteval.a -libmteval_a_SOURCES = ter.cc comb_scorer.cc aer_scorer.cc scorer.cc external_scorer.cc ns.cc ns_ter.cc ns_ext.cc ns_comb.cc +libmteval_a_SOURCES = ter.cc comb_scorer.cc aer_scorer.cc scorer.cc external_scorer.cc ns.cc ns_ter.cc ns_ext.cc ns_comb.cc ns_docscorer.cc fast_score_SOURCES = fast_score.cc fast_score_LDADD = libmteval.a $(top_srcdir)/utils/libutils.a -lz diff --git a/mteval/fast_score.cc b/mteval/fast_score.cc index 5ee264a6..a271ccc5 100644 --- a/mteval/fast_score.cc +++ b/mteval/fast_score.cc @@ -4,9 +4,11 @@ #include #include +#include "stringlib.h" #include "filelib.h" #include "tdict.h" -#include "scorer.h" +#include "ns.h" +#include "ns_docscorer.h" using namespace std; namespace po = boost::program_options; @@ -14,8 +16,8 @@ namespace po = boost::program_options; void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() - ("reference,r",po::value >(), "[REQD] Reference translation(s) (tokenized text file)") - ("loss_function,l",po::value()->default_value("ibm_bleu"), "Scoring metric (ibm_bleu, nist_bleu, koehn_bleu, ter, combi)") + ("reference,r",po::value >(), "[1 or more required] Reference translation(s) in tokenized text files") + ("evaluation_metric,m",po::value()->default_value("IBM_BLEU"), "Evaluation metric (ibm_bleu, koehn_bleu, nist_bleu, ter, meteor, etc.)") ("in_file,i", po::value()->default_value("-"), "Input file") ("help,h", "Help"); po::options_description dcmdline_options; @@ -35,24 +37,29 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { int main(int argc, char** argv) { po::variables_map conf; InitCommandLine(argc, argv, &conf); - const string loss_function = conf["loss_function"].as(); - ScoreType type = ScoreTypeFromString(loss_function); - DocScorer ds(type, conf["reference"].as >(), ""); + string loss_function = UppercaseString(conf["evaluation_metric"].as()); + if (loss_function == "COMBI") { + cerr << "WARNING: 'combi' metric is no longer supported, switching to 'COMB:TER=-0.5;IBM_BLEU=0.5'\n"; + loss_function = "COMB:TER=-0.5;IBM_BLEU=0.5"; + } else if (loss_function == "BLEU") { + cerr << "WARNING: 'BLEU' is ambiguous, assuming 'IBM_BLEU'\n"; + loss_function = "IBM_BLEU"; + } + EvaluationMetric* metric = EvaluationMetric::Instance(loss_function); + DocumentScorer ds(metric, conf["reference"].as >()); cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl; ReadFile rf(conf["in_file"].as()); - ScoreP acc; + SufficientStats acc; istream& in = *rf.stream(); int lc = 0; - while(in) { - string line; - getline(in, line); - if (line.empty() && !in) break; + string line; + while(getline(in, line)) { vector sent; TD::ConvertSentence(line, &sent); - ScoreP sentscore = ds[lc]->ScoreCandidate(sent); - if (!acc) { acc = sentscore->GetZero(); } - acc->PlusEquals(*sentscore); + SufficientStats t; + ds[lc]->Evaluate(sent, &t); + acc += t; ++lc; } assert(lc > 0); @@ -63,9 +70,8 @@ int main(int argc, char** argv) { if (lc != ds.size()) cerr << "Fewer sentences in hyp (" << lc << ") than refs (" << ds.size() << "): scoring partial set!\n"; - float score = acc->ComputeScore(); - string details; - acc->ScoreDetails(&details); + float score = metric->ComputeScore(acc); + const string details = metric->DetailedScore(acc); cerr << details << endl; cout << score << endl; return 0; diff --git a/mteval/ns.cc b/mteval/ns.cc index 6139757d..1018319d 100644 --- a/mteval/ns.cc +++ b/mteval/ns.cc @@ -173,7 +173,7 @@ struct BleuSegmentEvaluator : public SegmentEvaluator { template struct BleuMetric : public EvaluationMetric { - BleuMetric() : EvaluationMetric("IBM_BLEU") {} + BleuMetric() : EvaluationMetric(BrevityType == IBM ? "IBM_BLEU" : (BrevityType == Koehn ? "KOEHN_BLEU" : "NIST_BLEU")) {} unsigned SufficientStatisticsVectorSize() const { return N*2 + 2; } shared_ptr CreateSegmentEvaluator(const vector >& refs) const { return shared_ptr(new BleuSegmentEvaluator(refs, this)); @@ -208,7 +208,8 @@ struct BleuMetric : public EvaluationMetric { vector precs(N); float bp; float bleu = ComputeBreakdown(stats, &bp, &precs); - sprintf(buf, "BLEU = %.2f, %.1f|%.1f|%.1f|%.1f (brev=%.3f)", + sprintf(buf, "%s = %.2f, %.1f|%.1f|%.1f|%.1f (brev=%.3f)", + MetricId().c_str(), bleu*100.0, precs[0]*100.0, precs[1]*100.0, diff --git a/mteval/ns_ter.cc b/mteval/ns_ter.cc index 8c969e58..f75acf1d 100644 --- a/mteval/ns_ter.cc +++ b/mteval/ns_ter.cc @@ -473,3 +473,15 @@ float TERMetric::ComputeScore(const SufficientStats& stats) const { return edits / static_cast(stats[kREF_WORDCOUNT]); } +string TERMetric::DetailedScore(const SufficientStats& stats) const { + char buf[200]; + sprintf(buf, "TER = %.2f, %3.f|%3.f|%3.f|%3.f (len=%3.f)", + ComputeScore(stats) * 100.0f, + stats[kINSERTIONS], + stats[kDELETIONS], + stats[kSUBSTITUTIONS], + stats[kSHIFTS], + stats[kREF_WORDCOUNT]); + return buf; +} + diff --git a/mteval/ns_ter.h b/mteval/ns_ter.h index 6c020cfa..3190fc1b 100644 --- a/mteval/ns_ter.h +++ b/mteval/ns_ter.h @@ -10,6 +10,7 @@ class TERMetric : public EvaluationMetric { public: virtual unsigned SufficientStatisticsVectorSize() const; + virtual std::string DetailedScore(const SufficientStats& stats) const; virtual void ComputeSufficientStatistics(const std::vector& hyp, const std::vector >& refs, SufficientStats* out) const; diff --git a/pro-train/dist-pro.pl b/pro-train/dist-pro.pl index 5db053de..ba9cdc06 100755 --- a/pro-train/dist-pro.pl +++ b/pro-train/dist-pro.pl @@ -288,7 +288,7 @@ while (1){ $retries++; } die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest); - my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -l $metric"); + my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -m $metric"); chomp $dec_score; print STDERR "DECODER SCORE: $dec_score\n"; diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl index 11e791c1..c382a972 100755 --- a/vest/dist-vest.pl +++ b/vest/dist-vest.pl @@ -308,7 +308,7 @@ while (1){ $retries++; } die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest); - my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -l $metric"); + my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -m $metric"); chomp $dec_score; print STDERR "DECODER SCORE: $dec_score\n"; -- cgit v1.2.3 From d021894c27ffea13decf4e64e9bee428ffc85013 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 20 Dec 2011 23:37:25 +0000 Subject: new headers --- mteval/ns.cc | 1 + mteval/ns_ter.cc | 1 + 2 files changed, 2 insertions(+) diff --git a/mteval/ns.cc b/mteval/ns.cc index 1018319d..68c8deaa 100644 --- a/mteval/ns.cc +++ b/mteval/ns.cc @@ -3,6 +3,7 @@ #include "ns_ext.h" #include "ns_comb.h" +#include #include #include #include diff --git a/mteval/ns_ter.cc b/mteval/ns_ter.cc index f75acf1d..91a17f0d 100644 --- a/mteval/ns_ter.cc +++ b/mteval/ns_ter.cc @@ -1,5 +1,6 @@ #include "ns_ter.h" +#include #include #include #include -- cgit v1.2.3 From da92444f09b7e04f3cfa4d461aef47c6b59827e2 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 20 Dec 2011 18:37:43 -0500 Subject: new doc scorer --- mteval/ns_docscorer.cc | 60 ++++++++++++++++++++++++++++++++++++++++++++++++++ mteval/ns_docscorer.h | 31 ++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) create mode 100644 mteval/ns_docscorer.cc create mode 100644 mteval/ns_docscorer.h diff --git a/mteval/ns_docscorer.cc b/mteval/ns_docscorer.cc new file mode 100644 index 00000000..28a2fd09 --- /dev/null +++ b/mteval/ns_docscorer.cc @@ -0,0 +1,60 @@ +#include "ns_docscorer.h" + +#include +#include + +#include "tdict.h" +#include "filelib.h" +#include "ns.h" + +using namespace std; + +DocumentScorer::~DocumentScorer() {} + +void DocumentScorer::Init(const EvaluationMetric* metric, + const vector& ref_files, + const string& src_file, + bool verbose) { + scorers_.clear(); + cerr << "Loading references (" << ref_files.size() << " files)\n"; + assert(src_file.empty()); + std::vector ifs(ref_files.begin(),ref_files.end()); + for (int i=0; i < ref_files.size(); ++i) ifs[i].Init(ref_files[i]); + char buf[64000]; + bool expect_eof = false; + int line=0; + while (ifs[0].get()) { + vector > refs(ref_files.size()); + for (int i=0; i < ref_files.size(); ++i) { + istream &in=ifs[i].get(); + if (in.eof()) break; + in.getline(buf, 64000); + refs[i].clear(); + if (strlen(buf) == 0) { + if (in.eof()) { + if (!expect_eof) { + assert(i == 0); + expect_eof = true; + } + break; + } + } else { + TD::ConvertSentence(buf, &refs[i]); + assert(!refs[i].empty()); + } + assert(!expect_eof); + } + if (!expect_eof) { + string src_line; + //if (srcrf) { + // getline(srcrf.get(), src_line); + // map dummy; + // ProcessAndStripSGML(&src_line, &dummy); + //} + scorers_.push_back(metric->CreateSegmentEvaluator(refs)); + ++line; + } + } + cerr << "Loaded reference translations for " << scorers_.size() << " sentences.\n"; +} + diff --git a/mteval/ns_docscorer.h b/mteval/ns_docscorer.h new file mode 100644 index 00000000..170ac627 --- /dev/null +++ b/mteval/ns_docscorer.h @@ -0,0 +1,31 @@ +#ifndef _NS_DOC_SCORER_H_ +#define _NS_DOC_SCORER_H_ + +#include +#include +#include + +struct EvaluationMetric; +struct SegmentEvaluator; +class DocumentScorer { + public: + ~DocumentScorer(); + DocumentScorer() { } + DocumentScorer(const EvaluationMetric* metric, + const std::vector& ref_files, + const std::string& src_file = "", + bool verbose=false) { + Init(metric,ref_files,src_file,verbose); + } + void Init(const EvaluationMetric* metric, + const std::vector& ref_files, + const std::string& src_file = "", + bool verbose=false); + + int size() const { return scorers_.size(); } + const SegmentEvaluator* operator[](size_t i) const { return scorers_[i].get(); } + private: + std::vector > scorers_; +}; + +#endif -- cgit v1.2.3 From aac3ef3e3fdf636406fc61a40096cee6381e5461 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 29 Dec 2011 21:08:30 -0500 Subject: lexical alignment samplers --- gi/pf/Makefile.am | 13 +- gi/pf/align-lexonly.cc | 356 +++++++++++++++++++++++++++++++++++++++++++++++++ gi/pf/base_measures.cc | 26 ++++ gi/pf/base_measures.h | 50 ++++++- gi/pf/itg.cc | 98 +++++++++++--- gi/pf/unigrams.cc | 80 +++++++++++ gi/pf/unigrams.h | 69 ++++++++++ 7 files changed, 668 insertions(+), 24 deletions(-) create mode 100644 gi/pf/align-lexonly.cc create mode 100644 gi/pf/unigrams.cc create mode 100644 gi/pf/unigrams.h diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 42758939..7c8e89d0 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,10 +1,14 @@ -bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive +bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly noinst_LIBRARIES = libpf.a -libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc corpus.cc +libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc + +align_lexonly_SOURCES = align-lexonly.cc itg_SOURCES = itg.cc +condnaive_SOURCES = condnaive.cc + dpnaive_SOURCES = dpnaive.cc pfdist_SOURCES = pfdist.cc @@ -17,5 +21,6 @@ brat_SOURCES = brat.cc pfbrat_SOURCES = pfbrat.cc -AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -AM_LDFLAGS = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops -I$(top_srcdir)/utils $(GTEST_CPPFLAGS) -I$(top_srcdir)/decoder -I$(top_srcdir)/klm + +AM_LDFLAGS = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a $(top_srcdir)/utils/libutils.a -lz diff --git a/gi/pf/align-lexonly.cc b/gi/pf/align-lexonly.cc new file mode 100644 index 00000000..91a3cfcf --- /dev/null +++ b/gi/pf/align-lexonly.cc @@ -0,0 +1,356 @@ +#include +#include +#include + +#include +#include +#include + +#include "array2d.h" +#include "base_measures.h" +#include "monotonic_pseg.h" +#include "conditional_pseg.h" +#include "trule.h" +#include "tdict.h" +#include "stringlib.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp_nt.h" +#include "corpus.h" +#include "ngram_base.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("input,i",po::value(),"Read parallel data from") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +shared_ptr prng; + +struct LexicalAlignment { + unsigned char src_index; + bool is_transliteration; + vector > derivation; +}; + +struct AlignedSentencePair { + vector src; + vector trg; + vector a; + Array2D posterior; +}; + +struct HierarchicalUnigramBase { + explicit HierarchicalUnigramBase(const unsigned vocab_e_size) : r(5,5), u0(1.0 / vocab_e_size) {} + + // return p0 of rule.e_ + prob_t operator()(const TRule& rule) const { + prob_t p = prob_t::One(); + prob_t q; + for (unsigned i = 0; i < rule.e_.size(); ++i) { + q.logeq(r.logprob(rule.e_[i], log(u0))); + p *= q; + } + q.logeq(r.logprob(TD::Convert(""), log(u0))); + p *= q; + return p; + } + + void Increment(const TRule& rule) { + for (unsigned i = 0; i < rule.e_.size(); ++i) + r.increment(rule.e_[i]); + r.increment(TD::Convert("")); + } + + void Decrement(const TRule& rule) { + for (unsigned i = 0; i < rule.e_.size(); ++i) + r.decrement(rule.e_[i]); + r.decrement(TD::Convert("")); + } + + CCRP_NoTable r; + prob_t u0; +}; + +struct HierarchicalWordBase { + explicit HierarchicalWordBase(const unsigned vocab_e_size) : + base(prob_t::One()), r(15,15), u0(-log(vocab_e_size)) {} + + void ResampleHyperparameters(MT19937* rng) { + r.resample_hyperparameters(rng); + } + + inline double logp0(const vector& s) const { + return s.size() * u0; + } + + // return p0 of rule.e_ + prob_t operator()(const TRule& rule) const { + prob_t p; p.logeq(r.logprob(rule.e_, logp0(rule.e_))); + return p; + } + + void Increment(const TRule& rule) { + if (r.increment(rule.e_)) { + prob_t p; p.logeq(logp0(rule.e_)); + base *= p; + } + } + + void Decrement(const TRule& rule) { + if (r.decrement(rule.e_)) { + prob_t p; p.logeq(logp0(rule.e_)); + base /= p; + } + } + + prob_t Likelihood() const { + prob_t p; p.logeq(r.log_crp_prob()); + p *= base; + return p; + } + + void Summary() const { + cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << endl; + for (CCRP_NoTable >::const_iterator it = r.begin(); it != r.end(); ++it) + cerr << " " << it->second << '\t' << TD::GetString(it->first) << endl; + } + + prob_t base; + CCRP_NoTable > r; + const double u0; +}; + +struct BasicLexicalAlignment { + explicit BasicLexicalAlignment(const vector >& lets, + const unsigned letters_e, + vector* corp) : + letters(lets), + corpus(*corp), + //up0("en.chars.1gram", letters_e), + //up0("en.words.1gram"), + up0(letters_e), + //up0("en.chars.2gram"), + tmodel(up0) { + } + + void InstantiateRule(const WordID src, + const WordID trg, + TRule* rule) const { + static const WordID kX = TD::Convert("X") * -1; + rule->lhs_ = kX; + rule->e_ = letters[trg]; + rule->f_ = letters[src]; + } + + void InitializeRandom() { + const WordID kNULL = TD::Convert("NULL"); + cerr << "Initializing with random alignments ...\n"; + for (unsigned i = 0; i < corpus.size(); ++i) { + AlignedSentencePair& asp = corpus[i]; + asp.a.resize(asp.trg.size()); + for (unsigned j = 0; j < asp.trg.size(); ++j) { + const unsigned char a_j = prng->next() * (1 + asp.src.size()); + const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + TRule r; + InstantiateRule(f_a_j, asp.trg[j], &r); + asp.a[j].is_transliteration = false; + asp.a[j].src_index = a_j; + if (tmodel.IncrementRule(r)) + up0.Increment(r); + } + } + cerr << " LLH = " << Likelihood() << endl; + } + + prob_t Likelihood() const { + prob_t p = tmodel.Likelihood(); + p *= up0.Likelihood(); + return p; + } + + void ResampleHyperparemeters() { + cerr << " LLH_prev = " << Likelihood() << flush; + tmodel.ResampleHyperparameters(&*prng); + up0.ResampleHyperparameters(&*prng); + cerr << "\tLLH_post = " << Likelihood() << endl; + } + + void ResampleCorpus(); + + const vector >& letters; // spelling dictionary + vector& corpus; + //PhraseConditionalUninformativeBase up0; + //PhraseConditionalUninformativeUnigramBase up0; + //UnigramWordBase up0; + //HierarchicalUnigramBase up0; + HierarchicalWordBase up0; + //CompletelyUniformBase up0; + //FixedNgramBase up0; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; +}; + +void BasicLexicalAlignment::ResampleCorpus() { + static const WordID kNULL = TD::Convert("NULL"); + for (unsigned i = 0; i < corpus.size(); ++i) { + AlignedSentencePair& asp = corpus[i]; + SampleSet ss; ss.resize(asp.src.size() + 1); + for (unsigned j = 0; j < asp.trg.size(); ++j) { + TRule r; + unsigned char& a_j = asp.a[j].src_index; + WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + InstantiateRule(f_a_j, asp.trg[j], &r); + if (tmodel.DecrementRule(r)) + up0.Decrement(r); + + for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { + const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); + InstantiateRule(prop_f, asp.trg[j], &r); + ss[prop_a_j] = tmodel.RuleProbability(r); + } + a_j = prng->SelectSample(ss); + f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + InstantiateRule(f_a_j, asp.trg[j], &r); + if (tmodel.IncrementRule(r)) + up0.Increment(r); + } + } + cerr << " LLH = " << tmodel.Likelihood() << endl; +} + +void ExtractLetters(const set& v, vector >* l, set* letset = NULL) { + for (set::const_iterator it = v.begin(); it != v.end(); ++it) { + vector& letters = (*l)[*it]; + if (letters.size()) continue; // if e and f have the same word + + const string& w = TD::Convert(*it); + + size_t cur = 0; + while (cur < w.size()) { + const size_t len = UTF8Len(w[cur]); + letters.push_back(TD::Convert(w.substr(cur, len))); + if (letset) letset->insert(letters.back()); + cur += len; + } + } +} + +void Debug(const AlignedSentencePair& asp) { + cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl; + Array2D a(asp.src.size(), asp.trg.size()); + for (unsigned j = 0; j < asp.trg.size(); ++j) + if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true; + cerr << a << endl; +} + +void AddSample(AlignedSentencePair* asp) { + for (unsigned j = 0; j < asp->trg.size(); ++j) + asp->posterior(asp->a[j].src_index, j)++; +} + +void WriteAlignments(const AlignedSentencePair& asp) { + bool first = true; + for (unsigned j = 0; j < asp.trg.size(); ++j) { + int src_index = -1; + int mc = -1; + for (unsigned i = 0; i <= asp.src.size(); ++i) { + if (asp.posterior(i, j) > mc) { + mc = asp.posterior(i, j); + src_index = i; + } + } + + if (src_index) { + if (first) first = false; else cout << ' '; + cout << (src_index - 1) << '-' << j; + } + } + cout << endl; +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); +// MT19937& rng = *prng; + + vector > corpuse, corpusf; + set vocabe, vocabf; + corpus::ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; + cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; + assert(corpusf.size() == corpuse.size()); + + vector corpus(corpuse.size()); + for (unsigned i = 0; i < corpuse.size(); ++i) { + corpus[i].src.swap(corpusf[i]); + corpus[i].trg.swap(corpuse[i]); + corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size()); + } + corpusf.clear(); corpuse.clear(); + + vocabf.insert(TD::Convert("NULL")); + vector > letters(TD::NumWords()); + set letset; + ExtractLetters(vocabe, &letters, &letset); + ExtractLetters(vocabf, &letters, NULL); + letters[TD::Convert("NULL")].clear(); + + BasicLexicalAlignment x(letters, letset.size(), &corpus); + x.InitializeRandom(); + const unsigned samples = conf["samples"].as(); + for (int i = 0; i < samples; ++i) { + for (int j = 431; j < 433; ++j) Debug(corpus[j]); + cerr << i << "\t" << x.tmodel.r.size() << "\t"; + if (i % 10 == 0) x.ResampleHyperparemeters(); + x.ResampleCorpus(); + if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); + } + for (unsigned i = 0; i < corpus.size(); ++i) + WriteAlignments(corpus[i]); + //ModelAndData posterior(x, &corpus, vocabe, vocabf); + x.tmodel.Summary(); + x.up0.Summary(); + + //posterior.Sample(); + + return 0; +} diff --git a/gi/pf/base_measures.cc b/gi/pf/base_measures.cc index 8adb37d7..97b4e698 100644 --- a/gi/pf/base_measures.cc +++ b/gi/pf/base_measures.cc @@ -6,6 +6,32 @@ using namespace std; +prob_t PhraseConditionalUninformativeUnigramBase::p0(const vector& vsrc, + const vector& vtrg, + int start_src, int start_trg) const { + const int flen = vsrc.size() - start_src; + const int elen = vtrg.size() - start_trg; + prob_t p; + p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) + //p.logeq(log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01) + for (int i = 0; i < elen; ++i) + p *= u(vtrg[i + start_trg]); // draw e_i ~Uniform + return p; +} + +prob_t PhraseConditionalUninformativeBase::p0(const vector& vsrc, + const vector& vtrg, + int start_src, int start_trg) const { + const int flen = vsrc.size() - start_src; + const int elen = vtrg.size() - start_trg; + prob_t p; + //p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) + p.logeq(log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01) + for (int i = 0; i < elen; ++i) + p *= kUNIFORM_TARGET; // draw e_i ~Uniform + return p; +} + void Model1::LoadModel1(const string& fname) { cerr << "Loading Model 1 parameters from " << fname << " ..." << endl; ReadFile rf(fname); diff --git a/gi/pf/base_measures.h b/gi/pf/base_measures.h index 7ce7e2e6..fbd1c3ad 100644 --- a/gi/pf/base_measures.h +++ b/gi/pf/base_measures.h @@ -7,6 +7,7 @@ #include #include +#include "unigrams.h" #include "trule.h" #include "prob.h" #include "tdict.h" @@ -49,6 +50,51 @@ struct Model1 { std::vector > ttable; }; +struct CompletelyUniformBase { + explicit CompletelyUniformBase(const unsigned ves) : kUNIFORM(1.0 / ves) {} + prob_t operator()(const TRule&) const { + return kUNIFORM; + } + const prob_t kUNIFORM; +}; + +struct UnigramWordBase { + explicit UnigramWordBase(const std::string& fname) : un(fname) {} + prob_t operator()(const TRule& r) const { + return un(r.e_); + } + const UnigramWordModel un; +}; + +struct PhraseConditionalUninformativeBase { + explicit PhraseConditionalUninformativeBase(const unsigned vocab_e_size) : + kUNIFORM_TARGET(1.0 / vocab_e_size) { + assert(vocab_e_size > 0); + } + + // return p0 of rule.e_ | rule.f_ + prob_t operator()(const TRule& rule) const { + return p0(rule.f_, rule.e_, 0, 0); + } + + prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; + + const prob_t kUNIFORM_TARGET; +}; + +struct PhraseConditionalUninformativeUnigramBase { + explicit PhraseConditionalUninformativeUnigramBase(const std::string& file, const unsigned vocab_e_size) : u(file, vocab_e_size) {} + + // return p0 of rule.e_ | rule.f_ + prob_t operator()(const TRule& rule) const { + return p0(rule.f_, rule.e_, 0, 0); + } + + prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; + + const UnigramModel u; +}; + struct PhraseConditionalBase { explicit PhraseConditionalBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size) : model1(m1), @@ -83,7 +129,7 @@ struct PhraseJointBase { assert(vocab_e_size > 0); } - // return p0 of rule.e_ | rule.f_ + // return p0 of rule.e_ , rule.f_ prob_t operator()(const TRule& rule) const { return p0(rule.f_, rule.e_, 0, 0); } @@ -113,7 +159,7 @@ struct PhraseJointBase_BiDir { assert(vocab_e_size > 0); } - // return p0 of rule.e_ | rule.f_ + // return p0 of rule.e_ , rule.f_ prob_t operator()(const TRule& rule) const { return p0(rule.f_, rule.e_, 0, 0); } diff --git a/gi/pf/itg.cc b/gi/pf/itg.cc index ac3c16a3..a38fe672 100644 --- a/gi/pf/itg.cc +++ b/gi/pf/itg.cc @@ -27,10 +27,67 @@ ostream& operator<<(ostream& os, const vector& p) { return os << ']'; } -double log_poisson(unsigned x, const double& lambda) { - assert(lambda > 0.0); - return log(lambda) * x - lgamma(x + 1) - lambda; -} +struct UnigramModel { + explicit UnigramModel(const string& fname, unsigned vocab_size, double p0null = 0.05) : + use_uniform_(fname.size() == 0), + p0null_(p0null), + uniform_((1.0 - p0null) / vocab_size), + probs_(TD::NumWords() + 1) { + if (fname.size() > 0) LoadUnigrams(fname); + probs_[0] = p0null_; + } + +// +// \data\ +// ngram 1=9295 +// +// \1-grams: +// -3.191193 " + + void LoadUnigrams(const string& fname) { + cerr << "Loading unigram probabilities from " << fname << " ..." << endl; + ReadFile rf(fname); + string line; + istream& in = *rf.stream(); + assert(in); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\data\\"); + getline(in, line); + size_t pos = line.find("ngram 1="); + assert(pos == 0); + assert(line.size() > 8); + const size_t num_unigrams = atoi(&line[8]); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\1-grams:"); + for (size_t i = 0; i < num_unigrams; ++i) { + getline(in, line); + assert(line.size() > 0); + pos = line.find('\t'); + assert(pos > 0); + assert(pos + 1 < line.size()); + const WordID w = TD::Convert(line.substr(pos + 1)); + line[pos] = 0; + float p = atof(&line[0]); + const prob_t pnon_null(1.0 - p0null_.as_float()); + if (w < probs_.size()) probs_[w].logeq(p * log(10) + log(pnon_null)); else abort(); + } + } + + const prob_t& operator()(const WordID& w) const { + if (!w) return p0null_; + if (use_uniform_) return uniform_; + return probs_[w]; + } + + const bool use_uniform_; + const prob_t p0null_; + const prob_t uniform_; + vector probs_; +}; struct Model1 { explicit Model1(const string& fname) : @@ -89,11 +146,11 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("samples,s",po::value()->default_value(1000),"Number of samples") ("particles,p",po::value()->default_value(25),"Number of particles") ("input,i",po::value(),"Read parallel data from") - ("max_src_phrase",po::value()->default_value(7),"Maximum length of source language phrases") - ("max_trg_phrase",po::value()->default_value(7),"Maximum length of target language phrases") ("model1,m",po::value(),"Model 1 parameters (used in base distribution)") ("inverse_model1,M",po::value(),"Inverse Model 1 parameters (used in backward estimate)") ("model1_interpolation_weight",po::value()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") + ("src_unigram,u",po::value()->default_value(""),"Source unigram distribution; empty for uniform") + ("trg_unigram,U",po::value()->default_value(""),"Target unigram distribution; empty for uniform") ("random_seed,S",po::value(), "Random seed"); po::options_description clo("Command line options"); clo.add_options() @@ -165,11 +222,11 @@ void ReadParallelCorpus(const string& filename, int main(int argc, char** argv) { po::variables_map conf; InitCommandLine(argc, argv, &conf); - const size_t kMAX_TRG_PHRASE = conf["max_trg_phrase"].as(); - const size_t kMAX_SRC_PHRASE = conf["max_src_phrase"].as(); const unsigned particles = conf["particles"].as(); const unsigned samples = conf["samples"].as(); - + TD::Convert(""); + TD::Convert(""); + TD::Convert(""); if (!conf.count("model1")) { cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; return 1; @@ -188,23 +245,28 @@ int main(int argc, char** argv) { cerr << "F-corpus size: " << corpusf.size() << " sentences\t (" << vocabf.size() << " word types)\n"; cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; assert(corpusf.size() == corpuse.size()); + UnigramModel src_unigram(conf["src_unigram"].as(), vocabf.size()); + UnigramModel trg_unigram(conf["trg_unigram"].as(), vocabe.size()); + const prob_t kHALF(0.5); + const string kEMPTY = "NULL"; const int kLHS = -TD::Convert("X"); Model1 m1(conf["model1"].as()); Model1 invm1(conf["inverse_model1"].as()); for (int si = 0; si < conf["samples"].as(); ++si) { cerr << '.' << flush; for (int ci = 0; ci < corpusf.size(); ++ci) { - const vector& src = corpusf[ci]; const vector& trg = corpuse[ci]; - for (int i = 0; i < src.size(); ++i) { - for (int j = 0; j < trg.size(); ++j) { - const int eff_max_src = min(src.size() - i, kMAX_SRC_PHRASE); - for (int k = 0; k < eff_max_src; ++k) { - const int eff_max_trg = (k == 0 ? 1 : min(trg.size() - j, kMAX_TRG_PHRASE)); - for (int l = 0; l < eff_max_trg; ++l) { - } - } + const vector& src = corpusf[ci]; + for (int i = 0; i <= trg.size(); ++i) { + const WordID e_i = i > 0 ? trg[i-1] : 0; + for (int j = 0; j <= src.size(); ++j) { + const WordID f_j = j > 0 ? src[j-1] : 0; + if (e_i == 0 && f_j == 0) continue; + prob_t je = kHALF * src_unigram(f_j) * m1(f_j,e_i) + kHALF * trg_unigram(e_i) * invm1(e_i,f_j); + cerr << "p( " << (e_i ? TD::Convert(e_i) : kEMPTY) << " , " << (f_j ? TD::Convert(f_j) : kEMPTY) << " ) = " << log(je) << endl; + if (e_i && f_j) + cout << "[X] ||| " << TD::Convert(f_j) << " ||| " << TD::Convert(e_i) << " ||| LogProb=" << log(je) << endl; } } } diff --git a/gi/pf/unigrams.cc b/gi/pf/unigrams.cc new file mode 100644 index 00000000..40829775 --- /dev/null +++ b/gi/pf/unigrams.cc @@ -0,0 +1,80 @@ +#include "unigrams.h" + +#include +#include + +#include "stringlib.h" +#include "filelib.h" + +using namespace std; + +void UnigramModel::LoadUnigrams(const string& fname) { + cerr << "Loading unigram probabilities from " << fname << " ..." << endl; + ReadFile rf(fname); + string line; + istream& in = *rf.stream(); + assert(in); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\data\\"); + getline(in, line); + size_t pos = line.find("ngram 1="); + assert(pos == 0); + assert(line.size() > 8); + const size_t num_unigrams = atoi(&line[8]); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\1-grams:"); + for (size_t i = 0; i < num_unigrams; ++i) { + getline(in, line); + assert(line.size() > 0); + pos = line.find('\t'); + assert(pos > 0); + assert(pos + 1 < line.size()); + const WordID w = TD::Convert(line.substr(pos + 1)); + line[pos] = 0; + float p = atof(&line[0]); + if (w < probs_.size()) probs_[w].logeq(p * log(10)); else cerr << "WARNING: don't know about '" << TD::Convert(w) << "'\n"; + } +} + +void UnigramWordModel::LoadUnigrams(const string& fname) { + cerr << "Loading unigram probabilities from " << fname << " ..." << endl; + ReadFile rf(fname); + string line; + istream& in = *rf.stream(); + assert(in); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\data\\"); + getline(in, line); + size_t pos = line.find("ngram 1="); + assert(pos == 0); + assert(line.size() > 8); + const size_t num_unigrams = atoi(&line[8]); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\1-grams:"); + for (size_t i = 0; i < num_unigrams; ++i) { + getline(in, line); + assert(line.size() > 0); + pos = line.find('\t'); + assert(pos > 0); + assert(pos + 1 < line.size()); + size_t cur = pos + 1; + vector w; + while (cur < line.size()) { + const size_t len = UTF8Len(line[cur]); + w.push_back(TD::Convert(line.substr(cur, len))); + cur += len; + } + line[pos] = 0; + float p = atof(&line[0]); + probs_[w].logeq(p * log(10.0)); + } +} + diff --git a/gi/pf/unigrams.h b/gi/pf/unigrams.h new file mode 100644 index 00000000..1660d1ed --- /dev/null +++ b/gi/pf/unigrams.h @@ -0,0 +1,69 @@ +#ifndef _UNIGRAMS_H_ +#define _UNIGRAMS_H_ + +#include +#include +#include +#include + +#include "wordid.h" +#include "prob.h" +#include "tdict.h" + +struct UnigramModel { + explicit UnigramModel(const std::string& fname, unsigned vocab_size) : + use_uniform_(fname.size() == 0), + uniform_(1.0 / vocab_size), + probs_() { + if (fname.size() > 0) { + probs_.resize(TD::NumWords() + 1); + LoadUnigrams(fname); + } + } + + const prob_t& operator()(const WordID& w) const { + assert(w); + if (use_uniform_) return uniform_; + return probs_[w]; + } + + private: + void LoadUnigrams(const std::string& fname); + + const bool use_uniform_; + const prob_t uniform_; + std::vector probs_; +}; + + +// reads an ARPA unigram file and converts words like 'cat' into a string 'c a t' +struct UnigramWordModel { + explicit UnigramWordModel(const std::string& fname) : + use_uniform_(false), + uniform_(1.0), + probs_() { + LoadUnigrams(fname); + } + + explicit UnigramWordModel(const unsigned vocab_size) : + use_uniform_(true), + uniform_(1.0 / vocab_size), + probs_() {} + + const prob_t& operator()(const std::vector& s) const { + if (use_uniform_) return uniform_; + const VectorProbHash::const_iterator it = probs_.find(s); + assert(it != probs_.end()); + return it->second; + } + + private: + void LoadUnigrams(const std::string& fname); + + const bool use_uniform_; + const prob_t uniform_; + typedef std::tr1::unordered_map, prob_t, boost::hash > > VectorProbHash; + VectorProbHash probs_; +}; + +#endif -- cgit v1.2.3 From a4c69d2a8ef5a39c2ebc0e3a1307801c5288be8e Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 29 Dec 2011 21:09:14 -0500 Subject: ngram base dist --- gi/pf/ngram_base.cc | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++++ gi/pf/ngram_base.h | 25 +++++++++++++++++++ 2 files changed, 94 insertions(+) create mode 100644 gi/pf/ngram_base.cc create mode 100644 gi/pf/ngram_base.h diff --git a/gi/pf/ngram_base.cc b/gi/pf/ngram_base.cc new file mode 100644 index 00000000..1299f06f --- /dev/null +++ b/gi/pf/ngram_base.cc @@ -0,0 +1,69 @@ +#include "ngram_base.h" + +#include "lm/model.hh" +#include "tdict.h" + +using namespace std; + +namespace { +struct GICSVMapper : public lm::EnumerateVocab { + GICSVMapper(vector* out) : out_(out), kLM_UNKNOWN_TOKEN(0) { out_->clear(); } + void Add(lm::WordIndex index, const StringPiece &str) { + const WordID cdec_id = TD::Convert(str.as_string()); + if (cdec_id >= out_->size()) + out_->resize(cdec_id + 1, kLM_UNKNOWN_TOKEN); + (*out_)[cdec_id] = index; + } + vector* out_; + const lm::WordIndex kLM_UNKNOWN_TOKEN; +}; +} + +struct FixedNgramBaseImpl { + FixedNgramBaseImpl(const string& param) { + GICSVMapper vm(&cdec2klm_map_); + lm::ngram::Config conf; + conf.enumerate_vocab = &vm; + cerr << "Reading character LM from " << param << endl; + model = new lm::ngram::ProbingModel(param.c_str(), conf); + order = model->Order(); + kEOS = MapWord(TD::Convert("")); + assert(kEOS > 0); + } + + lm::WordIndex MapWord(const WordID w) const { + if (w < cdec2klm_map_.size()) return cdec2klm_map_[w]; + return 0; + } + + ~FixedNgramBaseImpl() { delete model; } + + prob_t StringProbability(const vector& s) const { + lm::ngram::State state = model->BeginSentenceState(); + double prob = 0; + for (unsigned i = 0; i < s.size(); ++i) { + const lm::ngram::State scopy(state); + prob += model->Score(scopy, MapWord(s[i]), state); + } + const lm::ngram::State scopy(state); + prob += model->Score(scopy, kEOS, state); + prob_t p; p.logeq(prob * log(10)); + return p; + } + + lm::ngram::ProbingModel* model; + unsigned order; + vector cdec2klm_map_; + lm::WordIndex kEOS; +}; + +FixedNgramBase::~FixedNgramBase() { delete impl; } + +FixedNgramBase::FixedNgramBase(const string& lmfname) { + impl = new FixedNgramBaseImpl(lmfname); +} + +prob_t FixedNgramBase::StringProbability(const vector& s) const { + return impl->StringProbability(s); +} + diff --git a/gi/pf/ngram_base.h b/gi/pf/ngram_base.h new file mode 100644 index 00000000..4ea999f3 --- /dev/null +++ b/gi/pf/ngram_base.h @@ -0,0 +1,25 @@ +#ifndef _NGRAM_BASE_H_ +#define _NGRAM_BASE_H_ + +#include +#include +#include "trule.h" +#include "wordid.h" +#include "prob.h" + +struct FixedNgramBaseImpl; +struct FixedNgramBase { + FixedNgramBase(const std::string& lmfname); + ~FixedNgramBase(); + prob_t StringProbability(const std::vector& s) const; + + prob_t operator()(const TRule& rule) const { + return StringProbability(rule.e_); + } + + private: + FixedNgramBaseImpl* impl; + +}; + +#endif -- cgit v1.2.3 From c430c3b3f7b22319001b2ddb1343d8a7506e9f81 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 29 Dec 2011 21:10:24 -0500 Subject: forgotten --- gi/pf/condnaive.cc | 298 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 298 insertions(+) create mode 100644 gi/pf/condnaive.cc diff --git a/gi/pf/condnaive.cc b/gi/pf/condnaive.cc new file mode 100644 index 00000000..52ddbbfe --- /dev/null +++ b/gi/pf/condnaive.cc @@ -0,0 +1,298 @@ +#include +#include +#include + +#include +#include +#include + +#include "base_measures.h" +#include "monotonic_pseg.h" +#include "conditional_pseg.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp_nt.h" +#include "corpus.h" + +using namespace std; +using namespace std::tr1; +namespace po = boost::program_options; + +static unsigned kMAX_SRC_PHRASE; +static unsigned kMAX_TRG_PHRASE; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("input,i",po::value(),"Read parallel data from") + ("max_src_phrase",po::value()->default_value(4),"Maximum length of source language phrases") + ("max_trg_phrase",po::value()->default_value(4),"Maximum length of target language phrases") + ("model1,m",po::value(),"Model 1 parameters (used in base distribution)") + ("model1_interpolation_weight",po::value()->default_value(0.95),"Mixing proportion of model 1 with uniform target distribution") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +shared_ptr prng; + +struct ModelAndData { + explicit ModelAndData(ConditionalParallelSegementationModel& m, const vector >& ce, const vector >& cf, const set& ve, const set& vf) : + model(m), + rng(&*prng), + corpuse(ce), + corpusf(cf), + vocabe(ve), + vocabf(vf), + mh_samples(), + mh_rejects(), + kX(-TD::Convert("X")), + derivations(corpuse.size()) {} + + void ResampleHyperparameters() { + } + + void InstantiateRule(const pair& from, + const pair& to, + const vector& sentf, + const vector& sente, + TRule* rule) const { + rule->f_.clear(); + rule->e_.clear(); + rule->lhs_ = kX; + for (short i = from.first; i < to.first; ++i) + rule->f_.push_back(sentf[i]); + for (short i = from.second; i < to.second; ++i) + rule->e_.push_back(sente[i]); + } + + void DecrementDerivation(const vector >& d, const vector& sentf, const vector& sente) { + if (d.size() < 2) return; + TRule x; + for (int i = 1; i < d.size(); ++i) { + InstantiateRule(d[i], d[i-1], sentf, sente, &x); + model.DecrementRule(x); + model.DecrementAlign(x.f_.size()); + } + } + + void PrintDerivation(const vector >& d, const vector& sentf, const vector& sente) { + if (d.size() < 2) return; + TRule x; + for (int i = 1; i < d.size(); ++i) { + InstantiateRule(d[i], d[i-1], sentf, sente, &x); + cerr << i << '/' << (d.size() - 1) << ": " << x << endl; + } + } + + void IncrementDerivation(const vector >& d, const vector& sentf, const vector& sente) { + if (d.size() < 2) return; + TRule x; + for (int i = 1; i < d.size(); ++i) { + InstantiateRule(d[i], d[i-1], sentf, sente, &x); + model.IncrementRule(x); + model.IncrementAlign(x.f_.size()); + } + } + + prob_t Likelihood() const { + return model.Likelihood(); + } + + prob_t DerivationProposalProbability(const vector >& d, const vector& sentf, const vector& sente) const { + prob_t p = prob_t::One(); + TRule x; + for (int i = 1; i < d.size(); ++i) { + InstantiateRule(d[i], d[i-1], sentf, sente, &x); + p *= model.RuleProbability(x); + p *= model.AlignProbability(x.f_.size()); + } + return p; + } + + void Sample(); + + ConditionalParallelSegementationModel& model; + MT19937* rng; + const vector >& corpuse, corpusf; + const set& vocabe, vocabf; + unsigned mh_samples, mh_rejects; + const int kX; + vector > > derivations; +}; + +void ModelAndData::Sample() { + unsigned MAXK = kMAX_SRC_PHRASE; + unsigned MAXL = kMAX_TRG_PHRASE; + TRule x; + x.lhs_ = -TD::Convert("X"); + + for (int samples = 0; samples < 1000; ++samples) { + if (samples % 1 == 0 && samples > 0) { + //ResampleHyperparameters(); + cerr << " [" << samples << " LLH=" << log(Likelihood()) << " MH=" << ((double)mh_rejects / mh_samples) << "]\n"; + for (int i = 0; i < 10; ++i) { + cerr << "SENTENCE: " << TD::GetString(corpusf[i]) << " ||| " << TD::GetString(corpuse[i]) << endl; + PrintDerivation(derivations[i], corpusf[i], corpuse[i]); + } + static TRule xx("[X] ||| w n ||| s h ||| X=0"); + const CCRP_NoTable& dcrp = model.tmodel.r.find(xx.f_)->second; + for (CCRP_NoTable::const_iterator it = dcrp.begin(); it != dcrp.end(); ++it) { + cerr << "\t" << it->second << "\t" << it->first << endl; + } + } + cerr << '.' << flush; + for (int s = 0; s < corpuse.size(); ++s) { + const vector& sentf = corpusf[s]; + const vector& sente = corpuse[s]; +// cerr << " CUSTOMERS: " << rules.num_customers() << endl; +// cerr << "SENTENCE: " << TD::GetString(sentf) << " ||| " << TD::GetString(sente) << endl; + + vector >& deriv = derivations[s]; + const prob_t p_cur = Likelihood(); + DecrementDerivation(deriv, sentf, sente); + + boost::multi_array a(boost::extents[sentf.size() + 1][sente.size() + 1]); + boost::multi_array trans(boost::extents[sentf.size() + 1][sente.size() + 1][MAXK][MAXL]); + a[0][0] = prob_t::One(); + for (int i = 0; i < sentf.size(); ++i) { + for (int j = 0; j < sente.size(); ++j) { + const prob_t src_a = a[i][j]; + x.f_.clear(); + for (int k = 1; k <= MAXK; ++k) { + if (i + k > sentf.size()) break; + x.f_.push_back(sentf[i + k - 1]); + x.e_.clear(); + const prob_t p_span = model.AlignProbability(k); // prob of consuming this much source + for (int l = 1; l <= MAXL; ++l) { + if (j + l > sente.size()) break; + x.e_.push_back(sente[j + l - 1]); + trans[i][j][k - 1][l - 1] = model.RuleProbability(x) * p_span; + a[i + k][j + l] += src_a * trans[i][j][k - 1][l - 1]; + } + } + } + } +// cerr << "Inside: " << log(a[sentf.size()][sente.size()]) << endl; + const prob_t q_cur = DerivationProposalProbability(deriv, sentf, sente); + + vector > newderiv; + int cur_i = sentf.size(); + int cur_j = sente.size(); + while(cur_i > 0 && cur_j > 0) { + newderiv.push_back(pair(cur_i, cur_j)); +// cerr << "NODE: (" << cur_i << "," << cur_j << ")\n"; + SampleSet ss; + vector > nexts; + for (int k = 1; k <= MAXK; ++k) { + const int hyp_i = cur_i - k; + if (hyp_i < 0) break; + for (int l = 1; l <= MAXL; ++l) { + const int hyp_j = cur_j - l; + if (hyp_j < 0) break; + const prob_t& inside = a[hyp_i][hyp_j]; + if (inside == prob_t::Zero()) continue; + const prob_t& transp = trans[hyp_i][hyp_j][k - 1][l - 1]; + if (transp == prob_t::Zero()) continue; + const prob_t p = inside * transp; + ss.add(p); + nexts.push_back(pair(hyp_i, hyp_j)); +// cerr << " (" << hyp_i << "," << hyp_j << ") <--- " << log(p) << endl; + } + } +// cerr << " sample set has " << nexts.size() << " elements.\n"; + const int selected = rng->SelectSample(ss); + cur_i = nexts[selected].first; + cur_j = nexts[selected].second; + } + newderiv.push_back(pair(0,0)); + const prob_t q_new = DerivationProposalProbability(newderiv, sentf, sente); + IncrementDerivation(newderiv, sentf, sente); +// cerr << "SANITY: " << q_new << " " <(); + kMAX_SRC_PHRASE = conf["max_src_phrase"].as(); + + if (!conf.count("model1")) { + cerr << argv[0] << "Please use --model1 to specify model 1 parameters\n"; + return 1; + } + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); +// MT19937& rng = *prng; + + vector > corpuse, corpusf; + set vocabe, vocabf; + corpus::ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; + cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; + assert(corpusf.size() == corpuse.size()); + + Model1 m1(conf["model1"].as()); + + PhraseConditionalBase pcb0(m1, conf["model1_interpolation_weight"].as(), vocabe.size()); + ConditionalParallelSegementationModel x(pcb0); + + ModelAndData posterior(x, corpuse, corpusf, vocabe, vocabf); + posterior.Sample(); + + TRule r1("[X] ||| x ||| l e ||| X=0"); + TRule r2("[X] ||| A ||| a d ||| X=0"); + TRule r3("[X] ||| n ||| e r ||| X=0"); + TRule r4("[X] ||| x A n ||| b l a g ||| X=0"); + + PhraseConditionalUninformativeBase u0(vocabe.size()); + + cerr << (pcb0(r1)*pcb0(r2)*pcb0(r3)) << endl; + cerr << (u0(r4)) << endl; + + return 0; +} + -- cgit v1.2.3 From 665badbdcc755183aa83414f6e86987f4d017393 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 29 Dec 2011 21:10:36 -0500 Subject: foo --- gi/pf/conditional_pseg.h | 155 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 gi/pf/conditional_pseg.h diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h new file mode 100644 index 00000000..edcdc813 --- /dev/null +++ b/gi/pf/conditional_pseg.h @@ -0,0 +1,155 @@ +#ifndef _CONDITIONAL_PSEG_H_ +#define _CONDITIONAL_PSEG_H_ + +#include +#include +#include +#include + +#include "prob.h" +#include "ccrp_nt.h" +#include "trule.h" +#include "base_measures.h" +#include "tdict.h" + +template +struct ConditionalTranslationModel { + explicit ConditionalTranslationModel(ConditionalBaseMeasure& rcp0) : + rp0(rcp0) {} + + void Summary() const { + std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + std::cerr << TD::GetString(it->first) << " \t(\\alpha = " << it->second.concentration() << ") --------------------------" << std::endl; + for (CCRP_NoTable::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) + std::cerr << " " << i2->second << '\t' << i2->first << std::endl; + } + } + + void ResampleHyperparameters(MT19937* rng) { + for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it) + it->second.resample_hyperparameters(rng); + } + + int DecrementRule(const TRule& rule) { + RuleModelHash::iterator it = r.find(rule.f_); + assert(it != r.end()); + int count = it->second.decrement(rule); + if (count) { + if (it->second.num_customers() == 0) r.erase(it); + } + return count; + } + + int IncrementRule(const TRule& rule) { + RuleModelHash::iterator it = r.find(rule.f_); + if (it == r.end()) { + it = r.insert(make_pair(rule.f_, CCRP_NoTable(1.0, 1.0, 8.0))).first; + } + int count = it->second.increment(rule); + return count; + } + + void IncrementRules(const std::vector& rules) { + for (int i = 0; i < rules.size(); ++i) + IncrementRule(*rules[i]); + } + + void DecrementRules(const std::vector& rules) { + for (int i = 0; i < rules.size(); ++i) + DecrementRule(*rules[i]); + } + + prob_t RuleProbability(const TRule& rule) const { + prob_t p; + RuleModelHash::const_iterator it = r.find(rule.f_); + if (it == r.end()) { + p.logeq(log(rp0(rule))); + } else { + p.logeq(it->second.logprob(rule, log(rp0(rule)))); + } + return p; + } + + prob_t Likelihood() const { + prob_t p = prob_t::One(); + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + prob_t q; q.logeq(it->second.log_crp_prob()); + p *= q; + for (CCRP_NoTable::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) + p *= rp0(i2->first); + } + return p; + } + + const ConditionalBaseMeasure& rp0; + typedef std::tr1::unordered_map, + CCRP_NoTable, + boost::hash > > RuleModelHash; + RuleModelHash r; +}; + +template +struct ConditionalParallelSegementationModel { + explicit ConditionalParallelSegementationModel(ConditionalBaseMeasure& rcp0) : + tmodel(rcp0), base(prob_t::One()), aligns(1,1) {} + + ConditionalTranslationModel tmodel; + + void DecrementRule(const TRule& rule) { + tmodel.DecrementRule(rule); + } + + void IncrementRule(const TRule& rule) { + tmodel.IncrementRule(rule); + } + + void IncrementRulesAndAlignments(const std::vector& rules) { + tmodel.IncrementRules(rules); + for (int i = 0; i < rules.size(); ++i) { + IncrementAlign(rules[i]->f_.size()); + } + } + + void DecrementRulesAndAlignments(const std::vector& rules) { + tmodel.DecrementRules(rules); + for (int i = 0; i < rules.size(); ++i) { + DecrementAlign(rules[i]->f_.size()); + } + } + + prob_t RuleProbability(const TRule& rule) const { + return tmodel.RuleProbability(rule); + } + + void IncrementAlign(unsigned span) { + if (aligns.increment(span)) { + // TODO + } + } + + void DecrementAlign(unsigned span) { + if (aligns.decrement(span)) { + // TODO + } + } + + prob_t AlignProbability(unsigned span) const { + prob_t p; + p.logeq(aligns.logprob(span, log_poisson(span, 1.0))); + return p; + } + + prob_t Likelihood() const { + prob_t p; p.logeq(aligns.log_crp_prob()); + p *= base; + p *= tmodel.Likelihood(); + return p; + } + + prob_t base; + CCRP_NoTable aligns; +}; + +#endif + -- cgit v1.2.3 From 5ea87bf5487f0bd9fef7385eb1812b0601b57a6e Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 29 Dec 2011 23:02:50 -0500 Subject: remove broken prior, add logging --- gi/pf/align-lexonly.cc | 36 ++---------------------------------- 1 file changed, 2 insertions(+), 34 deletions(-) diff --git a/gi/pf/align-lexonly.cc b/gi/pf/align-lexonly.cc index 91a3cfcf..7e48b25a 100644 --- a/gi/pf/align-lexonly.cc +++ b/gi/pf/align-lexonly.cc @@ -66,41 +66,9 @@ struct AlignedSentencePair { Array2D posterior; }; -struct HierarchicalUnigramBase { - explicit HierarchicalUnigramBase(const unsigned vocab_e_size) : r(5,5), u0(1.0 / vocab_e_size) {} - - // return p0 of rule.e_ - prob_t operator()(const TRule& rule) const { - prob_t p = prob_t::One(); - prob_t q; - for (unsigned i = 0; i < rule.e_.size(); ++i) { - q.logeq(r.logprob(rule.e_[i], log(u0))); - p *= q; - } - q.logeq(r.logprob(TD::Convert(""), log(u0))); - p *= q; - return p; - } - - void Increment(const TRule& rule) { - for (unsigned i = 0; i < rule.e_.size(); ++i) - r.increment(rule.e_[i]); - r.increment(TD::Convert("")); - } - - void Decrement(const TRule& rule) { - for (unsigned i = 0; i < rule.e_.size(); ++i) - r.decrement(rule.e_[i]); - r.decrement(TD::Convert("")); - } - - CCRP_NoTable r; - prob_t u0; -}; - struct HierarchicalWordBase { explicit HierarchicalWordBase(const unsigned vocab_e_size) : - base(prob_t::One()), r(15,15), u0(-log(vocab_e_size)) {} + base(prob_t::One()), r(25,25,10), u0(-log(vocab_e_size)) {} void ResampleHyperparameters(MT19937* rng) { r.resample_hyperparameters(rng); @@ -137,7 +105,7 @@ struct HierarchicalWordBase { } void Summary() const { - cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << endl; + cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (\\alpha=" << r.concentration() << ')' << endl; for (CCRP_NoTable >::const_iterator it = r.begin(); it != r.end(); ++it) cerr << " " << it->second << '\t' << TD::GetString(it->first) << endl; } -- cgit v1.2.3 From 71da76d47d5a6f988b56b5641f7296249cb85124 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 30 Dec 2011 19:23:32 +0000 Subject: logging corpus errors --- gi/pf/corpus.cc | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/gi/pf/corpus.cc b/gi/pf/corpus.cc index a408e7cf..cb6e4ed7 100644 --- a/gi/pf/corpus.cc +++ b/gi/pf/corpus.cc @@ -24,11 +24,11 @@ void ReadParallelCorpus(const string& filename, istream* in = rf.stream(); assert(*in); string line; + unsigned lc = 0; const WordID kDIV = TD::Convert("|||"); vector tmp; - while(*in) { - getline(*in, line); - if (line.empty() && !*in) break; + while(getline(*in, line)) { + ++lc; e->push_back(vector()); f->push_back(vector()); vector& le = e->back(); @@ -39,12 +39,17 @@ void ReadParallelCorpus(const string& filename, for (unsigned i = 0; i < tmp.size(); ++i) { const int cur = tmp[i]; if (isf) { - if (kDIV == cur) { isf = false; } else { + if (kDIV == cur) { + isf = false; + } else { lf.push_back(cur); vocab_f->insert(cur); } } else { - assert(cur != kDIV); + if (cur == kDIV) { + cerr << "ERROR in " << lc << ": " << line << endl << endl; + abort(); + } le.push_back(cur); vocab_e->insert(cur); } -- cgit v1.2.3 From 134228d946a3f119e88f23a5315fa7849d498ee4 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 31 Dec 2011 17:54:38 +0000 Subject: last change before adding wood&teh stuff --- gi/pf/align-lexonly.cc | 6 ++++-- gi/pf/base_measures.h | 11 +++++++++++ 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/gi/pf/align-lexonly.cc b/gi/pf/align-lexonly.cc index 7e48b25a..e9f1e7b6 100644 --- a/gi/pf/align-lexonly.cc +++ b/gi/pf/align-lexonly.cc @@ -117,10 +117,12 @@ struct HierarchicalWordBase { struct BasicLexicalAlignment { explicit BasicLexicalAlignment(const vector >& lets, + const unsigned words_e, const unsigned letters_e, vector* corp) : letters(lets), corpus(*corp), + //up0(words_e), //up0("en.chars.1gram", letters_e), //up0("en.words.1gram"), up0(letters_e), @@ -302,11 +304,11 @@ int main(int argc, char** argv) { ExtractLetters(vocabf, &letters, NULL); letters[TD::Convert("NULL")].clear(); - BasicLexicalAlignment x(letters, letset.size(), &corpus); + BasicLexicalAlignment x(letters, vocabe.size(), letset.size(), &corpus); x.InitializeRandom(); const unsigned samples = conf["samples"].as(); for (int i = 0; i < samples; ++i) { - for (int j = 431; j < 433; ++j) Debug(corpus[j]); + for (int j = 4995; j < 4997; ++j) Debug(corpus[j]); cerr << i << "\t" << x.tmodel.r.size() << "\t"; if (i % 10 == 0) x.ResampleHyperparemeters(); x.ResampleCorpus(); diff --git a/gi/pf/base_measures.h b/gi/pf/base_measures.h index fbd1c3ad..a4e9ac28 100644 --- a/gi/pf/base_measures.h +++ b/gi/pf/base_measures.h @@ -11,6 +11,7 @@ #include "trule.h" #include "prob.h" #include "tdict.h" +#include "sampler.h" inline double log_poisson(unsigned x, const double& lambda) { assert(lambda > 0.0); @@ -55,6 +56,11 @@ struct CompletelyUniformBase { prob_t operator()(const TRule&) const { return kUNIFORM; } + void Summary() const {} + void ResampleHyperparameters(MT19937*) {} + void Increment(const TRule&) {} + void Decrement(const TRule&) {} + prob_t Likelihood() const { return prob_t::One(); } const prob_t kUNIFORM; }; @@ -79,6 +85,11 @@ struct PhraseConditionalUninformativeBase { prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; + void Summary() const {} + void ResampleHyperparameters(MT19937*) {} + void Increment(const TRule&) {} + void Decrement(const TRule&) {} + prob_t Likelihood() const { return prob_t::One(); } const prob_t kUNIFORM_TARGET; }; -- cgit v1.2.3 From a144fb07effc59a3aa269d7fd5f3d0ab9dfe5e54 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 3 Jan 2012 16:59:11 -0500 Subject: multi-floor chinese restaurant described by wood&teh (2009) --- utils/Makefile.am | 6 +- utils/mfcr.h | 354 +++++++++++++++++++++++++++++++++++++++++++++++++++++ utils/mfcr_test.cc | 72 +++++++++++ 3 files changed, 430 insertions(+), 2 deletions(-) create mode 100644 utils/mfcr.h create mode 100644 utils/mfcr_test.cc diff --git a/utils/Makefile.am b/utils/Makefile.am index df667655..3e559c75 100644 --- a/utils/Makefile.am +++ b/utils/Makefile.am @@ -1,8 +1,8 @@ bin_PROGRAMS = reconstruct_weights -noinst_PROGRAMS = ts phmt -TESTS = ts phmt +noinst_PROGRAMS = ts phmt mfcr_test +TESTS = ts phmt mfcr_test if HAVE_GTEST noinst_PROGRAMS += \ @@ -40,6 +40,8 @@ phmt_SOURCES = phmt.cc ts_SOURCES = ts.cc dict_test_SOURCES = dict_test.cc dict_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) +mfcr_test_SOURCES = mfcr_test.cc +mfcr_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) weights_test_SOURCES = weights_test.cc weights_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) logval_test_SOURCES = logval_test.cc diff --git a/utils/mfcr.h b/utils/mfcr.h new file mode 100644 index 00000000..3eb133fc --- /dev/null +++ b/utils/mfcr.h @@ -0,0 +1,354 @@ +#ifndef _MFCR_H_ +#define _MFCR_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "sampler.h" +#include "slice_sampler.h" + +struct TableCount { + TableCount() : count(), floor() {} + TableCount(int c, int f) : count(c), floor(f) { + assert(f >= 0); + } + int count; // count or delta (may be 0, <0, or >0) + unsigned char floor; // from which floor? +}; + +std::ostream& operator<<(std::ostream& o, const TableCount& tc) { + return o << "[c=" << tc.count << " floor=" << static_cast(tc.floor) << ']'; +} + +// Multi-Floor Chinese Restaurant as proposed by Wood & Teh (AISTATS, 2009) to simulate +// graphical Pitman-Yor processes. +// http://jmlr.csail.mit.edu/proceedings/papers/v5/wood09a/wood09a.pdf +// +// Implementation is based on Blunsom, Cohn, Goldwater, & Johnson (ACL 2009) and code +// referenced therein. +// http://www.aclweb.org/anthology/P/P09/P09-2085.pdf +// +template > +class MFCR { + public: + + MFCR(unsigned num_floors, double d, double alpha) : + num_floors_(num_floors), + num_tables_(), + num_customers_(), + d_(d), + alpha_(alpha), + d_prior_alpha_(std::numeric_limits::quiet_NaN()), + d_prior_beta_(std::numeric_limits::quiet_NaN()), + alpha_prior_shape_(std::numeric_limits::quiet_NaN()), + alpha_prior_rate_(std::numeric_limits::quiet_NaN()) {} + + MFCR(unsigned num_floors, double d_alpha, double d_beta, double alpha_shape, double alpha_rate, double d = 0.9, double alpha = 10.0) : + num_floors_(num_floors), + num_tables_(), + num_customers_(), + d_(d), + alpha_(alpha), + d_prior_alpha_(d_alpha), + d_prior_beta_(d_beta), + alpha_prior_shape_(alpha_shape), + alpha_prior_rate_(alpha_rate) {} + + double d() const { return d_; } + double alpha() const { return alpha_; } + + bool has_d_prior() const { + return !std::isnan(d_prior_alpha_); + } + + bool has_alpha_prior() const { + return !std::isnan(alpha_prior_shape_); + } + + void clear() { + num_tables_ = 0; + num_customers_ = 0; + dish_locs_.clear(); + } + + unsigned num_tables() const { + return num_tables_; + } + + unsigned num_tables(const Dish& dish) const { + const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); + if (it == dish_locs_.end()) return 0; + return it->second.table_counts_.size(); + } + + // this is not terribly efficient but it should not typically be necessary to execute this query + unsigned num_tables(const Dish& dish, const unsigned floor) const { + const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); + if (it == dish_locs_.end()) return 0; + unsigned c = 0; + for (typename std::list::const_iterator i = it->second.table_counts_.begin(); + i != it->second.table_counts_.end(); ++i) { + if (i->floor == floor) ++c; + } + return c; + } + + unsigned num_customers() const { + return num_customers_; + } + + unsigned num_customers(const Dish& dish) const { + const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); + if (it == dish_locs_.end()) return 0; + return it->total_dish_count_; + } + + // returns (delta, floor) indicating whether a new table (delta) was opened and on which floor + TableCount increment(const Dish& dish, const std::vector& p0s, const std::vector& lambdas, MT19937* rng) { + assert(p0s.size() == num_floors_); + assert(lambdas.size() == num_floors_); + + DishLocations& loc = dish_locs_[dish]; + // marg_p0 = marginal probability of opening a new table on any floor with label dish + const double marg_p0 = std::inner_product(p0s.begin(), p0s.end(), lambdas.begin(), 0.0); + assert(marg_p0 <= 1.0); + int floor = -1; + bool share_table = false; + if (loc.total_dish_count_) { + const double p_empty = (alpha_ + num_tables_ * d_) * marg_p0; + const double p_share = (loc.total_dish_count_ - loc.table_counts_.size() * d_); + share_table = rng->SelectSample(p_empty, p_share); + } + if (share_table) { + double r = rng->next() * (loc.total_dish_count_ - loc.table_counts_.size() * d_); + for (typename std::list::iterator ti = loc.table_counts_.begin(); + ti != loc.table_counts_.end(); ++ti) { + r -= ti->count - d_; + if (r <= 0.0) { + ++ti->count; + floor = ti->floor; + break; + } + } + if (r > 0.0) { + std::cerr << "Serious error: r=" << r << std::endl; + Print(&std::cerr); + assert(r <= 0.0); + } + } else { // sit at currently empty table -- must sample what floor + double r = rng->next() * marg_p0; + for (unsigned i = 0; i < p0s.size(); ++i) { + r -= p0s[i] * lambdas[i]; + if (r <= 0.0) { + floor = i; + break; + } + } + assert(floor >= 0); + loc.table_counts_.push_back(TableCount(1, floor)); + ++num_tables_; + } + ++loc.total_dish_count_; + ++num_customers_; + return (share_table ? TableCount(0, floor) : TableCount(1, floor)); + } + + // returns first = -1 or 0, indicating whether a table was closed, and on what floor (second) + TableCount decrement(const Dish& dish, MT19937* rng) { + DishLocations& loc = dish_locs_[dish]; + assert(loc.total_dish_count_); + int floor = -1; + int delta = 0; + if (loc.total_dish_count_ == 1) { + floor = loc.table_counts_.front().floor; + dish_locs_.erase(dish); + --num_tables_; + --num_customers_; + delta = -1; + } else { + // sample customer to remove UNIFORMLY. that is, do NOT use the d + // here. if you do, it will introduce (unwanted) bias! + double r = rng->next() * loc.total_dish_count_; + --loc.total_dish_count_; + --num_customers_; + for (typename std::list::iterator ti = loc.table_counts_.begin(); + ti != loc.table_counts_.end(); ++ti) { + r -= ti->count; + if (r <= 0.0) { + floor = ti->floor; + if ((--ti->count) == 0) { + --num_tables_; + delta = -1; + loc.table_counts_.erase(ti); + } + break; + } + } + if (r > 0.0) { + std::cerr << "Serious error: r=" << r << std::endl; + Print(&std::cerr); + assert(r <= 0.0); + } + } + return TableCount(delta, floor); + } + + double prob(const Dish& dish, const std::vector& p0s, const std::vector& lambdas) const { + assert(p0s.size() == num_floors_); + assert(lambdas.size() == num_floors_); + const double marg_p0 = std::inner_product(p0s.begin(), p0s.end(), lambdas.begin(), 0.0); + assert(marg_p0 <= 1.0); + const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); + const double r = num_tables_ * d_ + alpha_; + if (it == dish_locs_.end()) { + return r * marg_p0 / (num_customers_ + alpha_); + } else { + return (it->second.total_dish_count_ - d_ * it->second.table_counts_.size() + r * marg_p0) / + (num_customers_ + alpha_); + } + } + + double log_crp_prob() const { + return log_crp_prob(d_, alpha_); + } + + static double log_beta_density(const double& x, const double& alpha, const double& beta) { + assert(x > 0.0); + assert(x < 1.0); + assert(alpha > 0.0); + assert(beta > 0.0); + const double lp = (alpha-1)*log(x)+(beta-1)*log(1-x)+lgamma(alpha+beta)-lgamma(alpha)-lgamma(beta); + return lp; + } + + static double log_gamma_density(const double& x, const double& shape, const double& rate) { + assert(x >= 0.0); + assert(shape > 0.0); + assert(rate > 0.0); + const double lp = (shape-1)*log(x) - shape*log(rate) - x/rate - lgamma(shape); + return lp; + } + + // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process + // does not include draws from G_w's + double log_crp_prob(const double& d, const double& alpha) const { + double lp = 0.0; + if (has_d_prior()) + lp = log_beta_density(d, d_prior_alpha_, d_prior_beta_); + if (has_alpha_prior()) + lp += log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_); + assert(lp <= 0.0); + if (num_customers_) { + if (d > 0.0) { + const double r = lgamma(1.0 - d); + lp += lgamma(alpha) - lgamma(alpha + num_customers_) + + num_tables_ * log(d) + lgamma(alpha / d + num_tables_) + - lgamma(alpha / d); + assert(std::isfinite(lp)); + for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); + it != dish_locs_.end(); ++it) { + const DishLocations& cur = it->second; + for (std::list::const_iterator ti = cur.table_counts_.begin(); ti != cur.table_counts_.end(); ++ti) { + lp += lgamma(ti->count - d) - r; + } + } + } else { + assert(!"not implemented yet"); + } + } + assert(std::isfinite(lp)); + return lp; + } + + void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { + assert(has_d_prior() || has_alpha_prior()); + DiscountResampler dr(*this); + ConcentrationResampler cr(*this); + for (int iter = 0; iter < nloop; ++iter) { + if (has_alpha_prior()) { + alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0, + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + } + if (has_d_prior()) { + d_ = slice_sampler1d(dr, d_, *rng, std::numeric_limits::min(), + 1.0, 0.0, niterations, 100*niterations); + } + } + alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0, + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + } + + struct DiscountResampler { + DiscountResampler(const MFCR& crp) : crp_(crp) {} + const MFCR& crp_; + double operator()(const double& proposed_d) const { + return crp_.log_crp_prob(proposed_d, crp_.alpha_); + } + }; + + struct ConcentrationResampler { + ConcentrationResampler(const MFCR& crp) : crp_(crp) {} + const MFCR& crp_; + double operator()(const double& proposed_alpha) const { + return crp_.log_crp_prob(crp_.d_, proposed_alpha); + } + }; + + struct DishLocations { + DishLocations() : total_dish_count_() {} + unsigned total_dish_count_; // customers at all tables with this dish + std::list table_counts_; // list<> gives O(1) deletion and insertion, which we want + // .size() is the number of tables for this dish + }; + + void Print(std::ostream* out) const { + (*out) << "MFCR(d=" << d_ << ",alpha=" << alpha_ << ") customers=" << num_customers_ << std::endl; + for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); + it != dish_locs_.end(); ++it) { + (*out) << it->first << " (" << it->second.total_dish_count_ << " on " << it->second.table_counts_.size() << " tables): "; + for (typename std::list::const_iterator i = it->second.table_counts_.begin(); + i != it->second.table_counts_.end(); ++i) { + (*out) << " " << *i; + } + (*out) << std::endl; + } + } + + typedef typename std::tr1::unordered_map::const_iterator const_iterator; + const_iterator begin() const { + return dish_locs_.begin(); + } + const_iterator end() const { + return dish_locs_.end(); + } + + unsigned num_floors_; + unsigned num_tables_; + unsigned num_customers_; + std::tr1::unordered_map dish_locs_; + + double d_; + double alpha_; + + // optional beta prior on d_ (NaN if no prior) + double d_prior_alpha_; + double d_prior_beta_; + + // optional gamma prior on alpha_ (NaN if no prior) + double alpha_prior_shape_; + double alpha_prior_rate_; +}; + +template +std::ostream& operator<<(std::ostream& o, const MFCR& c) { + c.Print(&o); + return o; +} + +#endif diff --git a/utils/mfcr_test.cc b/utils/mfcr_test.cc new file mode 100644 index 00000000..7c45a37c --- /dev/null +++ b/utils/mfcr_test.cc @@ -0,0 +1,72 @@ +#include "mfcr.h" + +#include +#include +#include + +#include "sampler.h" + +using namespace std; + +void test_exch(MT19937* rng) { + MFCR crp(2, 0.5, 3.0); + vector lambdas(2); + vector p0s(2); + lambdas[0] = 0.2; + lambdas[1] = 0.8; + p0s[0] = 1.0; + p0s[1] = 1.0; + + double tot = 0; + double tot2 = 0; + double xt = 0; + int cust = 10; + vector hist(cust + 1, 0), hist2(cust + 1, 0); + for (int i = 0; i < cust; ++i) { crp.increment(1, p0s, lambdas, rng); } + const int samples = 100000; + const bool simulate = true; + for (int k = 0; k < samples; ++k) { + if (!simulate) { + crp.clear(); + for (int i = 0; i < cust; ++i) { crp.increment(1, p0s, lambdas, rng); } + } else { + int da = rng->next() * cust; + bool a = rng->next() < 0.45; + if (a) { + for (int i = 0; i < da; ++i) { crp.increment(1, p0s, lambdas, rng); } + for (int i = 0; i < da; ++i) { crp.decrement(1, rng); } + xt += 1.0; + } else { + for (int i = 0; i < da; ++i) { crp.decrement(1, rng); } + for (int i = 0; i < da; ++i) { crp.increment(1, p0s, lambdas, rng); } + } + } + int c = crp.num_tables(1); + ++hist[c]; + tot += c; + int c2 = crp.num_tables(1,0); // tables on floor 0 with dish 1 + ++hist2[c2]; + tot2 += c2; + } + cerr << cust << " = " << crp.num_customers() << endl; + cerr << "P(a) = " << (xt / samples) << endl; + cerr << "E[num tables] = " << (tot / samples) << endl; + double error = fabs((tot / samples) - 6.894); + cerr << " error = " << error << endl; + for (int i = 1; i <= cust; ++i) + cerr << i << ' ' << (hist[i]) << endl; + cerr << "E[num tables on floor 0] = " << (tot2 / samples) << endl; + double error2 = fabs((tot2 / samples) - 1.379); + cerr << " error2 = " << error2 << endl; + for (int i = 1; i <= cust; ++i) + cerr << i << ' ' << (hist2[i]) << endl; + assert(error < 0.05); // these can fail with very low probability + assert(error2 < 0.05); +}; + +int main(int argc, char** argv) { + MT19937 rng; + test_exch(&rng); + return 0; +} + -- cgit v1.2.3 From d156a65ac638b574abfabfd78c949e122faada5d Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 11 Jan 2012 01:22:54 -0500 Subject: script to pull out candidate transliterations from a word-aligned parallel corpus --- gi/pf/guess-translits.pl | 71 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100755 gi/pf/guess-translits.pl diff --git a/gi/pf/guess-translits.pl b/gi/pf/guess-translits.pl new file mode 100755 index 00000000..ab737121 --- /dev/null +++ b/gi/pf/guess-translits.pl @@ -0,0 +1,71 @@ +#!/usr/bin/perl -w +use strict; +use utf8; + +my $MIN_PMI = -3; + +my %fs; +my %es; +my %ef; + +die "Usage: $0 < input.utf8.txt\n" if scalar @ARGV > 0; + +binmode(STDIN,":utf8"); +binmode(STDOUT,":utf8"); +binmode(STDERR,":utf8"); + +my $tot = 0; +print STDERR "Reading alignments from STDIN ...\n"; +while() { + chomp; + my ($fsent, $esent, $alsent) = split / \|\|\| /; + die "Format should be 'foreign sentence ||| english sentence ||| 0-0 1-1 ...'\n" unless defined $fsent && defined $esent && defined $alsent; + + my @fws = split /\s+/, $fsent; + my @ews = split /\s+/, $esent; + my @as = split /\s+/, $alsent; + my %a2b; + my %b2a; + for my $ap (@as) { + my ($a,$b) = split /-/, $ap; + $a2b{$a}->{$b} = 1; + $b2a{$b}->{$a} = 1; + } + for my $a (keys %a2b) { + my $bref = $a2b{$a}; + next unless scalar keys %$bref < 2; + my $b = (keys %$bref)[0]; + next unless scalar keys %{$b2a{$b}} < 2; + my $f = $fws[$a]; + next unless defined $f; + next unless length($f) > 3; + my $e = $ews[$b]; + next unless defined $e; + next unless length($e) > 3; + + $ef{$f}->{$e}++; + $es{$e}++; + $fs{$f}++; + $tot++; + } +} +my $ltot = log($tot); +my $num = 0; +print STDERR "Extracting pairs for PMI > $MIN_PMI ...\n"; +for my $f (keys %fs) { + my $logf = log($fs{$f}); + my $esref = $ef{$f}; + for my $e (keys %$esref) { + my $loge = log($es{$e}); + my $ef = $esref->{$e}; + my $logef = log($ef); + my $pmi = $logef - ($loge + $logf); + next if $pmi < $MIN_PMI; + my @flets = split //, $f; + my @elets = split //, $e; + print "@flets ||| @elets\n"; + $num++; + } +} +print STDERR "Extracted $num pairs.\n"; +print STDERR "Recommend running:\n ../../training/model1 -t -99999 output.txt\n"; -- cgit v1.2.3 From f0bdd4de6455855d705d9056deb2e90c999dc740 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 20 Jan 2012 15:35:47 -0500 Subject: 'pseudo model 2' that strictly favors a diagonal, with tunable parameters for p(null) and how sharp/flat the alignment distribution is around the diagonal --- training/model1.cc | 39 ++++++++++++++++++++++++++++++++++++--- word-aligner/stemmers/ar.pl | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 3 deletions(-) create mode 100755 word-aligner/stemmers/ar.pl diff --git a/training/model1.cc b/training/model1.cc index b9590ece..346c0033 100644 --- a/training/model1.cc +++ b/training/model1.cc @@ -20,6 +20,10 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("iterations,i",po::value()->default_value(5),"Number of iterations of EM training") ("beam_threshold,t",po::value()->default_value(-4),"log_10 of beam threshold (-10000 to include everything, 0 max)") ("no_null_word,N","Do not generate from the null token") + ("write_alignments,A", "Write alignments instead of parameters") + ("favor_diagonal,d", "Use a static alignment distribution that assigns higher probabilities to alignments near the diagonal") + ("diagonal_tension,T", po::value()->default_value(4.0), "How sharp or flat around the diagonal is the alignment distribution (<1 = flat >1 = sharp)") + ("prob_align_null", po::value()->default_value(0.08), "When --favor_diagonal is set, what's the probability of a null alignment?") ("variational_bayes,v","Add a symmetric Dirichlet prior and infer VB estimate of weights") ("alpha,a", po::value()->default_value(0.01), "Hyperparameter for optional Dirichlet prior") ("no_add_viterbi,V","Do not add Viterbi alignment points (may generate a grammar where some training sentence pairs are unreachable)"); @@ -56,7 +60,12 @@ int main(int argc, char** argv) { const WordID kNULL = TD::Convert(""); const bool add_viterbi = (conf.count("no_add_viterbi") == 0); const bool variational_bayes = (conf.count("variational_bayes") > 0); + const bool write_alignments = (conf.count("write_alignments") > 0); + const double diagonal_tension = conf["diagonal_tension"].as(); + const double prob_align_null = conf["prob_align_null"].as(); + const double prob_align_not_null = 1.0 - prob_align_null; const double alpha = conf["alpha"].as(); + const bool favor_diagonal = conf.count("favor_diagonal"); if (variational_bayes && alpha <= 0.0) { cerr << "--alpha must be > 0\n"; return 1; @@ -93,31 +102,52 @@ int main(int argc, char** argv) { denom += trg.size(); vector probs(src.size() + 1); const double src_logprob = -log(src.size() + 1); + bool first_al = true; // used for write_alignments for (int j = 0; j < trg.size(); ++j) { const WordID& f_j = trg[j][0].label; double sum = 0; + const double j_over_ts = double(j) / trg.size(); + double prob_a_i = 1.0 / (src.size() + use_null); // uniform (model 1) if (use_null) { - probs[0] = tt.prob(kNULL, f_j); + if (favor_diagonal) prob_a_i = prob_align_null; + probs[0] = tt.prob(kNULL, f_j) * prob_a_i; sum += probs[0]; } + double az = 0; + if (favor_diagonal) { + for (int ta = 0; ta < src.size(); ++ta) + az += exp(-fabs(double(ta) / src.size() - j_over_ts) * diagonal_tension); + az /= prob_align_not_null; + } for (int i = 1; i <= src.size(); ++i) { - probs[i] = tt.prob(src[i-1][0].label, f_j); + if (favor_diagonal) + prob_a_i = exp(-fabs(double(i) / src.size() - j_over_ts) * diagonal_tension) / az; + probs[i] = tt.prob(src[i-1][0].label, f_j) * prob_a_i; sum += probs[i]; } if (final_iteration) { - if (add_viterbi) { + if (add_viterbi || write_alignments) { WordID max_i = 0; double max_p = -1; + int max_index = -1; if (use_null) { max_i = kNULL; + max_index = 0; max_p = probs[0]; } for (int i = 1; i <= src.size(); ++i) { if (probs[i] > max_p) { + max_index = i; max_p = probs[i]; max_i = src[i-1][0].label; } } + if (write_alignments) { + if (max_index > 0) { + if (first_al) first_al = false; else cout << ' '; + cout << (max_index - 1) << "-" << j; + } + } was_viterbi[max_i][f_j] = 1.0; } } else { @@ -128,6 +158,7 @@ int main(int argc, char** argv) { } likelihood += log(sum) + src_logprob; } + if (write_alignments && final_iteration) cout << endl; } // log(e) = 1.0 @@ -145,6 +176,8 @@ int main(int argc, char** argv) { tt.Normalize(); } } + if (write_alignments) return 0; + for (TTable::Word2Word2Double::iterator ei = tt.ttable.begin(); ei != tt.ttable.end(); ++ei) { const TTable::Word2Double& cpd = ei->second; const TTable::Word2Double& vit = was_viterbi[ei->first]; diff --git a/word-aligner/stemmers/ar.pl b/word-aligner/stemmers/ar.pl new file mode 100755 index 00000000..c85e883a --- /dev/null +++ b/word-aligner/stemmers/ar.pl @@ -0,0 +1,39 @@ +#!/usr/bin/perl -w + +use strict; +use utf8; + +binmode(STDIN, ":utf8"); +binmode(STDOUT,":utf8"); + +my $vocab = undef; +if (scalar @ARGV > 0) { + die "Only allow --vocab" unless ($ARGV[0] eq '--vocab' && scalar @ARGV == 1); + $vocab = 1; +} + +my %dict; +while() { + chomp; + my @words = split /\s+/; + my @out = (); + for my $w (@words) { + my $tw = $dict{$w}; + if (!defined $tw) { + my $el = 4; + if ($w =~ /^(.st|.n|Al)/) { $el+=2; } + if ($w =~ /^(y|t|n)/) { $el++; } + if ($el > length($w)) { $el = length($w); } + $tw = substr $w, 0, $el; + $dict{$w} = $tw; + } + push @out, $tw; + } + if ($vocab) { + die "Expected exactly one word per line with --vocab: $_" unless scalar @out == 1; + print "$_ @out\n"; + } else { + print "@out\n"; + } +} + -- cgit v1.2.3 From 26d9ad04bd81508163d75c99726f970dd75f5127 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 23 Jan 2012 15:47:29 -0500 Subject: more alignment stuff --- gi/pf/Makefile.am | 4 +- gi/pf/align-lexonly-pyp.cc | 327 ++++++++++++++++++++++++++++++++++++++++++++ gi/pf/base_measures.cc | 47 +++++++ gi/pf/base_measures.h | 18 +++ gi/pf/conditional_pseg.h | 74 ++++++++++ word-aligner/stemmers/ur.pl | 38 +++++ 6 files changed, 507 insertions(+), 1 deletion(-) create mode 100644 gi/pf/align-lexonly-pyp.cc create mode 100755 word-aligner/stemmers/ur.pl diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 7c8e89d0..28367e67 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,10 +1,12 @@ -bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly +bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp noinst_LIBRARIES = libpf.a libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc align_lexonly_SOURCES = align-lexonly.cc +align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc + itg_SOURCES = itg.cc condnaive_SOURCES = condnaive.cc diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc new file mode 100644 index 00000000..d2630a2b --- /dev/null +++ b/gi/pf/align-lexonly-pyp.cc @@ -0,0 +1,327 @@ +#include +#include +#include + +#include +#include +#include + +#include "array2d.h" +#include "base_measures.h" +#include "monotonic_pseg.h" +#include "conditional_pseg.h" +#include "trule.h" +#include "tdict.h" +#include "stringlib.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "mfcr.h" +#include "corpus.h" +#include "ngram_base.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("input,i",po::value(),"Read parallel data from") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +shared_ptr prng; + +struct LexicalAlignment { + unsigned char src_index; + bool is_transliteration; + vector > derivation; +}; + +struct AlignedSentencePair { + vector src; + vector trg; + vector a; + Array2D posterior; +}; + +struct HierarchicalWordBase { + explicit HierarchicalWordBase(const unsigned vocab_e_size) : + base(prob_t::One()), r(1,1,1,25,25), u0(-log(vocab_e_size)), l(1,1.0), v(1, 0.0) {} + + void ResampleHyperparameters(MT19937* rng) { + r.resample_hyperparameters(rng); + } + + inline double logp0(const vector& s) const { + return s.size() * u0; + } + + // return p0 of rule.e_ + prob_t operator()(const TRule& rule) const { + v[0] = exp(logp0(rule.e_)); + return prob_t(r.prob(rule.e_, v, l)); + } + + void Increment(const TRule& rule) { + v[0] = exp(logp0(rule.e_)); + if (r.increment(rule.e_, v, l, &*prng).count) { + base *= prob_t(v[0] * l[0]); + } + } + + void Decrement(const TRule& rule) { + if (r.decrement(rule.e_, &*prng).count) { + base /= prob_t(exp(logp0(rule.e_))); + } + } + + prob_t Likelihood() const { + prob_t p; p.logeq(r.log_crp_prob()); + p *= base; + return p; + } + + void Summary() const { + cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (d=" << r.d() << ",\\alpha=" << r.alpha() << ')' << endl; + for (MFCR >::const_iterator it = r.begin(); it != r.end(); ++it) + cerr << " " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables)" << TD::GetString(it->first) << endl; + } + + prob_t base; + MFCR > r; + const double u0; + const vector l; + mutable vector v; +}; + +struct BasicLexicalAlignment { + explicit BasicLexicalAlignment(const vector >& lets, + const unsigned words_e, + const unsigned letters_e, + vector* corp) : + letters(lets), + corpus(*corp), + //up0(words_e), + //up0("en.chars.1gram", letters_e), + //up0("en.words.1gram"), + up0(letters_e), + //up0("en.chars.2gram"), + tmodel(up0) { + } + + void InstantiateRule(const WordID src, + const WordID trg, + TRule* rule) const { + static const WordID kX = TD::Convert("X") * -1; + rule->lhs_ = kX; + rule->e_ = letters[trg]; + rule->f_ = letters[src]; + } + + void InitializeRandom() { + const WordID kNULL = TD::Convert("NULL"); + cerr << "Initializing with random alignments ...\n"; + for (unsigned i = 0; i < corpus.size(); ++i) { + AlignedSentencePair& asp = corpus[i]; + asp.a.resize(asp.trg.size()); + for (unsigned j = 0; j < asp.trg.size(); ++j) { + const unsigned char a_j = prng->next() * (1 + asp.src.size()); + const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + TRule r; + InstantiateRule(f_a_j, asp.trg[j], &r); + asp.a[j].is_transliteration = false; + asp.a[j].src_index = a_j; + if (tmodel.IncrementRule(r, &*prng)) + up0.Increment(r); + } + } + cerr << " LLH = " << Likelihood() << endl; + } + + prob_t Likelihood() const { + prob_t p = tmodel.Likelihood(); + p *= up0.Likelihood(); + return p; + } + + void ResampleHyperparemeters() { + cerr << " LLH_prev = " << Likelihood() << flush; + tmodel.ResampleHyperparameters(&*prng); + up0.ResampleHyperparameters(&*prng); + cerr << "\tLLH_post = " << Likelihood() << endl; + } + + void ResampleCorpus(); + + const vector >& letters; // spelling dictionary + vector& corpus; + //PhraseConditionalUninformativeBase up0; + //PhraseConditionalUninformativeUnigramBase up0; + //UnigramWordBase up0; + //HierarchicalUnigramBase up0; + HierarchicalWordBase up0; + //CompletelyUniformBase up0; + //FixedNgramBase up0; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + MConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; +}; + +void BasicLexicalAlignment::ResampleCorpus() { + static const WordID kNULL = TD::Convert("NULL"); + for (unsigned i = 0; i < corpus.size(); ++i) { + AlignedSentencePair& asp = corpus[i]; + SampleSet ss; ss.resize(asp.src.size() + 1); + for (unsigned j = 0; j < asp.trg.size(); ++j) { + TRule r; + unsigned char& a_j = asp.a[j].src_index; + WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + InstantiateRule(f_a_j, asp.trg[j], &r); + if (tmodel.DecrementRule(r, &*prng)) + up0.Decrement(r); + + for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { + const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); + InstantiateRule(prop_f, asp.trg[j], &r); + ss[prop_a_j] = tmodel.RuleProbability(r); + } + a_j = prng->SelectSample(ss); + f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + InstantiateRule(f_a_j, asp.trg[j], &r); + if (tmodel.IncrementRule(r, &*prng)) + up0.Increment(r); + } + } + cerr << " LLH = " << tmodel.Likelihood() << endl; +} + +void ExtractLetters(const set& v, vector >* l, set* letset = NULL) { + for (set::const_iterator it = v.begin(); it != v.end(); ++it) { + vector& letters = (*l)[*it]; + if (letters.size()) continue; // if e and f have the same word + + const string& w = TD::Convert(*it); + + size_t cur = 0; + while (cur < w.size()) { + const size_t len = UTF8Len(w[cur]); + letters.push_back(TD::Convert(w.substr(cur, len))); + if (letset) letset->insert(letters.back()); + cur += len; + } + } +} + +void Debug(const AlignedSentencePair& asp) { + cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl; + Array2D a(asp.src.size(), asp.trg.size()); + for (unsigned j = 0; j < asp.trg.size(); ++j) + if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true; + cerr << a << endl; +} + +void AddSample(AlignedSentencePair* asp) { + for (unsigned j = 0; j < asp->trg.size(); ++j) + asp->posterior(asp->a[j].src_index, j)++; +} + +void WriteAlignments(const AlignedSentencePair& asp) { + bool first = true; + for (unsigned j = 0; j < asp.trg.size(); ++j) { + int src_index = -1; + int mc = -1; + for (unsigned i = 0; i <= asp.src.size(); ++i) { + if (asp.posterior(i, j) > mc) { + mc = asp.posterior(i, j); + src_index = i; + } + } + + if (src_index) { + if (first) first = false; else cout << ' '; + cout << (src_index - 1) << '-' << j; + } + } + cout << endl; +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); +// MT19937& rng = *prng; + + vector > corpuse, corpusf; + set vocabe, vocabf; + corpus::ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; + cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; + assert(corpusf.size() == corpuse.size()); + + vector corpus(corpuse.size()); + for (unsigned i = 0; i < corpuse.size(); ++i) { + corpus[i].src.swap(corpusf[i]); + corpus[i].trg.swap(corpuse[i]); + corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size()); + } + corpusf.clear(); corpuse.clear(); + + vocabf.insert(TD::Convert("NULL")); + vector > letters(TD::NumWords()); + set letset; + ExtractLetters(vocabe, &letters, &letset); + ExtractLetters(vocabf, &letters, NULL); + letters[TD::Convert("NULL")].clear(); + + BasicLexicalAlignment x(letters, vocabe.size(), letset.size(), &corpus); + x.InitializeRandom(); + const unsigned samples = conf["samples"].as(); + for (int i = 0; i < samples; ++i) { + for (int j = 65; j < 67; ++j) Debug(corpus[j]); + cerr << i << "\t" << x.tmodel.r.size() << "\t"; + if (i % 10 == 0) x.ResampleHyperparemeters(); + x.ResampleCorpus(); + if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); + } + for (unsigned i = 0; i < corpus.size(); ++i) + WriteAlignments(corpus[i]); + //ModelAndData posterior(x, &corpus, vocabe, vocabf); + x.tmodel.Summary(); + x.up0.Summary(); + + //posterior.Sample(); + + return 0; +} diff --git a/gi/pf/base_measures.cc b/gi/pf/base_measures.cc index 97b4e698..7894d3e7 100644 --- a/gi/pf/base_measures.cc +++ b/gi/pf/base_measures.cc @@ -6,6 +6,53 @@ using namespace std; +TableLookupBase::TableLookupBase(const string& fname) { + cerr << "TableLookupBase reading from " << fname << " ..." << endl; + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + unsigned lc = 0; + const WordID kDIV = TD::Convert("|||"); + vector tmp; + vector le, lf; + TRule x; + x.lhs_ = -TD::Convert("X"); + bool flag = false; + while(getline(in, line)) { + ++lc; + if (lc % 1000000 == 0) { cerr << " [" << lc << ']' << endl; flag = false; } + else if (lc % 25000 == 0) { cerr << '.' << flush; flag = true; } + tmp.clear(); + TD::ConvertSentence(line, &tmp); + x.f_.clear(); + x.e_.clear(); + size_t pos = 0; + int cc = 0; + while(pos < tmp.size()) { + const WordID cur = tmp[pos++]; + if (cur == kDIV) { + ++cc; + } else if (cc == 0) { + x.f_.push_back(cur); + } else if (cc == 1) { + x.e_.push_back(cur); + } else if (cc == 2) { + table[x] = atof(TD::Convert(cur)); + ++cc; + } else { + if (flag) cerr << endl; + cerr << "Bad format in " << lc << ": " << line << endl; abort(); + } + } + if (cc != 3) { + if (flag) cerr << endl; + cerr << "Bad format in " << lc << ": " << line << endl; abort(); + } + } + if (flag) cerr << endl; + cerr << " read " << lc << " entries\n"; +} + prob_t PhraseConditionalUninformativeUnigramBase::p0(const vector& vsrc, const vector& vtrg, int start_src, int start_trg) const { diff --git a/gi/pf/base_measures.h b/gi/pf/base_measures.h index a4e9ac28..7214aa22 100644 --- a/gi/pf/base_measures.h +++ b/gi/pf/base_measures.h @@ -72,6 +72,24 @@ struct UnigramWordBase { const UnigramWordModel un; }; +struct RuleHasher { + size_t operator()(const TRule& r) const { + return hash_value(r); + } +}; + +struct TableLookupBase { + TableLookupBase(const std::string& fname); + + prob_t operator()(const TRule& rule) const { + const std::tr1::unordered_map::const_iterator it = table.find(rule); + assert(it != table.end()); + return it->second; + } + + std::tr1::unordered_map table; +}; + struct PhraseConditionalUninformativeBase { explicit PhraseConditionalUninformativeBase(const unsigned vocab_e_size) : kUNIFORM_TARGET(1.0 / vocab_e_size) { diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h index edcdc813..db951d15 100644 --- a/gi/pf/conditional_pseg.h +++ b/gi/pf/conditional_pseg.h @@ -8,10 +8,84 @@ #include "prob.h" #include "ccrp_nt.h" +#include "mfcr.h" #include "trule.h" #include "base_measures.h" #include "tdict.h" +template +struct MConditionalTranslationModel { + explicit MConditionalTranslationModel(ConditionalBaseMeasure& rcp0) : + rp0(rcp0), lambdas(1, 1.0), p0s(1) {} + + void Summary() const { + std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + std::cerr << TD::GetString(it->first) << " \t(d=" << it->second.d() << ",\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl; + for (MFCR::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) + std::cerr << " " << -1 << '\t' << i2->first << std::endl; + } + } + + void ResampleHyperparameters(MT19937* rng) { + for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it) + it->second.resample_hyperparameters(rng); + } + + int DecrementRule(const TRule& rule, MT19937* rng) { + RuleModelHash::iterator it = r.find(rule.f_); + assert(it != r.end()); + const TableCount delta = it->second.decrement(rule, rng); + if (delta.count) { + if (it->second.num_customers() == 0) r.erase(it); + } + return delta.count; + } + + int IncrementRule(const TRule& rule, MT19937* rng) { + RuleModelHash::iterator it = r.find(rule.f_); + if (it == r.end()) { + it = r.insert(make_pair(rule.f_, MFCR(1, 1.0, 1.0, 1.0, 1.0, 1e-9, 4.0))).first; + } + p0s[0] = rp0(rule).as_float(); + TableCount delta = it->second.increment(rule, p0s, lambdas, rng); + return delta.count; + } + + prob_t RuleProbability(const TRule& rule) const { + prob_t p; + RuleModelHash::const_iterator it = r.find(rule.f_); + if (it == r.end()) { + p.logeq(log(rp0(rule))); + } else { + p0s[0] = rp0(rule).as_float(); + p = prob_t(it->second.prob(rule, p0s, lambdas)); + } + return p; + } + + prob_t Likelihood() const { + prob_t p = prob_t::One(); +#if 0 + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + prob_t q; q.logeq(it->second.log_crp_prob()); + p *= q; + for (CCRP_NoTable::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) + p *= rp0(i2->first); + } +#endif + return p; + } + + const ConditionalBaseMeasure& rp0; + typedef std::tr1::unordered_map, + MFCR, + boost::hash > > RuleModelHash; + RuleModelHash r; + std::vector lambdas; + mutable std::vector p0s; +}; + template struct ConditionalTranslationModel { explicit ConditionalTranslationModel(ConditionalBaseMeasure& rcp0) : diff --git a/word-aligner/stemmers/ur.pl b/word-aligner/stemmers/ur.pl new file mode 100755 index 00000000..3a4f5a45 --- /dev/null +++ b/word-aligner/stemmers/ur.pl @@ -0,0 +1,38 @@ +#!/usr/bin/perl -w + +use strict; +use utf8; + +binmode(STDIN, ":utf8"); +binmode(STDOUT,":utf8"); + +my $vocab = undef; +if (scalar @ARGV > 0) { + die "Only allow --vocab" unless ($ARGV[0] eq '--vocab' && scalar @ARGV == 1); + $vocab = 1; +} + +my %dict; +while() { + chomp; + my @words = split /\s+/; + my @out = (); + for my $w (@words) { + my $tw = $dict{$w}; + if (!defined $tw) { + my $el = 4; + if ($w =~ /^(al|Al)/) { $el++; } + if ($el > length($w)) { $el = length($w); } + $tw = substr $w, 0, $el; + $dict{$w} = $tw; + } + push @out, $tw; + } + if ($vocab) { + die "Expected exactly one word per line with --vocab: $_" unless scalar @out == 1; + print "$_ @out\n"; + } else { + print "@out\n"; + } +} + -- cgit v1.2.3 From 4c2360119def2fb624d2691b355b1908c511f004 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 24 Jan 2012 22:26:44 -0500 Subject: more models --- gi/pf/align-lexonly.cc | 14 +++++++---- gi/pf/base_measures.cc | 2 +- gi/pf/base_measures.h | 27 ++++++++++++++++++++- training/model1.cc | 64 +++++++++++++++++++++++++++++++++++++++++++++++--- 4 files changed, 98 insertions(+), 9 deletions(-) diff --git a/gi/pf/align-lexonly.cc b/gi/pf/align-lexonly.cc index e9f1e7b6..76e2e009 100644 --- a/gi/pf/align-lexonly.cc +++ b/gi/pf/align-lexonly.cc @@ -122,10 +122,11 @@ struct BasicLexicalAlignment { vector* corp) : letters(lets), corpus(*corp), + up0("fr-en.10k.translit-base.txt.gz"), //up0(words_e), //up0("en.chars.1gram", letters_e), //up0("en.words.1gram"), - up0(letters_e), + //up0(letters_e), //up0("en.chars.2gram"), tmodel(up0) { } @@ -180,14 +181,18 @@ struct BasicLexicalAlignment { //PhraseConditionalUninformativeUnigramBase up0; //UnigramWordBase up0; //HierarchicalUnigramBase up0; - HierarchicalWordBase up0; + TableLookupBase up0; + //HierarchicalWordBase up0; + //PoissonUniformUninformativeBase up0; //CompletelyUniformBase up0; //FixedNgramBase up0; //ConditionalTranslationModel tmodel; //ConditionalTranslationModel tmodel; //ConditionalTranslationModel tmodel; //ConditionalTranslationModel tmodel; - ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + ConditionalTranslationModel tmodel; //ConditionalTranslationModel tmodel; //ConditionalTranslationModel tmodel; }; @@ -222,6 +227,7 @@ void BasicLexicalAlignment::ResampleCorpus() { void ExtractLetters(const set& v, vector >* l, set* letset = NULL) { for (set::const_iterator it = v.begin(); it != v.end(); ++it) { + if (*it >= l->size()) { l->resize(*it + 1); } vector& letters = (*l)[*it]; if (letters.size()) continue; // if e and f have the same word @@ -308,7 +314,7 @@ int main(int argc, char** argv) { x.InitializeRandom(); const unsigned samples = conf["samples"].as(); for (int i = 0; i < samples; ++i) { - for (int j = 4995; j < 4997; ++j) Debug(corpus[j]); + for (int j = 395; j < 397; ++j) Debug(corpus[j]); cerr << i << "\t" << x.tmodel.r.size() << "\t"; if (i % 10 == 0) x.ResampleHyperparemeters(); x.ResampleCorpus(); diff --git a/gi/pf/base_measures.cc b/gi/pf/base_measures.cc index 7894d3e7..4b1863fa 100644 --- a/gi/pf/base_measures.cc +++ b/gi/pf/base_measures.cc @@ -37,7 +37,7 @@ TableLookupBase::TableLookupBase(const string& fname) { } else if (cc == 1) { x.e_.push_back(cur); } else if (cc == 2) { - table[x] = atof(TD::Convert(cur)); + table[x].logeq(atof(TD::Convert(cur))); ++cc; } else { if (flag) cerr << endl; diff --git a/gi/pf/base_measures.h b/gi/pf/base_measures.h index 7214aa22..b0495bfd 100644 --- a/gi/pf/base_measures.h +++ b/gi/pf/base_measures.h @@ -51,6 +51,22 @@ struct Model1 { std::vector > ttable; }; +struct PoissonUniformUninformativeBase { + explicit PoissonUniformUninformativeBase(const unsigned ves) : kUNIFORM(1.0 / ves) {} + prob_t operator()(const TRule& r) const { + prob_t p; p.logeq(log_poisson(r.e_.size(), 1.0)); + prob_t q = kUNIFORM; q.poweq(r.e_.size()); + p *= q; + return p; + } + void Summary() const {} + void ResampleHyperparameters(MT19937*) {} + void Increment(const TRule&) {} + void Decrement(const TRule&) {} + prob_t Likelihood() const { return prob_t::One(); } + const prob_t kUNIFORM; +}; + struct CompletelyUniformBase { explicit CompletelyUniformBase(const unsigned ves) : kUNIFORM(1.0 / ves) {} prob_t operator()(const TRule&) const { @@ -83,10 +99,19 @@ struct TableLookupBase { prob_t operator()(const TRule& rule) const { const std::tr1::unordered_map::const_iterator it = table.find(rule); - assert(it != table.end()); + if (it == table.end()) { + std::cerr << rule << " not found\n"; + abort(); + } return it->second; } + void ResampleHyperparameters(MT19937*) {} + void Increment(const TRule&) {} + void Decrement(const TRule&) {} + prob_t Likelihood() const { return prob_t::One(); } + void Summary() const {} + std::tr1::unordered_map table; }; diff --git a/training/model1.cc b/training/model1.cc index 346c0033..40249aa3 100644 --- a/training/model1.cc +++ b/training/model1.cc @@ -14,6 +14,11 @@ namespace po = boost::program_options; using namespace std; +inline double log_poisson(unsigned x, const double& lambda) { + assert(lambda > 0.0); + return log(lambda) * x - lgamma(x + 1) - lambda; +} + bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() @@ -25,6 +30,7 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("diagonal_tension,T", po::value()->default_value(4.0), "How sharp or flat around the diagonal is the alignment distribution (<1 = flat >1 = sharp)") ("prob_align_null", po::value()->default_value(0.08), "When --favor_diagonal is set, what's the probability of a null alignment?") ("variational_bayes,v","Add a symmetric Dirichlet prior and infer VB estimate of weights") + ("testset,x", po::value(), "After training completes, compute the log likelihood of this set of sentence pairs under the learned model") ("alpha,a", po::value()->default_value(0.01), "Hyperparameter for optional Dirichlet prior") ("no_add_viterbi,V","Do not add Viterbi alignment points (may generate a grammar where some training sentence pairs are unreachable)"); po::options_description clo("Command line options"); @@ -63,6 +69,8 @@ int main(int argc, char** argv) { const bool write_alignments = (conf.count("write_alignments") > 0); const double diagonal_tension = conf["diagonal_tension"].as(); const double prob_align_null = conf["prob_align_null"].as(); + string testset; + if (conf.count("testset")) testset = conf["testset"].as(); const double prob_align_not_null = 1.0 - prob_align_null; const double alpha = conf["alpha"].as(); const bool favor_diagonal = conf.count("favor_diagonal"); @@ -73,6 +81,8 @@ int main(int argc, char** argv) { TTable tt; TTable::Word2Word2Double was_viterbi; + double tot_len_ratio = 0; + double mean_srclen_multiplier = 0; for (int iter = 0; iter < ITERATIONS; ++iter) { const bool final_iteration = (iter == (ITERATIONS - 1)); cerr << "ITERATION " << (iter + 1) << (final_iteration ? " (FINAL)" : "") << endl; @@ -83,13 +93,13 @@ int main(int argc, char** argv) { int lc = 0; bool flag = false; string line; + string ssrc, strg; while(true) { getline(in, line); if (!in) break; ++lc; if (lc % 1000 == 0) { cerr << '.'; flag = true; } if (lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; } - string ssrc, strg; ParseTranslatorInput(line, &ssrc, &strg); Lattice src, trg; LatticeTools::ConvertTextToLattice(ssrc, &src); @@ -99,9 +109,10 @@ int main(int argc, char** argv) { assert(src.size() > 0); assert(trg.size() > 0); } + if (iter == 0) + tot_len_ratio += static_cast(trg.size()) / static_cast(src.size()); denom += trg.size(); vector probs(src.size() + 1); - const double src_logprob = -log(src.size() + 1); bool first_al = true; // used for write_alignments for (int j = 0; j < trg.size(); ++j) { const WordID& f_j = trg[j][0].label; @@ -156,7 +167,7 @@ int main(int argc, char** argv) { for (int i = 1; i <= src.size(); ++i) tt.Increment(src[i-1][0].label, f_j, probs[i] / sum); } - likelihood += log(sum) + src_logprob; + likelihood += log(sum); } if (write_alignments && final_iteration) cout << endl; } @@ -165,6 +176,10 @@ int main(int argc, char** argv) { double base2_likelihood = likelihood / log(2); if (flag) { cerr << endl; } + if (iter == 0) { + mean_srclen_multiplier = tot_len_ratio / lc; + cerr << "expected target length = source length * " << mean_srclen_multiplier << endl; + } cerr << " log_e likelihood: " << likelihood << endl; cerr << " log_2 likelihood: " << base2_likelihood << endl; cerr << " cross entropy: " << (-base2_likelihood / denom) << endl; @@ -176,6 +191,49 @@ int main(int argc, char** argv) { tt.Normalize(); } } + if (testset.size()) { + ReadFile rf(testset); + istream& in = *rf.stream(); + int lc = 0; + double tlp = 0; + string ssrc, strg, line; + while (getline(in, line)) { + ++lc; + ParseTranslatorInput(line, &ssrc, &strg); + Lattice src, trg; + LatticeTools::ConvertTextToLattice(ssrc, &src); + LatticeTools::ConvertTextToLattice(strg, &trg); + double log_prob = log_poisson(trg.size(), 0.05 + src.size() * mean_srclen_multiplier); + + // compute likelihood + for (int j = 0; j < trg.size(); ++j) { + const WordID& f_j = trg[j][0].label; + double sum = 0; + const double j_over_ts = double(j) / trg.size(); + double prob_a_i = 1.0 / (src.size() + use_null); // uniform (model 1) + if (use_null) { + if (favor_diagonal) prob_a_i = prob_align_null; + sum += tt.prob(kNULL, f_j) * prob_a_i; + } + double az = 0; + if (favor_diagonal) { + for (int ta = 0; ta < src.size(); ++ta) + az += exp(-fabs(double(ta) / src.size() - j_over_ts) * diagonal_tension); + az /= prob_align_not_null; + } + for (int i = 1; i <= src.size(); ++i) { + if (favor_diagonal) + prob_a_i = exp(-fabs(double(i) / src.size() - j_over_ts) * diagonal_tension) / az; + sum += tt.prob(src[i-1][0].label, f_j) * prob_a_i; + } + log_prob += log(sum); + } + tlp += log_prob; + cerr << ssrc << " ||| " << strg << " ||| " << log_prob << endl; + } + cerr << "TOTAL LOG PROB " << tlp << endl; + } + if (write_alignments) return 0; for (TTable::Word2Word2Double::iterator ei = tt.ttable.begin(); ei != tt.ttable.end(); ++ei) { -- cgit v1.2.3 From 481a120564fdb73c8c6833e2102acb533683261c Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 27 Jan 2012 02:31:00 -0500 Subject: migrate mert to the new scorer interface --- gi/pf/base_distributions.cc | 241 ++++++++++++++++++++++++++++++++++++++++ gi/pf/base_distributions.h | 261 ++++++++++++++++++++++++++++++++++++++++++++ gi/pf/base_measures.cc | 241 ---------------------------------------- gi/pf/base_measures.h | 247 ----------------------------------------- mteval/ns.cc | 4 + mteval/ns.h | 10 +- vest/ces.cc | 42 +++---- vest/ces.h | 10 +- vest/dist-vest.pl | 4 +- vest/error_surface.cc | 11 +- vest/error_surface.h | 6 +- vest/line_optimizer.cc | 20 ++-- vest/line_optimizer.h | 2 + vest/lo_test.cc | 21 ++-- vest/mr_vest_map.cc | 16 +-- vest/mr_vest_reduce.cc | 34 +++--- 16 files changed, 602 insertions(+), 568 deletions(-) create mode 100644 gi/pf/base_distributions.cc create mode 100644 gi/pf/base_distributions.h delete mode 100644 gi/pf/base_measures.cc delete mode 100644 gi/pf/base_measures.h diff --git a/gi/pf/base_distributions.cc b/gi/pf/base_distributions.cc new file mode 100644 index 00000000..4b1863fa --- /dev/null +++ b/gi/pf/base_distributions.cc @@ -0,0 +1,241 @@ +#include "base_measures.h" + +#include + +#include "filelib.h" + +using namespace std; + +TableLookupBase::TableLookupBase(const string& fname) { + cerr << "TableLookupBase reading from " << fname << " ..." << endl; + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + unsigned lc = 0; + const WordID kDIV = TD::Convert("|||"); + vector tmp; + vector le, lf; + TRule x; + x.lhs_ = -TD::Convert("X"); + bool flag = false; + while(getline(in, line)) { + ++lc; + if (lc % 1000000 == 0) { cerr << " [" << lc << ']' << endl; flag = false; } + else if (lc % 25000 == 0) { cerr << '.' << flush; flag = true; } + tmp.clear(); + TD::ConvertSentence(line, &tmp); + x.f_.clear(); + x.e_.clear(); + size_t pos = 0; + int cc = 0; + while(pos < tmp.size()) { + const WordID cur = tmp[pos++]; + if (cur == kDIV) { + ++cc; + } else if (cc == 0) { + x.f_.push_back(cur); + } else if (cc == 1) { + x.e_.push_back(cur); + } else if (cc == 2) { + table[x].logeq(atof(TD::Convert(cur))); + ++cc; + } else { + if (flag) cerr << endl; + cerr << "Bad format in " << lc << ": " << line << endl; abort(); + } + } + if (cc != 3) { + if (flag) cerr << endl; + cerr << "Bad format in " << lc << ": " << line << endl; abort(); + } + } + if (flag) cerr << endl; + cerr << " read " << lc << " entries\n"; +} + +prob_t PhraseConditionalUninformativeUnigramBase::p0(const vector& vsrc, + const vector& vtrg, + int start_src, int start_trg) const { + const int flen = vsrc.size() - start_src; + const int elen = vtrg.size() - start_trg; + prob_t p; + p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) + //p.logeq(log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01) + for (int i = 0; i < elen; ++i) + p *= u(vtrg[i + start_trg]); // draw e_i ~Uniform + return p; +} + +prob_t PhraseConditionalUninformativeBase::p0(const vector& vsrc, + const vector& vtrg, + int start_src, int start_trg) const { + const int flen = vsrc.size() - start_src; + const int elen = vtrg.size() - start_trg; + prob_t p; + //p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) + p.logeq(log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01) + for (int i = 0; i < elen; ++i) + p *= kUNIFORM_TARGET; // draw e_i ~Uniform + return p; +} + +void Model1::LoadModel1(const string& fname) { + cerr << "Loading Model 1 parameters from " << fname << " ..." << endl; + ReadFile rf(fname); + istream& in = *rf.stream(); + string line; + unsigned lc = 0; + while(getline(in, line)) { + ++lc; + int cur = 0; + int start = 0; + while(cur < line.size() && line[cur] != ' ') { ++cur; } + assert(cur != line.size()); + line[cur] = 0; + const WordID src = TD::Convert(&line[0]); + ++cur; + start = cur; + while(cur < line.size() && line[cur] != ' ') { ++cur; } + assert(cur != line.size()); + line[cur] = 0; + WordID trg = TD::Convert(&line[start]); + const double logprob = strtod(&line[cur + 1], NULL); + if (src >= ttable.size()) ttable.resize(src + 1); + ttable[src][trg].logeq(logprob); + } + cerr << " read " << lc << " parameters.\n"; +} + +prob_t PhraseConditionalBase::p0(const vector& vsrc, + const vector& vtrg, + int start_src, int start_trg) const { + const int flen = vsrc.size() - start_src; + const int elen = vtrg.size() - start_trg; + prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); + prob_t p; + p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) + for (int i = 0; i < elen; ++i) { // for each position i in e-RHS + const WordID trg = vtrg[i + start_trg]; + prob_t tp = prob_t::Zero(); + for (int j = -1; j < flen; ++j) { + const WordID src = j < 0 ? 0 : vsrc[j + start_src]; + tp += kM1MIXTURE * model1(src, trg); + tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; + } + tp *= uniform_src_alignment; // draw a_i ~uniform + p *= tp; // draw e_i ~Model1(f_a_i) / uniform + } + if (p.is_0()) { + cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; + abort(); + } + return p; +} + +prob_t PhraseJointBase::p0(const vector& vsrc, + const vector& vtrg, + int start_src, int start_trg) const { + const int flen = vsrc.size() - start_src; + const int elen = vtrg.size() - start_trg; + prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); + prob_t p; + p.logeq(log_poisson(flen, 1.0)); // flen ~Pois(1) + // elen | flen ~Pois(flen + 0.01) + prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01)); + p *= ptrglen; + p *= kUNIFORM_SOURCE.pow(flen); // each f in F ~Uniform + for (int i = 0; i < elen; ++i) { // for each position i in E + const WordID trg = vtrg[i + start_trg]; + prob_t tp = prob_t::Zero(); + for (int j = -1; j < flen; ++j) { + const WordID src = j < 0 ? 0 : vsrc[j + start_src]; + tp += kM1MIXTURE * model1(src, trg); + tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; + } + tp *= uniform_src_alignment; // draw a_i ~uniform + p *= tp; // draw e_i ~Model1(f_a_i) / uniform + } + if (p.is_0()) { + cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; + abort(); + } + return p; +} + +prob_t PhraseJointBase_BiDir::p0(const vector& vsrc, + const vector& vtrg, + int start_src, int start_trg) const { + const int flen = vsrc.size() - start_src; + const int elen = vtrg.size() - start_trg; + prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); + prob_t uniform_trg_alignment; uniform_trg_alignment.logeq(-log(elen + 1)); + + prob_t p1; + p1.logeq(log_poisson(flen, 1.0)); // flen ~Pois(1) + // elen | flen ~Pois(flen + 0.01) + prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01)); + p1 *= ptrglen; + p1 *= kUNIFORM_SOURCE.pow(flen); // each f in F ~Uniform + for (int i = 0; i < elen; ++i) { // for each position i in E + const WordID trg = vtrg[i + start_trg]; + prob_t tp = prob_t::Zero(); + for (int j = -1; j < flen; ++j) { + const WordID src = j < 0 ? 0 : vsrc[j + start_src]; + tp += kM1MIXTURE * model1(src, trg); + tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; + } + tp *= uniform_src_alignment; // draw a_i ~uniform + p1 *= tp; // draw e_i ~Model1(f_a_i) / uniform + } + if (p1.is_0()) { + cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; + abort(); + } + + prob_t p2; + p2.logeq(log_poisson(elen, 1.0)); // elen ~Pois(1) + // flen | elen ~Pois(flen + 0.01) + prob_t psrclen; psrclen.logeq(log_poisson(flen, elen + 0.01)); + p2 *= psrclen; + p2 *= kUNIFORM_TARGET.pow(elen); // each f in F ~Uniform + for (int i = 0; i < flen; ++i) { // for each position i in E + const WordID src = vsrc[i + start_src]; + prob_t tp = prob_t::Zero(); + for (int j = -1; j < elen; ++j) { + const WordID trg = j < 0 ? 0 : vtrg[j + start_trg]; + tp += kM1MIXTURE * invmodel1(trg, src); + tp += kUNIFORM_MIXTURE * kUNIFORM_SOURCE; + } + tp *= uniform_trg_alignment; // draw a_i ~uniform + p2 *= tp; // draw e_i ~Model1(f_a_i) / uniform + } + if (p2.is_0()) { + cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; + abort(); + } + + static const prob_t kHALF(0.5); + return (p1 + p2) * kHALF; +} + +JumpBase::JumpBase() : p(200) { + for (unsigned src_len = 1; src_len < 200; ++src_len) { + map& cpd = p[src_len]; + int min_jump = 1 - src_len; + int max_jump = src_len; + prob_t z; + for (int j = min_jump; j <= max_jump; ++j) { + prob_t& cp = cpd[j]; + if (j < 0) + cp.logeq(log_poisson(1.5-j, 1)); + else if (j > 0) + cp.logeq(log_poisson(j, 1)); + cp.poweq(0.2); + z += cp; + } + for (int j = min_jump; j <= max_jump; ++j) { + cpd[j] /= z; + } + } +} + diff --git a/gi/pf/base_distributions.h b/gi/pf/base_distributions.h new file mode 100644 index 00000000..a23ac32b --- /dev/null +++ b/gi/pf/base_distributions.h @@ -0,0 +1,261 @@ +#ifndef _BASE_MEASURES_H_ +#define _BASE_MEASURES_H_ + +#include +#include +#include +#include +#include +#include + +#include "unigrams.h" +#include "trule.h" +#include "prob.h" +#include "tdict.h" +#include "sampler.h" + +inline double log_poisson(unsigned x, const double& lambda) { + assert(lambda > 0.0); + return log(lambda) * x - lgamma(x + 1) - lambda; +} + +inline double log_binom_coeff(unsigned n, unsigned k) { + assert(n >= k); + if (n == k) return 0.0; + return lgamma(n + 1) - lgamma(k + 1) - lgamma(n - k + 1); +} + +// http://en.wikipedia.org/wiki/Negative_binomial_distribution +inline double log_negative_binom(unsigned x, unsigned r, double p) { + assert(p > 0.0); + assert(p < 1.0); + return log_binom_coeff(x + r - 1, x) + r * log(1 - p) + x * log(p); +} + +inline std::ostream& operator<<(std::ostream& os, const std::vector& p) { + os << '['; + for (int i = 0; i < p.size(); ++i) + os << (i==0 ? "" : " ") << TD::Convert(p[i]); + return os << ']'; +} + +struct Model1 { + explicit Model1(const std::string& fname) : + kNULL(TD::Convert("")), + kZERO() { + LoadModel1(fname); + } + + void LoadModel1(const std::string& fname); + + // returns prob 0 if src or trg is not found + const prob_t& operator()(WordID src, WordID trg) const { + if (src == 0) src = kNULL; + if (src < ttable.size()) { + const std::map& cpd = ttable[src]; + const std::map::const_iterator it = cpd.find(trg); + if (it != cpd.end()) + return it->second; + } + return kZERO; + } + + const WordID kNULL; + const prob_t kZERO; + std::vector > ttable; +}; + +struct PoissonUniformUninformativeBase { + explicit PoissonUniformUninformativeBase(const unsigned ves) : kUNIFORM(1.0 / ves) {} + prob_t operator()(const TRule& r) const { + prob_t p; p.logeq(log_poisson(r.e_.size(), 1.0)); + prob_t q = kUNIFORM; q.poweq(r.e_.size()); + p *= q; + return p; + } + void Summary() const {} + void ResampleHyperparameters(MT19937*) {} + void Increment(const TRule&) {} + void Decrement(const TRule&) {} + prob_t Likelihood() const { return prob_t::One(); } + const prob_t kUNIFORM; +}; + +struct CompletelyUniformBase { + explicit CompletelyUniformBase(const unsigned ves) : kUNIFORM(1.0 / ves) {} + prob_t operator()(const TRule&) const { + return kUNIFORM; + } + void Summary() const {} + void ResampleHyperparameters(MT19937*) {} + void Increment(const TRule&) {} + void Decrement(const TRule&) {} + prob_t Likelihood() const { return prob_t::One(); } + const prob_t kUNIFORM; +}; + +struct UnigramWordBase { + explicit UnigramWordBase(const std::string& fname) : un(fname) {} + prob_t operator()(const TRule& r) const { + return un(r.e_); + } + const UnigramWordModel un; +}; + +struct RuleHasher { + size_t operator()(const TRule& r) const { + return hash_value(r); + } +}; + +struct TableLookupBase { + TableLookupBase(const std::string& fname); + + prob_t operator()(const TRule& rule) const { + const std::tr1::unordered_map::const_iterator it = table.find(rule); + if (it == table.end()) { + std::cerr << rule << " not found\n"; + abort(); + } + return it->second; + } + + void ResampleHyperparameters(MT19937*) {} + void Increment(const TRule&) {} + void Decrement(const TRule&) {} + prob_t Likelihood() const { return prob_t::One(); } + void Summary() const {} + + std::tr1::unordered_map table; +}; + +struct PhraseConditionalUninformativeBase { + explicit PhraseConditionalUninformativeBase(const unsigned vocab_e_size) : + kUNIFORM_TARGET(1.0 / vocab_e_size) { + assert(vocab_e_size > 0); + } + + // return p0 of rule.e_ | rule.f_ + prob_t operator()(const TRule& rule) const { + return p0(rule.f_, rule.e_, 0, 0); + } + + prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; + + void Summary() const {} + void ResampleHyperparameters(MT19937*) {} + void Increment(const TRule&) {} + void Decrement(const TRule&) {} + prob_t Likelihood() const { return prob_t::One(); } + const prob_t kUNIFORM_TARGET; +}; + +struct PhraseConditionalUninformativeUnigramBase { + explicit PhraseConditionalUninformativeUnigramBase(const std::string& file, const unsigned vocab_e_size) : u(file, vocab_e_size) {} + + // return p0 of rule.e_ | rule.f_ + prob_t operator()(const TRule& rule) const { + return p0(rule.f_, rule.e_, 0, 0); + } + + prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; + + const UnigramModel u; +}; + +struct PhraseConditionalBase { + explicit PhraseConditionalBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size) : + model1(m1), + kM1MIXTURE(m1mixture), + kUNIFORM_MIXTURE(1.0 - m1mixture), + kUNIFORM_TARGET(1.0 / vocab_e_size) { + assert(m1mixture >= 0.0 && m1mixture <= 1.0); + assert(vocab_e_size > 0); + } + + // return p0 of rule.e_ | rule.f_ + prob_t operator()(const TRule& rule) const { + return p0(rule.f_, rule.e_, 0, 0); + } + + prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; + + const Model1& model1; + const prob_t kM1MIXTURE; // Model 1 mixture component + const prob_t kUNIFORM_MIXTURE; // uniform mixture component + const prob_t kUNIFORM_TARGET; +}; + +struct PhraseJointBase { + explicit PhraseJointBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size, const unsigned vocab_f_size) : + model1(m1), + kM1MIXTURE(m1mixture), + kUNIFORM_MIXTURE(1.0 - m1mixture), + kUNIFORM_SOURCE(1.0 / vocab_f_size), + kUNIFORM_TARGET(1.0 / vocab_e_size) { + assert(m1mixture >= 0.0 && m1mixture <= 1.0); + assert(vocab_e_size > 0); + } + + // return p0 of rule.e_ , rule.f_ + prob_t operator()(const TRule& rule) const { + return p0(rule.f_, rule.e_, 0, 0); + } + + prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; + + const Model1& model1; + const prob_t kM1MIXTURE; // Model 1 mixture component + const prob_t kUNIFORM_MIXTURE; // uniform mixture component + const prob_t kUNIFORM_SOURCE; + const prob_t kUNIFORM_TARGET; +}; + +struct PhraseJointBase_BiDir { + explicit PhraseJointBase_BiDir(const Model1& m1, + const Model1& im1, + const double m1mixture, + const unsigned vocab_e_size, + const unsigned vocab_f_size) : + model1(m1), + invmodel1(im1), + kM1MIXTURE(m1mixture), + kUNIFORM_MIXTURE(1.0 - m1mixture), + kUNIFORM_SOURCE(1.0 / vocab_f_size), + kUNIFORM_TARGET(1.0 / vocab_e_size) { + assert(m1mixture >= 0.0 && m1mixture <= 1.0); + assert(vocab_e_size > 0); + } + + // return p0 of rule.e_ , rule.f_ + prob_t operator()(const TRule& rule) const { + return p0(rule.f_, rule.e_, 0, 0); + } + + prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; + + const Model1& model1; + const Model1& invmodel1; + const prob_t kM1MIXTURE; // Model 1 mixture component + const prob_t kUNIFORM_MIXTURE; // uniform mixture component + const prob_t kUNIFORM_SOURCE; + const prob_t kUNIFORM_TARGET; +}; + +// base distribution for jump size multinomials +// basically p(0) = 0 and then, p(1) is max, and then +// you drop as you move to the max jump distance +struct JumpBase { + JumpBase(); + + const prob_t& operator()(int jump, unsigned src_len) const { + assert(jump != 0); + const std::map::const_iterator it = p[src_len].find(jump); + assert(it != p[src_len].end()); + return it->second; + } + std::vector > p; +}; + + +#endif diff --git a/gi/pf/base_measures.cc b/gi/pf/base_measures.cc deleted file mode 100644 index 4b1863fa..00000000 --- a/gi/pf/base_measures.cc +++ /dev/null @@ -1,241 +0,0 @@ -#include "base_measures.h" - -#include - -#include "filelib.h" - -using namespace std; - -TableLookupBase::TableLookupBase(const string& fname) { - cerr << "TableLookupBase reading from " << fname << " ..." << endl; - ReadFile rf(fname); - istream& in = *rf.stream(); - string line; - unsigned lc = 0; - const WordID kDIV = TD::Convert("|||"); - vector tmp; - vector le, lf; - TRule x; - x.lhs_ = -TD::Convert("X"); - bool flag = false; - while(getline(in, line)) { - ++lc; - if (lc % 1000000 == 0) { cerr << " [" << lc << ']' << endl; flag = false; } - else if (lc % 25000 == 0) { cerr << '.' << flush; flag = true; } - tmp.clear(); - TD::ConvertSentence(line, &tmp); - x.f_.clear(); - x.e_.clear(); - size_t pos = 0; - int cc = 0; - while(pos < tmp.size()) { - const WordID cur = tmp[pos++]; - if (cur == kDIV) { - ++cc; - } else if (cc == 0) { - x.f_.push_back(cur); - } else if (cc == 1) { - x.e_.push_back(cur); - } else if (cc == 2) { - table[x].logeq(atof(TD::Convert(cur))); - ++cc; - } else { - if (flag) cerr << endl; - cerr << "Bad format in " << lc << ": " << line << endl; abort(); - } - } - if (cc != 3) { - if (flag) cerr << endl; - cerr << "Bad format in " << lc << ": " << line << endl; abort(); - } - } - if (flag) cerr << endl; - cerr << " read " << lc << " entries\n"; -} - -prob_t PhraseConditionalUninformativeUnigramBase::p0(const vector& vsrc, - const vector& vtrg, - int start_src, int start_trg) const { - const int flen = vsrc.size() - start_src; - const int elen = vtrg.size() - start_trg; - prob_t p; - p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) - //p.logeq(log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01) - for (int i = 0; i < elen; ++i) - p *= u(vtrg[i + start_trg]); // draw e_i ~Uniform - return p; -} - -prob_t PhraseConditionalUninformativeBase::p0(const vector& vsrc, - const vector& vtrg, - int start_src, int start_trg) const { - const int flen = vsrc.size() - start_src; - const int elen = vtrg.size() - start_trg; - prob_t p; - //p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) - p.logeq(log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01) - for (int i = 0; i < elen; ++i) - p *= kUNIFORM_TARGET; // draw e_i ~Uniform - return p; -} - -void Model1::LoadModel1(const string& fname) { - cerr << "Loading Model 1 parameters from " << fname << " ..." << endl; - ReadFile rf(fname); - istream& in = *rf.stream(); - string line; - unsigned lc = 0; - while(getline(in, line)) { - ++lc; - int cur = 0; - int start = 0; - while(cur < line.size() && line[cur] != ' ') { ++cur; } - assert(cur != line.size()); - line[cur] = 0; - const WordID src = TD::Convert(&line[0]); - ++cur; - start = cur; - while(cur < line.size() && line[cur] != ' ') { ++cur; } - assert(cur != line.size()); - line[cur] = 0; - WordID trg = TD::Convert(&line[start]); - const double logprob = strtod(&line[cur + 1], NULL); - if (src >= ttable.size()) ttable.resize(src + 1); - ttable[src][trg].logeq(logprob); - } - cerr << " read " << lc << " parameters.\n"; -} - -prob_t PhraseConditionalBase::p0(const vector& vsrc, - const vector& vtrg, - int start_src, int start_trg) const { - const int flen = vsrc.size() - start_src; - const int elen = vtrg.size() - start_trg; - prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); - prob_t p; - p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) - for (int i = 0; i < elen; ++i) { // for each position i in e-RHS - const WordID trg = vtrg[i + start_trg]; - prob_t tp = prob_t::Zero(); - for (int j = -1; j < flen; ++j) { - const WordID src = j < 0 ? 0 : vsrc[j + start_src]; - tp += kM1MIXTURE * model1(src, trg); - tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; - } - tp *= uniform_src_alignment; // draw a_i ~uniform - p *= tp; // draw e_i ~Model1(f_a_i) / uniform - } - if (p.is_0()) { - cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; - abort(); - } - return p; -} - -prob_t PhraseJointBase::p0(const vector& vsrc, - const vector& vtrg, - int start_src, int start_trg) const { - const int flen = vsrc.size() - start_src; - const int elen = vtrg.size() - start_trg; - prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); - prob_t p; - p.logeq(log_poisson(flen, 1.0)); // flen ~Pois(1) - // elen | flen ~Pois(flen + 0.01) - prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01)); - p *= ptrglen; - p *= kUNIFORM_SOURCE.pow(flen); // each f in F ~Uniform - for (int i = 0; i < elen; ++i) { // for each position i in E - const WordID trg = vtrg[i + start_trg]; - prob_t tp = prob_t::Zero(); - for (int j = -1; j < flen; ++j) { - const WordID src = j < 0 ? 0 : vsrc[j + start_src]; - tp += kM1MIXTURE * model1(src, trg); - tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; - } - tp *= uniform_src_alignment; // draw a_i ~uniform - p *= tp; // draw e_i ~Model1(f_a_i) / uniform - } - if (p.is_0()) { - cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; - abort(); - } - return p; -} - -prob_t PhraseJointBase_BiDir::p0(const vector& vsrc, - const vector& vtrg, - int start_src, int start_trg) const { - const int flen = vsrc.size() - start_src; - const int elen = vtrg.size() - start_trg; - prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); - prob_t uniform_trg_alignment; uniform_trg_alignment.logeq(-log(elen + 1)); - - prob_t p1; - p1.logeq(log_poisson(flen, 1.0)); // flen ~Pois(1) - // elen | flen ~Pois(flen + 0.01) - prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01)); - p1 *= ptrglen; - p1 *= kUNIFORM_SOURCE.pow(flen); // each f in F ~Uniform - for (int i = 0; i < elen; ++i) { // for each position i in E - const WordID trg = vtrg[i + start_trg]; - prob_t tp = prob_t::Zero(); - for (int j = -1; j < flen; ++j) { - const WordID src = j < 0 ? 0 : vsrc[j + start_src]; - tp += kM1MIXTURE * model1(src, trg); - tp += kUNIFORM_MIXTURE * kUNIFORM_TARGET; - } - tp *= uniform_src_alignment; // draw a_i ~uniform - p1 *= tp; // draw e_i ~Model1(f_a_i) / uniform - } - if (p1.is_0()) { - cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; - abort(); - } - - prob_t p2; - p2.logeq(log_poisson(elen, 1.0)); // elen ~Pois(1) - // flen | elen ~Pois(flen + 0.01) - prob_t psrclen; psrclen.logeq(log_poisson(flen, elen + 0.01)); - p2 *= psrclen; - p2 *= kUNIFORM_TARGET.pow(elen); // each f in F ~Uniform - for (int i = 0; i < flen; ++i) { // for each position i in E - const WordID src = vsrc[i + start_src]; - prob_t tp = prob_t::Zero(); - for (int j = -1; j < elen; ++j) { - const WordID trg = j < 0 ? 0 : vtrg[j + start_trg]; - tp += kM1MIXTURE * invmodel1(trg, src); - tp += kUNIFORM_MIXTURE * kUNIFORM_SOURCE; - } - tp *= uniform_trg_alignment; // draw a_i ~uniform - p2 *= tp; // draw e_i ~Model1(f_a_i) / uniform - } - if (p2.is_0()) { - cerr << "Zero! " << vsrc << "\nTRG=" << vtrg << endl; - abort(); - } - - static const prob_t kHALF(0.5); - return (p1 + p2) * kHALF; -} - -JumpBase::JumpBase() : p(200) { - for (unsigned src_len = 1; src_len < 200; ++src_len) { - map& cpd = p[src_len]; - int min_jump = 1 - src_len; - int max_jump = src_len; - prob_t z; - for (int j = min_jump; j <= max_jump; ++j) { - prob_t& cp = cpd[j]; - if (j < 0) - cp.logeq(log_poisson(1.5-j, 1)); - else if (j > 0) - cp.logeq(log_poisson(j, 1)); - cp.poweq(0.2); - z += cp; - } - for (int j = min_jump; j <= max_jump; ++j) { - cpd[j] /= z; - } - } -} - diff --git a/gi/pf/base_measures.h b/gi/pf/base_measures.h deleted file mode 100644 index b0495bfd..00000000 --- a/gi/pf/base_measures.h +++ /dev/null @@ -1,247 +0,0 @@ -#ifndef _BASE_MEASURES_H_ -#define _BASE_MEASURES_H_ - -#include -#include -#include -#include -#include - -#include "unigrams.h" -#include "trule.h" -#include "prob.h" -#include "tdict.h" -#include "sampler.h" - -inline double log_poisson(unsigned x, const double& lambda) { - assert(lambda > 0.0); - return log(lambda) * x - lgamma(x + 1) - lambda; -} - -inline std::ostream& operator<<(std::ostream& os, const std::vector& p) { - os << '['; - for (int i = 0; i < p.size(); ++i) - os << (i==0 ? "" : " ") << TD::Convert(p[i]); - return os << ']'; -} - -struct Model1 { - explicit Model1(const std::string& fname) : - kNULL(TD::Convert("")), - kZERO() { - LoadModel1(fname); - } - - void LoadModel1(const std::string& fname); - - // returns prob 0 if src or trg is not found - const prob_t& operator()(WordID src, WordID trg) const { - if (src == 0) src = kNULL; - if (src < ttable.size()) { - const std::map& cpd = ttable[src]; - const std::map::const_iterator it = cpd.find(trg); - if (it != cpd.end()) - return it->second; - } - return kZERO; - } - - const WordID kNULL; - const prob_t kZERO; - std::vector > ttable; -}; - -struct PoissonUniformUninformativeBase { - explicit PoissonUniformUninformativeBase(const unsigned ves) : kUNIFORM(1.0 / ves) {} - prob_t operator()(const TRule& r) const { - prob_t p; p.logeq(log_poisson(r.e_.size(), 1.0)); - prob_t q = kUNIFORM; q.poweq(r.e_.size()); - p *= q; - return p; - } - void Summary() const {} - void ResampleHyperparameters(MT19937*) {} - void Increment(const TRule&) {} - void Decrement(const TRule&) {} - prob_t Likelihood() const { return prob_t::One(); } - const prob_t kUNIFORM; -}; - -struct CompletelyUniformBase { - explicit CompletelyUniformBase(const unsigned ves) : kUNIFORM(1.0 / ves) {} - prob_t operator()(const TRule&) const { - return kUNIFORM; - } - void Summary() const {} - void ResampleHyperparameters(MT19937*) {} - void Increment(const TRule&) {} - void Decrement(const TRule&) {} - prob_t Likelihood() const { return prob_t::One(); } - const prob_t kUNIFORM; -}; - -struct UnigramWordBase { - explicit UnigramWordBase(const std::string& fname) : un(fname) {} - prob_t operator()(const TRule& r) const { - return un(r.e_); - } - const UnigramWordModel un; -}; - -struct RuleHasher { - size_t operator()(const TRule& r) const { - return hash_value(r); - } -}; - -struct TableLookupBase { - TableLookupBase(const std::string& fname); - - prob_t operator()(const TRule& rule) const { - const std::tr1::unordered_map::const_iterator it = table.find(rule); - if (it == table.end()) { - std::cerr << rule << " not found\n"; - abort(); - } - return it->second; - } - - void ResampleHyperparameters(MT19937*) {} - void Increment(const TRule&) {} - void Decrement(const TRule&) {} - prob_t Likelihood() const { return prob_t::One(); } - void Summary() const {} - - std::tr1::unordered_map table; -}; - -struct PhraseConditionalUninformativeBase { - explicit PhraseConditionalUninformativeBase(const unsigned vocab_e_size) : - kUNIFORM_TARGET(1.0 / vocab_e_size) { - assert(vocab_e_size > 0); - } - - // return p0 of rule.e_ | rule.f_ - prob_t operator()(const TRule& rule) const { - return p0(rule.f_, rule.e_, 0, 0); - } - - prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; - - void Summary() const {} - void ResampleHyperparameters(MT19937*) {} - void Increment(const TRule&) {} - void Decrement(const TRule&) {} - prob_t Likelihood() const { return prob_t::One(); } - const prob_t kUNIFORM_TARGET; -}; - -struct PhraseConditionalUninformativeUnigramBase { - explicit PhraseConditionalUninformativeUnigramBase(const std::string& file, const unsigned vocab_e_size) : u(file, vocab_e_size) {} - - // return p0 of rule.e_ | rule.f_ - prob_t operator()(const TRule& rule) const { - return p0(rule.f_, rule.e_, 0, 0); - } - - prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; - - const UnigramModel u; -}; - -struct PhraseConditionalBase { - explicit PhraseConditionalBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size) : - model1(m1), - kM1MIXTURE(m1mixture), - kUNIFORM_MIXTURE(1.0 - m1mixture), - kUNIFORM_TARGET(1.0 / vocab_e_size) { - assert(m1mixture >= 0.0 && m1mixture <= 1.0); - assert(vocab_e_size > 0); - } - - // return p0 of rule.e_ | rule.f_ - prob_t operator()(const TRule& rule) const { - return p0(rule.f_, rule.e_, 0, 0); - } - - prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; - - const Model1& model1; - const prob_t kM1MIXTURE; // Model 1 mixture component - const prob_t kUNIFORM_MIXTURE; // uniform mixture component - const prob_t kUNIFORM_TARGET; -}; - -struct PhraseJointBase { - explicit PhraseJointBase(const Model1& m1, const double m1mixture, const unsigned vocab_e_size, const unsigned vocab_f_size) : - model1(m1), - kM1MIXTURE(m1mixture), - kUNIFORM_MIXTURE(1.0 - m1mixture), - kUNIFORM_SOURCE(1.0 / vocab_f_size), - kUNIFORM_TARGET(1.0 / vocab_e_size) { - assert(m1mixture >= 0.0 && m1mixture <= 1.0); - assert(vocab_e_size > 0); - } - - // return p0 of rule.e_ , rule.f_ - prob_t operator()(const TRule& rule) const { - return p0(rule.f_, rule.e_, 0, 0); - } - - prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; - - const Model1& model1; - const prob_t kM1MIXTURE; // Model 1 mixture component - const prob_t kUNIFORM_MIXTURE; // uniform mixture component - const prob_t kUNIFORM_SOURCE; - const prob_t kUNIFORM_TARGET; -}; - -struct PhraseJointBase_BiDir { - explicit PhraseJointBase_BiDir(const Model1& m1, - const Model1& im1, - const double m1mixture, - const unsigned vocab_e_size, - const unsigned vocab_f_size) : - model1(m1), - invmodel1(im1), - kM1MIXTURE(m1mixture), - kUNIFORM_MIXTURE(1.0 - m1mixture), - kUNIFORM_SOURCE(1.0 / vocab_f_size), - kUNIFORM_TARGET(1.0 / vocab_e_size) { - assert(m1mixture >= 0.0 && m1mixture <= 1.0); - assert(vocab_e_size > 0); - } - - // return p0 of rule.e_ , rule.f_ - prob_t operator()(const TRule& rule) const { - return p0(rule.f_, rule.e_, 0, 0); - } - - prob_t p0(const std::vector& vsrc, const std::vector& vtrg, int start_src, int start_trg) const; - - const Model1& model1; - const Model1& invmodel1; - const prob_t kM1MIXTURE; // Model 1 mixture component - const prob_t kUNIFORM_MIXTURE; // uniform mixture component - const prob_t kUNIFORM_SOURCE; - const prob_t kUNIFORM_TARGET; -}; - -// base distribution for jump size multinomials -// basically p(0) = 0 and then, p(1) is max, and then -// you drop as you move to the max jump distance -struct JumpBase { - JumpBase(); - - const prob_t& operator()(int jump, unsigned src_len) const { - assert(jump != 0); - const std::map::const_iterator it = p[src_len].find(jump); - assert(it != p[src_len].end()); - return it->second; - } - std::vector > p; -}; - - -#endif diff --git a/mteval/ns.cc b/mteval/ns.cc index 68c8deaa..da678b84 100644 --- a/mteval/ns.cc +++ b/mteval/ns.cc @@ -136,6 +136,10 @@ struct BleuSegmentEvaluator : public SegmentEvaluator { float* correct, // N elements reserved float* hyp, // N elements reserved bool clip_counts = true) const { + // clear clipping stats + for (typename NGramCountMap::iterator it = ngrams_.begin(); it != ngrams_.end(); ++it) + it->second.second = 0; + vector ngram(N); *correct *= 0; *hyp *= 0; diff --git a/mteval/ns.h b/mteval/ns.h index 622265db..d88c263b 100644 --- a/mteval/ns.h +++ b/mteval/ns.h @@ -6,6 +6,7 @@ #include #include #include "wordid.h" +#include class SufficientStats { public: @@ -43,6 +44,11 @@ class SufficientStats { bool operator==(const SufficientStats& other) const { return other.fields == fields; } + bool IsAdditiveIdentity() const { + for (unsigned i = 0; i < fields.size(); ++i) + if (fields[i]) return false; + return true; + } size_t size() const { return fields.size(); } float operator[](size_t i) const { if (i < fields.size()) return fields[i]; @@ -54,12 +60,12 @@ class SufficientStats { std::vector fields; }; -inline const SufficientStats& operator+(const SufficientStats& a, const SufficientStats& b) { +inline const SufficientStats operator+(const SufficientStats& a, const SufficientStats& b) { SufficientStats res(a); return res += b; } -inline const SufficientStats& operator-(const SufficientStats& a, const SufficientStats& b) { +inline const SufficientStats operator-(const SufficientStats& a, const SufficientStats& b) { SufficientStats res(a); return res -= b; } diff --git a/vest/ces.cc b/vest/ces.cc index 4ae6b695..cd89aa69 100644 --- a/vest/ces.cc +++ b/vest/ces.cc @@ -4,25 +4,32 @@ #include #include -#include "aligner.h" +// TODO, if AER is to be optimized again, we will need this +// #include "aligner.h" #include "lattice.h" #include "viterbi_envelope.h" #include "error_surface.h" +#include "ns.h" using boost::shared_ptr; using namespace std; const bool minimize_segments = true; // if adjacent segments have equal scores, merge them -void ComputeErrorSurface(const SentenceScorer& ss, const ViterbiEnvelope& ve, ErrorSurface* env, const ScoreType type, const Hypergraph& hg) { +void ComputeErrorSurface(const SegmentEvaluator& ss, + const ViterbiEnvelope& ve, + ErrorSurface* env, + const EvaluationMetric* metric, + const Hypergraph& hg) { vector prev_trans; const vector >& ienv = ve.GetSortedSegs(); env->resize(ienv.size()); - ScoreP prev_score; + SufficientStats prev_score; // defaults to 0 int j = 0; for (int i = 0; i < ienv.size(); ++i) { const Segment& seg = *ienv[i]; vector trans; +#if 0 if (type == AER) { vector edges(hg.edges_.size(), false); seg.CollectEdgesUsed(&edges); // get the set of edges in the viterbi @@ -46,34 +53,31 @@ void ComputeErrorSurface(const SentenceScorer& ss, const ViterbiEnvelope& ve, Er string tstr = os.str(); TD::ConvertSentence(tstr.substr(tstr.rfind(" ||| ") + 5), &trans); } else { +#endif seg.ConstructTranslation(&trans); - } - // cerr << "Scoring: " << TD::GetString(trans) << endl; + //} + //cerr << "Scoring: " << TD::GetString(trans) << endl; if (trans == prev_trans) { if (!minimize_segments) { - assert(prev_score); // if this fails, it means - // the decoder can generate null translations ErrorSegment& out = (*env)[j]; - out.delta = prev_score->GetZero(); + out.delta.fields.clear(); out.x = seg.x; ++j; } - // cerr << "Identical translation, skipping scoring\n"; + //cerr << "Identical translation, skipping scoring\n"; } else { - ScoreP score = ss.ScoreCandidate(trans); + SufficientStats score; + ss.Evaluate(trans, &score); // cerr << "score= " << score->ComputeScore() << "\n"; - ScoreP cur_delta_p = score->GetZero(); - Score* cur_delta = cur_delta_p.get(); - // just record the score diffs - if (!prev_score) - prev_score = score->GetZero(); - - score->Subtract(*prev_score, cur_delta); + //string x1; score.Encode(&x1); cerr << "STATS: " << x1 << endl; + const SufficientStats delta = score - prev_score; + //string x2; delta.Encode(&x2); cerr << "DELTA: " << x2 << endl; + //string xx; delta.Encode(&xx); cerr << xx << endl; prev_trans.swap(trans); prev_score = score; - if ((!minimize_segments) || (!cur_delta->IsAdditiveIdentity())) { + if ((!minimize_segments) || (!delta.IsAdditiveIdentity())) { ErrorSegment& out = (*env)[j]; - out.delta = cur_delta_p; + out.delta = delta; out.x = seg.x; ++j; } diff --git a/vest/ces.h b/vest/ces.h index 2f098990..e021e715 100644 --- a/vest/ces.h +++ b/vest/ces.h @@ -1,12 +1,16 @@ #ifndef _CES_H_ #define _CES_H_ -#include "scorer.h" - class ViterbiEnvelope; class Hypergraph; +class SegmentEvaluator; class ErrorSurface; +class EvaluationMetric; -void ComputeErrorSurface(const SentenceScorer& ss, const ViterbiEnvelope& ve, ErrorSurface* es, const ScoreType type, const Hypergraph& hg); +void ComputeErrorSurface(const SegmentEvaluator& ss, + const ViterbiEnvelope& ve, + ErrorSurface* es, + const EvaluationMetric* metric, + const Hypergraph& hg); #endif diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl index c382a972..8cde748b 100755 --- a/vest/dist-vest.pl +++ b/vest/dist-vest.pl @@ -364,7 +364,7 @@ while (1){ $mapoutput =~ s/mapinput/mapoutput/; push @mapoutputs, "$dir/splag.$im1/$mapoutput"; $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; - my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput"; + my $script = "$MAPPER -s $srcFile -m $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput"; if ($use_make) { my $script_file = "$dir/scripts/map.$shard"; open F, ">$script_file" or die "Can't write $script_file: $!"; @@ -424,7 +424,7 @@ while (1){ print STDERR "Results for $tol/$til lines\n"; print STDERR "\nSORTING AND RUNNING VEST REDUCER\n"; print STDERR unchecked_output("date"); - $cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -l $metric > $dir/redoutput.$im1"; + $cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -m $metric > $dir/redoutput.$im1"; print STDERR "COMMAND:\n$cmd\n"; check_bash_call($cmd); $cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1"; diff --git a/vest/error_surface.cc b/vest/error_surface.cc index 754aa8de..515b67f8 100644 --- a/vest/error_surface.cc +++ b/vest/error_surface.cc @@ -5,8 +5,7 @@ using namespace std; -ErrorSurface::~ErrorSurface() { -} +ErrorSurface::~ErrorSurface() {} void ErrorSurface::Serialize(std::string* out) const { const int segments = this->size(); @@ -15,8 +14,8 @@ void ErrorSurface::Serialize(std::string* out) const { for (int i = 0; i < segments; ++i) { const ErrorSegment& cur = (*this)[i]; string senc; - cur.delta->Encode(&senc); - assert(senc.size() < 256); + cur.delta.Encode(&senc); + assert(senc.size() < 1024); unsigned char len = senc.size(); os.write((const char*)&cur.x, sizeof(cur.x)); os.write((const char*)&len, sizeof(len)); @@ -25,7 +24,7 @@ void ErrorSurface::Serialize(std::string* out) const { *out = os.str(); } -void ErrorSurface::Deserialize(ScoreType type, const std::string& in) { +void ErrorSurface::Deserialize(const std::string& in) { istringstream is(in, ios::binary); int segments; is.read((char*)&segments, sizeof(segments)); @@ -37,7 +36,7 @@ void ErrorSurface::Deserialize(ScoreType type, const std::string& in) { is.read((char*)&len, sizeof(len)); string senc(len, '\0'); assert(senc.size() == len); is.read((char*)&senc[0], len); - cur.delta = SentenceScorer::CreateScoreFromString(type, senc); + cur.delta = SufficientStats(senc); } } diff --git a/vest/error_surface.h b/vest/error_surface.h index ad728cfa..bb65847b 100644 --- a/vest/error_surface.h +++ b/vest/error_surface.h @@ -4,13 +4,13 @@ #include #include -#include "scorer.h" +#include "ns.h" class Score; struct ErrorSegment { double x; - ScoreP delta; + SufficientStats delta; ErrorSegment() : x(0), delta() {} }; @@ -18,7 +18,7 @@ class ErrorSurface : public std::vector { public: ~ErrorSurface(); void Serialize(std::string* out) const; - void Deserialize(ScoreType type, const std::string& in); + void Deserialize(const std::string& in); }; #endif diff --git a/vest/line_optimizer.cc b/vest/line_optimizer.cc index 7303df8d..49443fbe 100644 --- a/vest/line_optimizer.cc +++ b/vest/line_optimizer.cc @@ -4,7 +4,7 @@ #include #include "sparse_vector.h" -#include "scorer.h" +#include "ns.h" using namespace std; @@ -18,6 +18,7 @@ struct IntervalComp { }; double LineOptimizer::LineOptimize( + const EvaluationMetric* metric, const vector& surfaces, const LineOptimizer::ScoreType type, float* best_score, @@ -32,8 +33,7 @@ double LineOptimizer::LineOptimize( } sort(all_ints.begin(), all_ints.end(), IntervalComp()); double last_boundary = all_ints.front()->x; - ScoreP accp = all_ints.front()->delta->GetZero(); - Score *acc=accp.get(); + SufficientStats acc; float& cur_best_score = *best_score; cur_best_score = (type == MAXIMIZE_SCORE ? -numeric_limits::max() : numeric_limits::max()); @@ -42,9 +42,8 @@ double LineOptimizer::LineOptimize( for (vector::iterator i = all_ints.begin(); i != all_ints.end(); ++i) { const ErrorSegment& seg = **i; - assert(seg.delta); if (seg.x - last_boundary > epsilon) { - float sco = acc->ComputeScore(); + float sco = metric->ComputeScore(acc); if ((type == MAXIMIZE_SCORE && sco > cur_best_score) || (type == MINIMIZE_SCORE && sco < cur_best_score) ) { cur_best_score = sco; @@ -54,16 +53,18 @@ double LineOptimizer::LineOptimize( } else { pos = last_boundary + (seg.x - last_boundary) / 2; } - // cerr << "NEW BEST: " << pos << " (score=" << cur_best_score << ")\n"; + //cerr << "NEW BEST: " << pos << " (score=" << cur_best_score << ")\n"; } - // string xx; acc->ScoreDetails(&xx); cerr << "---- " << xx; + // string xx = metric->DetailedScore(acc); cerr << "---- " << xx; // cerr << "---- s=" << sco << "\n"; last_boundary = seg.x; } // cerr << "x-boundary=" << seg.x << "\n"; - acc->PlusEquals(*seg.delta); + //string x2; acc.Encode(&x2); cerr << " ACC: " << x2 << endl; + //string x1; seg.delta.Encode(&x1); cerr << " DELTA: " << x1 << endl; + acc += seg.delta; } - float sco = acc->ComputeScore(); + float sco = metric->ComputeScore(acc); if ((type == MAXIMIZE_SCORE && sco > cur_best_score) || (type == MINIMIZE_SCORE && sco < cur_best_score) ) { cur_best_score = sco; @@ -107,3 +108,4 @@ void LineOptimizer::CreateOptimizationDirections( RandomUnitVector(features_to_optimize, &out[i], rng); cerr << "Generated " << out.size() << " total axes to optimize along.\n"; } + diff --git a/vest/line_optimizer.h b/vest/line_optimizer.h index 99a591f4..83819f41 100644 --- a/vest/line_optimizer.h +++ b/vest/line_optimizer.h @@ -7,6 +7,7 @@ #include "error_surface.h" #include "sampler.h" +class EvaluationMetric; class Weights; struct LineOptimizer { @@ -18,6 +19,7 @@ struct LineOptimizer { // merge all the error surfaces together into a global // error surface and find (the middle of) the best segment static double LineOptimize( + const EvaluationMetric* metric, const std::vector& envs, const LineOptimizer::ScoreType type, float* best_score, diff --git a/vest/lo_test.cc b/vest/lo_test.cc index f5638600..a67f65e1 100644 --- a/vest/lo_test.cc +++ b/vest/lo_test.cc @@ -5,6 +5,8 @@ #include #include +#include "ns.h" +#include "ns_docscorer.h" #include "ces.h" #include "fdict.h" #include "hg.h" @@ -15,7 +17,6 @@ #include "viterbi.h" #include "viterbi_envelope.h" #include "line_optimizer.h" -#include "scorer.h" using namespace std; using boost::shared_ptr; @@ -141,9 +142,6 @@ TEST_F(OptTest, TestS1) { TD::ConvertSentence(ref22, &refs2[1]); TD::ConvertSentence(ref32, &refs2[2]); TD::ConvertSentence(ref42, &refs2[3]); - ScoreType type = ScoreTypeFromString("ibm_bleu"); - ScorerP scorer1 = SentenceScorer::CreateSentenceScorer(type, refs1); - ScorerP scorer2 = SentenceScorer::CreateSentenceScorer(type, refs2); vector envs(2); RandomNumberGenerator rng; @@ -167,14 +165,17 @@ TEST_F(OptTest, TestS1) { envs[1] = Inside(hg2, NULL, wf); vector es(2); - ComputeErrorSurface(*scorer1, envs[0], &es[0], IBM_BLEU, hg); - ComputeErrorSurface(*scorer2, envs[1], &es[1], IBM_BLEU, hg2); + EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); + boost::shared_ptr scorer1 = metric->CreateSegmentEvaluator(refs1); + boost::shared_ptr scorer2 = metric->CreateSegmentEvaluator(refs2); + ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg); + ComputeErrorSurface(*scorer2, envs[1], &es[1], metric, hg2); cerr << envs[0].size() << " " << envs[1].size() << endl; cerr << es[0].size() << " " << es[1].size() << endl; envs.clear(); clock_t t_env=clock(); float score; - double m = LineOptimizer::LineOptimize(es, LineOptimizer::MAXIMIZE_SCORE, &score); + double m = LineOptimizer::LineOptimize(metric,es, LineOptimizer::MAXIMIZE_SCORE, &score); clock_t t_opt=clock(); cerr << "line optimizer returned: " << m << " (SCORE=" << score << ")\n"; EXPECT_FLOAT_EQ(0.48719698, score); @@ -217,15 +218,15 @@ TEST_F(OptTest,TestZeroOrigin) { vector envs(1); envs[0] = Inside(hg, NULL, wf); - ScoreType type = ScoreTypeFromString("ibm_bleu"); vector > mr(4); TD::ConvertSentence("untitled", &mr[0]); TD::ConvertSentence("with no title", &mr[1]); TD::ConvertSentence("without a title", &mr[2]); TD::ConvertSentence("without title", &mr[3]); - ScorerP scorer1 = SentenceScorer::CreateSentenceScorer(type, mr); + EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); + boost::shared_ptr scorer1 = metric->CreateSegmentEvaluator(mr); vector es(1); - ComputeErrorSurface(*scorer1, envs[0], &es[0], IBM_BLEU, hg); + ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg); } int main(int argc, char **argv) { diff --git a/vest/mr_vest_map.cc b/vest/mr_vest_map.cc index 71dda6d7..8f6e085d 100644 --- a/vest/mr_vest_map.cc +++ b/vest/mr_vest_map.cc @@ -6,11 +6,12 @@ #include #include +#include "ns.h" +#include "ns_docscorer.h" #include "ces.h" #include "filelib.h" #include "stringlib.h" #include "sparse_vector.h" -#include "scorer.h" #include "viterbi_envelope.h" #include "inside_outside.h" #include "error_surface.h" @@ -25,7 +26,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { opts.add_options() ("reference,r",po::value >(), "[REQD] Reference translation (tokenized text)") ("source,s",po::value(), "Source file (ignored, except for AER)") - ("loss_function,l",po::value()->default_value("ibm_bleu"), "Loss function being optimized") + ("evaluation_metric,m",po::value()->default_value("ibm_bleu"), "Evaluation metric being optimized") ("input,i",po::value()->default_value("-"), "Input file to map (- is STDIN)") ("help,h", "Help"); po::options_description dcmdline_options; @@ -67,10 +68,10 @@ bool ReadSparseVectorString(const string& s, SparseVector* v) { int main(int argc, char** argv) { po::variables_map conf; InitCommandLine(argc, argv, &conf); - const string loss_function = conf["loss_function"].as(); - ScoreType type = ScoreTypeFromString(loss_function); - DocScorer ds(type, conf["reference"].as >(), conf["source"].as()); - cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl; + const string evaluation_metric = conf["evaluation_metric"].as(); + EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); + DocumentScorer ds(metric, conf["reference"].as >()); + cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl; Hypergraph hg; string last_file; ReadFile in_read(conf["input"].as()); @@ -97,7 +98,8 @@ int main(int argc, char** argv) { ViterbiEnvelopeWeightFunction wf(origin, axis); ViterbiEnvelope ve = Inside(hg, NULL, wf); ErrorSurface es; - ComputeErrorSurface(*ds[sent_id], ve, &es, type, hg); + + ComputeErrorSurface(*ds[sent_id], ve, &es, metric, hg); //cerr << "Viterbi envelope has " << ve.size() << " segments\n"; // cerr << "Error surface has " << es.size() << " segments\n"; string val; diff --git a/vest/mr_vest_reduce.cc b/vest/mr_vest_reduce.cc index 3df52020..dda61f88 100644 --- a/vest/mr_vest_reduce.cc +++ b/vest/mr_vest_reduce.cc @@ -10,6 +10,7 @@ #include "error_surface.h" #include "line_optimizer.h" #include "b64tools.h" +#include "stringlib.h" using namespace std; namespace po = boost::program_options; @@ -17,12 +18,12 @@ namespace po = boost::program_options; void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() - ("loss_function,l",po::value(), "Loss function being optimized") + ("evaluation_metric,m",po::value(), "Evaluation metric (IBM_BLEU, etc.)") ("help,h", "Help"); po::options_description dcmdline_options; dcmdline_options.add(opts); po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - bool flag = conf->count("loss_function") == 0; + bool flag = conf->count("evaluation_metric") == 0; if (flag || conf->count("help")) { cerr << dcmdline_options << endl; exit(1); @@ -32,30 +33,27 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { int main(int argc, char** argv) { po::variables_map conf; InitCommandLine(argc, argv, &conf); - const string loss_function = conf["loss_function"].as(); - ScoreType type = ScoreTypeFromString(loss_function); + const string evaluation_metric = conf["evaluation_metric"].as(); LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE; - if (type == TER || type == AER) { + if (UppercaseString(evaluation_metric) == "TER") opt_type = LineOptimizer::MINIMIZE_SCORE; - } - string last_key; + EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); + vector esv; - while(cin) { - string line; - getline(cin, line); - if (line.empty()) continue; + string last_key, line, key, val; + while(getline(cin, line)) { size_t ks = line.find("\t"); assert(string::npos != ks); assert(ks > 2); - string key = line.substr(2, ks - 2); - string val = line.substr(ks + 1); + key = line.substr(2, ks - 2); + val = line.substr(ks + 1); if (key != last_key) { if (!last_key.empty()) { float score; - double x = LineOptimizer::LineOptimize(esv, opt_type, &score); + double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score); cout << last_key << "|" << x << "|" << score << endl; } - last_key = key; + last_key.swap(key); esv.clear(); } if (val.size() % 4 != 0) { @@ -68,13 +66,11 @@ int main(int argc, char** argv) { continue; } esv.push_back(ErrorSurface()); - esv.back().Deserialize(type, encoded); + esv.back().Deserialize(encoded); } if (!esv.empty()) { - // cerr << "ESV=" << esv.size() << endl; - // for (int i = 0; i < esv.size(); ++i) { cerr << esv[i].size() << endl; } float score; - double x = LineOptimizer::LineOptimize(esv, opt_type, &score); + double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score); cout << last_key << "|" << x << "|" << score << endl; } return 0; -- cgit v1.2.3 From 203c3c3357b9ed8cfe44932c2bf5ea19eba6238c Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 27 Jan 2012 13:19:27 -0500 Subject: migration to new metric api for vest, clean up of unsupported/not functional code --- mteval/mbr_kbest.cc | 21 +- utils/fast_sparse_vector.h | 6 + vest/dist-vest.pl | 22 +-- vest/mbr_kbest.cc | 138 ------------- vest/mr_vest_generate_mapper_input.cc | 356 ++++++---------------------------- vest/mr_vest_map.cc | 16 +- 6 files changed, 84 insertions(+), 475 deletions(-) delete mode 100644 vest/mbr_kbest.cc diff --git a/mteval/mbr_kbest.cc b/mteval/mbr_kbest.cc index 64a6a8bf..b5e4750c 100644 --- a/mteval/mbr_kbest.cc +++ b/mteval/mbr_kbest.cc @@ -5,7 +5,7 @@ #include "prob.h" #include "tdict.h" -#include "scorer.h" +#include "ns.h" #include "filelib.h" #include "stringlib.h" @@ -17,7 +17,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() ("scale,a",po::value()->default_value(1.0), "Posterior scaling factor (alpha)") - ("loss_function,l",po::value()->default_value("bleu"), "Loss function") + ("evaluation_metric,m",po::value()->default_value("ibm_bleu"), "Evaluation metric") ("input,i",po::value()->default_value("-"), "File to read k-best lists from") ("output_list,L", "Show reranked list as output") ("help,h", "Help"); @@ -75,13 +75,14 @@ bool ReadKBestList(istream* in, string* sent_id, vector, pro int main(int argc, char** argv) { po::variables_map conf; InitCommandLine(argc, argv, &conf); - const string metric = conf["loss_function"].as(); + const string smetric = conf["evaluation_metric"].as(); + EvaluationMetric* metric = EvaluationMetric::Instance(smetric); + const bool is_loss = (UppercaseString(smetric) == "TER"); const bool output_list = conf.count("output_list") > 0; const string file = conf["input"].as(); const double mbr_scale = conf["scale"].as(); cerr << "Posterior scaling factor (alpha) = " << mbr_scale << endl; - ScoreType type = ScoreTypeFromString(metric); vector, prob_t> > list; ReadFile rf(file); string sent_id; @@ -99,15 +100,15 @@ int main(int argc, char** argv) { vector mbr_scores(output_list ? list.size() : 0); double mbr_loss = numeric_limits::max(); for (int i = 0 ; i < list.size(); ++i) { - vector > refs(1, list[i].first); - //cerr << i << ": " << list[i].second <<"\t" << TD::GetString(list[i].first) << endl; - ScorerP scorer = SentenceScorer::CreateSentenceScorer(type, refs); + const vector > refs(1, list[i].first); + double wl_acc = 0; for (int j = 0; j < list.size(); ++j) { if (i != j) { - ScoreP s = scorer->ScoreCandidate(list[j].first); - double loss = 1.0 - s->ComputeScore(); - if (type == TER || type == AER) loss = 1.0 - loss; + SufficientStats ss; + metric->ComputeSufficientStatistics(list[j].first, refs, &ss); + double loss = 1.0 - metric->ComputeScore(ss); + if (is_loss) loss = 1.0 - loss; double weighted_loss = loss * (joints[j] / marginal).as_float(); wl_acc += weighted_loss; if ((!output_list) && wl_acc > mbr_loss) break; diff --git a/utils/fast_sparse_vector.h b/utils/fast_sparse_vector.h index 1301581a..17fa47bf 100644 --- a/utils/fast_sparse_vector.h +++ b/utils/fast_sparse_vector.h @@ -178,6 +178,12 @@ class FastSparseVector { T l2norm() const { return sqrt(l2norm_sq()); } + T pnorm(const double p) const { + T sum = T(); + for (const_iterator it = begin(), e = end(); it != e; ++it) + sum += pow(fabs(it->second), p); + return pow(sum, 1.0 / p); + } // if values are binary, gives |A intersect B|/|A union B| template S tanimoto_coef(const FastSparseVector &vec) const { diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl index 8cde748b..1ec8c6b1 100755 --- a/vest/dist-vest.pl +++ b/vest/dist-vest.pl @@ -65,8 +65,6 @@ my $oraclen=0; my $oracleb=20; my $bleu_weight=1; my $use_make = 1; # use make to parallelize line search -my $dirargs=''; -my $density_prune; my $useqsub; my $pass_suffix = ''; my $cpbin=1; @@ -75,7 +73,6 @@ Getopt::Long::Configure("no_auto_abbrev"); if (GetOptions( "decoder=s" => \$decoderOpt, "jobs=i" => \$jobs, - "density-prune=f" => \$density_prune, "dont-clean" => \$disable_clean, "pass-suffix=s" => \$pass_suffix, "dry-run" => \$dryrun, @@ -87,15 +84,7 @@ if (GetOptions( "normalize=s" => \$normalize, "pmem=s" => \$pmem, "cpbin!" => \$cpbin, - "rand-directions=i" => \$rand_directions, - "random_directions=i" => \$rand_directions, - "bleu_weight=s" => \$bleu_weight, - "no-primary!" => \$noprimary, - "max-similarity=s" => \$maxsim, - "oracle-directions=i" => \$oraclen, - "n-oracle=i" => \$oraclen, - "oracle-batch=i" => \$oracleb, - "directions-args=s" => \$dirargs, + "random-directions=i" => \$rand_directions, "ref-files=s" => \$refFiles, "metric=s" => \$metric, "source-file=s" => \$srcFile, @@ -107,10 +96,6 @@ if (GetOptions( exit; } -if (defined $density_prune) { - die "--density_prune n: n must be greater than 1.0\n" unless $density_prune > 1.0; -} - if ($useqsub) { $use_make = 0; die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub(); @@ -328,10 +313,7 @@ while (1){ print STDERR "\nGENERATE OPTIMIZATION STRATEGY (OPT-ITERATION $opt_iter/$optimization_iters)\n"; print STDERR unchecked_output("date"); $icc++; - my $nop=$noprimary?"--no_primary":""; - my $targs=$oraclen ? "--decoder_translations='$runFile.gz' ".get_comma_sep_refs('-references',$refFiles):""; - my $bwargs=$bleu_weight!=1 ? "--bleu_weight=$bleu_weight":""; - $cmd="$MAPINPUT -w $inweights -r $dir/hgs $bwargs -s $devSize -d $rand_directions --max_similarity=$maxsim --oracle_directions=$oraclen --oracle_batch=$oracleb $targs $dirargs > $dir/agenda.$im1-$opt_iter"; + $cmd="$MAPINPUT -w $inweights -r $dir/hgs -s $devSize -d $rand_directions > $dir/agenda.$im1-$opt_iter"; print STDERR "COMMAND:\n$cmd\n"; check_call($cmd); check_call("mkdir -p $dir/splag.$im1"); diff --git a/vest/mbr_kbest.cc b/vest/mbr_kbest.cc deleted file mode 100644 index 2867b36b..00000000 --- a/vest/mbr_kbest.cc +++ /dev/null @@ -1,138 +0,0 @@ -#include -#include - -#include - -#include "prob.h" -#include "tdict.h" -#include "scorer.h" -#include "filelib.h" -#include "stringlib.h" - -using namespace std; - -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("scale,a",po::value()->default_value(1.0), "Posterior scaling factor (alpha)") - ("loss_function,l",po::value()->default_value("bleu"), "Loss function") - ("input,i",po::value()->default_value("-"), "File to read k-best lists from") - ("output_list,L", "Show reranked list as output") - ("help,h", "Help"); - po::options_description dcmdline_options; - dcmdline_options.add(opts); - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - bool flag = false; - if (flag || conf->count("help")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -struct LossComparer { - bool operator()(const pair, double>& a, const pair, double>& b) const { - return a.second < b.second; - } -}; - -bool ReadKBestList(istream* in, string* sent_id, vector, prob_t> >* list) { - static string cache_id; - static pair, prob_t> cache_pair; - list->clear(); - string cur_id; - if (cache_pair.first.size() > 0) { - list->push_back(cache_pair); - cur_id = cache_id; - cache_pair.first.clear(); - } - string line; - string tstr; - while(*in) { - getline(*in, line); - if (line.empty()) continue; - size_t p1 = line.find(" ||| "); - if (p1 == string::npos) { cerr << "Bad format: " << line << endl; abort(); } - size_t p2 = line.find(" ||| ", p1 + 4); - if (p2 == string::npos) { cerr << "Bad format: " << line << endl; abort(); } - size_t p3 = line.rfind(" ||| "); - cache_id = line.substr(0, p1); - tstr = line.substr(p1 + 5, p2 - p1 - 5); - double val = strtod(line.substr(p3 + 5).c_str(), NULL); - TD::ConvertSentence(tstr, &cache_pair.first); - cache_pair.second.logeq(val); - if (cur_id.empty()) cur_id = cache_id; - if (cur_id == cache_id) { - list->push_back(cache_pair); - *sent_id = cur_id; - cache_pair.first.clear(); - } else { break; } - } - return !list->empty(); -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - const string metric = conf["loss_function"].as(); - const bool output_list = conf.count("output_list") > 0; - const string file = conf["input"].as(); - const double mbr_scale = conf["scale"].as(); - cerr << "Posterior scaling factor (alpha) = " << mbr_scale << endl; - - ScoreType type = ScoreTypeFromString(metric); - vector, prob_t> > list; - ReadFile rf(file); - string sent_id; - while(ReadKBestList(rf.stream(), &sent_id, &list)) { - vector joints(list.size()); - const prob_t max_score = pow(list.front().second, mbr_scale); - prob_t marginal = prob_t::Zero(); - for (int i = 0 ; i < list.size(); ++i) { - const prob_t joint = pow(list[i].second, mbr_scale) / max_score; - joints[i] = joint; - // cerr << "list[" << i << "] joint=" << log(joint) << endl; - marginal += joint; - } - int mbr_idx = -1; - vector mbr_scores(output_list ? list.size() : 0); - double mbr_loss = numeric_limits::max(); - for (int i = 0 ; i < list.size(); ++i) { - vector > refs(1, list[i].first); - //cerr << i << ": " << list[i].second <<"\t" << TD::GetString(list[i].first) << endl; - ScorerP scorer = SentenceScorer::CreateSentenceScorer(type, refs); - double wl_acc = 0; - for (int j = 0; j < list.size(); ++j) { - if (i != j) { - ScoreP s = scorer->ScoreCandidate(list[j].first); - double loss = 1.0 - s->ComputeScore(); - if (type == TER || type == AER) loss = 1.0 - loss; - double weighted_loss = loss * (joints[j] / marginal); - wl_acc += weighted_loss; - if ((!output_list) && wl_acc > mbr_loss) break; - } - } - if (output_list) mbr_scores[i] = wl_acc; - if (wl_acc < mbr_loss) { - mbr_loss = wl_acc; - mbr_idx = i; - } - } - // cerr << "ML translation: " << TD::GetString(list[0].first) << endl; - cerr << "MBR Best idx: " << mbr_idx << endl; - if (output_list) { - for (int i = 0; i < list.size(); ++i) - list[i].second.logeq(mbr_scores[i]); - sort(list.begin(), list.end(), LossComparer()); - for (int i = 0; i < list.size(); ++i) - cout << sent_id << " ||| " - << TD::GetString(list[i].first) << " ||| " - << log(list[i].second) << endl; - } else { - cout << TD::GetString(list[mbr_idx].first) << endl; - } - } - return 0; -} - diff --git a/vest/mr_vest_generate_mapper_input.cc b/vest/mr_vest_generate_mapper_input.cc index 0c094fd5..59d4f24f 100644 --- a/vest/mr_vest_generate_mapper_input.cc +++ b/vest/mr_vest_generate_mapper_input.cc @@ -1,320 +1,78 @@ -//TODO: debug segfault when references supplied, null shared_ptr when oracle #include #include -#include #include #include -#include "sampler.h" #include "filelib.h" #include "weights.h" #include "line_optimizer.h" -#include "hg.h" -#include "hg_io.h" -#include "scorer.h" -#include "oracle_bleu.h" -#include "ff_bleu.h" - -const bool DEBUG_ORACLE=true; - -//TODO: decide on cdec_ff ffs, or just bleumodel - if just bleumodel, then do existing features on serialized hypergraphs remain? weights (origin) is passed to oracle_bleu.h:ComputeOracle -//void register_feature_functions(); -//FFRegistry ff_registry; -namespace { -void init_bleumodel() { - ff_registry.clear(); - ff_registry.Register(new FFFactory); -} - -struct init_ff { - init_ff() { - init_bleumodel(); - } -}; -//init_ff reg; // order of initialization? ff_registry may not be init yet. call in Run() instead. -} using namespace std; namespace po = boost::program_options; -typedef SparseVector Dir; -typedef Dir Point; - -void compress_similar(vector &dirs,double min_dist,ostream *log=&cerr,bool avg=true,bool verbose=true) { - // return; //TODO: debug - if (min_dist<=0) return; - double max_s=1.-min_dist; - if (log&&verbose) *log<<"max allowed S="< "<add_options() - ("dev_set_size,s",po::value(&dev_set_size),"[REQD] Development set size (# of parallel sentences)") - ("forest_repository,r",po::value(&forest_repository),"[REQD] Path to forest repository") - ("weights,w",po::value(&weights_file),"[REQD] Current feature weights file") - ("optimize_feature,o",po::value >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)") - ("random_directions,d",po::value(&n_random)->default_value(10),"Number of random directions to run the line optimizer in") - ("no_primary,n","don't use the primary (orthogonal each feature alone) directions") - ("oracle_directions,O",po::value(&n_oracle)->default_value(0),"read the forests and choose this many directions based on heading toward a hope max (bleu+modelscore) translation.") - ("oracle_start_random",po::bool_switch(&start_random),"sample random subsets of dev set for ALL oracle directions, not just those after a sequential run through it") - ("oracle_batch,b",po::value(&oracle_batch)->default_value(10),"to produce each oracle direction, sum the 'gradient' over this many sentences") - ("max_similarity,m",po::value(&max_similarity)->default_value(0),"remove directions that are too similar (Tanimoto coeff. less than (1-this)). 0 means don't filter, 1 means only 1 direction allowed?") - ("fear_to_hope,f",po::bool_switch(&fear_to_hope),"for each of the oracle_directions, also include a direction from fear to hope (as well as origin to hope)") - ("no_old_to_hope","don't emit the usual old -> hope oracle") - ("decoder_translations",po::value(&decoder_translations_file)->default_value(""),"one per line decoder 1best translations for computing document BLEU vs. sentences-seen-so-far BLEU") - ; - } - void InitCommandLine(int argc, char *argv[], po::variables_map *conf) { - po::options_description opts("Configuration options"); - AddOptions(&opts); - opts.add_options()("help,h", "Help"); - - po::options_description dcmdline_options; - dcmdline_options.add(opts); - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - po::notify(*conf); - if (conf->count("dev_set_size") == 0) { - cerr << "Please specify the size of the development set using -s N\n"; - goto bad_cmdline; - } - if (conf->count("weights") == 0) { - cerr << "Please specify the starting-point weights using -w \n"; - goto bad_cmdline; - } - if (conf->count("forest_repository") == 0) { - cerr << "Please specify the forest repository location using -r \n"; - goto bad_cmdline; - } - if (n_oracle && oracle.refs.empty()) { - cerr<<"Specify references when using oracle directions\n"; - goto bad_cmdline; - } - if (conf->count("help")) { - cout << dcmdline_options << endl; - exit(0); - } - - return; - bad_cmdline: - cerr << dcmdline_options << endl; - exit(1); +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("dev_set_size,s",po::value(),"[REQD] Development set size (# of parallel sentences)") + ("forest_repository,r",po::value(),"[REQD] Path to forest repository") + ("weights,w",po::value(),"[REQD] Current feature weights file") + ("optimize_feature,o",po::value >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)") + ("random_directions,d",po::value()->default_value(20),"Number of random directions to run the line optimizer in") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (conf->count("dev_set_size") == 0) { + cerr << "Please specify the size of the development set using -d N\n"; + flag = true; } - - int main(int argc, char *argv[]) { - po::variables_map conf; - InitCommandLine(argc,argv,&conf); - init_bleumodel(); - UseConf(conf); - Run(); - return 0; + if (conf->count("weights") == 0) { + cerr << "Please specify the starting-point weights using -w \n"; + flag = true; } - bool verbose() const { return oracle.verbose; } - void Run() { -// register_feature_functions(); - AddPrimaryAndRandomDirections(); - AddOracleDirections(); - compress_similar(directions,max_similarity,&cerr,true,verbose()); - Print(); + if (conf->count("forest_repository") == 0) { + cerr << "Please specify the forest repository location using -r \n"; + flag = true; } - - - Point origin; // old weights that gave model 1best. - vector optimize_features; - void UseConf(po::variables_map const& conf) { - oracle.UseConf(conf); - include_primary=!conf.count("no_primary"); - old_to_hope=!conf.count("no_old_to_hope"); - - if (conf.count("optimize_feature") > 0) - optimize_features=conf["optimize_feature"].as >(); - Init(); + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); } +} - string weights_file; - double max_similarity; - unsigned n_oracle, oracle_batch; - string forest_repository; - unsigned dev_set_size; - vector oracles; - vector fids; - string forest_file(unsigned i) const { - ostringstream o; - o << forest_repository << '/' << i << ".json.gz"; - return o.str(); - } - - oracle_directions() { } - - Sentences model_hyps; - - vector model_scores; - bool have_doc; - void Init() { - have_doc=!decoder_translations_file.empty(); - if (have_doc) { - model_hyps.Load(decoder_translations_file); - if (verbose()) model_hyps.Print(cerr,5); - model_scores.resize(model_hyps.size()); - if (dev_set_size!=model_hyps.size()) { - cerr<<"You supplied decoder_translations with a different number of lines ("<ScoreCandidate(model_hyps[i]); - assert(model_scores[i]); - if (verbose()) cerr<<"Before model["<ScoreDetails()<PlusEquals(*model_scores[i]); - if (verbose()) cerr<<"After model["< features; - vector dorigin; - Weights::InitFromFile(weights_file, &dorigin, &features); - if (optimize_features.size()) - features=optimize_features; - Weights::InitSparseVector(dorigin, &origin); - fids.clear(); - AddFeatureIds(features); - oracles.resize(dev_set_size); - } - - void AddFeatureIds(vector const& features) { - int i = fids.size(); - fids.resize(fids.size()+features.size()); - for (; i < features.size(); ++i) - fids[i] = FD::Convert(features[i]); - } - - - std::string decoder_translations_file; // one per line - //TODO: is it worthwhile to get a complete document bleu first? would take a list of 1best translations one per line from the decoders, rather than loading all the forests (expensive). translations are in run.raw.N.gz - new arg - void adjust_doc(unsigned i,double scale=1.) { - oracle.doc_score->PlusEquals(*model_scores[i],scale); - } - - Score &ds() { - return *oracle.doc_score; - } - - Oracle const& ComputeOracle(unsigned i) { - Oracle &o=oracles[i]; - if (o.is_null()) { - if (have_doc) { - if (verbose()) cerr<<"Before removing i="<PlusEquals(*hopesc,1); - cerr<<"With hope: "<PlusEquals(*hopesc,-1); - cerr<<"Without hope: "<ScoreDetails()<=dev_set_size) ? rsg() : b); - - if (old_to_hope) - o2hope+=o.ModelHopeGradient(); - if (fear_to_hope) - fear2hope+=o.FearHopeGradient(); - } - double N=(double)oracle_batch; - if (old_to_hope) { - o2hope/=N; - directions.push_back(o2hope); - } - if (fear_to_hope) { - fear2hope/=N; - directions.push_back(fear2hope); - } +int main(int argc, char** argv) { + RandomNumberGenerator rng; + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + vector features; + SparseVector origin; + vector w; + Weights::InitFromFile(conf["weights"].as(), &w, &features); + Weights::InitSparseVector(w, &origin); + const string forest_repository = conf["forest_repository"].as(); + assert(DirectoryExists(forest_repository)); + if (conf.count("optimize_feature") > 0) + features=conf["optimize_feature"].as >(); + vector > directions; + vector fids(features.size()); + for (int i = 0; i < features.size(); ++i) + fids[i] = FD::Convert(features[i]); + LineOptimizer::CreateOptimizationDirections( + fids, + conf["random_directions"].as(), + &rng, + &directions); + unsigned dev_set_size = conf["dev_set_size"].as(); + for (unsigned i = 0; i < dev_set_size; ++i) { + for (unsigned j = 0; j < directions.size(); ++j) { + cout << forest_repository << '/' << i << ".json.gz " << i << ' '; + print(cout, origin, "=", ";"); + cout << ' '; + print(cout, directions[j], "=", ";"); + cout << endl; } } -}; - -int main(int argc, char** argv) { - oracle_directions od; - return od.main(argc,argv); + return 0; } diff --git a/vest/mr_vest_map.cc b/vest/mr_vest_map.cc index 8f6e085d..7d9625bc 100644 --- a/vest/mr_vest_map.cc +++ b/vest/mr_vest_map.cc @@ -82,20 +82,20 @@ int main(int argc, char** argv) { if (line.empty()) continue; istringstream is(line); int sent_id; - string file, s_origin, s_axis; + string file, s_origin, s_direction; // path-to-file (JSON) sent_ed starting-point search-direction - is >> file >> sent_id >> s_origin >> s_axis; + is >> file >> sent_id >> s_origin >> s_direction; SparseVector origin; - assert(ReadSparseVectorString(s_origin, &origin)); - SparseVector axis; - assert(ReadSparseVectorString(s_axis, &axis)); - // cerr << "File: " << file << "\nAxis: " << axis << "\n X: " << origin << endl; + ReadSparseVectorString(s_origin, &origin); + SparseVector direction; + ReadSparseVectorString(s_direction, &direction); + // cerr << "File: " << file << "\nDir: " << direction << "\n X: " << origin << endl; if (last_file != file) { last_file = file; ReadFile rf(file); HypergraphIO::ReadFromJSON(rf.stream(), &hg); } - ViterbiEnvelopeWeightFunction wf(origin, axis); + ViterbiEnvelopeWeightFunction wf(origin, direction); ViterbiEnvelope ve = Inside(hg, NULL, wf); ErrorSurface es; @@ -104,7 +104,7 @@ int main(int argc, char** argv) { // cerr << "Error surface has " << es.size() << " segments\n"; string val; es.Serialize(&val); - cout << 'M' << ' ' << s_origin << ' ' << s_axis << '\t'; + cout << 'M' << ' ' << s_origin << ' ' << s_direction << '\t'; B64::b64encode(val.c_str(), val.size(), &cout); cout << endl << flush; } -- cgit v1.2.3 From 74c61ef9c9dc5cefbad4aa9513973965dd583ee7 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Fri, 27 Jan 2012 14:30:44 -0500 Subject: Silly windows users, code isn't executable --- utils/agenda.h | 0 utils/batched_append.h | 0 utils/best.h | 0 utils/fast_lexical_cast.hpp | 0 utils/feature_vector.h | 0 utils/ftoa.h | 0 utils/hash.h | 0 utils/have_64_bits.h | 0 utils/indices_after.h | 0 utils/int_or_pointer.h | 0 utils/intern_pool.h | 0 utils/intrusive_refcount.hpp | 0 utils/lvalue_pmap.h | 0 utils/max_plus.h | 0 utils/maybe_update_bound.h | 0 utils/murmur_hash.h | 0 utils/named_enum.h | 0 utils/nan.h | 0 utils/null_deleter.h | 0 utils/null_traits.h | 0 utils/semiring.h | 0 utils/show.h | 0 utils/static_utoa.h | 0 utils/string_to.h | 0 utils/stringlib_test.cc | 0 utils/swap_pod.h | 0 utils/utoa.h | 0 utils/value_array.h | 0 28 files changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 utils/agenda.h mode change 100755 => 100644 utils/batched_append.h mode change 100755 => 100644 utils/best.h mode change 100755 => 100644 utils/fast_lexical_cast.hpp mode change 100755 => 100644 utils/feature_vector.h mode change 100755 => 100644 utils/ftoa.h mode change 100755 => 100644 utils/hash.h mode change 100755 => 100644 utils/have_64_bits.h mode change 100755 => 100644 utils/indices_after.h mode change 100755 => 100644 utils/int_or_pointer.h mode change 100755 => 100644 utils/intern_pool.h mode change 100755 => 100644 utils/intrusive_refcount.hpp mode change 100755 => 100644 utils/lvalue_pmap.h mode change 100755 => 100644 utils/max_plus.h mode change 100755 => 100644 utils/maybe_update_bound.h mode change 100755 => 100644 utils/murmur_hash.h mode change 100755 => 100644 utils/named_enum.h mode change 100755 => 100644 utils/nan.h mode change 100755 => 100644 utils/null_deleter.h mode change 100755 => 100644 utils/null_traits.h mode change 100755 => 100644 utils/semiring.h mode change 100755 => 100644 utils/show.h mode change 100755 => 100644 utils/static_utoa.h mode change 100755 => 100644 utils/string_to.h mode change 100755 => 100644 utils/stringlib_test.cc mode change 100755 => 100644 utils/swap_pod.h mode change 100755 => 100644 utils/utoa.h mode change 100755 => 100644 utils/value_array.h diff --git a/utils/agenda.h b/utils/agenda.h old mode 100755 new mode 100644 diff --git a/utils/batched_append.h b/utils/batched_append.h old mode 100755 new mode 100644 diff --git a/utils/best.h b/utils/best.h old mode 100755 new mode 100644 diff --git a/utils/fast_lexical_cast.hpp b/utils/fast_lexical_cast.hpp old mode 100755 new mode 100644 diff --git a/utils/feature_vector.h b/utils/feature_vector.h old mode 100755 new mode 100644 diff --git a/utils/ftoa.h b/utils/ftoa.h old mode 100755 new mode 100644 diff --git a/utils/hash.h b/utils/hash.h old mode 100755 new mode 100644 diff --git a/utils/have_64_bits.h b/utils/have_64_bits.h old mode 100755 new mode 100644 diff --git a/utils/indices_after.h b/utils/indices_after.h old mode 100755 new mode 100644 diff --git a/utils/int_or_pointer.h b/utils/int_or_pointer.h old mode 100755 new mode 100644 diff --git a/utils/intern_pool.h b/utils/intern_pool.h old mode 100755 new mode 100644 diff --git a/utils/intrusive_refcount.hpp b/utils/intrusive_refcount.hpp old mode 100755 new mode 100644 diff --git a/utils/lvalue_pmap.h b/utils/lvalue_pmap.h old mode 100755 new mode 100644 diff --git a/utils/max_plus.h b/utils/max_plus.h old mode 100755 new mode 100644 diff --git a/utils/maybe_update_bound.h b/utils/maybe_update_bound.h old mode 100755 new mode 100644 diff --git a/utils/murmur_hash.h b/utils/murmur_hash.h old mode 100755 new mode 100644 diff --git a/utils/named_enum.h b/utils/named_enum.h old mode 100755 new mode 100644 diff --git a/utils/nan.h b/utils/nan.h old mode 100755 new mode 100644 diff --git a/utils/null_deleter.h b/utils/null_deleter.h old mode 100755 new mode 100644 diff --git a/utils/null_traits.h b/utils/null_traits.h old mode 100755 new mode 100644 diff --git a/utils/semiring.h b/utils/semiring.h old mode 100755 new mode 100644 diff --git a/utils/show.h b/utils/show.h old mode 100755 new mode 100644 diff --git a/utils/static_utoa.h b/utils/static_utoa.h old mode 100755 new mode 100644 diff --git a/utils/string_to.h b/utils/string_to.h old mode 100755 new mode 100644 diff --git a/utils/stringlib_test.cc b/utils/stringlib_test.cc old mode 100755 new mode 100644 diff --git a/utils/swap_pod.h b/utils/swap_pod.h old mode 100755 new mode 100644 diff --git a/utils/utoa.h b/utils/utoa.h old mode 100755 new mode 100644 diff --git a/utils/value_array.h b/utils/value_array.h old mode 100755 new mode 100644 -- cgit v1.2.3 From 47aa8d94d3ddff39295966cee67ce884c98be8da Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 27 Jan 2012 14:49:08 -0500 Subject: rename vest to dpmert (dynamic programming mert), rename variables and types to correspond to standard geometric concepts --- Makefile.am | 2 +- configure.ac | 2 +- dpmert/Makefile.am | 35 ++ dpmert/README.shared-mem | 9 + dpmert/cat.pl | 4 + dpmert/ces.cc | 91 ++++ dpmert/ces.h | 16 + dpmert/dpmert.pl | 700 ++++++++++++++++++++++++++++++ dpmert/error_surface.cc | 42 ++ dpmert/error_surface.h | 24 + dpmert/libcall.pl | 71 +++ dpmert/line_mediator.pl | 116 +++++ dpmert/line_optimizer.cc | 111 +++++ dpmert/line_optimizer.h | 48 ++ dpmert/lo_test.cc | 236 ++++++++++ dpmert/mert_geometry.cc | 186 ++++++++ dpmert/mert_geometry.h | 81 ++++ dpmert/mr_dpmert_generate_mapper_input.cc | 78 ++++ dpmert/mr_dpmert_map.cc | 112 +++++ dpmert/mr_dpmert_reduce.cc | 77 ++++ dpmert/parallelize.pl | 423 ++++++++++++++++++ dpmert/sentclient.c | 76 ++++ dpmert/sentserver.c | 515 ++++++++++++++++++++++ dpmert/sentserver.h | 6 + dpmert/tac.pl | 8 + dpmert/test_aer/README | 8 + dpmert/test_aer/cdec.ini | 3 + dpmert/test_aer/corpus.src | 3 + dpmert/test_aer/grammar | 12 + dpmert/test_aer/ref.0 | 3 + dpmert/test_aer/weights | 13 + dpmert/test_data/0.json.gz | Bin 0 -> 13709 bytes dpmert/test_data/1.json.gz | Bin 0 -> 204803 bytes dpmert/test_data/c2e.txt.0 | 2 + dpmert/test_data/c2e.txt.1 | 2 + dpmert/test_data/c2e.txt.2 | 2 + dpmert/test_data/c2e.txt.3 | 2 + dpmert/test_data/re.txt.0 | 5 + dpmert/test_data/re.txt.1 | 5 + dpmert/test_data/re.txt.2 | 5 + dpmert/test_data/re.txt.3 | 5 + vest/Makefile.am | 35 -- vest/README.shared-mem | 9 - vest/cat.pl | 4 - vest/ces.cc | 91 ---- vest/ces.h | 16 - vest/dist-vest.pl | 700 ------------------------------ vest/error_surface.cc | 42 -- vest/error_surface.h | 24 - vest/libcall.pl | 71 --- vest/line_mediator.pl | 116 ----- vest/line_optimizer.cc | 111 ----- vest/line_optimizer.h | 48 -- vest/lo_test.cc | 236 ---------- vest/mr_vest_generate_mapper_input.cc | 78 ---- vest/mr_vest_map.cc | 112 ----- vest/mr_vest_reduce.cc | 77 ---- vest/parallelize.pl | 423 ------------------ vest/sentclient.c | 76 ---- vest/sentserver.c | 515 ---------------------- vest/sentserver.h | 6 - vest/tac.pl | 8 - vest/test_aer/README | 8 - vest/test_aer/cdec.ini | 3 - vest/test_aer/corpus.src | 3 - vest/test_aer/grammar | 12 - vest/test_aer/ref.0 | 3 - vest/test_aer/weights | 13 - vest/test_data/0.json.gz | Bin 13709 -> 0 bytes vest/test_data/1.json.gz | Bin 204803 -> 0 bytes vest/test_data/c2e.txt.0 | 2 - vest/test_data/c2e.txt.1 | 2 - vest/test_data/c2e.txt.2 | 2 - vest/test_data/c2e.txt.3 | 2 - vest/test_data/re.txt.0 | 5 - vest/test_data/re.txt.1 | 5 - vest/test_data/re.txt.2 | 5 - vest/test_data/re.txt.3 | 5 - vest/viterbi_envelope.cc | 177 -------- vest/viterbi_envelope.h | 81 ---- 80 files changed, 3137 insertions(+), 3128 deletions(-) create mode 100644 dpmert/Makefile.am create mode 100644 dpmert/README.shared-mem create mode 100755 dpmert/cat.pl create mode 100644 dpmert/ces.cc create mode 100644 dpmert/ces.h create mode 100755 dpmert/dpmert.pl create mode 100644 dpmert/error_surface.cc create mode 100644 dpmert/error_surface.h create mode 100644 dpmert/libcall.pl create mode 100755 dpmert/line_mediator.pl create mode 100644 dpmert/line_optimizer.cc create mode 100644 dpmert/line_optimizer.h create mode 100644 dpmert/lo_test.cc create mode 100644 dpmert/mert_geometry.cc create mode 100644 dpmert/mert_geometry.h create mode 100644 dpmert/mr_dpmert_generate_mapper_input.cc create mode 100644 dpmert/mr_dpmert_map.cc create mode 100644 dpmert/mr_dpmert_reduce.cc create mode 100755 dpmert/parallelize.pl create mode 100644 dpmert/sentclient.c create mode 100644 dpmert/sentserver.c create mode 100644 dpmert/sentserver.h create mode 100755 dpmert/tac.pl create mode 100644 dpmert/test_aer/README create mode 100644 dpmert/test_aer/cdec.ini create mode 100644 dpmert/test_aer/corpus.src create mode 100644 dpmert/test_aer/grammar create mode 100644 dpmert/test_aer/ref.0 create mode 100644 dpmert/test_aer/weights create mode 100644 dpmert/test_data/0.json.gz create mode 100644 dpmert/test_data/1.json.gz create mode 100644 dpmert/test_data/c2e.txt.0 create mode 100644 dpmert/test_data/c2e.txt.1 create mode 100644 dpmert/test_data/c2e.txt.2 create mode 100644 dpmert/test_data/c2e.txt.3 create mode 100644 dpmert/test_data/re.txt.0 create mode 100644 dpmert/test_data/re.txt.1 create mode 100644 dpmert/test_data/re.txt.2 create mode 100644 dpmert/test_data/re.txt.3 delete mode 100644 vest/Makefile.am delete mode 100644 vest/README.shared-mem delete mode 100755 vest/cat.pl delete mode 100644 vest/ces.cc delete mode 100644 vest/ces.h delete mode 100755 vest/dist-vest.pl delete mode 100644 vest/error_surface.cc delete mode 100644 vest/error_surface.h delete mode 100644 vest/libcall.pl delete mode 100755 vest/line_mediator.pl delete mode 100644 vest/line_optimizer.cc delete mode 100644 vest/line_optimizer.h delete mode 100644 vest/lo_test.cc delete mode 100644 vest/mr_vest_generate_mapper_input.cc delete mode 100644 vest/mr_vest_map.cc delete mode 100644 vest/mr_vest_reduce.cc delete mode 100755 vest/parallelize.pl delete mode 100644 vest/sentclient.c delete mode 100644 vest/sentserver.c delete mode 100644 vest/sentserver.h delete mode 100755 vest/tac.pl delete mode 100644 vest/test_aer/README delete mode 100644 vest/test_aer/cdec.ini delete mode 100644 vest/test_aer/corpus.src delete mode 100644 vest/test_aer/grammar delete mode 100644 vest/test_aer/ref.0 delete mode 100644 vest/test_aer/weights delete mode 100644 vest/test_data/0.json.gz delete mode 100644 vest/test_data/1.json.gz delete mode 100644 vest/test_data/c2e.txt.0 delete mode 100644 vest/test_data/c2e.txt.1 delete mode 100644 vest/test_data/c2e.txt.2 delete mode 100644 vest/test_data/c2e.txt.3 delete mode 100644 vest/test_data/re.txt.0 delete mode 100644 vest/test_data/re.txt.1 delete mode 100644 vest/test_data/re.txt.2 delete mode 100644 vest/test_data/re.txt.3 delete mode 100644 vest/viterbi_envelope.cc delete mode 100644 vest/viterbi_envelope.h diff --git a/Makefile.am b/Makefile.am index 59c2fc0a..c0fcb1f6 100644 --- a/Makefile.am +++ b/Makefile.am @@ -1,7 +1,7 @@ # warning - the subdirectories in the following list should # be kept in topologically sorted order. Also, DO NOT introduce # cyclic dependencies between these directories! -SUBDIRS = utils mteval klm/util klm/lm decoder phrasinator training mira vest pro-train extools gi/pf gi/markov_al +SUBDIRS = utils mteval klm/util klm/lm decoder phrasinator training mira dpmert pro-train extools gi/pf gi/markov_al #gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava diff --git a/configure.ac b/configure.ac index 131a1705..cd78ee72 100644 --- a/configure.ac +++ b/configure.ac @@ -113,4 +113,4 @@ then AM_CONDITIONAL([GLC], true) fi -AC_OUTPUT(Makefile utils/Makefile mteval/Makefile extools/Makefile decoder/Makefile phrasinator/Makefile training/Makefile vest/Makefile pro-train/Makefile klm/util/Makefile klm/lm/Makefile mira/Makefile gi/pyp-topics/src/Makefile gi/clda/src/Makefile gi/pf/Makefile gi/markov_al/Makefile) +AC_OUTPUT(Makefile utils/Makefile mteval/Makefile extools/Makefile decoder/Makefile phrasinator/Makefile training/Makefile dpmert/Makefile pro-train/Makefile klm/util/Makefile klm/lm/Makefile mira/Makefile gi/pyp-topics/src/Makefile gi/clda/src/Makefile gi/pf/Makefile gi/markov_al/Makefile) diff --git a/dpmert/Makefile.am b/dpmert/Makefile.am new file mode 100644 index 00000000..2676fb50 --- /dev/null +++ b/dpmert/Makefile.am @@ -0,0 +1,35 @@ +bin_PROGRAMS = \ + mr_dpmert_map \ + mr_dpmert_reduce \ + mr_dpmert_generate_mapper_input \ + sentserver \ + sentclient + +if HAVE_GTEST +noinst_PROGRAMS = \ + lo_test +TESTS = lo_test +endif + +sentserver_SOURCES = sentserver.c +sentserver_LDFLAGS = -all-static -pthread + +sentclient_SOURCES = sentclient.c +sentclient_LDFLAGS = -all-static -pthread + +mr_dpmert_generate_mapper_input_SOURCES = mr_dpmert_generate_mapper_input.cc line_optimizer.cc +mr_dpmert_generate_mapper_input_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz + +# nbest2hg_SOURCES = nbest2hg.cc +# nbest2hg_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lfst -lz + +mr_dpmert_map_SOURCES = mert_geometry.cc ces.cc error_surface.cc mr_dpmert_map.cc line_optimizer.cc +mr_dpmert_map_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz + +mr_dpmert_reduce_SOURCES = error_surface.cc ces.cc mr_dpmert_reduce.cc line_optimizer.cc mert_geometry.cc +mr_dpmert_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz + +lo_test_SOURCES = lo_test.cc ces.cc mert_geometry.cc error_surface.cc line_optimizer.cc +lo_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz + +AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/dpmert/README.shared-mem b/dpmert/README.shared-mem new file mode 100644 index 00000000..7728efc0 --- /dev/null +++ b/dpmert/README.shared-mem @@ -0,0 +1,9 @@ +If you want to run dist-vest.pl on a very large shared memory machine, do the +following: + + ./dist-vest.pl --use-make I --decode-nodes J --weights weights.init --source-file=dev.src --ref-files=dev.ref.* cdec.ini + +This will use I jobs for doing the line search and J jobs to run the decoder. Typically, since the +decoder must load grammars, language models, etc., J should be smaller than I, but this will depend +on the system you are running on and the complexity of the models used for decoding. + diff --git a/dpmert/cat.pl b/dpmert/cat.pl new file mode 100755 index 00000000..2ecba3f9 --- /dev/null +++ b/dpmert/cat.pl @@ -0,0 +1,4 @@ +#!/usr/bin/perl + +$|=1; +print while(<>); diff --git a/dpmert/ces.cc b/dpmert/ces.cc new file mode 100644 index 00000000..a85454da --- /dev/null +++ b/dpmert/ces.cc @@ -0,0 +1,91 @@ +#include "ces.h" + +#include +#include +#include + +// TODO, if AER is to be optimized again, we will need this +// #include "aligner.h" +#include "lattice.h" +#include "mert_geometry.h" +#include "error_surface.h" +#include "ns.h" + +using boost::shared_ptr; +using namespace std; + +const bool minimize_segments = true; // if adjacent segments have equal scores, merge them + +void ComputeErrorSurface(const SegmentEvaluator& ss, + const ConvexHull& ve, + ErrorSurface* env, + const EvaluationMetric* metric, + const Hypergraph& hg) { + vector prev_trans; + const vector >& ienv = ve.GetSortedSegs(); + env->resize(ienv.size()); + SufficientStats prev_score; // defaults to 0 + int j = 0; + for (int i = 0; i < ienv.size(); ++i) { + const MERTPoint& seg = *ienv[i]; + vector trans; +#if 0 + if (type == AER) { + vector edges(hg.edges_.size(), false); + seg.CollectEdgesUsed(&edges); // get the set of edges in the viterbi + // alignment + ostringstream os; + const string* psrc = ss.GetSource(); + if (psrc == NULL) { + cerr << "AER scoring in VEST requires source, but it is missing!\n"; + abort(); + } + size_t pos = psrc->rfind(" ||| "); + if (pos == string::npos) { + cerr << "Malformed source for AER: expected |||\nINPUT: " << *psrc << endl; + abort(); + } + Lattice src; + Lattice ref; + LatticeTools::ConvertTextOrPLF(psrc->substr(0, pos), &src); + LatticeTools::ConvertTextOrPLF(psrc->substr(pos + 5), &ref); + AlignerTools::WriteAlignment(src, ref, hg, &os, true, 0, &edges); + string tstr = os.str(); + TD::ConvertSentence(tstr.substr(tstr.rfind(" ||| ") + 5), &trans); + } else { +#endif + seg.ConstructTranslation(&trans); + //} + //cerr << "Scoring: " << TD::GetString(trans) << endl; + if (trans == prev_trans) { + if (!minimize_segments) { + ErrorSegment& out = (*env)[j]; + out.delta.fields.clear(); + out.x = seg.x; + ++j; + } + //cerr << "Identical translation, skipping scoring\n"; + } else { + SufficientStats score; + ss.Evaluate(trans, &score); + // cerr << "score= " << score->ComputeScore() << "\n"; + //string x1; score.Encode(&x1); cerr << "STATS: " << x1 << endl; + const SufficientStats delta = score - prev_score; + //string x2; delta.Encode(&x2); cerr << "DELTA: " << x2 << endl; + //string xx; delta.Encode(&xx); cerr << xx << endl; + prev_trans.swap(trans); + prev_score = score; + if ((!minimize_segments) || (!delta.IsAdditiveIdentity())) { + ErrorSegment& out = (*env)[j]; + out.delta = delta; + out.x = seg.x; + ++j; + } + } + } + // cerr << " In segments: " << ienv.size() << endl; + // cerr << "Out segments: " << j << endl; + assert(j > 0); + env->resize(j); +} + diff --git a/dpmert/ces.h b/dpmert/ces.h new file mode 100644 index 00000000..e4fa2080 --- /dev/null +++ b/dpmert/ces.h @@ -0,0 +1,16 @@ +#ifndef _CES_H_ +#define _CES_H_ + +class ConvexHull; +class Hypergraph; +class SegmentEvaluator; +class ErrorSurface; +class EvaluationMetric; + +void ComputeErrorSurface(const SegmentEvaluator& ss, + const ConvexHull& convex_hull, + ErrorSurface* es, + const EvaluationMetric* metric, + const Hypergraph& hg); + +#endif diff --git a/dpmert/dpmert.pl b/dpmert/dpmert.pl new file mode 100755 index 00000000..52ce0fc0 --- /dev/null +++ b/dpmert/dpmert.pl @@ -0,0 +1,700 @@ +#!/usr/bin/env perl +use strict; +my @ORIG_ARGV=@ARGV; +use Cwd qw(getcwd); +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } + +# Skip local config (used for distributing jobs) if we're running in local-only mode +use LocalConfig; +use Getopt::Long; +use IPC::Open2; +use POSIX ":sys_wait_h"; +my $QSUB_CMD = qsub_args(mert_memory()); + +require "libcall.pl"; + +# Default settings +my $srcFile; +my $refFiles; +my $default_jobs = env_default_jobs(); +my $bin_dir = $SCRIPT_DIR; +die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; +my $FAST_SCORE="$bin_dir/../mteval/fast_score"; +die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; +my $MAPINPUT = "$bin_dir/mr_dpmert_generate_mapper_input"; +my $MAPPER = "$bin_dir/mr_dpmert_map"; +my $REDUCER = "$bin_dir/mr_dpmert_reduce"; +my $parallelize = "$bin_dir/parallelize.pl"; +my $libcall = "$bin_dir/libcall.pl"; +my $sentserver = "$bin_dir/sentserver"; +my $sentclient = "$bin_dir/sentclient"; +my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm"; + +my $SCORER = $FAST_SCORE; +die "Can't find $MAPPER" unless -x $MAPPER; +my $cdec = "$bin_dir/../decoder/cdec"; +die "Can't find decoder in $cdec" unless -x $cdec; +die "Can't find $parallelize" unless -x $parallelize; +die "Can't find $libcall" unless -e $libcall; +my $decoder = $cdec; +my $lines_per_mapper = 400; +my $rand_directions = 15; +my $iteration = 1; +my $best_weights; +my $max_iterations = 15; +my $optimization_iters = 6; +my $jobs = $default_jobs; # number of decode nodes +my $pmem = "9g"; +my $disable_clean = 0; +my %seen_weights; +my $normalize; +my $help = 0; +my $epsilon = 0.0001; +my $interval = 5; +my $dryrun = 0; +my $last_score = -10000000; +my $metric = "ibm_bleu"; +my $dir; +my $iniFile; +my $weights; +my $initialWeights; +my $decoderOpt; +my $noprimary; +my $maxsim=0; +my $oraclen=0; +my $oracleb=20; +my $bleu_weight=1; +my $use_make = 1; # use make to parallelize line search +my $useqsub; +my $pass_suffix = ''; +my $cpbin=1; +# Process command-line options +Getopt::Long::Configure("no_auto_abbrev"); +if (GetOptions( + "decoder=s" => \$decoderOpt, + "jobs=i" => \$jobs, + "dont-clean" => \$disable_clean, + "pass-suffix=s" => \$pass_suffix, + "dry-run" => \$dryrun, + "epsilon=s" => \$epsilon, + "help" => \$help, + "interval" => \$interval, + "qsub" => \$useqsub, + "max-iterations=i" => \$max_iterations, + "normalize=s" => \$normalize, + "pmem=s" => \$pmem, + "cpbin!" => \$cpbin, + "random-directions=i" => \$rand_directions, + "ref-files=s" => \$refFiles, + "metric=s" => \$metric, + "source-file=s" => \$srcFile, + "weights=s" => \$initialWeights, + "workdir=s" => \$dir, + "opt-iterations=i" => \$optimization_iters, +) == 0 || @ARGV!=1 || $help) { + print_help(); + exit; +} + +if ($useqsub) { + $use_make = 0; + die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub(); +} + +my @missing_args = (); +if (!defined $srcFile) { push @missing_args, "--source-file"; } +if (!defined $refFiles) { push @missing_args, "--ref-files"; } +if (!defined $initialWeights) { push @missing_args, "--weights"; } +die "Please specify missing arguments: " . join (', ', @missing_args) . "\n" if (@missing_args); + +if ($metric =~ /^(combi|ter)$/i) { + $lines_per_mapper = 40; +} elsif ($metric =~ /^meteor$/i) { + $lines_per_mapper = 2000; # start up time is really high +} + +($iniFile) = @ARGV; + + +sub write_config; +sub enseg; +sub print_help; + +my $nodelist; +my $host =check_output("hostname"); chomp $host; +my $bleu; +my $interval_count = 0; +my $logfile; +my $projected_score; + +# used in sorting scores +my $DIR_FLAG = '-r'; +if ($metric =~ /^ter$|^aer$/i) { + $DIR_FLAG = ''; +} + +my $refs_comma_sep = get_comma_sep_refs('r',$refFiles); + +unless ($dir){ + $dir = "dpmert"; +} +unless ($dir =~ /^\//){ # convert relative path to absolute path + my $basedir = check_output("pwd"); + chomp $basedir; + $dir = "$basedir/$dir"; +} + +if ($decoderOpt){ $decoder = $decoderOpt; } + + +# Initializations and helper functions +srand; + +my @childpids = (); +my @cleanupcmds = (); + +sub cleanup { + print STDERR "Cleanup...\n"; + for my $pid (@childpids){ unchecked_call("kill $pid"); } + for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); } + exit 1; +}; +# Always call cleanup, no matter how we exit +*CORE::GLOBAL::exit = + sub{ cleanup(); }; +$SIG{INT} = "cleanup"; +$SIG{TERM} = "cleanup"; +$SIG{HUP} = "cleanup"; + +my $decoderBase = check_output("basename $decoder"); chomp $decoderBase; +my $newIniFile = "$dir/$decoderBase.ini"; +my $inputFileName = "$dir/input"; +my $user = $ENV{"USER"}; + + +# process ini file +-e $iniFile || die "Error: could not open $iniFile for reading\n"; +open(INI, $iniFile); + +use File::Basename qw(basename); +#pass bindir, refs to vars holding bin +sub modbin { + local $_; + my $bindir=shift; + check_call("mkdir -p $bindir"); + -d $bindir || die "couldn't make bindir $bindir"; + for (@_) { + my $src=$$_; + $$_="$bindir/".basename($src); + check_call("cp -p $src $$_"); + } +} +sub dirsize { + opendir ISEMPTY,$_[0]; + return scalar(readdir(ISEMPTY))-1; +} +if ($dryrun){ + write_config(*STDERR); + exit 0; +} else { + if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-dpmert.pl outputs + die "ERROR: working dir $dir already exists\n\n"; + } else { + -e $dir || mkdir $dir; + mkdir "$dir/hgs"; + modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$REDUCER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin; + mkdir "$dir/scripts"; + my $cmdfile="$dir/rerun-dpmert.sh"; + open CMD,'>',$cmdfile; + print CMD "cd ",&getcwd,"\n"; +# print CMD &escaped_cmdline,"\n"; #buggy - last arg is quoted. + my $cline=&cmdline."\n"; + print CMD $cline; + close CMD; + print STDERR $cline; + chmod(0755,$cmdfile); + unless (-e $initialWeights) { + print STDERR "Please specify an initial weights file with --initial-weights\n"; + print_help(); + exit; + } + check_call("cp $initialWeights $dir/weights.0"); + die "Can't find weights.0" unless (-e "$dir/weights.0"); + } + write_config(*STDERR); +} + + +# Generate initial files and values +check_call("cp $iniFile $newIniFile"); +$iniFile = $newIniFile; + +my $newsrc = "$dir/dev.input"; +enseg($srcFile, $newsrc); +$srcFile = $newsrc; +my $devSize = 0; +open F, "<$srcFile" or die "Can't read $srcFile: $!"; +while() { $devSize++; } +close F; + +unless($best_weights){ $best_weights = $weights; } +unless($projected_score){ $projected_score = 0.0; } +$seen_weights{$weights} = 1; + +my $random_seed = int(time / 1000); +my $lastWeightsFile; +my $lastPScore = 0; +# main optimization loop +while (1){ + print STDERR "\n\nITERATION $iteration\n==========\n"; + + if ($iteration > $max_iterations){ + print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n"; + last; + } + # iteration-specific files + my $runFile="$dir/run.raw.$iteration"; + my $onebestFile="$dir/1best.$iteration"; + my $logdir="$dir/logs.$iteration"; + my $decoderLog="$logdir/decoder.sentserver.log.$iteration"; + my $scorerLog="$logdir/scorer.log.$iteration"; + check_call("mkdir -p $logdir"); + + + #decode + print STDERR "RUNNING DECODER AT "; + print STDERR unchecked_output("date"); + my $im1 = $iteration - 1; + my $weightsFile="$dir/weights.$im1"; + my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs"; + my $pcmd; + if ($use_make) { + $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $jobs --"; + } else { + $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --"; + } + my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile"; + print STDERR "COMMAND:\n$cmd\n"; + check_bash_call($cmd); + my $num_hgs; + my $num_topbest; + my $retries = 0; + while($retries < 5) { + $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l"); + $num_topbest = check_output("wc -l < $runFile"); + print STDERR "NUMBER OF HGs: $num_hgs\n"; + print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n"; + if($devSize == $num_hgs && $devSize == $num_topbest) { + last; + } else { + print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n"; + sleep(3); + } + $retries++; + } + die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest); + my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -m $metric"); + chomp $dec_score; + print STDERR "DECODER SCORE: $dec_score\n"; + + # save space + check_call("gzip -f $runFile"); + check_call("gzip -f $decoderLog"); + + # run optimizer + print STDERR "RUNNING OPTIMIZER AT "; + print STDERR unchecked_output("date"); + my $mergeLog="$logdir/prune-merge.log.$iteration"; + + my $score = 0; + my $icc = 0; + my $inweights="$dir/weights.$im1"; + for (my $opt_iter=1; $opt_iter<$optimization_iters; $opt_iter++) { + print STDERR "\nGENERATE OPTIMIZATION STRATEGY (OPT-ITERATION $opt_iter/$optimization_iters)\n"; + print STDERR unchecked_output("date"); + $icc++; + $cmd="$MAPINPUT -w $inweights -r $dir/hgs -s $devSize -d $rand_directions > $dir/agenda.$im1-$opt_iter"; + print STDERR "COMMAND:\n$cmd\n"; + check_call($cmd); + check_call("mkdir -p $dir/splag.$im1"); + $cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1-$opt_iter $dir/splag.$im1/mapinput."; + print STDERR "COMMAND:\n$cmd\n"; + check_call($cmd); + opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!"; + my @shards = grep { /^mapinput\./ } readdir(DIR); + closedir DIR; + die "No shards!" unless scalar @shards > 0; + my $joblist = ""; + my $nmappers = 0; + my @mapoutputs = (); + @cleanupcmds = (); + my %o2i = (); + my $first_shard = 1; + my $mkfile; # only used with makefiles + my $mkfilename; + if ($use_make) { + $mkfilename = "$dir/splag.$im1/domap.mk"; + open $mkfile, ">$mkfilename" or die "Couldn't write $mkfilename: $!"; + print $mkfile "all: $dir/splag.$im1/map.done\n\n"; + } + my @mkouts = (); # only used with makefiles + for my $shard (@shards) { + my $mapoutput = $shard; + my $client_name = $shard; + $client_name =~ s/mapinput.//; + $client_name = "dpmert.$client_name"; + $mapoutput =~ s/mapinput/mapoutput/; + push @mapoutputs, "$dir/splag.$im1/$mapoutput"; + $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; + my $script = "$MAPPER -s $srcFile -m $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput"; + if ($use_make) { + my $script_file = "$dir/scripts/map.$shard"; + open F, ">$script_file" or die "Can't write $script_file: $!"; + print F "#!/bin/bash\n"; + print F "$script\n"; + close F; + my $output = "$dir/splag.$im1/$mapoutput"; + push @mkouts, $output; + chmod(0755, $script_file) or die "Can't chmod $script_file: $!"; + if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } + print $mkfile "$output: $dir/splag.$im1/$shard\n\t$script_file\n\n"; + } else { + my $script_file = "$dir/scripts/map.$shard"; + open F, ">$script_file" or die "Can't write $script_file: $!"; + print F "$script\n"; + close F; + if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } + + $nmappers++; + my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file"; + my $jobid = check_output("$qcmd"); + chomp $jobid; + $jobid =~ s/^(\d+)(.*?)$/\1/g; + $jobid =~ s/^Your job (\d+) .*$/\1/; + push(@cleanupcmds, "qdel $jobid 2> /dev/null"); + print STDERR " $jobid"; + if ($joblist == "") { $joblist = $jobid; } + else {$joblist = $joblist . "\|" . $jobid; } + } + } + if ($use_make) { + print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n"; + close $mkfile; + my $mcmd = "make -j $jobs -f $mkfilename"; + print STDERR "\nExecuting: $mcmd\n"; + check_call($mcmd); + } else { + print STDERR "\nLaunched $nmappers mappers.\n"; + sleep 8; + print STDERR "Waiting for mappers to complete...\n"; + while ($nmappers > 0) { + sleep 5; + my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '"))); + $nmappers = scalar @livejobs; + } + print STDERR "All mappers complete.\n"; + } + my $tol = 0; + my $til = 0; + for my $mo (@mapoutputs) { + my $olines = get_lines($mo); + my $ilines = get_lines($o2i{$mo}); + $tol += $olines; + $til += $ilines; + die "$mo: output lines ($olines) doesn't match input lines ($ilines)" unless $olines==$ilines; + } + print STDERR "Results for $tol/$til lines\n"; + print STDERR "\nSORTING AND RUNNING VEST REDUCER\n"; + print STDERR unchecked_output("date"); + $cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -m $metric > $dir/redoutput.$im1"; + print STDERR "COMMAND:\n$cmd\n"; + check_bash_call($cmd); + $cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1"; + # sort returns failure even when it doesn't fail for some reason + my $best=unchecked_output("$cmd"); chomp $best; + print STDERR "$best\n"; + my ($oa, $x, $xscore) = split /\|/, $best; + $score = $xscore; + print STDERR "PROJECTED SCORE: $score\n"; + if (abs($x) < $epsilon) { + print STDERR "\nOPTIMIZER: no score improvement: abs($x) < $epsilon\n"; + last; + } + my $psd = $score - $last_score; + $last_score = $score; + if (abs($psd) < $epsilon) { + print STDERR "\nOPTIMIZER: no score improvement: abs($psd) < $epsilon\n"; + last; + } + my ($origin, $axis) = split /\s+/, $oa; + + my %ori = convert($origin); + my %axi = convert($axis); + + my $finalFile="$dir/weights.$im1-$opt_iter"; + open W, ">$finalFile" or die "Can't write: $finalFile: $!"; + my $norm = 0; + for my $k (sort keys %ori) { + my $dd = $ori{$k} + $axi{$k} * $x; + $norm += $dd * $dd; + } + $norm = sqrt($norm); + $norm = 1; + for my $k (sort keys %ori) { + my $v = ($ori{$k} + $axi{$k} * $x) / $norm; + print W "$k $v\n"; + } + check_call("rm $dir/splag.$im1/*"); + $inweights = $finalFile; + } + $lastWeightsFile = "$dir/weights.$iteration"; + check_call("cp $inweights $lastWeightsFile"); + if ($icc < 2) { + print STDERR "\nREACHED STOPPING CRITERION: score change too little\n"; + last; + } + $lastPScore = $score; + $iteration++; + print STDERR "\n==========\n"; +} + +print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w with the decoder)\n\n"; + +print STDOUT "$lastWeightsFile\n"; + +exit 0; + +sub normalize_weights { + my ($rfn, $rpts, $feat) = @_; + my @feat_names = @$rfn; + my @pts = @$rpts; + my $z = 1.0; + for (my $i=0; $i < scalar @feat_names; $i++) { + if ($feat_names[$i] eq $feat) { + $z = $pts[$i]; + last; + } + } + for (my $i=0; $i < scalar @feat_names; $i++) { + $pts[$i] /= $z; + } + print STDERR " NORM WEIGHTS: @pts\n"; + return @pts; +} + +sub get_lines { + my $fn = shift @_; + open FL, "<$fn" or die "Couldn't read $fn: $!"; + my $lc = 0; + while() { $lc++; } + return $lc; +} + +sub get_comma_sep_refs { + my ($r,$p) = @_; + my $o = check_output("echo $p"); + chomp $o; + my @files = split /\s+/, $o; + return "-$r " . join(" -$r ", @files); +} + +sub read_weights_file { + my ($file) = @_; + open F, "<$file" or die "Couldn't read $file: $!"; + my @r = (); + my $pm = -1; + while() { + next if /^#/; + next if /^\s*$/; + chomp; + if (/^(.+)\s+(.+)$/) { + my $m = $1; + my $w = $2; + die "Weights out of order: $m <= $pm" unless $m > $pm; + push @r, $w; + } else { + warn "Unexpected feature name in weight file: $_"; + } + } + close F; + return join ' ', @r; +} + +# subs +sub write_config { + my $fh = shift; + my $cleanup = "yes"; + if ($disable_clean) {$cleanup = "no";} + + print $fh "\n"; + print $fh "DECODER: $decoder\n"; + print $fh "INI FILE: $iniFile\n"; + print $fh "WORKING DIR: $dir\n"; + print $fh "SOURCE (DEV): $srcFile\n"; + print $fh "REFS (DEV): $refFiles\n"; + print $fh "EVAL METRIC: $metric\n"; + print $fh "START ITERATION: $iteration\n"; + print $fh "MAX ITERATIONS: $max_iterations\n"; + print $fh "PARALLEL JOBS: $jobs\n"; + print $fh "HEAD NODE: $host\n"; + print $fh "PMEM (DECODING): $pmem\n"; + print $fh "CLEANUP: $cleanup\n"; + print $fh "INITIAL WEIGHTS: $initialWeights\n"; +} + +sub update_weights_file { + my ($neww, $rfn, $rpts) = @_; + my @feats = @$rfn; + my @pts = @$rpts; + my $num_feats = scalar @feats; + my $num_pts = scalar @pts; + die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts; + open G, ">$neww" or die; + for (my $i = 0; $i < $num_feats; $i++) { + my $f = $feats[$i]; + my $lambda = $pts[$i]; + print G "$f $lambda\n"; + } + close G; +} + +sub enseg { + my $src = shift; + my $newsrc = shift; + open(SRC, $src); + open(NEWSRC, ">$newsrc"); + my $i=0; + while (my $line=){ + chomp $line; + if ($line =~ /^\s* tags, you must include a zero-based id attribute"; + } + } else { + print NEWSRC "$line\n"; + } + $i++; + } + close SRC; + close NEWSRC; +} + +sub print_help { + + my $executable = check_output("basename $0"); chomp $executable; + print << "Help"; + +Usage: $executable [options] + + $executable [options] + Runs a complete MERT optimization using the decoder configuration + in . Required options are --weights, --source-file, and + --ref-files. + +Options: + + --help + Print this message and exit. + + --max-iterations + Maximum number of iterations to run. If not specified, defaults + to 10. + + --pass-suffix + If the decoder is doing multi-pass decoding, the pass suffix "2", + "3", etc., is used to control what iteration of weights is set. + + --ref-files + Dev set ref files. This option takes only a single string argument. + To use multiple files (including file globbing), this argument should + be quoted. + + --metric + Metric to optimize. + Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi + + --normalize + After each iteration, rescale all feature weights such that feature- + name has a weight of 1.0. + + --rand-directions + MERT will attempt to optimize along all of the principle directions, + set this parameter to explore other directions. Defaults to 5. + + --source-file + Dev set source file. + + --weights + A file specifying initial feature weights. The format is + FeatureName_1 value1 + FeatureName_2 value2 + **All and only the weights listed in will be optimized!** + + --workdir + Directory for intermediate and output files. If not specified, the + name is derived from the ini filename. Assuming that the ini + filename begins with the decoder name and ends with ini, the default + name of the working directory is inferred from the middle part of + the filename. E.g. an ini file named decoder.foo.ini would have + a default working directory name foo. + +Job control options: + + --jobs + Number of decoder processes to run in parallel. [default=$default_jobs] + + --qsub + Use qsub to run jobs in parallel (qsub must be configured in + environment/LocalEnvironment.pm) + + --pmem + Amount of physical memory requested for parallel decoding jobs + (used with qsub requests only) + +Help +} + +sub convert { + my ($str) = @_; + my @ps = split /;/, $str; + my %dict = (); + for my $p (@ps) { + my ($k, $v) = split /=/, $p; + $dict{$k} = $v; + } + return %dict; +} + + + +sub cmdline { + return join ' ',($0,@ORIG_ARGV); +} + +#buggy: last arg gets quoted sometimes? +my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]}; +my $shell_escape_in_quote=qr{[\\"\$`!]}; + +sub escape_shell { + my ($arg)=@_; + return undef unless defined $arg; + if ($arg =~ /$is_shell_special/) { + $arg =~ s/($shell_escape_in_quote)/\\$1/g; + return "\"$arg\""; + } + return $arg; +} + +sub escaped_shell_args { + return map {local $_=$_;chomp;escape_shell($_)} @_; +} + +sub escaped_shell_args_str { + return join ' ',&escaped_shell_args(@_); +} + +sub escaped_cmdline { + return "$0 ".&escaped_shell_args_str(@ORIG_ARGV); +} diff --git a/dpmert/error_surface.cc b/dpmert/error_surface.cc new file mode 100644 index 00000000..515b67f8 --- /dev/null +++ b/dpmert/error_surface.cc @@ -0,0 +1,42 @@ +#include "error_surface.h" + +#include +#include + +using namespace std; + +ErrorSurface::~ErrorSurface() {} + +void ErrorSurface::Serialize(std::string* out) const { + const int segments = this->size(); + ostringstream os(ios::binary); + os.write((const char*)&segments,sizeof(segments)); + for (int i = 0; i < segments; ++i) { + const ErrorSegment& cur = (*this)[i]; + string senc; + cur.delta.Encode(&senc); + assert(senc.size() < 1024); + unsigned char len = senc.size(); + os.write((const char*)&cur.x, sizeof(cur.x)); + os.write((const char*)&len, sizeof(len)); + os.write((const char*)&senc[0], len); + } + *out = os.str(); +} + +void ErrorSurface::Deserialize(const std::string& in) { + istringstream is(in, ios::binary); + int segments; + is.read((char*)&segments, sizeof(segments)); + this->resize(segments); + for (int i = 0; i < segments; ++i) { + ErrorSegment& cur = (*this)[i]; + unsigned char len; + is.read((char*)&cur.x, sizeof(cur.x)); + is.read((char*)&len, sizeof(len)); + string senc(len, '\0'); assert(senc.size() == len); + is.read((char*)&senc[0], len); + cur.delta = SufficientStats(senc); + } +} + diff --git a/dpmert/error_surface.h b/dpmert/error_surface.h new file mode 100644 index 00000000..bb65847b --- /dev/null +++ b/dpmert/error_surface.h @@ -0,0 +1,24 @@ +#ifndef _ERROR_SURFACE_H_ +#define _ERROR_SURFACE_H_ + +#include +#include + +#include "ns.h" + +class Score; + +struct ErrorSegment { + double x; + SufficientStats delta; + ErrorSegment() : x(0), delta() {} +}; + +class ErrorSurface : public std::vector { + public: + ~ErrorSurface(); + void Serialize(std::string* out) const; + void Deserialize(const std::string& in); +}; + +#endif diff --git a/dpmert/libcall.pl b/dpmert/libcall.pl new file mode 100644 index 00000000..c7d0f128 --- /dev/null +++ b/dpmert/libcall.pl @@ -0,0 +1,71 @@ +use IPC::Open3; +use Symbol qw(gensym); + +$DUMMY_STDERR = gensym(); +$DUMMY_STDIN = gensym(); + +# Run the command and ignore failures +sub unchecked_call { + system("@_") +} + +# Run the command and return its output, if any ignoring failures +sub unchecked_output { + return `@_` +} + +# WARNING: Do not use this for commands that will return large amounts +# of stdout or stderr -- they might block indefinitely +sub check_output { + print STDERR "Executing and gathering output: @_\n"; + + my $pid = open3($DUMMY_STDIN, \*PH, $DUMMY_STDERR, @_); + my $proc_output = ""; + while( ) { + $proc_output .= $_; + } + waitpid($pid, 0); + # TODO: Grab signal that the process died from + my $child_exit_status = $? >> 8; + if($child_exit_status == 0) { + return $proc_output; + } else { + print STDERR "ERROR: Execution of @_ failed.\n"; + exit(1); + } +} + +# Based on Moses' safesystem sub +sub check_call { + print STDERR "Executing: @_\n"; + system(@_); + my $exitcode = $? >> 8; + if($exitcode == 0) { + return 0; + } elsif ($? == -1) { + print STDERR "ERROR: Failed to execute: @_\n $!\n"; + exit(1); + + } elsif ($? & 127) { + printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n", + ($? & 127), ($? & 128) ? 'with' : 'without'; + exit(1); + + } else { + print STDERR "Failed with exit code: $exitcode\n" if $exitcode; + exit($exitcode); + } +} + +sub check_bash_call { + my @args = ( "bash", "-auxeo", "pipefail", "-c", "@_"); + check_call(@args); +} + +sub check_bash_output { + my @args = ( "bash", "-auxeo", "pipefail", "-c", "@_"); + return check_output(@args); +} + +# perl module weirdness... +return 1; diff --git a/dpmert/line_mediator.pl b/dpmert/line_mediator.pl new file mode 100755 index 00000000..bc2bb24c --- /dev/null +++ b/dpmert/line_mediator.pl @@ -0,0 +1,116 @@ +#!/usr/bin/perl -w +#hooks up two processes, 2nd of which has one line of output per line of input, expected by the first, which starts off the communication + +# if you don't know how to fork/exec in a C program, this could be helpful under limited cirmustances (would be ok to liaise with sentserver) + +#WARNING: because it waits for the result from command 2 after sending every line, and especially if command 1 does the same, using sentserver as command 2 won't actually buy you any real parallelism. + +use strict; +use IPC::Open2; +use POSIX qw(pipe dup2 STDIN_FILENO STDOUT_FILENO); + +my $quiet=!$ENV{DEBUG}; +$quiet=1 if $ENV{QUIET}; +sub info { + local $,=' '; + print STDERR @_ unless $quiet; +} + +my $mode='CROSS'; +my $ser='DIRECT'; +$mode='PIPE' if $ENV{PIPE}; +$mode='SNAKE' if $ENV{SNAKE}; +$mode='CROSS' if $ENV{CROSS}; +$ser='SERIAL' if $ENV{SERIAL}; +$ser='DIRECT' if $ENV{DIRECT}; +$ser='SERIAL' if $mode eq 'SNAKE'; +info("mode: $mode\n"); +info("connection: $ser\n"); + + +my @c1; +if (scalar @ARGV) { + do { + push @c1,shift + } while scalar @ARGV && $c1[$#c1] ne '--'; +} +pop @c1; +my @c2=@ARGV; +@ARGV=(); +(scalar @c1 && scalar @c2) || die qq{ +usage: $0 cmd1 args -- cmd2 args +all options are environment variables. +DEBUG=1 env var enables debugging output. +CROSS=1 hooks up two processes, 2nd of which has one line of output per line of input, expected by the first, which starts off the communication. crosses stdin/stderr of cmd1 and cmd2 line by line (both must flush on newline and output. cmd1 initiates the conversation (sends the first line). default: attempts to cross stdin/stdout of c1 and c2 directly (via two unidirectional posix pipes created before fork). +SERIAL=1: (no parallelism possible) but lines exchanged are logged if DEBUG. +if SNAKE then stdin -> c1 -> c2 -> c1 -> stdout. +if PIPE then stdin -> c1 -> c2 -> stdout (same as shell c1|c2, but with SERIAL you can see the intermediate in real time; you could do similar with c1 | tee /dev/fd/2 |c2. +DIRECT=1 (default) will override SERIAL=1. +CROSS=1 (default) will override SNAKE or PIPE. +}; + +info("1 cmd:",@c1,"\n"); +info("2 cmd:",@c2,"\n"); + +sub lineto { + select $_[0]; + $|=1; + shift; + print @_; +} + +if ($ser eq 'SERIAL') { + my ($R1,$W1,$R2,$W2); + my $c1p=open2($R1,$W1,@c1); # Open2 R W backward from Open3. + my $c2p=open2($R2,$W2,@c2); + if ($mode eq 'CROSS') { + while(<$R1>) { + info("1:",$_); + lineto($W2,$_); + last unless defined ($_=<$R2>); + info("1|2:",$_); + lineto($W1,$_); + } + } else { + my $snake=$mode eq 'SNAKE'; + while() { + info("IN:",$_); + lineto($W1,$_); + last unless defined ($_=<$R1>); + info("IN|1:",$_); + lineto($W2,$_); + last unless defined ($_=<$R2>); + info("IN|1|2:",$_); + if ($snake) { + lineto($W1,$_); + last unless defined ($_=<$R1>); + info("IN|1|2|1:",$_); + } + lineto(*STDOUT,$_); + } + } +} else { + info("DIRECT mode\n"); + my @rw1=POSIX::pipe(); + my @rw2=POSIX::pipe(); + my $pid=undef; + $SIG{CHLD} = sub { wait }; + while (not defined ($pid=fork())) { + sleep 1; + } + my $pipe = $mode eq 'PIPE'; + unless ($pipe) { + POSIX::close(STDOUT_FILENO); + POSIX::close(STDIN_FILENO); + } + if ($pid) { + POSIX::dup2($rw1[1],STDOUT_FILENO); + POSIX::dup2($rw2[0],STDIN_FILENO) unless $pipe; + exec @c1; + } else { + POSIX::dup2($rw2[1],STDOUT_FILENO) unless $pipe; + POSIX::dup2($rw1[0],STDIN_FILENO); + exec @c2; + } + while (wait()!=-1) {} +} diff --git a/dpmert/line_optimizer.cc b/dpmert/line_optimizer.cc new file mode 100644 index 00000000..49443fbe --- /dev/null +++ b/dpmert/line_optimizer.cc @@ -0,0 +1,111 @@ +#include "line_optimizer.h" + +#include +#include + +#include "sparse_vector.h" +#include "ns.h" + +using namespace std; + +typedef ErrorSurface::const_iterator ErrorIter; + +// sort by increasing x-ints +struct IntervalComp { + bool operator() (const ErrorIter& a, const ErrorIter& b) const { + return a->x < b->x; + } +}; + +double LineOptimizer::LineOptimize( + const EvaluationMetric* metric, + const vector& surfaces, + const LineOptimizer::ScoreType type, + float* best_score, + const double epsilon) { + // cerr << "MIN=" << MINIMIZE_SCORE << " MAX=" << MAXIMIZE_SCORE << " MINE=" << type << endl; + vector all_ints; + for (vector::const_iterator i = surfaces.begin(); + i != surfaces.end(); ++i) { + const ErrorSurface& surface = *i; + for (ErrorIter j = surface.begin(); j != surface.end(); ++j) + all_ints.push_back(j); + } + sort(all_ints.begin(), all_ints.end(), IntervalComp()); + double last_boundary = all_ints.front()->x; + SufficientStats acc; + float& cur_best_score = *best_score; + cur_best_score = (type == MAXIMIZE_SCORE ? + -numeric_limits::max() : numeric_limits::max()); + bool left_edge = true; + double pos = numeric_limits::quiet_NaN(); + for (vector::iterator i = all_ints.begin(); + i != all_ints.end(); ++i) { + const ErrorSegment& seg = **i; + if (seg.x - last_boundary > epsilon) { + float sco = metric->ComputeScore(acc); + if ((type == MAXIMIZE_SCORE && sco > cur_best_score) || + (type == MINIMIZE_SCORE && sco < cur_best_score) ) { + cur_best_score = sco; + if (left_edge) { + pos = seg.x - 0.1; + left_edge = false; + } else { + pos = last_boundary + (seg.x - last_boundary) / 2; + } + //cerr << "NEW BEST: " << pos << " (score=" << cur_best_score << ")\n"; + } + // string xx = metric->DetailedScore(acc); cerr << "---- " << xx; + // cerr << "---- s=" << sco << "\n"; + last_boundary = seg.x; + } + // cerr << "x-boundary=" << seg.x << "\n"; + //string x2; acc.Encode(&x2); cerr << " ACC: " << x2 << endl; + //string x1; seg.delta.Encode(&x1); cerr << " DELTA: " << x1 << endl; + acc += seg.delta; + } + float sco = metric->ComputeScore(acc); + if ((type == MAXIMIZE_SCORE && sco > cur_best_score) || + (type == MINIMIZE_SCORE && sco < cur_best_score) ) { + cur_best_score = sco; + if (left_edge) { + pos = 0; + } else { + pos = last_boundary + 1000.0; + } + } + return pos; +} + +void LineOptimizer::RandomUnitVector(const vector& features_to_optimize, + SparseVector* axis, + RandomNumberGenerator* rng) { + axis->clear(); + for (int i = 0; i < features_to_optimize.size(); ++i) + axis->set_value(features_to_optimize[i], rng->NextNormal(0.0,1.0)); + (*axis) /= axis->l2norm(); +} + +void LineOptimizer::CreateOptimizationDirections( + const vector& features_to_optimize, + int additional_random_directions, + RandomNumberGenerator* rng, + vector >* dirs + , bool include_orthogonal + ) { + dirs->clear(); + typedef SparseVector Dir; + vector &out=*dirs; + int i=0; + if (include_orthogonal) + for (;i + +#include "sparse_vector.h" +#include "error_surface.h" +#include "sampler.h" + +class EvaluationMetric; +class Weights; + +struct LineOptimizer { + + // use MINIMIZE_SCORE for things like TER, WER + // MAXIMIZE_SCORE for things like BLEU + enum ScoreType { MAXIMIZE_SCORE, MINIMIZE_SCORE }; + + // merge all the error surfaces together into a global + // error surface and find (the middle of) the best segment + static double LineOptimize( + const EvaluationMetric* metric, + const std::vector& envs, + const LineOptimizer::ScoreType type, + float* best_score, + const double epsilon = 1.0/65536.0); + + // return a random vector of length 1 where all dimensions + // not listed in dimensions will be 0. + static void RandomUnitVector(const std::vector& dimensions, + SparseVector* axis, + RandomNumberGenerator* rng); + + // generate a list of directions to optimize; the list will + // contain the orthogonal vectors corresponding to the dimensions in + // primary and then additional_random_directions directions in those + // dimensions as well. All vectors will be length 1. + static void CreateOptimizationDirections( + const std::vector& primary, + int additional_random_directions, + RandomNumberGenerator* rng, + std::vector >* dirs + , bool include_primary=true + ); + +}; + +#endif diff --git a/dpmert/lo_test.cc b/dpmert/lo_test.cc new file mode 100644 index 00000000..d9b909b8 --- /dev/null +++ b/dpmert/lo_test.cc @@ -0,0 +1,236 @@ +#include +#include +#include + +#include +#include + +#include "ns.h" +#include "ns_docscorer.h" +#include "ces.h" +#include "fdict.h" +#include "hg.h" +#include "kbest.h" +#include "hg_io.h" +#include "filelib.h" +#include "inside_outside.h" +#include "viterbi.h" +#include "mert_geometry.h" +#include "line_optimizer.h" + +using namespace std; +using boost::shared_ptr; + +class OptTest : public testing::Test { + protected: + virtual void SetUp() { } + virtual void TearDown() { } +}; + +const char* ref11 = "australia reopens embassy in manila"; +const char* ref12 = "( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack ."; +const char* ref21 = "australia reopened manila embassy"; +const char* ref22 = "( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack ."; +const char* ref31 = "australia to reopen embassy in manila"; +const char* ref32 = "( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so - called confirmed terrorist attack threats ."; +const char* ref41 = "australia to re - open its embassy to manila"; +const char* ref42 = "( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so - called \" clear \" threat of terrorist attack 7 weeks ago ."; + +TEST_F(OptTest, TestCheckNaN) { + double x = 0; + double y = 0; + double z = x / y; + EXPECT_EQ(true, isnan(z)); +} + +TEST_F(OptTest,TestConvexHull) { + shared_ptr a1(new MERTPoint(-1, 0)); + shared_ptr b1(new MERTPoint(1, 0)); + shared_ptr a2(new MERTPoint(-1, 1)); + shared_ptr b2(new MERTPoint(1, -1)); + vector > sa; sa.push_back(a1); sa.push_back(b1); + vector > sb; sb.push_back(a2); sb.push_back(b2); + ConvexHull a(sa); + cerr << a << endl; + ConvexHull b(sb); + ConvexHull c = a; + c *= b; + cerr << a << " (*) " << b << " = " << c << endl; + EXPECT_EQ(3, c.size()); +} + +TEST_F(OptTest,TestConvexHullInside) { + const string json = "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| A [1]\",3,\"[X] ||| c\",4,\"[X] ||| C [1]\",5,\"[X] ||| [1] B [2]\",6,\"[X] ||| [1] b [2]\",7,\"[X] ||| X [1]\",8,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.8,1,-0.1],\"rule\":2}],\"node\":{\"in_edges\":[1]},\"edges\":[{\"tail\":[],\"feats\":[1,-1],\"rule\":3}],\"node\":{\"in_edges\":[2]},\"edges\":[{\"tail\":[2],\"feats\":[0,-0.2,1,-0.1],\"rule\":4}],\"node\":{\"in_edges\":[3]},\"edges\":[{\"tail\":[1,3],\"feats\":[0,-1.2,1,-0.2],\"rule\":5},{\"tail\":[1,3],\"feats\":[0,-0.5,1,-1.3],\"rule\":6}],\"node\":{\"in_edges\":[4,5]},\"edges\":[{\"tail\":[4],\"feats\":[0,-0.5,1,-0.8],\"rule\":7},{\"tail\":[4],\"feats\":[0,-0.7,1,-0.9],\"rule\":8}],\"node\":{\"in_edges\":[6,7]}}"; + Hypergraph hg; + istringstream instr(json); + HypergraphIO::ReadFromJSON(&instr, &hg); + SparseVector wts; + wts.set_value(FD::Convert("f1"), 0.4); + wts.set_value(FD::Convert("f2"), 1.0); + hg.Reweight(wts); + vector, prob_t> > list; + std::vector > features; + KBest::KBestDerivations, ESentenceTraversal> kbest(hg, 10); + for (int i = 0; i < 10; ++i) { + const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(hg.nodes_.size() - 1, i); + if (!d) break; + cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; + } + SparseVector dir; dir.set_value(FD::Convert("f1"), 1.0); + ConvexHullWeightFunction wf(wts, dir); + ConvexHull env = Inside(hg, NULL, wf); + cerr << env << endl; + const vector >& segs = env.GetSortedSegs(); + dir *= segs[1]->x; + wts += dir; + hg.Reweight(wts); + KBest::KBestDerivations, ESentenceTraversal> kbest2(hg, 10); + for (int i = 0; i < 10; ++i) { + const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = + kbest2.LazyKthBest(hg.nodes_.size() - 1, i); + if (!d) break; + cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; + } + for (int i = 0; i < segs.size(); ++i) { + cerr << "seg=" << i << endl; + vector trans; + segs[i]->ConstructTranslation(&trans); + cerr << TD::GetString(trans) << endl; + } +} + +TEST_F(OptTest, TestS1) { + int fPhraseModel_0 = FD::Convert("PhraseModel_0"); + int fPhraseModel_1 = FD::Convert("PhraseModel_1"); + int fPhraseModel_2 = FD::Convert("PhraseModel_2"); + int fLanguageModel = FD::Convert("LanguageModel"); + int fWordPenalty = FD::Convert("WordPenalty"); + int fPassThrough = FD::Convert("PassThrough"); + SparseVector wts; + wts.set_value(fWordPenalty, 4.25); + wts.set_value(fLanguageModel, -1.1165); + wts.set_value(fPhraseModel_0, -0.96); + wts.set_value(fPhraseModel_1, -0.65); + wts.set_value(fPhraseModel_2, -0.77); + wts.set_value(fPassThrough, -10.0); + + vector to_optimize; + to_optimize.push_back(fWordPenalty); + to_optimize.push_back(fLanguageModel); + to_optimize.push_back(fPhraseModel_0); + to_optimize.push_back(fPhraseModel_1); + to_optimize.push_back(fPhraseModel_2); + + Hypergraph hg; + ReadFile rf("./test_data/0.json.gz"); + HypergraphIO::ReadFromJSON(rf.stream(), &hg); + hg.Reweight(wts); + + Hypergraph hg2; + ReadFile rf2("./test_data/1.json.gz"); + HypergraphIO::ReadFromJSON(rf2.stream(), &hg2); + hg2.Reweight(wts); + + vector > refs1(4); + TD::ConvertSentence(ref11, &refs1[0]); + TD::ConvertSentence(ref21, &refs1[1]); + TD::ConvertSentence(ref31, &refs1[2]); + TD::ConvertSentence(ref41, &refs1[3]); + vector > refs2(4); + TD::ConvertSentence(ref12, &refs2[0]); + TD::ConvertSentence(ref22, &refs2[1]); + TD::ConvertSentence(ref32, &refs2[2]); + TD::ConvertSentence(ref42, &refs2[3]); + vector envs(2); + + RandomNumberGenerator rng; + + vector > axes; // directions to search + LineOptimizer::CreateOptimizationDirections( + to_optimize, + 10, + &rng, + &axes); + assert(axes.size() == 10 + to_optimize.size()); + for (int i = 0; i < axes.size(); ++i) + cerr << axes[i] << endl; + const SparseVector& axis = axes[0]; + + cerr << "Computing Viterbi envelope using inside algorithm...\n"; + cerr << "axis: " << axis << endl; + clock_t t_start=clock(); + ConvexHullWeightFunction wf(wts, axis); // wts = starting point, axis = search direction + envs[0] = Inside(hg, NULL, wf); + envs[1] = Inside(hg2, NULL, wf); + + vector es(2); + EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); + boost::shared_ptr scorer1 = metric->CreateSegmentEvaluator(refs1); + boost::shared_ptr scorer2 = metric->CreateSegmentEvaluator(refs2); + ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg); + ComputeErrorSurface(*scorer2, envs[1], &es[1], metric, hg2); + cerr << envs[0].size() << " " << envs[1].size() << endl; + cerr << es[0].size() << " " << es[1].size() << endl; + envs.clear(); + clock_t t_env=clock(); + float score; + double m = LineOptimizer::LineOptimize(metric,es, LineOptimizer::MAXIMIZE_SCORE, &score); + clock_t t_opt=clock(); + cerr << "line optimizer returned: " << m << " (SCORE=" << score << ")\n"; + EXPECT_FLOAT_EQ(0.48719698, score); + SparseVector res = axis; + res *= m; + res += wts; + cerr << "res: " << res << endl; + cerr << "ENVELOPE PROCESSING=" << (static_cast(t_env - t_start) / 1000.0) << endl; + cerr << " LINE OPTIMIZATION=" << (static_cast(t_opt - t_env) / 1000.0) << endl; + hg.Reweight(res); + hg2.Reweight(res); + vector t1,t2; + ViterbiESentence(hg, &t1); + ViterbiESentence(hg2, &t2); + cerr << TD::GetString(t1) << endl; + cerr << TD::GetString(t2) << endl; +} + +TEST_F(OptTest,TestZeroOrigin) { + const string json = "{\"rules\":[1,\"[X7] ||| blA ||| without ||| LHSProb=3.92173 LexE2F=2.90799 LexF2E=1.85003 GenerativeProb=10.5381 RulePenalty=1 XFE=2.77259 XEF=0.441833 LabelledEF=2.63906 LabelledFE=4.96981 LogRuleCount=0.693147\",2,\"[X7] ||| blA ||| except ||| LHSProb=4.92173 LexE2F=3.90799 LexF2E=1.85003 GenerativeProb=11.5381 RulePenalty=1 XFE=2.77259 XEF=1.44183 LabelledEF=2.63906 LabelledFE=4.96981 LogRuleCount=1.69315\",3,\"[S] ||| [X7,1] ||| [1] ||| GlueTop=1\",4,\"[X28] ||| EnwAn ||| title ||| LHSProb=3.96802 LexE2F=2.22462 LexF2E=1.83258 GenerativeProb=10.0863 RulePenalty=1 XFE=0 XEF=1.20397 LabelledEF=1.20397 LabelledFE=-1.98341e-08 LogRuleCount=1.09861\",5,\"[X0] ||| EnwAn ||| funny ||| LHSProb=3.98479 LexE2F=1.79176 LexF2E=3.21888 GenerativeProb=11.1681 RulePenalty=1 XFE=0 XEF=2.30259 LabelledEF=2.30259 LabelledFE=0 LogRuleCount=0 SingletonRule=1\",6,\"[X8] ||| [X7,1] EnwAn ||| entitled [1] ||| LHSProb=3.82533 LexE2F=3.21888 LexF2E=2.52573 GenerativeProb=11.3276 RulePenalty=1 XFE=1.20397 XEF=1.20397 LabelledEF=2.30259 LabelledFE=2.30259 LogRuleCount=0 SingletonRule=1\",7,\"[S] ||| [S,1] [X28,2] ||| [1] [2] ||| Glue=1\",8,\"[S] ||| [S,1] [X0,2] ||| [1] [2] ||| Glue=1\",9,\"[S] ||| [X8,1] ||| [1] ||| GlueTop=1\",10,\"[Goal] ||| [S,1] ||| [1]\"],\"features\":[\"PassThrough\",\"Glue\",\"GlueTop\",\"LanguageModel\",\"WordPenalty\",\"LHSProb\",\"LexE2F\",\"LexF2E\",\"GenerativeProb\",\"RulePenalty\",\"XFE\",\"XEF\",\"LabelledEF\",\"LabelledFE\",\"LogRuleCount\",\"SingletonRule\"],\"edges\":[{\"tail\":[],\"spans\":[0,1,-1,-1],\"feats\":[5,3.92173,6,2.90799,7,1.85003,8,10.5381,9,1,10,2.77259,11,0.441833,12,2.63906,13,4.96981,14,0.693147],\"rule\":1},{\"tail\":[],\"spans\":[0,1,-1,-1],\"feats\":[5,4.92173,6,3.90799,7,1.85003,8,11.5381,9,1,10,2.77259,11,1.44183,12,2.63906,13,4.96981,14,1.69315],\"rule\":2}],\"node\":{\"in_edges\":[0,1],\"cat\":\"X7\"},\"edges\":[{\"tail\":[0],\"spans\":[0,1,-1,-1],\"feats\":[2,1],\"rule\":3}],\"node\":{\"in_edges\":[2],\"cat\":\"S\"},\"edges\":[{\"tail\":[],\"spans\":[1,2,-1,-1],\"feats\":[5,3.96802,6,2.22462,7,1.83258,8,10.0863,9,1,11,1.20397,12,1.20397,13,-1.98341e-08,14,1.09861],\"rule\":4}],\"node\":{\"in_edges\":[3],\"cat\":\"X28\"},\"edges\":[{\"tail\":[],\"spans\":[1,2,-1,-1],\"feats\":[5,3.98479,6,1.79176,7,3.21888,8,11.1681,9,1,11,2.30259,12,2.30259,15,1],\"rule\":5}],\"node\":{\"in_edges\":[4],\"cat\":\"X0\"},\"edges\":[{\"tail\":[0],\"spans\":[0,2,-1,-1],\"feats\":[5,3.82533,6,3.21888,7,2.52573,8,11.3276,9,1,10,1.20397,11,1.20397,12,2.30259,13,2.30259,15,1],\"rule\":6}],\"node\":{\"in_edges\":[5],\"cat\":\"X8\"},\"edges\":[{\"tail\":[1,2],\"spans\":[0,2,-1,-1],\"feats\":[1,1],\"rule\":7},{\"tail\":[1,3],\"spans\":[0,2,-1,-1],\"feats\":[1,1],\"rule\":8},{\"tail\":[4],\"spans\":[0,2,-1,-1],\"feats\":[2,1],\"rule\":9}],\"node\":{\"in_edges\":[6,7,8],\"cat\":\"S\"},\"edges\":[{\"tail\":[5],\"spans\":[0,2,-1,-1],\"feats\":[],\"rule\":10}],\"node\":{\"in_edges\":[9],\"cat\":\"Goal\"}}"; + Hypergraph hg; + istringstream instr(json); + HypergraphIO::ReadFromJSON(&instr, &hg); + SparseVector wts; + wts.set_value(FD::Convert("PassThrough"), -0.929201533002898); + hg.Reweight(wts); + + vector, prob_t> > list; + std::vector > features; + KBest::KBestDerivations, ESentenceTraversal> kbest(hg, 10); + for (int i = 0; i < 10; ++i) { + const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = + kbest.LazyKthBest(hg.nodes_.size() - 1, i); + if (!d) break; + cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; + } + + SparseVector axis; axis.set_value(FD::Convert("Glue"),1.0); + ConvexHullWeightFunction wf(wts, axis); // wts = starting point, axis = search direction + vector envs(1); + envs[0] = Inside(hg, NULL, wf); + + vector > mr(4); + TD::ConvertSentence("untitled", &mr[0]); + TD::ConvertSentence("with no title", &mr[1]); + TD::ConvertSentence("without a title", &mr[2]); + TD::ConvertSentence("without title", &mr[3]); + EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); + boost::shared_ptr scorer1 = metric->CreateSegmentEvaluator(mr); + vector es(1); + ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg); +} + +int main(int argc, char **argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + diff --git a/dpmert/mert_geometry.cc b/dpmert/mert_geometry.cc new file mode 100644 index 00000000..81b25af9 --- /dev/null +++ b/dpmert/mert_geometry.cc @@ -0,0 +1,186 @@ +#include "mert_geometry.h" + +#include +#include + +using namespace std; +using boost::shared_ptr; + +ConvexHull::ConvexHull(int i) { + if (i == 0) { + // do nothing - <> + } else if (i == 1) { + points.push_back(shared_ptr(new MERTPoint(0, 0, 0, shared_ptr(), shared_ptr()))); + assert(this->IsMultiplicativeIdentity()); + } else { + cerr << "Only can create ConvexHull semiring 0 and 1 with this constructor!\n"; + abort(); + } +} + +const ConvexHull ConvexHullWeightFunction::operator()(const Hypergraph::Edge& e) const { + const double m = direction.dot(e.feature_values_); + const double b = origin.dot(e.feature_values_); + MERTPoint* point = new MERTPoint(m, b, e); + return ConvexHull(1, point); +} + +ostream& operator<<(ostream& os, const ConvexHull& env) { + os << '<'; + const vector >& points = env.GetSortedSegs(); + for (int i = 0; i < points.size(); ++i) + os << (i==0 ? "" : "|") << "x=" << points[i]->x << ",b=" << points[i]->b << ",m=" << points[i]->m << ",p1=" << points[i]->p1 << ",p2=" << points[i]->p2; + return os << '>'; +} + +#define ORIGINAL_MERT_IMPLEMENTATION 1 +#ifdef ORIGINAL_MERT_IMPLEMENTATION + +struct SlopeCompare { + bool operator() (const shared_ptr& a, const shared_ptr& b) const { + return a->m < b->m; + } +}; + +const ConvexHull& ConvexHull::operator+=(const ConvexHull& other) { + if (!other.is_sorted) other.Sort(); + if (points.empty()) { + points = other.points; + return *this; + } + is_sorted = false; + int j = points.size(); + points.resize(points.size() + other.points.size()); + for (int i = 0; i < other.points.size(); ++i) + points[j++] = other.points[i]; + assert(j == points.size()); + return *this; +} + +void ConvexHull::Sort() const { + sort(points.begin(), points.end(), SlopeCompare()); + const int k = points.size(); + int j = 0; + for (int i = 0; i < k; ++i) { + MERTPoint l = *points[i]; + l.x = kMinusInfinity; + // cerr << "m=" << l.m << endl; + if (0 < j) { + if (points[j-1]->m == l.m) { // lines are parallel + if (l.b <= points[j-1]->b) continue; + --j; + } + while(0 < j) { + l.x = (l.b - points[j-1]->b) / (points[j-1]->m - l.m); + if (points[j-1]->x < l.x) break; + --j; + } + if (0 == j) l.x = kMinusInfinity; + } + *points[j++] = l; + } + points.resize(j); + is_sorted = true; +} + +const ConvexHull& ConvexHull::operator*=(const ConvexHull& other) { + if (other.IsMultiplicativeIdentity()) { return *this; } + if (this->IsMultiplicativeIdentity()) { (*this) = other; return *this; } + + if (!is_sorted) Sort(); + if (!other.is_sorted) other.Sort(); + + if (this->IsEdgeEnvelope()) { +// if (other.size() > 1) +// cerr << *this << " (TIMES) " << other << endl; + shared_ptr edge_parent = points[0]; + const double& edge_b = edge_parent->b; + const double& edge_m = edge_parent->m; + points.clear(); + for (int i = 0; i < other.points.size(); ++i) { + const MERTPoint& p = *other.points[i]; + const double m = p.m + edge_m; + const double b = p.b + edge_b; + const double& x = p.x; // x's don't change with * + points.push_back(shared_ptr(new MERTPoint(x, m, b, edge_parent, other.points[i]))); + assert(points.back()->p1->edge); + } +// if (other.size() > 1) +// cerr << " = " << *this << endl; + } else { + vector > new_points; + int this_i = 0; + int other_i = 0; + const int this_size = points.size(); + const int other_size = other.points.size(); + double cur_x = kMinusInfinity; // moves from left to right across the + // real numbers, stopping for all inter- + // sections + double this_next_val = (1 < this_size ? points[1]->x : kPlusInfinity); + double other_next_val = (1 < other_size ? other.points[1]->x : kPlusInfinity); + while (this_i < this_size && other_i < other_size) { + const MERTPoint& this_point = *points[this_i]; + const MERTPoint& other_point= *other.points[other_i]; + const double m = this_point.m + other_point.m; + const double b = this_point.b + other_point.b; + + new_points.push_back(shared_ptr(new MERTPoint(cur_x, m, b, points[this_i], other.points[other_i]))); + int comp = 0; + if (this_next_val < other_next_val) comp = -1; else + if (this_next_val > other_next_val) comp = 1; + if (0 == comp) { // the next values are equal, advance both indices + ++this_i; + ++other_i; + cur_x = this_next_val; // could be other_next_val (they're equal!) + this_next_val = (this_i+1 < this_size ? points[this_i+1]->x : kPlusInfinity); + other_next_val = (other_i+1 < other_size ? other.points[other_i+1]->x : kPlusInfinity); + } else { // advance the i with the lower x, update cur_x + if (-1 == comp) { + ++this_i; + cur_x = this_next_val; + this_next_val = (this_i+1 < this_size ? points[this_i+1]->x : kPlusInfinity); + } else { + ++other_i; + cur_x = other_next_val; + other_next_val = (other_i+1 < other_size ? other.points[other_i+1]->x : kPlusInfinity); + } + } + } + points.swap(new_points); + } + //cerr << "Multiply: result=" << (*this) << endl; + return *this; +} + +// recursively construct translation +void MERTPoint::ConstructTranslation(vector* trans) const { + const MERTPoint* cur = this; + vector > ant_trans; + while(!cur->edge) { + ant_trans.resize(ant_trans.size() + 1); + cur->p2->ConstructTranslation(&ant_trans.back()); + cur = cur->p1.get(); + } + size_t ant_size = ant_trans.size(); + vector*> pants(ant_size); + assert(ant_size == cur->edge->tail_nodes_.size()); + --ant_size; + for (int i = 0; i < pants.size(); ++i) pants[ant_size - i] = &ant_trans[i]; + cur->edge->rule_->ESubstitute(pants, trans); +} + +void MERTPoint::CollectEdgesUsed(std::vector* edges_used) const { + if (edge) { + assert(edge->id_ < edges_used->size()); + (*edges_used)[edge->id_] = true; + } + if (p1) p1->CollectEdgesUsed(edges_used); + if (p2) p2->CollectEdgesUsed(edges_used); +} + +#else + +// THIS IS THE NEW FASTER IMPLEMENTATION OF THE MERT SEMIRING OPERATIONS + +#endif + diff --git a/dpmert/mert_geometry.h b/dpmert/mert_geometry.h new file mode 100644 index 00000000..a8b6959e --- /dev/null +++ b/dpmert/mert_geometry.h @@ -0,0 +1,81 @@ +#ifndef _MERT_GEOMETRY_H_ +#define _MERT_GEOMETRY_H_ + +#include +#include +#include + +#include "hg.h" +#include "sparse_vector.h" + +static const double kMinusInfinity = -std::numeric_limits::infinity(); +static const double kPlusInfinity = std::numeric_limits::infinity(); + +struct MERTPoint { + MERTPoint() : x(), m(), b(), edge() {} + MERTPoint(double _m, double _b) : + x(kMinusInfinity), m(_m), b(_b), edge() {} + MERTPoint(double _x, double _m, double _b, const boost::shared_ptr& p1_, const boost::shared_ptr& p2_) : + x(_x), m(_m), b(_b), p1(p1_), p2(p2_), edge() {} + MERTPoint(double _m, double _b, const Hypergraph::Edge& edge) : + x(kMinusInfinity), m(_m), b(_b), edge(&edge) {} + + double x; // x intersection with previous segment in env, or -inf if none + double m; // this line's slope + double b; // intercept with y-axis + + // we keep a pointer to the "parents" of this segment so we can reconstruct + // the Viterbi translation corresponding to this segment + boost::shared_ptr p1; + boost::shared_ptr p2; + + // only MERTPoints created from an edge using the ConvexHullWeightFunction + // have rules + // TRulePtr rule; + const Hypergraph::Edge* edge; + + // recursively recover the Viterbi translation that will result from setting + // the weights to origin + axis * x, where x is any value from this->x up + // until the next largest x in the containing ConvexHull + void ConstructTranslation(std::vector* trans) const; + void CollectEdgesUsed(std::vector* edges_used) const; +}; + +// this is the semiring value type, +// it defines constructors for 0, 1, and the operations + and * +struct ConvexHull { + // create semiring zero + ConvexHull() : is_sorted(true) {} // zero + // for debugging: + ConvexHull(const std::vector >& s) : points(s) { Sort(); } + // create semiring 1 or 0 + explicit ConvexHull(int i); + ConvexHull(int n, MERTPoint* point) : is_sorted(true), points(n, boost::shared_ptr(point)) {} + const ConvexHull& operator+=(const ConvexHull& other); + const ConvexHull& operator*=(const ConvexHull& other); + bool IsMultiplicativeIdentity() const { + return size() == 1 && (points[0]->b == 0.0 && points[0]->m == 0.0) && (!points[0]->edge) && (!points[0]->p1) && (!points[0]->p2); } + const std::vector >& GetSortedSegs() const { + if (!is_sorted) Sort(); + return points; + } + size_t size() const { return points.size(); } + + private: + bool IsEdgeEnvelope() const { + return points.size() == 1 && points[0]->edge; } + void Sort() const; + mutable bool is_sorted; + mutable std::vector > points; +}; +std::ostream& operator<<(std::ostream& os, const ConvexHull& env); + +struct ConvexHullWeightFunction { + ConvexHullWeightFunction(const SparseVector& ori, + const SparseVector& dir) : origin(ori), direction(dir) {} + const ConvexHull operator()(const Hypergraph::Edge& e) const; + const SparseVector origin; + const SparseVector direction; +}; + +#endif diff --git a/dpmert/mr_dpmert_generate_mapper_input.cc b/dpmert/mr_dpmert_generate_mapper_input.cc new file mode 100644 index 00000000..59d4f24f --- /dev/null +++ b/dpmert/mr_dpmert_generate_mapper_input.cc @@ -0,0 +1,78 @@ +#include +#include + +#include +#include + +#include "filelib.h" +#include "weights.h" +#include "line_optimizer.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("dev_set_size,s",po::value(),"[REQD] Development set size (# of parallel sentences)") + ("forest_repository,r",po::value(),"[REQD] Path to forest repository") + ("weights,w",po::value(),"[REQD] Current feature weights file") + ("optimize_feature,o",po::value >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)") + ("random_directions,d",po::value()->default_value(20),"Number of random directions to run the line optimizer in") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (conf->count("dev_set_size") == 0) { + cerr << "Please specify the size of the development set using -d N\n"; + flag = true; + } + if (conf->count("weights") == 0) { + cerr << "Please specify the starting-point weights using -w \n"; + flag = true; + } + if (conf->count("forest_repository") == 0) { + cerr << "Please specify the forest repository location using -r \n"; + flag = true; + } + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +int main(int argc, char** argv) { + RandomNumberGenerator rng; + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + vector features; + SparseVector origin; + vector w; + Weights::InitFromFile(conf["weights"].as(), &w, &features); + Weights::InitSparseVector(w, &origin); + const string forest_repository = conf["forest_repository"].as(); + assert(DirectoryExists(forest_repository)); + if (conf.count("optimize_feature") > 0) + features=conf["optimize_feature"].as >(); + vector > directions; + vector fids(features.size()); + for (int i = 0; i < features.size(); ++i) + fids[i] = FD::Convert(features[i]); + LineOptimizer::CreateOptimizationDirections( + fids, + conf["random_directions"].as(), + &rng, + &directions); + unsigned dev_set_size = conf["dev_set_size"].as(); + for (unsigned i = 0; i < dev_set_size; ++i) { + for (unsigned j = 0; j < directions.size(); ++j) { + cout << forest_repository << '/' << i << ".json.gz " << i << ' '; + print(cout, origin, "=", ";"); + cout << ' '; + print(cout, directions[j], "=", ";"); + cout << endl; + } + } + return 0; +} diff --git a/dpmert/mr_dpmert_map.cc b/dpmert/mr_dpmert_map.cc new file mode 100644 index 00000000..f3304f0f --- /dev/null +++ b/dpmert/mr_dpmert_map.cc @@ -0,0 +1,112 @@ +#include +#include +#include +#include + +#include +#include + +#include "ns.h" +#include "ns_docscorer.h" +#include "ces.h" +#include "filelib.h" +#include "stringlib.h" +#include "sparse_vector.h" +#include "mert_geometry.h" +#include "inside_outside.h" +#include "error_surface.h" +#include "b64tools.h" +#include "hg_io.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("reference,r",po::value >(), "[REQD] Reference translation (tokenized text)") + ("source,s",po::value(), "Source file (ignored, except for AER)") + ("evaluation_metric,m",po::value()->default_value("ibm_bleu"), "Evaluation metric being optimized") + ("input,i",po::value()->default_value("-"), "Input file to map (- is STDIN)") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (!conf->count("reference")) { + cerr << "Please specify one or more references using -r \n"; + flag = true; + } + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +bool ReadSparseVectorString(const string& s, SparseVector* v) { +#if 0 + // this should work, but untested. + std::istringstream i(s); + i>>*v; +#else + vector fields; + Tokenize(s, ';', &fields); + if (fields.empty()) return false; + for (int i = 0; i < fields.size(); ++i) { + vector pair(2); + Tokenize(fields[i], '=', &pair); + if (pair.size() != 2) { + cerr << "Error parsing vector string: " << fields[i] << endl; + return false; + } + v->set_value(FD::Convert(pair[0]), atof(pair[1].c_str())); + } + return true; +#endif +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const string evaluation_metric = conf["evaluation_metric"].as(); + EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); + DocumentScorer ds(metric, conf["reference"].as >()); + cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl; + Hypergraph hg; + string last_file; + ReadFile in_read(conf["input"].as()); + istream &in=*in_read.stream(); + while(in) { + string line; + getline(in, line); + if (line.empty()) continue; + istringstream is(line); + int sent_id; + string file, s_origin, s_direction; + // path-to-file (JSON) sent_ed starting-point search-direction + is >> file >> sent_id >> s_origin >> s_direction; + SparseVector origin; + ReadSparseVectorString(s_origin, &origin); + SparseVector direction; + ReadSparseVectorString(s_direction, &direction); + // cerr << "File: " << file << "\nDir: " << direction << "\n X: " << origin << endl; + if (last_file != file) { + last_file = file; + ReadFile rf(file); + HypergraphIO::ReadFromJSON(rf.stream(), &hg); + } + const ConvexHullWeightFunction wf(origin, direction); + const ConvexHull hull = Inside(hg, NULL, wf); + + ErrorSurface es; + ComputeErrorSurface(*ds[sent_id], hull, &es, metric, hg); + //cerr << "Viterbi envelope has " << ve.size() << " segments\n"; + // cerr << "Error surface has " << es.size() << " segments\n"; + string val; + es.Serialize(&val); + cout << 'M' << ' ' << s_origin << ' ' << s_direction << '\t'; + B64::b64encode(val.c_str(), val.size(), &cout); + cout << endl << flush; + } + return 0; +} diff --git a/dpmert/mr_dpmert_reduce.cc b/dpmert/mr_dpmert_reduce.cc new file mode 100644 index 00000000..dda61f88 --- /dev/null +++ b/dpmert/mr_dpmert_reduce.cc @@ -0,0 +1,77 @@ +#include +#include +#include +#include + +#include +#include + +#include "sparse_vector.h" +#include "error_surface.h" +#include "line_optimizer.h" +#include "b64tools.h" +#include "stringlib.h" + +using namespace std; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("evaluation_metric,m",po::value(), "Evaluation metric (IBM_BLEU, etc.)") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = conf->count("evaluation_metric") == 0; + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + const string evaluation_metric = conf["evaluation_metric"].as(); + LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE; + if (UppercaseString(evaluation_metric) == "TER") + opt_type = LineOptimizer::MINIMIZE_SCORE; + EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); + + vector esv; + string last_key, line, key, val; + while(getline(cin, line)) { + size_t ks = line.find("\t"); + assert(string::npos != ks); + assert(ks > 2); + key = line.substr(2, ks - 2); + val = line.substr(ks + 1); + if (key != last_key) { + if (!last_key.empty()) { + float score; + double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score); + cout << last_key << "|" << x << "|" << score << endl; + } + last_key.swap(key); + esv.clear(); + } + if (val.size() % 4 != 0) { + cerr << "B64 encoding error 1! Skipping.\n"; + continue; + } + string encoded(val.size() / 4 * 3, '\0'); + if (!B64::b64decode(reinterpret_cast(&val[0]), val.size(), &encoded[0], encoded.size())) { + cerr << "B64 encoding error 2! Skipping.\n"; + continue; + } + esv.push_back(ErrorSurface()); + esv.back().Deserialize(encoded); + } + if (!esv.empty()) { + float score; + double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score); + cout << last_key << "|" << x << "|" << score << endl; + } + return 0; +} diff --git a/dpmert/parallelize.pl b/dpmert/parallelize.pl new file mode 100755 index 00000000..7d0365cc --- /dev/null +++ b/dpmert/parallelize.pl @@ -0,0 +1,423 @@ +#!/usr/bin/env perl + +# Author: Adam Lopez +# +# This script takes a command that processes input +# from stdin one-line-at-time, and parallelizes it +# on the cluster using David Chiang's sentserver/ +# sentclient architecture. +# +# Prerequisites: the command *must* read each line +# without waiting for subsequent lines of input +# (for instance, a command which must read all lines +# of input before processing will not work) and +# return it to the output *without* buffering +# multiple lines. + +#TODO: if -j 1, run immediately, not via sentserver? possible differences in environment might make debugging harder + +#ANNOYANCE: if input is shorter than -j n lines, or at the very last few lines, repeatedly sleeps. time cut down to 15s from 60s + +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } +use LocalConfig; + +use Cwd qw/ abs_path cwd getcwd /; +use File::Temp qw/ tempfile /; +use Getopt::Long; +use IPC::Open2; +use strict; +use POSIX ":sys_wait_h"; + +use File::Basename; +my $myDir = dirname(__FILE__); +print STDERR __FILE__." -> $myDir\n"; +push(@INC, $myDir); +require "libcall.pl"; + +my $tailn=5; # +0 = concatenate all the client logs. 5 = last 5 lines +my $recycle_clients; # spawn new clients when previous ones terminate +my $stay_alive; # dont let server die when having zero clients +my $joblist = ""; +my $errordir=""; +my $multiline; +my @files_to_stage; +my $numnodes = 8; +my $user = $ENV{"USER"}; +my $pmem = "9g"; +my $basep=50300; +my $randp=300; +my $tryp=50; +my $no_which; +my $no_cd; + +my $DEBUG=$ENV{DEBUG}; +print STDERR "DEBUG=$DEBUG output enabled.\n" if $DEBUG; +my $verbose = 1; +sub verbose { + if ($verbose) { + print STDERR @_,"\n"; + } +} +sub debug { + if ($DEBUG) { + my ($package, $filename, $line) = caller; + print STDERR "DEBUG: $filename($line): ",join(' ',@_),"\n"; + } +} +my $is_shell_special=qr.[ \t\n\\><|&;"'`~*?{}$!()].; +my $shell_escape_in_quote=qr.[\\"\$`!].; +sub escape_shell { + my ($arg)=@_; + return undef unless defined $arg; + return '""' unless $arg; + if ($arg =~ /$is_shell_special/) { + $arg =~ s/($shell_escape_in_quote)/\\$1/g; + return "\"$arg\""; + } + return $arg; +} +sub preview_files { + my ($l,$skipempty,$footer,$n)=@_; + $n=$tailn unless defined $n; + my @f=grep { ! ($skipempty && -z $_) } @$l; + my $fn=join(' ',map {escape_shell($_)} @f); + my $cmd="tail -n $n $fn"; + unchecked_output("$cmd").($footer?"\nNONEMPTY FILES:\n$fn\n":""); +} +sub prefix_dirname($) { + #like `dirname but if ends in / then return the whole thing + local ($_)=@_; + if (/\/$/) { + $_; + } else { + s#/[^/]$##; + $_ ? $_ : ''; + } +} +sub ensure_final_slash($) { + local ($_)=@_; + m#/$# ? $_ : ($_."/"); +} +sub extend_path($$;$$) { + my ($base,$ext,$mkdir,$baseisdir)=@_; + if (-d $base) { + $base.="/"; + } else { + my $dir; + if ($baseisdir) { + $dir=$base; + $base.='/' unless $base =~ /\/$/; + } else { + $dir=prefix_dirname($base); + } + my @cmd=("/bin/mkdir","-p",$dir); + check_call(@cmd) if $mkdir; + } + return $base.$ext; +} + +my $abscwd=abs_path(&getcwd); +sub print_help; + +my $use_fork; +my @pids; + +# Process command-line options +unless (GetOptions( + "stay-alive" => \$stay_alive, + "recycle-clients" => \$recycle_clients, + "error-dir=s" => \$errordir, + "multi-line" => \$multiline, + "file=s" => \@files_to_stage, + "use-fork" => \$use_fork, + "verbose" => \$verbose, + "jobs=i" => \$numnodes, + "pmem=s" => \$pmem, + "baseport=i" => \$basep, +# "iport=i" => \$randp, #for short name -i + "no-which!" => \$no_which, + "no-cd!" => \$no_cd, + "tailn=s" => \$tailn, +) && scalar @ARGV){ + print_help(); + die "bad options."; +} + +my $cmd = ""; +my $prog=shift; +if ($no_which) { + $cmd=$prog; +} else { + $cmd=check_output("which $prog"); + chomp $cmd; + die "$prog not found - $cmd" unless $cmd; +} +#$cmd=abs_path($cmd); +for my $arg (@ARGV) { + $cmd .= " ".escape_shell($arg); +} +die "Please specify a command to parallelize\n" if $cmd eq ''; + +my $cdcmd=$no_cd ? '' : ("cd ".escape_shell($abscwd)."\n"); + +my $executable = $cmd; +$executable =~ s/^\s*(\S+)($|\s.*)/$1/; +$executable=check_output("basename $executable"); +chomp $executable; + + +print STDERR "Parallelizing ($numnodes ways): $cmd\n\n"; + +# create -e dir and save .sh +use File::Temp qw/tempdir/; +unless ($errordir) { + $errordir=tempdir("$executable.XXXXXX",CLEANUP=>1); +} +if ($errordir) { + my $scriptfile=extend_path("$errordir/","$executable.sh",1,1); + -d $errordir || die "should have created -e dir $errordir"; + open SF,">",$scriptfile || die; + print SF "$cdcmd$cmd\n"; + close SF; + chmod 0755,$scriptfile; + $errordir=abs_path($errordir); + &verbose("-e dir: $errordir"); +} + +# set cleanup handler +my @cleanup_cmds; +sub cleanup; +sub cleanup_and_die; +$SIG{INT} = "cleanup_and_die"; +$SIG{TERM} = "cleanup_and_die"; +$SIG{HUP} = "cleanup_and_die"; + +# other subs: +sub numof_live_jobs; +sub launch_job_on_node; + + +# vars +my $mydir = check_output("dirname $0"); chomp $mydir; +my $sentserver = "$mydir/sentserver"; +my $sentclient = "$mydir/sentclient"; +my $host = check_output("hostname"); +chomp $host; + + +# find open port +srand; +my $port = 50300+int(rand($randp)); +my $endp=$port+$tryp; +sub listening_port_lines { + my $quiet=$verbose?'':'2>/dev/null'; + return unchecked_output("netstat -a -n $quiet | grep LISTENING | grep -i tcp"); +} +my $netstat=&listening_port_lines; + +if ($verbose){ print STDERR "Testing port $port...";} + +while ($netstat=~/$port/ || &listening_port_lines=~/$port/){ + if ($verbose){ print STDERR "port is busy\n";} + $port++; + if ($port > $endp){ + die "Unable to find open port\n"; + } + if ($verbose){ print STDERR "Testing port $port... "; } +} +if ($verbose){ + print STDERR "port $port is available\n"; +} + +my $key = int(rand()*1000000); + +my $multiflag = ""; +if ($multiline){ $multiflag = "-m"; print STDERR "expecting multiline output.\n"; } +my $stay_alive_flag = ""; +if ($stay_alive){ $stay_alive_flag = "--stay-alive"; print STDERR "staying alive while no clients are connected.\n"; } + +my $node_count = 0; +my $script = ""; +# fork == one thread runs the sentserver, while the +# other spawns the sentclient commands. +my $pid = fork; +if ($pid == 0) { # child + sleep 8; # give other thread time to start sentserver + $script = "$cdcmd$sentclient $host:$port:$key $cmd"; + + if ($verbose){ + print STDERR "Client script:\n====\n"; + print STDERR $script; + print STDERR "====\n"; + } + for (my $jobn=0; $jobn<$numnodes; $jobn++){ + launch_job(); + } + if ($recycle_clients) { + my $ret; + my $livejobs; + while (1) { + $ret = waitpid($pid, WNOHANG); + #print STDERR "waitpid $pid ret = $ret \n"; + last if ($ret != 0); + $livejobs = numof_live_jobs(); + if ($numnodes >= $livejobs ) { # a client terminated, OR # lines of input was less than -j + print STDERR "num of requested nodes = $numnodes; num of currently live jobs = $livejobs; Client terminated - launching another.\n"; + launch_job(); + } else { + sleep 15; + } + } + } + print STDERR "CHILD PROCESSES SPAWNED ... WAITING\n"; + for my $p (@pids) { + waitpid($p, 0); + } +} else { +# my $todo = "$sentserver -k $key $multiflag $port "; + my $todo = "$sentserver -k $key $multiflag $port $stay_alive_flag "; + if ($verbose){ print STDERR "Running: $todo\n"; } + check_call($todo); + print STDERR "Call to $sentserver returned.\n"; + cleanup(); + exit(0); +} + +sub numof_live_jobs { + if ($use_fork) { + die "not implemented"; + } else { + # We can probably continue decoding if the qstat error is only temporary + my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat"))); + return ($#livejobs + 1); + } +} +my (@errors,@outs,@cmds); + +sub launch_job { + if ($use_fork) { return launch_job_fork(); } + my $errorfile = "/dev/null"; + my $outfile = "/dev/null"; + $node_count++; + my $clientname = $executable; + $clientname =~ s/^(.{4}).*$/$1/; + $clientname = "$clientname.$node_count"; + if ($errordir){ + $errorfile = "$errordir/$clientname.ER"; + $outfile = "$errordir/$clientname.OU"; + push @errors,$errorfile; + push @outs,$outfile; + } + my $todo = qsub_args($pmem) . " -N $clientname -o $outfile -e $errorfile"; + push @cmds,$todo; + + print STDERR "Running: $todo\n"; + local(*QOUT, *QIN); + open2(\*QOUT, \*QIN, $todo) or die "Failed to open2: $!"; + print QIN $script; + close QIN; + while (my $jobid=){ + chomp $jobid; + if ($verbose){ print STDERR "Launched client job: $jobid"; } + $jobid =~ s/^(\d+)(.*?)$/\1/g; + $jobid =~ s/^Your job (\d+) .*$/\1/; + print STDERR " short job id $jobid\n"; + if ($verbose){ + print STDERR "cd: $abscwd\n"; + print STDERR "cmd: $cmd\n"; + } + if ($joblist == "") { $joblist = $jobid; } + else {$joblist = $joblist . "\|" . $jobid; } + my $cleanfn="qdel $jobid 2> /dev/null"; + push(@cleanup_cmds, $cleanfn); + } + close QOUT; +} + +sub launch_job_fork { + my $errorfile = "/dev/null"; + my $outfile = "/dev/null"; + $node_count++; + my $clientname = $executable; + $clientname =~ s/^(.{4}).*$/$1/; + $clientname = "$clientname.$node_count"; + if ($errordir){ + $errorfile = "$errordir/$clientname.ER"; + $outfile = "$errordir/$clientname.OU"; + push @errors,$errorfile; + push @outs,$outfile; + } + my $pid = fork; + if ($pid == 0) { + my ($fh, $scr_name) = get_temp_script(); + print $fh $script; + close $fh; + my $todo = "/bin/bash -xeo pipefail $scr_name 1> $outfile 2> $errorfile"; + print STDERR "EXEC: $todo\n"; + my $out = check_output("$todo"); + unlink $scr_name or warn "Failed to remove $scr_name"; + exit 0; + } else { + push @pids, $pid; + } +} + +sub get_temp_script { + my ($fh, $filename) = tempfile( "workXXXX", SUFFIX => '.sh'); + return ($fh, $filename); +} + +sub cleanup_and_die { + cleanup(); + die "\n"; +} + +sub cleanup { + print STDERR "Cleaning up...\n"; + for $cmd (@cleanup_cmds){ + print STDERR " Cleanup command: $cmd\n"; + eval $cmd; + } + print STDERR "outputs:\n",preview_files(\@outs,1),"\n"; + print STDERR "errors:\n",preview_files(\@errors,1),"\n"; + print STDERR "cmd:\n",$cmd,"\n"; + print STDERR " cat $errordir/*.ER\nfor logs.\n"; + print STDERR "Cleanup finished.\n"; +} + +sub print_help +{ + my $name = check_output("basename $0"); chomp $name; + print << "Help"; + +usage: $name [options] + + Automatic black-box parallelization of commands. + +options: + + --use-fork + Instead of using qsub, use fork. + + -e, --error-dir + Retain output files from jobs in , rather + than silently deleting them. + + -m, --multi-line + Expect that command may produce multiple output + lines for a single input line. $name makes a + reasonable attempt to obtain all output before + processing additional inputs. However, use of this + option is inherently unsafe. + + -v, --verbose + Print diagnostic informatoin on stderr. + + -j, --jobs + Number of jobs to use. + + -p, --pmem + pmem setting for each job. + +Help +} diff --git a/dpmert/sentclient.c b/dpmert/sentclient.c new file mode 100644 index 00000000..91d994ab --- /dev/null +++ b/dpmert/sentclient.c @@ -0,0 +1,76 @@ +/* Copyright (c) 2001 by David Chiang. All rights reserved.*/ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sentserver.h" + +int main (int argc, char *argv[]) { + int sock, port; + char *s, *key; + struct hostent *hp; + struct sockaddr_in server; + int errors = 0; + + if (argc < 3) { + fprintf(stderr, "Usage: sentclient host[:port[:key]] command [args ...]\n"); + exit(1); + } + + s = strchr(argv[1], ':'); + key = NULL; + + if (s == NULL) { + port = DEFAULT_PORT; + } else { + *s = '\0'; + s+=1; + /* dumb hack */ + key = strchr(s, ':'); + if (key != NULL){ + *key = '\0'; + key += 1; + } + port = atoi(s); + } + + sock = socket(AF_INET, SOCK_STREAM, 0); + + hp = gethostbyname(argv[1]); + if (hp == NULL) { + fprintf(stderr, "unknown host %s\n", argv[1]); + exit(1); + } + + bzero((char *)&server, sizeof(server)); + bcopy(hp->h_addr, (char *)&server.sin_addr, hp->h_length); + server.sin_family = hp->h_addrtype; + server.sin_port = htons(port); + + while (connect(sock, (struct sockaddr *)&server, sizeof(server)) < 0) { + perror("connect()"); + sleep(1); + errors++; + if (errors > 5) + exit(1); + } + + close(0); + close(1); + dup2(sock, 0); + dup2(sock, 1); + + if (key != NULL){ + write(1, key, strlen(key)); + write(1, "\n", 1); + } + + execvp(argv[2], argv+2); + return 0; +} diff --git a/dpmert/sentserver.c b/dpmert/sentserver.c new file mode 100644 index 00000000..c20b4fa6 --- /dev/null +++ b/dpmert/sentserver.c @@ -0,0 +1,515 @@ +/* Copyright (c) 2001 by David Chiang. All rights reserved.*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sentserver.h" + +#define MAX_CLIENTS 64 + +struct clientinfo { + int s; + struct sockaddr_in sin; +}; + +struct line { + int id; + char *s; + int status; + struct line *next; +} *head, **ptail; + +int n_sent = 0, n_received=0, n_flushed=0; + +#define STATUS_RUNNING 0 +#define STATUS_ABORTED 1 +#define STATUS_FINISHED 2 + +pthread_mutex_t queue_mutex = PTHREAD_MUTEX_INITIALIZER; +pthread_mutex_t clients_mutex = PTHREAD_MUTEX_INITIALIZER; +pthread_mutex_t input_mutex = PTHREAD_MUTEX_INITIALIZER; + +int n_clients = 0; +int s; +int expect_multiline_output = 0; +int log_mutex = 0; +int stay_alive = 0; /* dont panic and die with zero clients */ + +void queue_finish(struct line *node, char *s, int fid); +char * read_line(int fd, int multiline); +void done (int code); + +struct line * queue_get(int fid) { + struct line *cur; + char *s, *synch; + + if (log_mutex) fprintf(stderr, "Getting for data for fid %d\n", fid); + if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); + pthread_mutex_lock(&queue_mutex); + + /* First, check for aborted sentences. */ + + if (log_mutex) fprintf(stderr, " Checking queue for aborted jobs (fid %d)\n", fid); + for (cur = head; cur != NULL; cur = cur->next) { + if (cur->status == STATUS_ABORTED) { + cur->status = STATUS_RUNNING; + + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); + + return cur; + } + } + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); + + /* Otherwise, read a new one. */ + if (log_mutex) fprintf(stderr, "Locking input mutex (%d)\n", fid); + if (log_mutex) fprintf(stderr, " Reading input for new data (fid %d)\n", fid); + pthread_mutex_lock(&input_mutex); + s = read_line(0,0); + + while (s) { + if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); + pthread_mutex_lock(&queue_mutex); + if (log_mutex) fprintf(stderr, "Unlocking input mutex (%d)\n", fid); + pthread_mutex_unlock(&input_mutex); + + cur = malloc(sizeof (struct line)); + cur->id = n_sent; + cur->s = s; + cur->next = NULL; + + *ptail = cur; + ptail = &cur->next; + + n_sent++; + + if (strcmp(s,"===SYNCH===\n")==0){ + fprintf(stderr, "Received ===SYNCH=== signal (fid %d)\n", fid); + // Note: queue_finish calls free(cur->s). + // Therefore we need to create a new string here. + synch = malloc((strlen("===SYNCH===\n")+2) * sizeof (char)); + synch = strcpy(synch, s); + + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); + queue_finish(cur, synch, fid); /* handles its own lock */ + + if (log_mutex) fprintf(stderr, "Locking input mutex (%d)\n", fid); + if (log_mutex) fprintf(stderr, " Reading input for new data (fid %d)\n", fid); + pthread_mutex_lock(&input_mutex); + + s = read_line(0,0); + } else { + if (log_mutex) fprintf(stderr, " Received new data %d (fid %d)\n", cur->id, fid); + cur->status = STATUS_RUNNING; + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); + return cur; + } + } + + if (log_mutex) fprintf(stderr, "Unlocking input mutex (%d)\n", fid); + pthread_mutex_unlock(&input_mutex); + /* Only way to reach this point: no more output */ + + if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); + pthread_mutex_lock(&queue_mutex); + if (head == NULL) { + fprintf(stderr, "Reached end of file. Exiting.\n"); + done(0); + } else + ptail = NULL; /* This serves as a signal that there is no more input */ + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); + + return NULL; +} + +void queue_panic() { + struct line *next; + while (head && head->status == STATUS_FINISHED) { + /* Write out finished sentences */ + if (head->status == STATUS_FINISHED) { + fputs(head->s, stdout); + fflush(stdout); + } + /* Write out blank line for unfinished sentences */ + if (head->status == STATUS_ABORTED) { + fputs("\n", stdout); + fflush(stdout); + } + /* By defition, there cannot be any RUNNING sentences, since + function is only called when n_clients == 0 */ + free(head->s); + next = head->next; + free(head); + head = next; + n_flushed++; + } + fclose(stdout); + fprintf(stderr, "All clients died. Panicking, flushing completed sentences and exiting.\n"); + done(1); +} + +void queue_abort(struct line *node, int fid) { + if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); + pthread_mutex_lock(&queue_mutex); + node->status = STATUS_ABORTED; + if (n_clients == 0) { + if (stay_alive) { + fprintf(stderr, "Warning! No live clients detected! Staying alive, will retry soon.\n"); + } else { + queue_panic(); + } + } + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); +} + + +void queue_print() { + struct line *cur; + + fprintf(stderr, " Queue\n"); + + for (cur = head; cur != NULL; cur = cur->next) { + switch(cur->status) { + case STATUS_RUNNING: + fprintf(stderr, " %d running ", cur->id); break; + case STATUS_ABORTED: + fprintf(stderr, " %d aborted ", cur->id); break; + case STATUS_FINISHED: + fprintf(stderr, " %d finished ", cur->id); break; + + } + fprintf(stderr, "\n"); + //fprintf(stderr, cur->s); + } +} + +void queue_finish(struct line *node, char *s, int fid) { + struct line *next; + if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); + pthread_mutex_lock(&queue_mutex); + + free(node->s); + node->s = s; + node->status = STATUS_FINISHED; + n_received++; + + /* Flush out finished nodes */ + while (head && head->status == STATUS_FINISHED) { + + if (log_mutex) fprintf(stderr, " Flushing finished node %d\n", head->id); + + fputs(head->s, stdout); + fflush(stdout); + if (log_mutex) fprintf(stderr, " Flushed node %d\n", head->id); + free(head->s); + + next = head->next; + free(head); + + head = next; + + n_flushed++; + + if (head == NULL) { /* empty queue */ + if (ptail == NULL) { /* This can only happen if set in queue_get as signal that there is no more input. */ + fprintf(stderr, "All sentences finished. Exiting.\n"); + done(0); + } else /* ptail pointed at something which was just popped off the stack -- reset to head*/ + ptail = &head; + } + } + + if (log_mutex) fprintf(stderr, " Flushing output %d\n", head->id); + fflush(stdout); + fprintf(stderr, "%d sentences sent, %d sentences finished, %d sentences flushed\n", n_sent, n_received, n_flushed); + + if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); + pthread_mutex_unlock(&queue_mutex); + +} + +char * read_line(int fd, int multiline) { + int size = 80; + char errorbuf[100]; + char *s = malloc(size+2); + int result, errors=0; + int i = 0; + + result = read(fd, s+i, 1); + + while (1) { + if (result < 0) { + perror("read()"); + sprintf(errorbuf, "Error code: %d\n", errno); + fprintf(stderr, errorbuf); + errors++; + if (errors > 5) { + free(s); + return NULL; + } else { + sleep(1); /* retry after delay */ + } + } else if (result == 0) { + break; + } else if (multiline==0 && s[i] == '\n') { + break; + } else { + if (s[i] == '\n'){ + /* if we've reached this point, + then multiline must be 1, and we're + going to poll the fd for an additional + line of data. The basic design is to + run a select on the filedescriptor fd. + Select will return under two conditions: + if there is data on the fd, or if a + timeout is reached. We'll select on this + fd. If select returns because there's data + ready, keep going; else assume there's no + more and return the data we already have. + */ + + fd_set set; + FD_ZERO(&set); + FD_SET(fd, &set); + + struct timeval timeout; + timeout.tv_sec = 3; // number of seconds for timeout + timeout.tv_usec = 0; + + int ready = select(FD_SETSIZE, &set, NULL, NULL, &timeout); + if (ready<1){ + break; // no more data, stop looping + } + } + i++; + + if (i == size) { + size = size*2; + s = realloc(s, size+2); + } + } + + result = read(fd, s+i, 1); + } + + if (result == 0 && i == 0) { /* end of file */ + free(s); + return NULL; + } + + s[i] = '\n'; + s[i+1] = '\0'; + + return s; +} + +void * new_client(void *arg) { + struct clientinfo *client = (struct clientinfo *)arg; + struct line *cur; + int result; + char *s; + char errorbuf[100]; + + pthread_mutex_lock(&clients_mutex); + n_clients++; + pthread_mutex_unlock(&clients_mutex); + + fprintf(stderr, "Client connected (%d connected)\n", n_clients); + + for (;;) { + + cur = queue_get(client->s); + + if (cur) { + /* fprintf(stderr, "Sending to client: %s", cur->s); */ + fprintf(stderr, "Sending data %d to client (fid %d)\n", cur->id, client->s); + result = write(client->s, cur->s, strlen(cur->s)); + if (result < strlen(cur->s)){ + perror("write()"); + sprintf(errorbuf, "Error code: %d\n", errno); + fprintf(stderr, errorbuf); + + pthread_mutex_lock(&clients_mutex); + n_clients--; + pthread_mutex_unlock(&clients_mutex); + + fprintf(stderr, "Client died (%d connected)\n", n_clients); + queue_abort(cur, client->s); + + close(client->s); + free(client); + + pthread_exit(NULL); + } + } else { + close(client->s); + pthread_mutex_lock(&clients_mutex); + n_clients--; + pthread_mutex_unlock(&clients_mutex); + fprintf(stderr, "Client dismissed (%d connected)\n", n_clients); + pthread_exit(NULL); + } + + s = read_line(client->s,expect_multiline_output); + if (s) { + /* fprintf(stderr, "Client (fid %d) returned: %s", client->s, s); */ + fprintf(stderr, "Client (fid %d) returned data %d\n", client->s, cur->id); +// queue_print(); + queue_finish(cur, s, client->s); + } else { + pthread_mutex_lock(&clients_mutex); + n_clients--; + pthread_mutex_unlock(&clients_mutex); + + fprintf(stderr, "Client died (%d connected)\n", n_clients); + queue_abort(cur, client->s); + + close(client->s); + free(client); + + pthread_exit(NULL); + } + + } + return 0; +} + +void done (int code) { + close(s); + exit(code); +} + + + +int main (int argc, char *argv[]) { + struct sockaddr_in sin, from; + int g; + socklen_t len; + struct clientinfo *client; + int port; + int opt; + int errors = 0; + int argi; + char *key = NULL, *client_key; + int use_key = 0; + /* the key stuff here doesn't provide any + real measure of security, it's mainly to keep + jobs from bumping into each other. */ + + pthread_t tid; + port = DEFAULT_PORT; + + for (argi=1; argi < argc; argi++){ + if (strcmp(argv[argi], "-m")==0){ + expect_multiline_output = 1; + } else if (strcmp(argv[argi], "-k")==0){ + argi++; + if (argi == argc){ + fprintf(stderr, "Key must be specified after -k\n"); + exit(1); + } + key = argv[argi]; + use_key = 1; + } else if (strcmp(argv[argi], "--stay-alive")==0){ + stay_alive = 1; /* dont panic and die with zero clients */ + } else { + port = atoi(argv[argi]); + } + } + + /* Initialize data structures */ + head = NULL; + ptail = &head; + + /* Set up listener */ + s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + opt = 1; + setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)); + + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(port); + while (bind(s, (struct sockaddr *) &sin, sizeof(sin)) < 0) { + perror("bind()"); + sleep(1); + errors++; + if (errors > 100) + exit(1); + } + + len = sizeof(sin); + getsockname(s, (struct sockaddr *) &sin, &len); + + fprintf(stderr, "Listening on port %hu\n", ntohs(sin.sin_port)); + + while (listen(s, MAX_CLIENTS) < 0) { + perror("listen()"); + sleep(1); + errors++; + if (errors > 100) + exit(1); + } + + for (;;) { + len = sizeof(from); + g = accept(s, (struct sockaddr *)&from, &len); + if (g < 0) { + perror("accept()"); + sleep(1); + continue; + } + client = malloc(sizeof(struct clientinfo)); + client->s = g; + bcopy(&from, &client->sin, len); + + if (use_key){ + fd_set set; + FD_ZERO(&set); + FD_SET(client->s, &set); + + struct timeval timeout; + timeout.tv_sec = 3; // number of seconds for timeout + timeout.tv_usec = 0; + + int ready = select(FD_SETSIZE, &set, NULL, NULL, &timeout); + if (ready<1){ + fprintf(stderr, "Prospective client failed to respond with correct key.\n"); + close(client->s); + free(client); + } else { + client_key = read_line(client->s,0); + client_key[strlen(client_key)-1]='\0'; /* chop trailing newline */ + if (strcmp(key, client_key)==0){ + pthread_create(&tid, NULL, new_client, client); + } else { + fprintf(stderr, "Prospective client failed to respond with correct key.\n"); + close(client->s); + free(client); + } + free(client_key); + } + } else { + pthread_create(&tid, NULL, new_client, client); + } + } + +} + + + diff --git a/dpmert/sentserver.h b/dpmert/sentserver.h new file mode 100644 index 00000000..cd17a546 --- /dev/null +++ b/dpmert/sentserver.h @@ -0,0 +1,6 @@ +#ifndef SENTSERVER_H +#define SENTSERVER_H + +#define DEFAULT_PORT 50000 + +#endif diff --git a/dpmert/tac.pl b/dpmert/tac.pl new file mode 100755 index 00000000..9fb525c1 --- /dev/null +++ b/dpmert/tac.pl @@ -0,0 +1,8 @@ +#!/usr/bin/perl + +while(<>) { + chomp; + $|=1; + print (scalar reverse($_)); + print "\n"; +} diff --git a/dpmert/test_aer/README b/dpmert/test_aer/README new file mode 100644 index 00000000..819b2e32 --- /dev/null +++ b/dpmert/test_aer/README @@ -0,0 +1,8 @@ +To run the test: + +../dist-vest.pl --local --metric aer cdec.ini --source-file corpus.src --ref-files=ref.0 --weights weights + +This will optimize the parameters of the tiny lexical translation model +so as to minimize the AER of the Viterbi alignment on the development +set in corpus.src according to the reference alignments in ref.0. + diff --git a/dpmert/test_aer/cdec.ini b/dpmert/test_aer/cdec.ini new file mode 100644 index 00000000..08187848 --- /dev/null +++ b/dpmert/test_aer/cdec.ini @@ -0,0 +1,3 @@ +formalism=lextrans +grammar=grammar +aligner=true diff --git a/dpmert/test_aer/corpus.src b/dpmert/test_aer/corpus.src new file mode 100644 index 00000000..31b23971 --- /dev/null +++ b/dpmert/test_aer/corpus.src @@ -0,0 +1,3 @@ +el gato negro ||| the black cat +el gato ||| the cat +el libro ||| the book diff --git a/dpmert/test_aer/grammar b/dpmert/test_aer/grammar new file mode 100644 index 00000000..9d857824 --- /dev/null +++ b/dpmert/test_aer/grammar @@ -0,0 +1,12 @@ +el ||| cat ||| F1=1 +el ||| the ||| F2=1 +el ||| black ||| F3=1 +el ||| book ||| F11=1 +gato ||| cat ||| F4=1 NN=1 +gato ||| black ||| F5=1 +gato ||| the ||| F6=1 +negro ||| the ||| F7=1 +negro ||| cat ||| F8=1 +negro ||| black ||| F9=1 +libro ||| the ||| F10=1 +libro ||| book ||| F12=1 NN=1 diff --git a/dpmert/test_aer/ref.0 b/dpmert/test_aer/ref.0 new file mode 100644 index 00000000..734a9c5b --- /dev/null +++ b/dpmert/test_aer/ref.0 @@ -0,0 +1,3 @@ +0-0 1-2 2-1 +0-0 1-1 +0-0 1-1 diff --git a/dpmert/test_aer/weights b/dpmert/test_aer/weights new file mode 100644 index 00000000..afc9282e --- /dev/null +++ b/dpmert/test_aer/weights @@ -0,0 +1,13 @@ +F1 0.1 +F2 -.5980815 +F3 0.24235 +F4 0.625 +F5 0.4514 +F6 0.112316 +F7 -0.123415 +F8 -0.25390285 +F9 -0.23852 +F10 0.646 +F11 0.413141 +F12 0.343216 +NN -0.1215 diff --git a/dpmert/test_data/0.json.gz b/dpmert/test_data/0.json.gz new file mode 100644 index 00000000..30f8dd77 Binary files /dev/null and b/dpmert/test_data/0.json.gz differ diff --git a/dpmert/test_data/1.json.gz b/dpmert/test_data/1.json.gz new file mode 100644 index 00000000..c82cc179 Binary files /dev/null and b/dpmert/test_data/1.json.gz differ diff --git a/dpmert/test_data/c2e.txt.0 b/dpmert/test_data/c2e.txt.0 new file mode 100644 index 00000000..12c4abe9 --- /dev/null +++ b/dpmert/test_data/c2e.txt.0 @@ -0,0 +1,2 @@ +australia reopens embassy in manila +( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack . diff --git a/dpmert/test_data/c2e.txt.1 b/dpmert/test_data/c2e.txt.1 new file mode 100644 index 00000000..4ac12df1 --- /dev/null +++ b/dpmert/test_data/c2e.txt.1 @@ -0,0 +1,2 @@ +australia reopened manila embassy +( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack . diff --git a/dpmert/test_data/c2e.txt.2 b/dpmert/test_data/c2e.txt.2 new file mode 100644 index 00000000..2f67b72f --- /dev/null +++ b/dpmert/test_data/c2e.txt.2 @@ -0,0 +1,2 @@ +australia to reopen embassy in manila +( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so-called confirmed terrorist attack threats . diff --git a/dpmert/test_data/c2e.txt.3 b/dpmert/test_data/c2e.txt.3 new file mode 100644 index 00000000..5483cef6 --- /dev/null +++ b/dpmert/test_data/c2e.txt.3 @@ -0,0 +1,2 @@ +australia to re - open its embassy to manila +( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so-called " clear " threat of terrorist attack 7 weeks ago . diff --git a/dpmert/test_data/re.txt.0 b/dpmert/test_data/re.txt.0 new file mode 100644 index 00000000..86eff087 --- /dev/null +++ b/dpmert/test_data/re.txt.0 @@ -0,0 +1,5 @@ +erdogan states turkey to reject any pressures to urge it to recognize cyprus +ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara will reject any pressure by the european union to urge it to recognize cyprus . this comes two weeks before the summit of european union state and government heads who will decide whether or nor membership negotiations with ankara should be opened . +erdogan told " ntv " television station that " the european union cannot address us by imposing new conditions on us with regard to cyprus . +we will discuss this dossier in the course of membership negotiations . " +he added " let me be clear , i cannot sidestep turkey , this is something we cannot accept . " diff --git a/dpmert/test_data/re.txt.1 b/dpmert/test_data/re.txt.1 new file mode 100644 index 00000000..2140f198 --- /dev/null +++ b/dpmert/test_data/re.txt.1 @@ -0,0 +1,5 @@ +erdogan confirms turkey will resist any pressure to recognize cyprus +ankara 12 - 1 ( afp ) - the turkish head of government , recep tayyip erdogan , announced today ( wednesday ) that ankara would resist any pressure the european union might exercise in order to force it into recognizing cyprus . this comes two weeks before a summit of european union heads of state and government , who will decide whether or not to open membership negotiations with ankara . +erdogan said to the ntv television channel : " the european union cannot engage with us through imposing new conditions on us with regard to cyprus . +we shall discuss this issue in the course of the membership negotiations . " +he added : " let me be clear - i cannot confine turkey . this is something we do not accept . " diff --git a/dpmert/test_data/re.txt.2 b/dpmert/test_data/re.txt.2 new file mode 100644 index 00000000..94e46286 --- /dev/null +++ b/dpmert/test_data/re.txt.2 @@ -0,0 +1,5 @@ +erdogan confirms that turkey will reject any pressures to encourage it to recognize cyprus +ankara , 12 / 1 ( afp ) - the turkish prime minister recep tayyip erdogan declared today , wednesday , that ankara will reject any pressures that the european union may apply on it to encourage to recognize cyprus . this comes two weeks before a summit of the heads of countries and governments of the european union , who will decide on whether or not to start negotiations on joining with ankara . +erdogan told the ntv television station that " it is not possible for the european union to talk to us by imposing new conditions on us regarding cyprus . +we shall discuss this dossier during the negotiations on joining . " +and he added , " let me be clear . turkey's arm should not be twisted ; this is something we cannot accept . " diff --git a/dpmert/test_data/re.txt.3 b/dpmert/test_data/re.txt.3 new file mode 100644 index 00000000..f87c3308 --- /dev/null +++ b/dpmert/test_data/re.txt.3 @@ -0,0 +1,5 @@ +erdogan stresses that turkey will reject all pressures to force it to recognize cyprus +ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara would refuse all pressures applied on it by the european union to force it to recognize cyprus . that came two weeks before the summit of the presidents and prime ministers of the european union , who would decide on whether to open negotiations on joining with ankara or not . +erdogan said to " ntv " tv station that the " european union can not communicate with us by imposing on us new conditions related to cyprus . +we will discuss this file during the negotiations on joining . " +he added , " let me be clear . turkey's arm should not be twisted . this is unacceptable to us . " diff --git a/vest/Makefile.am b/vest/Makefile.am deleted file mode 100644 index 05fa5639..00000000 --- a/vest/Makefile.am +++ /dev/null @@ -1,35 +0,0 @@ -bin_PROGRAMS = \ - mr_vest_map \ - mr_vest_reduce \ - mr_vest_generate_mapper_input \ - sentserver \ - sentclient - -if HAVE_GTEST -noinst_PROGRAMS = \ - lo_test -TESTS = lo_test -endif - -sentserver_SOURCES = sentserver.c -sentserver_LDFLAGS = -all-static -pthread - -sentclient_SOURCES = sentclient.c -sentclient_LDFLAGS = -all-static -pthread - -mr_vest_generate_mapper_input_SOURCES = mr_vest_generate_mapper_input.cc line_optimizer.cc -mr_vest_generate_mapper_input_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz - -# nbest2hg_SOURCES = nbest2hg.cc -# nbest2hg_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lfst -lz - -mr_vest_map_SOURCES = viterbi_envelope.cc ces.cc error_surface.cc mr_vest_map.cc line_optimizer.cc -mr_vest_map_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz - -mr_vest_reduce_SOURCES = error_surface.cc ces.cc mr_vest_reduce.cc line_optimizer.cc viterbi_envelope.cc -mr_vest_reduce_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz - -lo_test_SOURCES = lo_test.cc ces.cc viterbi_envelope.cc error_surface.cc line_optimizer.cc -lo_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a -lz - -AM_CPPFLAGS = -W -Wall -Wno-sign-compare $(GTEST_CPPFLAGS) -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/vest/README.shared-mem b/vest/README.shared-mem deleted file mode 100644 index 7728efc0..00000000 --- a/vest/README.shared-mem +++ /dev/null @@ -1,9 +0,0 @@ -If you want to run dist-vest.pl on a very large shared memory machine, do the -following: - - ./dist-vest.pl --use-make I --decode-nodes J --weights weights.init --source-file=dev.src --ref-files=dev.ref.* cdec.ini - -This will use I jobs for doing the line search and J jobs to run the decoder. Typically, since the -decoder must load grammars, language models, etc., J should be smaller than I, but this will depend -on the system you are running on and the complexity of the models used for decoding. - diff --git a/vest/cat.pl b/vest/cat.pl deleted file mode 100755 index 2ecba3f9..00000000 --- a/vest/cat.pl +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/perl - -$|=1; -print while(<>); diff --git a/vest/ces.cc b/vest/ces.cc deleted file mode 100644 index cd89aa69..00000000 --- a/vest/ces.cc +++ /dev/null @@ -1,91 +0,0 @@ -#include "ces.h" - -#include -#include -#include - -// TODO, if AER is to be optimized again, we will need this -// #include "aligner.h" -#include "lattice.h" -#include "viterbi_envelope.h" -#include "error_surface.h" -#include "ns.h" - -using boost::shared_ptr; -using namespace std; - -const bool minimize_segments = true; // if adjacent segments have equal scores, merge them - -void ComputeErrorSurface(const SegmentEvaluator& ss, - const ViterbiEnvelope& ve, - ErrorSurface* env, - const EvaluationMetric* metric, - const Hypergraph& hg) { - vector prev_trans; - const vector >& ienv = ve.GetSortedSegs(); - env->resize(ienv.size()); - SufficientStats prev_score; // defaults to 0 - int j = 0; - for (int i = 0; i < ienv.size(); ++i) { - const Segment& seg = *ienv[i]; - vector trans; -#if 0 - if (type == AER) { - vector edges(hg.edges_.size(), false); - seg.CollectEdgesUsed(&edges); // get the set of edges in the viterbi - // alignment - ostringstream os; - const string* psrc = ss.GetSource(); - if (psrc == NULL) { - cerr << "AER scoring in VEST requires source, but it is missing!\n"; - abort(); - } - size_t pos = psrc->rfind(" ||| "); - if (pos == string::npos) { - cerr << "Malformed source for AER: expected |||\nINPUT: " << *psrc << endl; - abort(); - } - Lattice src; - Lattice ref; - LatticeTools::ConvertTextOrPLF(psrc->substr(0, pos), &src); - LatticeTools::ConvertTextOrPLF(psrc->substr(pos + 5), &ref); - AlignerTools::WriteAlignment(src, ref, hg, &os, true, 0, &edges); - string tstr = os.str(); - TD::ConvertSentence(tstr.substr(tstr.rfind(" ||| ") + 5), &trans); - } else { -#endif - seg.ConstructTranslation(&trans); - //} - //cerr << "Scoring: " << TD::GetString(trans) << endl; - if (trans == prev_trans) { - if (!minimize_segments) { - ErrorSegment& out = (*env)[j]; - out.delta.fields.clear(); - out.x = seg.x; - ++j; - } - //cerr << "Identical translation, skipping scoring\n"; - } else { - SufficientStats score; - ss.Evaluate(trans, &score); - // cerr << "score= " << score->ComputeScore() << "\n"; - //string x1; score.Encode(&x1); cerr << "STATS: " << x1 << endl; - const SufficientStats delta = score - prev_score; - //string x2; delta.Encode(&x2); cerr << "DELTA: " << x2 << endl; - //string xx; delta.Encode(&xx); cerr << xx << endl; - prev_trans.swap(trans); - prev_score = score; - if ((!minimize_segments) || (!delta.IsAdditiveIdentity())) { - ErrorSegment& out = (*env)[j]; - out.delta = delta; - out.x = seg.x; - ++j; - } - } - } - // cerr << " In segments: " << ienv.size() << endl; - // cerr << "Out segments: " << j << endl; - assert(j > 0); - env->resize(j); -} - diff --git a/vest/ces.h b/vest/ces.h deleted file mode 100644 index e021e715..00000000 --- a/vest/ces.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef _CES_H_ -#define _CES_H_ - -class ViterbiEnvelope; -class Hypergraph; -class SegmentEvaluator; -class ErrorSurface; -class EvaluationMetric; - -void ComputeErrorSurface(const SegmentEvaluator& ss, - const ViterbiEnvelope& ve, - ErrorSurface* es, - const EvaluationMetric* metric, - const Hypergraph& hg); - -#endif diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl deleted file mode 100755 index 1ec8c6b1..00000000 --- a/vest/dist-vest.pl +++ /dev/null @@ -1,700 +0,0 @@ -#!/usr/bin/env perl -use strict; -my @ORIG_ARGV=@ARGV; -use Cwd qw(getcwd); -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } - -# Skip local config (used for distributing jobs) if we're running in local-only mode -use LocalConfig; -use Getopt::Long; -use IPC::Open2; -use POSIX ":sys_wait_h"; -my $QSUB_CMD = qsub_args(mert_memory()); - -require "libcall.pl"; - -# Default settings -my $srcFile; -my $refFiles; -my $default_jobs = env_default_jobs(); -my $bin_dir = $SCRIPT_DIR; -die "Bin directory $bin_dir missing/inaccessible" unless -d $bin_dir; -my $FAST_SCORE="$bin_dir/../mteval/fast_score"; -die "Can't execute $FAST_SCORE" unless -x $FAST_SCORE; -my $MAPINPUT = "$bin_dir/mr_vest_generate_mapper_input"; -my $MAPPER = "$bin_dir/mr_vest_map"; -my $REDUCER = "$bin_dir/mr_vest_reduce"; -my $parallelize = "$bin_dir/parallelize.pl"; -my $libcall = "$bin_dir/libcall.pl"; -my $sentserver = "$bin_dir/sentserver"; -my $sentclient = "$bin_dir/sentclient"; -my $LocalConfig = "$SCRIPT_DIR/../environment/LocalConfig.pm"; - -my $SCORER = $FAST_SCORE; -die "Can't find $MAPPER" unless -x $MAPPER; -my $cdec = "$bin_dir/../decoder/cdec"; -die "Can't find decoder in $cdec" unless -x $cdec; -die "Can't find $parallelize" unless -x $parallelize; -die "Can't find $libcall" unless -e $libcall; -my $decoder = $cdec; -my $lines_per_mapper = 400; -my $rand_directions = 15; -my $iteration = 1; -my $best_weights; -my $max_iterations = 15; -my $optimization_iters = 6; -my $jobs = $default_jobs; # number of decode nodes -my $pmem = "9g"; -my $disable_clean = 0; -my %seen_weights; -my $normalize; -my $help = 0; -my $epsilon = 0.0001; -my $interval = 5; -my $dryrun = 0; -my $last_score = -10000000; -my $metric = "ibm_bleu"; -my $dir; -my $iniFile; -my $weights; -my $initialWeights; -my $decoderOpt; -my $noprimary; -my $maxsim=0; -my $oraclen=0; -my $oracleb=20; -my $bleu_weight=1; -my $use_make = 1; # use make to parallelize line search -my $useqsub; -my $pass_suffix = ''; -my $cpbin=1; -# Process command-line options -Getopt::Long::Configure("no_auto_abbrev"); -if (GetOptions( - "decoder=s" => \$decoderOpt, - "jobs=i" => \$jobs, - "dont-clean" => \$disable_clean, - "pass-suffix=s" => \$pass_suffix, - "dry-run" => \$dryrun, - "epsilon=s" => \$epsilon, - "help" => \$help, - "interval" => \$interval, - "qsub" => \$useqsub, - "max-iterations=i" => \$max_iterations, - "normalize=s" => \$normalize, - "pmem=s" => \$pmem, - "cpbin!" => \$cpbin, - "random-directions=i" => \$rand_directions, - "ref-files=s" => \$refFiles, - "metric=s" => \$metric, - "source-file=s" => \$srcFile, - "weights=s" => \$initialWeights, - "workdir=s" => \$dir, - "opt-iterations=i" => \$optimization_iters, -) == 0 || @ARGV!=1 || $help) { - print_help(); - exit; -} - -if ($useqsub) { - $use_make = 0; - die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub(); -} - -my @missing_args = (); -if (!defined $srcFile) { push @missing_args, "--source-file"; } -if (!defined $refFiles) { push @missing_args, "--ref-files"; } -if (!defined $initialWeights) { push @missing_args, "--weights"; } -die "Please specify missing arguments: " . join (', ', @missing_args) . "\n" if (@missing_args); - -if ($metric =~ /^(combi|ter)$/i) { - $lines_per_mapper = 40; -} elsif ($metric =~ /^meteor$/i) { - $lines_per_mapper = 2000; # start up time is really high -} - -($iniFile) = @ARGV; - - -sub write_config; -sub enseg; -sub print_help; - -my $nodelist; -my $host =check_output("hostname"); chomp $host; -my $bleu; -my $interval_count = 0; -my $logfile; -my $projected_score; - -# used in sorting scores -my $DIR_FLAG = '-r'; -if ($metric =~ /^ter$|^aer$/i) { - $DIR_FLAG = ''; -} - -my $refs_comma_sep = get_comma_sep_refs('r',$refFiles); - -unless ($dir){ - $dir = "vest"; -} -unless ($dir =~ /^\//){ # convert relative path to absolute path - my $basedir = check_output("pwd"); - chomp $basedir; - $dir = "$basedir/$dir"; -} - -if ($decoderOpt){ $decoder = $decoderOpt; } - - -# Initializations and helper functions -srand; - -my @childpids = (); -my @cleanupcmds = (); - -sub cleanup { - print STDERR "Cleanup...\n"; - for my $pid (@childpids){ unchecked_call("kill $pid"); } - for my $cmd (@cleanupcmds){ unchecked_call("$cmd"); } - exit 1; -}; -# Always call cleanup, no matter how we exit -*CORE::GLOBAL::exit = - sub{ cleanup(); }; -$SIG{INT} = "cleanup"; -$SIG{TERM} = "cleanup"; -$SIG{HUP} = "cleanup"; - -my $decoderBase = check_output("basename $decoder"); chomp $decoderBase; -my $newIniFile = "$dir/$decoderBase.ini"; -my $inputFileName = "$dir/input"; -my $user = $ENV{"USER"}; - - -# process ini file --e $iniFile || die "Error: could not open $iniFile for reading\n"; -open(INI, $iniFile); - -use File::Basename qw(basename); -#pass bindir, refs to vars holding bin -sub modbin { - local $_; - my $bindir=shift; - check_call("mkdir -p $bindir"); - -d $bindir || die "couldn't make bindir $bindir"; - for (@_) { - my $src=$$_; - $$_="$bindir/".basename($src); - check_call("cp -p $src $$_"); - } -} -sub dirsize { - opendir ISEMPTY,$_[0]; - return scalar(readdir(ISEMPTY))-1; -} -if ($dryrun){ - write_config(*STDERR); - exit 0; -} else { - if (-e $dir && dirsize($dir)>1 && -e "$dir/hgs" ){ # allow preexisting logfile, binaries, but not dist-vest.pl outputs - die "ERROR: working dir $dir already exists\n\n"; - } else { - -e $dir || mkdir $dir; - mkdir "$dir/hgs"; - modbin("$dir/bin",\$LocalConfig,\$cdec,\$SCORER,\$MAPINPUT,\$MAPPER,\$REDUCER,\$parallelize,\$sentserver,\$sentclient,\$libcall) if $cpbin; - mkdir "$dir/scripts"; - my $cmdfile="$dir/rerun-vest.sh"; - open CMD,'>',$cmdfile; - print CMD "cd ",&getcwd,"\n"; -# print CMD &escaped_cmdline,"\n"; #buggy - last arg is quoted. - my $cline=&cmdline."\n"; - print CMD $cline; - close CMD; - print STDERR $cline; - chmod(0755,$cmdfile); - unless (-e $initialWeights) { - print STDERR "Please specify an initial weights file with --initial-weights\n"; - print_help(); - exit; - } - check_call("cp $initialWeights $dir/weights.0"); - die "Can't find weights.0" unless (-e "$dir/weights.0"); - } - write_config(*STDERR); -} - - -# Generate initial files and values -check_call("cp $iniFile $newIniFile"); -$iniFile = $newIniFile; - -my $newsrc = "$dir/dev.input"; -enseg($srcFile, $newsrc); -$srcFile = $newsrc; -my $devSize = 0; -open F, "<$srcFile" or die "Can't read $srcFile: $!"; -while() { $devSize++; } -close F; - -unless($best_weights){ $best_weights = $weights; } -unless($projected_score){ $projected_score = 0.0; } -$seen_weights{$weights} = 1; - -my $random_seed = int(time / 1000); -my $lastWeightsFile; -my $lastPScore = 0; -# main optimization loop -while (1){ - print STDERR "\n\nITERATION $iteration\n==========\n"; - - if ($iteration > $max_iterations){ - print STDERR "\nREACHED STOPPING CRITERION: Maximum iterations\n"; - last; - } - # iteration-specific files - my $runFile="$dir/run.raw.$iteration"; - my $onebestFile="$dir/1best.$iteration"; - my $logdir="$dir/logs.$iteration"; - my $decoderLog="$logdir/decoder.sentserver.log.$iteration"; - my $scorerLog="$logdir/scorer.log.$iteration"; - check_call("mkdir -p $logdir"); - - - #decode - print STDERR "RUNNING DECODER AT "; - print STDERR unchecked_output("date"); - my $im1 = $iteration - 1; - my $weightsFile="$dir/weights.$im1"; - my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs"; - my $pcmd; - if ($use_make) { - $pcmd = "cat $srcFile | $parallelize --use-fork -p $pmem -e $logdir -j $jobs --"; - } else { - $pcmd = "cat $srcFile | $parallelize -p $pmem -e $logdir -j $jobs --"; - } - my $cmd = "$pcmd $decoder_cmd 2> $decoderLog 1> $runFile"; - print STDERR "COMMAND:\n$cmd\n"; - check_bash_call($cmd); - my $num_hgs; - my $num_topbest; - my $retries = 0; - while($retries < 5) { - $num_hgs = check_output("ls $dir/hgs/*.gz | wc -l"); - $num_topbest = check_output("wc -l < $runFile"); - print STDERR "NUMBER OF HGs: $num_hgs\n"; - print STDERR "NUMBER OF TOP-BEST HYPs: $num_topbest\n"; - if($devSize == $num_hgs && $devSize == $num_topbest) { - last; - } else { - print STDERR "Incorrect number of hypergraphs or topbest. Waiting for distributed filesystem and retrying...\n"; - sleep(3); - } - $retries++; - } - die "Dev set contains $devSize sentences, but we don't have topbest and hypergraphs for all these! Decoder failure? Check $decoderLog\n" if ($devSize != $num_hgs || $devSize != $num_topbest); - my $dec_score = check_output("cat $runFile | $SCORER $refs_comma_sep -m $metric"); - chomp $dec_score; - print STDERR "DECODER SCORE: $dec_score\n"; - - # save space - check_call("gzip -f $runFile"); - check_call("gzip -f $decoderLog"); - - # run optimizer - print STDERR "RUNNING OPTIMIZER AT "; - print STDERR unchecked_output("date"); - my $mergeLog="$logdir/prune-merge.log.$iteration"; - - my $score = 0; - my $icc = 0; - my $inweights="$dir/weights.$im1"; - for (my $opt_iter=1; $opt_iter<$optimization_iters; $opt_iter++) { - print STDERR "\nGENERATE OPTIMIZATION STRATEGY (OPT-ITERATION $opt_iter/$optimization_iters)\n"; - print STDERR unchecked_output("date"); - $icc++; - $cmd="$MAPINPUT -w $inweights -r $dir/hgs -s $devSize -d $rand_directions > $dir/agenda.$im1-$opt_iter"; - print STDERR "COMMAND:\n$cmd\n"; - check_call($cmd); - check_call("mkdir -p $dir/splag.$im1"); - $cmd="split -a 3 -l $lines_per_mapper $dir/agenda.$im1-$opt_iter $dir/splag.$im1/mapinput."; - print STDERR "COMMAND:\n$cmd\n"; - check_call($cmd); - opendir(DIR, "$dir/splag.$im1") or die "Can't open directory: $!"; - my @shards = grep { /^mapinput\./ } readdir(DIR); - closedir DIR; - die "No shards!" unless scalar @shards > 0; - my $joblist = ""; - my $nmappers = 0; - my @mapoutputs = (); - @cleanupcmds = (); - my %o2i = (); - my $first_shard = 1; - my $mkfile; # only used with makefiles - my $mkfilename; - if ($use_make) { - $mkfilename = "$dir/splag.$im1/domap.mk"; - open $mkfile, ">$mkfilename" or die "Couldn't write $mkfilename: $!"; - print $mkfile "all: $dir/splag.$im1/map.done\n\n"; - } - my @mkouts = (); # only used with makefiles - for my $shard (@shards) { - my $mapoutput = $shard; - my $client_name = $shard; - $client_name =~ s/mapinput.//; - $client_name = "vest.$client_name"; - $mapoutput =~ s/mapinput/mapoutput/; - push @mapoutputs, "$dir/splag.$im1/$mapoutput"; - $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; - my $script = "$MAPPER -s $srcFile -m $metric $refs_comma_sep < $dir/splag.$im1/$shard | sort -t \$'\\t' -k 1 > $dir/splag.$im1/$mapoutput"; - if ($use_make) { - my $script_file = "$dir/scripts/map.$shard"; - open F, ">$script_file" or die "Can't write $script_file: $!"; - print F "#!/bin/bash\n"; - print F "$script\n"; - close F; - my $output = "$dir/splag.$im1/$mapoutput"; - push @mkouts, $output; - chmod(0755, $script_file) or die "Can't chmod $script_file: $!"; - if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } - print $mkfile "$output: $dir/splag.$im1/$shard\n\t$script_file\n\n"; - } else { - my $script_file = "$dir/scripts/map.$shard"; - open F, ">$script_file" or die "Can't write $script_file: $!"; - print F "$script\n"; - close F; - if ($first_shard) { print STDERR "$script\n"; $first_shard=0; } - - $nmappers++; - my $qcmd = "$QSUB_CMD -N $client_name -o /dev/null -e $logdir/$client_name.ER $script_file"; - my $jobid = check_output("$qcmd"); - chomp $jobid; - $jobid =~ s/^(\d+)(.*?)$/\1/g; - $jobid =~ s/^Your job (\d+) .*$/\1/; - push(@cleanupcmds, "qdel $jobid 2> /dev/null"); - print STDERR " $jobid"; - if ($joblist == "") { $joblist = $jobid; } - else {$joblist = $joblist . "\|" . $jobid; } - } - } - if ($use_make) { - print $mkfile "$dir/splag.$im1/map.done: @mkouts\n\ttouch $dir/splag.$im1/map.done\n\n"; - close $mkfile; - my $mcmd = "make -j $jobs -f $mkfilename"; - print STDERR "\nExecuting: $mcmd\n"; - check_call($mcmd); - } else { - print STDERR "\nLaunched $nmappers mappers.\n"; - sleep 8; - print STDERR "Waiting for mappers to complete...\n"; - while ($nmappers > 0) { - sleep 5; - my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat | grep -v ' C '"))); - $nmappers = scalar @livejobs; - } - print STDERR "All mappers complete.\n"; - } - my $tol = 0; - my $til = 0; - for my $mo (@mapoutputs) { - my $olines = get_lines($mo); - my $ilines = get_lines($o2i{$mo}); - $tol += $olines; - $til += $ilines; - die "$mo: output lines ($olines) doesn't match input lines ($ilines)" unless $olines==$ilines; - } - print STDERR "Results for $tol/$til lines\n"; - print STDERR "\nSORTING AND RUNNING VEST REDUCER\n"; - print STDERR unchecked_output("date"); - $cmd="sort -t \$'\\t' -k 1 @mapoutputs | $REDUCER -m $metric > $dir/redoutput.$im1"; - print STDERR "COMMAND:\n$cmd\n"; - check_bash_call($cmd); - $cmd="sort -nk3 $DIR_FLAG '-t|' $dir/redoutput.$im1 | head -1"; - # sort returns failure even when it doesn't fail for some reason - my $best=unchecked_output("$cmd"); chomp $best; - print STDERR "$best\n"; - my ($oa, $x, $xscore) = split /\|/, $best; - $score = $xscore; - print STDERR "PROJECTED SCORE: $score\n"; - if (abs($x) < $epsilon) { - print STDERR "\nOPTIMIZER: no score improvement: abs($x) < $epsilon\n"; - last; - } - my $psd = $score - $last_score; - $last_score = $score; - if (abs($psd) < $epsilon) { - print STDERR "\nOPTIMIZER: no score improvement: abs($psd) < $epsilon\n"; - last; - } - my ($origin, $axis) = split /\s+/, $oa; - - my %ori = convert($origin); - my %axi = convert($axis); - - my $finalFile="$dir/weights.$im1-$opt_iter"; - open W, ">$finalFile" or die "Can't write: $finalFile: $!"; - my $norm = 0; - for my $k (sort keys %ori) { - my $dd = $ori{$k} + $axi{$k} * $x; - $norm += $dd * $dd; - } - $norm = sqrt($norm); - $norm = 1; - for my $k (sort keys %ori) { - my $v = ($ori{$k} + $axi{$k} * $x) / $norm; - print W "$k $v\n"; - } - check_call("rm $dir/splag.$im1/*"); - $inweights = $finalFile; - } - $lastWeightsFile = "$dir/weights.$iteration"; - check_call("cp $inweights $lastWeightsFile"); - if ($icc < 2) { - print STDERR "\nREACHED STOPPING CRITERION: score change too little\n"; - last; - } - $lastPScore = $score; - $iteration++; - print STDERR "\n==========\n"; -} - -print STDERR "\nFINAL WEIGHTS: $lastWeightsFile\n(Use -w with the decoder)\n\n"; - -print STDOUT "$lastWeightsFile\n"; - -exit 0; - -sub normalize_weights { - my ($rfn, $rpts, $feat) = @_; - my @feat_names = @$rfn; - my @pts = @$rpts; - my $z = 1.0; - for (my $i=0; $i < scalar @feat_names; $i++) { - if ($feat_names[$i] eq $feat) { - $z = $pts[$i]; - last; - } - } - for (my $i=0; $i < scalar @feat_names; $i++) { - $pts[$i] /= $z; - } - print STDERR " NORM WEIGHTS: @pts\n"; - return @pts; -} - -sub get_lines { - my $fn = shift @_; - open FL, "<$fn" or die "Couldn't read $fn: $!"; - my $lc = 0; - while() { $lc++; } - return $lc; -} - -sub get_comma_sep_refs { - my ($r,$p) = @_; - my $o = check_output("echo $p"); - chomp $o; - my @files = split /\s+/, $o; - return "-$r " . join(" -$r ", @files); -} - -sub read_weights_file { - my ($file) = @_; - open F, "<$file" or die "Couldn't read $file: $!"; - my @r = (); - my $pm = -1; - while() { - next if /^#/; - next if /^\s*$/; - chomp; - if (/^(.+)\s+(.+)$/) { - my $m = $1; - my $w = $2; - die "Weights out of order: $m <= $pm" unless $m > $pm; - push @r, $w; - } else { - warn "Unexpected feature name in weight file: $_"; - } - } - close F; - return join ' ', @r; -} - -# subs -sub write_config { - my $fh = shift; - my $cleanup = "yes"; - if ($disable_clean) {$cleanup = "no";} - - print $fh "\n"; - print $fh "DECODER: $decoder\n"; - print $fh "INI FILE: $iniFile\n"; - print $fh "WORKING DIR: $dir\n"; - print $fh "SOURCE (DEV): $srcFile\n"; - print $fh "REFS (DEV): $refFiles\n"; - print $fh "EVAL METRIC: $metric\n"; - print $fh "START ITERATION: $iteration\n"; - print $fh "MAX ITERATIONS: $max_iterations\n"; - print $fh "PARALLEL JOBS: $jobs\n"; - print $fh "HEAD NODE: $host\n"; - print $fh "PMEM (DECODING): $pmem\n"; - print $fh "CLEANUP: $cleanup\n"; - print $fh "INITIAL WEIGHTS: $initialWeights\n"; -} - -sub update_weights_file { - my ($neww, $rfn, $rpts) = @_; - my @feats = @$rfn; - my @pts = @$rpts; - my $num_feats = scalar @feats; - my $num_pts = scalar @pts; - die "$num_feats (num_feats) != $num_pts (num_pts)" unless $num_feats == $num_pts; - open G, ">$neww" or die; - for (my $i = 0; $i < $num_feats; $i++) { - my $f = $feats[$i]; - my $lambda = $pts[$i]; - print G "$f $lambda\n"; - } - close G; -} - -sub enseg { - my $src = shift; - my $newsrc = shift; - open(SRC, $src); - open(NEWSRC, ">$newsrc"); - my $i=0; - while (my $line=){ - chomp $line; - if ($line =~ /^\s* tags, you must include a zero-based id attribute"; - } - } else { - print NEWSRC "$line\n"; - } - $i++; - } - close SRC; - close NEWSRC; -} - -sub print_help { - - my $executable = check_output("basename $0"); chomp $executable; - print << "Help"; - -Usage: $executable [options] - - $executable [options] - Runs a complete MERT optimization using the decoder configuration - in . Required options are --weights, --source-file, and - --ref-files. - -Options: - - --help - Print this message and exit. - - --max-iterations - Maximum number of iterations to run. If not specified, defaults - to 10. - - --pass-suffix - If the decoder is doing multi-pass decoding, the pass suffix "2", - "3", etc., is used to control what iteration of weights is set. - - --ref-files - Dev set ref files. This option takes only a single string argument. - To use multiple files (including file globbing), this argument should - be quoted. - - --metric - Metric to optimize. - Example values: IBM_BLEU, NIST_BLEU, Koehn_BLEU, TER, Combi - - --normalize - After each iteration, rescale all feature weights such that feature- - name has a weight of 1.0. - - --rand-directions - MERT will attempt to optimize along all of the principle directions, - set this parameter to explore other directions. Defaults to 5. - - --source-file - Dev set source file. - - --weights - A file specifying initial feature weights. The format is - FeatureName_1 value1 - FeatureName_2 value2 - **All and only the weights listed in will be optimized!** - - --workdir - Directory for intermediate and output files. If not specified, the - name is derived from the ini filename. Assuming that the ini - filename begins with the decoder name and ends with ini, the default - name of the working directory is inferred from the middle part of - the filename. E.g. an ini file named decoder.foo.ini would have - a default working directory name foo. - -Job control options: - - --jobs - Number of decoder processes to run in parallel. [default=$default_jobs] - - --qsub - Use qsub to run jobs in parallel (qsub must be configured in - environment/LocalEnvironment.pm) - - --pmem - Amount of physical memory requested for parallel decoding jobs - (used with qsub requests only) - -Help -} - -sub convert { - my ($str) = @_; - my @ps = split /;/, $str; - my %dict = (); - for my $p (@ps) { - my ($k, $v) = split /=/, $p; - $dict{$k} = $v; - } - return %dict; -} - - - -sub cmdline { - return join ' ',($0,@ORIG_ARGV); -} - -#buggy: last arg gets quoted sometimes? -my $is_shell_special=qr{[ \t\n\\><|&;"'`~*?{}$!()]}; -my $shell_escape_in_quote=qr{[\\"\$`!]}; - -sub escape_shell { - my ($arg)=@_; - return undef unless defined $arg; - if ($arg =~ /$is_shell_special/) { - $arg =~ s/($shell_escape_in_quote)/\\$1/g; - return "\"$arg\""; - } - return $arg; -} - -sub escaped_shell_args { - return map {local $_=$_;chomp;escape_shell($_)} @_; -} - -sub escaped_shell_args_str { - return join ' ',&escaped_shell_args(@_); -} - -sub escaped_cmdline { - return "$0 ".&escaped_shell_args_str(@ORIG_ARGV); -} diff --git a/vest/error_surface.cc b/vest/error_surface.cc deleted file mode 100644 index 515b67f8..00000000 --- a/vest/error_surface.cc +++ /dev/null @@ -1,42 +0,0 @@ -#include "error_surface.h" - -#include -#include - -using namespace std; - -ErrorSurface::~ErrorSurface() {} - -void ErrorSurface::Serialize(std::string* out) const { - const int segments = this->size(); - ostringstream os(ios::binary); - os.write((const char*)&segments,sizeof(segments)); - for (int i = 0; i < segments; ++i) { - const ErrorSegment& cur = (*this)[i]; - string senc; - cur.delta.Encode(&senc); - assert(senc.size() < 1024); - unsigned char len = senc.size(); - os.write((const char*)&cur.x, sizeof(cur.x)); - os.write((const char*)&len, sizeof(len)); - os.write((const char*)&senc[0], len); - } - *out = os.str(); -} - -void ErrorSurface::Deserialize(const std::string& in) { - istringstream is(in, ios::binary); - int segments; - is.read((char*)&segments, sizeof(segments)); - this->resize(segments); - for (int i = 0; i < segments; ++i) { - ErrorSegment& cur = (*this)[i]; - unsigned char len; - is.read((char*)&cur.x, sizeof(cur.x)); - is.read((char*)&len, sizeof(len)); - string senc(len, '\0'); assert(senc.size() == len); - is.read((char*)&senc[0], len); - cur.delta = SufficientStats(senc); - } -} - diff --git a/vest/error_surface.h b/vest/error_surface.h deleted file mode 100644 index bb65847b..00000000 --- a/vest/error_surface.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef _ERROR_SURFACE_H_ -#define _ERROR_SURFACE_H_ - -#include -#include - -#include "ns.h" - -class Score; - -struct ErrorSegment { - double x; - SufficientStats delta; - ErrorSegment() : x(0), delta() {} -}; - -class ErrorSurface : public std::vector { - public: - ~ErrorSurface(); - void Serialize(std::string* out) const; - void Deserialize(const std::string& in); -}; - -#endif diff --git a/vest/libcall.pl b/vest/libcall.pl deleted file mode 100644 index c7d0f128..00000000 --- a/vest/libcall.pl +++ /dev/null @@ -1,71 +0,0 @@ -use IPC::Open3; -use Symbol qw(gensym); - -$DUMMY_STDERR = gensym(); -$DUMMY_STDIN = gensym(); - -# Run the command and ignore failures -sub unchecked_call { - system("@_") -} - -# Run the command and return its output, if any ignoring failures -sub unchecked_output { - return `@_` -} - -# WARNING: Do not use this for commands that will return large amounts -# of stdout or stderr -- they might block indefinitely -sub check_output { - print STDERR "Executing and gathering output: @_\n"; - - my $pid = open3($DUMMY_STDIN, \*PH, $DUMMY_STDERR, @_); - my $proc_output = ""; - while( ) { - $proc_output .= $_; - } - waitpid($pid, 0); - # TODO: Grab signal that the process died from - my $child_exit_status = $? >> 8; - if($child_exit_status == 0) { - return $proc_output; - } else { - print STDERR "ERROR: Execution of @_ failed.\n"; - exit(1); - } -} - -# Based on Moses' safesystem sub -sub check_call { - print STDERR "Executing: @_\n"; - system(@_); - my $exitcode = $? >> 8; - if($exitcode == 0) { - return 0; - } elsif ($? == -1) { - print STDERR "ERROR: Failed to execute: @_\n $!\n"; - exit(1); - - } elsif ($? & 127) { - printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n", - ($? & 127), ($? & 128) ? 'with' : 'without'; - exit(1); - - } else { - print STDERR "Failed with exit code: $exitcode\n" if $exitcode; - exit($exitcode); - } -} - -sub check_bash_call { - my @args = ( "bash", "-auxeo", "pipefail", "-c", "@_"); - check_call(@args); -} - -sub check_bash_output { - my @args = ( "bash", "-auxeo", "pipefail", "-c", "@_"); - return check_output(@args); -} - -# perl module weirdness... -return 1; diff --git a/vest/line_mediator.pl b/vest/line_mediator.pl deleted file mode 100755 index bc2bb24c..00000000 --- a/vest/line_mediator.pl +++ /dev/null @@ -1,116 +0,0 @@ -#!/usr/bin/perl -w -#hooks up two processes, 2nd of which has one line of output per line of input, expected by the first, which starts off the communication - -# if you don't know how to fork/exec in a C program, this could be helpful under limited cirmustances (would be ok to liaise with sentserver) - -#WARNING: because it waits for the result from command 2 after sending every line, and especially if command 1 does the same, using sentserver as command 2 won't actually buy you any real parallelism. - -use strict; -use IPC::Open2; -use POSIX qw(pipe dup2 STDIN_FILENO STDOUT_FILENO); - -my $quiet=!$ENV{DEBUG}; -$quiet=1 if $ENV{QUIET}; -sub info { - local $,=' '; - print STDERR @_ unless $quiet; -} - -my $mode='CROSS'; -my $ser='DIRECT'; -$mode='PIPE' if $ENV{PIPE}; -$mode='SNAKE' if $ENV{SNAKE}; -$mode='CROSS' if $ENV{CROSS}; -$ser='SERIAL' if $ENV{SERIAL}; -$ser='DIRECT' if $ENV{DIRECT}; -$ser='SERIAL' if $mode eq 'SNAKE'; -info("mode: $mode\n"); -info("connection: $ser\n"); - - -my @c1; -if (scalar @ARGV) { - do { - push @c1,shift - } while scalar @ARGV && $c1[$#c1] ne '--'; -} -pop @c1; -my @c2=@ARGV; -@ARGV=(); -(scalar @c1 && scalar @c2) || die qq{ -usage: $0 cmd1 args -- cmd2 args -all options are environment variables. -DEBUG=1 env var enables debugging output. -CROSS=1 hooks up two processes, 2nd of which has one line of output per line of input, expected by the first, which starts off the communication. crosses stdin/stderr of cmd1 and cmd2 line by line (both must flush on newline and output. cmd1 initiates the conversation (sends the first line). default: attempts to cross stdin/stdout of c1 and c2 directly (via two unidirectional posix pipes created before fork). -SERIAL=1: (no parallelism possible) but lines exchanged are logged if DEBUG. -if SNAKE then stdin -> c1 -> c2 -> c1 -> stdout. -if PIPE then stdin -> c1 -> c2 -> stdout (same as shell c1|c2, but with SERIAL you can see the intermediate in real time; you could do similar with c1 | tee /dev/fd/2 |c2. -DIRECT=1 (default) will override SERIAL=1. -CROSS=1 (default) will override SNAKE or PIPE. -}; - -info("1 cmd:",@c1,"\n"); -info("2 cmd:",@c2,"\n"); - -sub lineto { - select $_[0]; - $|=1; - shift; - print @_; -} - -if ($ser eq 'SERIAL') { - my ($R1,$W1,$R2,$W2); - my $c1p=open2($R1,$W1,@c1); # Open2 R W backward from Open3. - my $c2p=open2($R2,$W2,@c2); - if ($mode eq 'CROSS') { - while(<$R1>) { - info("1:",$_); - lineto($W2,$_); - last unless defined ($_=<$R2>); - info("1|2:",$_); - lineto($W1,$_); - } - } else { - my $snake=$mode eq 'SNAKE'; - while() { - info("IN:",$_); - lineto($W1,$_); - last unless defined ($_=<$R1>); - info("IN|1:",$_); - lineto($W2,$_); - last unless defined ($_=<$R2>); - info("IN|1|2:",$_); - if ($snake) { - lineto($W1,$_); - last unless defined ($_=<$R1>); - info("IN|1|2|1:",$_); - } - lineto(*STDOUT,$_); - } - } -} else { - info("DIRECT mode\n"); - my @rw1=POSIX::pipe(); - my @rw2=POSIX::pipe(); - my $pid=undef; - $SIG{CHLD} = sub { wait }; - while (not defined ($pid=fork())) { - sleep 1; - } - my $pipe = $mode eq 'PIPE'; - unless ($pipe) { - POSIX::close(STDOUT_FILENO); - POSIX::close(STDIN_FILENO); - } - if ($pid) { - POSIX::dup2($rw1[1],STDOUT_FILENO); - POSIX::dup2($rw2[0],STDIN_FILENO) unless $pipe; - exec @c1; - } else { - POSIX::dup2($rw2[1],STDOUT_FILENO) unless $pipe; - POSIX::dup2($rw1[0],STDIN_FILENO); - exec @c2; - } - while (wait()!=-1) {} -} diff --git a/vest/line_optimizer.cc b/vest/line_optimizer.cc deleted file mode 100644 index 49443fbe..00000000 --- a/vest/line_optimizer.cc +++ /dev/null @@ -1,111 +0,0 @@ -#include "line_optimizer.h" - -#include -#include - -#include "sparse_vector.h" -#include "ns.h" - -using namespace std; - -typedef ErrorSurface::const_iterator ErrorIter; - -// sort by increasing x-ints -struct IntervalComp { - bool operator() (const ErrorIter& a, const ErrorIter& b) const { - return a->x < b->x; - } -}; - -double LineOptimizer::LineOptimize( - const EvaluationMetric* metric, - const vector& surfaces, - const LineOptimizer::ScoreType type, - float* best_score, - const double epsilon) { - // cerr << "MIN=" << MINIMIZE_SCORE << " MAX=" << MAXIMIZE_SCORE << " MINE=" << type << endl; - vector all_ints; - for (vector::const_iterator i = surfaces.begin(); - i != surfaces.end(); ++i) { - const ErrorSurface& surface = *i; - for (ErrorIter j = surface.begin(); j != surface.end(); ++j) - all_ints.push_back(j); - } - sort(all_ints.begin(), all_ints.end(), IntervalComp()); - double last_boundary = all_ints.front()->x; - SufficientStats acc; - float& cur_best_score = *best_score; - cur_best_score = (type == MAXIMIZE_SCORE ? - -numeric_limits::max() : numeric_limits::max()); - bool left_edge = true; - double pos = numeric_limits::quiet_NaN(); - for (vector::iterator i = all_ints.begin(); - i != all_ints.end(); ++i) { - const ErrorSegment& seg = **i; - if (seg.x - last_boundary > epsilon) { - float sco = metric->ComputeScore(acc); - if ((type == MAXIMIZE_SCORE && sco > cur_best_score) || - (type == MINIMIZE_SCORE && sco < cur_best_score) ) { - cur_best_score = sco; - if (left_edge) { - pos = seg.x - 0.1; - left_edge = false; - } else { - pos = last_boundary + (seg.x - last_boundary) / 2; - } - //cerr << "NEW BEST: " << pos << " (score=" << cur_best_score << ")\n"; - } - // string xx = metric->DetailedScore(acc); cerr << "---- " << xx; - // cerr << "---- s=" << sco << "\n"; - last_boundary = seg.x; - } - // cerr << "x-boundary=" << seg.x << "\n"; - //string x2; acc.Encode(&x2); cerr << " ACC: " << x2 << endl; - //string x1; seg.delta.Encode(&x1); cerr << " DELTA: " << x1 << endl; - acc += seg.delta; - } - float sco = metric->ComputeScore(acc); - if ((type == MAXIMIZE_SCORE && sco > cur_best_score) || - (type == MINIMIZE_SCORE && sco < cur_best_score) ) { - cur_best_score = sco; - if (left_edge) { - pos = 0; - } else { - pos = last_boundary + 1000.0; - } - } - return pos; -} - -void LineOptimizer::RandomUnitVector(const vector& features_to_optimize, - SparseVector* axis, - RandomNumberGenerator* rng) { - axis->clear(); - for (int i = 0; i < features_to_optimize.size(); ++i) - axis->set_value(features_to_optimize[i], rng->NextNormal(0.0,1.0)); - (*axis) /= axis->l2norm(); -} - -void LineOptimizer::CreateOptimizationDirections( - const vector& features_to_optimize, - int additional_random_directions, - RandomNumberGenerator* rng, - vector >* dirs - , bool include_orthogonal - ) { - dirs->clear(); - typedef SparseVector Dir; - vector &out=*dirs; - int i=0; - if (include_orthogonal) - for (;i - -#include "sparse_vector.h" -#include "error_surface.h" -#include "sampler.h" - -class EvaluationMetric; -class Weights; - -struct LineOptimizer { - - // use MINIMIZE_SCORE for things like TER, WER - // MAXIMIZE_SCORE for things like BLEU - enum ScoreType { MAXIMIZE_SCORE, MINIMIZE_SCORE }; - - // merge all the error surfaces together into a global - // error surface and find (the middle of) the best segment - static double LineOptimize( - const EvaluationMetric* metric, - const std::vector& envs, - const LineOptimizer::ScoreType type, - float* best_score, - const double epsilon = 1.0/65536.0); - - // return a random vector of length 1 where all dimensions - // not listed in dimensions will be 0. - static void RandomUnitVector(const std::vector& dimensions, - SparseVector* axis, - RandomNumberGenerator* rng); - - // generate a list of directions to optimize; the list will - // contain the orthogonal vectors corresponding to the dimensions in - // primary and then additional_random_directions directions in those - // dimensions as well. All vectors will be length 1. - static void CreateOptimizationDirections( - const std::vector& primary, - int additional_random_directions, - RandomNumberGenerator* rng, - std::vector >* dirs - , bool include_primary=true - ); - -}; - -#endif diff --git a/vest/lo_test.cc b/vest/lo_test.cc deleted file mode 100644 index a67f65e1..00000000 --- a/vest/lo_test.cc +++ /dev/null @@ -1,236 +0,0 @@ -#include -#include -#include - -#include -#include - -#include "ns.h" -#include "ns_docscorer.h" -#include "ces.h" -#include "fdict.h" -#include "hg.h" -#include "kbest.h" -#include "hg_io.h" -#include "filelib.h" -#include "inside_outside.h" -#include "viterbi.h" -#include "viterbi_envelope.h" -#include "line_optimizer.h" - -using namespace std; -using boost::shared_ptr; - -class OptTest : public testing::Test { - protected: - virtual void SetUp() { } - virtual void TearDown() { } -}; - -const char* ref11 = "australia reopens embassy in manila"; -const char* ref12 = "( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack ."; -const char* ref21 = "australia reopened manila embassy"; -const char* ref22 = "( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack ."; -const char* ref31 = "australia to reopen embassy in manila"; -const char* ref32 = "( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so - called confirmed terrorist attack threats ."; -const char* ref41 = "australia to re - open its embassy to manila"; -const char* ref42 = "( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so - called \" clear \" threat of terrorist attack 7 weeks ago ."; - -TEST_F(OptTest, TestCheckNaN) { - double x = 0; - double y = 0; - double z = x / y; - EXPECT_EQ(true, isnan(z)); -} - -TEST_F(OptTest,TestViterbiEnvelope) { - shared_ptr a1(new Segment(-1, 0)); - shared_ptr b1(new Segment(1, 0)); - shared_ptr a2(new Segment(-1, 1)); - shared_ptr b2(new Segment(1, -1)); - vector > sa; sa.push_back(a1); sa.push_back(b1); - vector > sb; sb.push_back(a2); sb.push_back(b2); - ViterbiEnvelope a(sa); - cerr << a << endl; - ViterbiEnvelope b(sb); - ViterbiEnvelope c = a; - c *= b; - cerr << a << " (*) " << b << " = " << c << endl; - EXPECT_EQ(3, c.size()); -} - -TEST_F(OptTest,TestViterbiEnvelopeInside) { - const string json = "{\"rules\":[1,\"[X] ||| a\",2,\"[X] ||| A [1]\",3,\"[X] ||| c\",4,\"[X] ||| C [1]\",5,\"[X] ||| [1] B [2]\",6,\"[X] ||| [1] b [2]\",7,\"[X] ||| X [1]\",8,\"[X] ||| Z [1]\"],\"features\":[\"f1\",\"f2\",\"Feature_1\",\"Feature_0\",\"Model_0\",\"Model_1\",\"Model_2\",\"Model_3\",\"Model_4\",\"Model_5\",\"Model_6\",\"Model_7\"],\"edges\":[{\"tail\":[],\"feats\":[],\"rule\":1}],\"node\":{\"in_edges\":[0]},\"edges\":[{\"tail\":[0],\"feats\":[0,-0.8,1,-0.1],\"rule\":2}],\"node\":{\"in_edges\":[1]},\"edges\":[{\"tail\":[],\"feats\":[1,-1],\"rule\":3}],\"node\":{\"in_edges\":[2]},\"edges\":[{\"tail\":[2],\"feats\":[0,-0.2,1,-0.1],\"rule\":4}],\"node\":{\"in_edges\":[3]},\"edges\":[{\"tail\":[1,3],\"feats\":[0,-1.2,1,-0.2],\"rule\":5},{\"tail\":[1,3],\"feats\":[0,-0.5,1,-1.3],\"rule\":6}],\"node\":{\"in_edges\":[4,5]},\"edges\":[{\"tail\":[4],\"feats\":[0,-0.5,1,-0.8],\"rule\":7},{\"tail\":[4],\"feats\":[0,-0.7,1,-0.9],\"rule\":8}],\"node\":{\"in_edges\":[6,7]}}"; - Hypergraph hg; - istringstream instr(json); - HypergraphIO::ReadFromJSON(&instr, &hg); - SparseVector wts; - wts.set_value(FD::Convert("f1"), 0.4); - wts.set_value(FD::Convert("f2"), 1.0); - hg.Reweight(wts); - vector, prob_t> > list; - std::vector > features; - KBest::KBestDerivations, ESentenceTraversal> kbest(hg, 10); - for (int i = 0; i < 10; ++i) { - const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = - kbest.LazyKthBest(hg.nodes_.size() - 1, i); - if (!d) break; - cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; - } - SparseVector dir; dir.set_value(FD::Convert("f1"), 1.0); - ViterbiEnvelopeWeightFunction wf(wts, dir); - ViterbiEnvelope env = Inside(hg, NULL, wf); - cerr << env << endl; - const vector >& segs = env.GetSortedSegs(); - dir *= segs[1]->x; - wts += dir; - hg.Reweight(wts); - KBest::KBestDerivations, ESentenceTraversal> kbest2(hg, 10); - for (int i = 0; i < 10; ++i) { - const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = - kbest2.LazyKthBest(hg.nodes_.size() - 1, i); - if (!d) break; - cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; - } - for (int i = 0; i < segs.size(); ++i) { - cerr << "seg=" << i << endl; - vector trans; - segs[i]->ConstructTranslation(&trans); - cerr << TD::GetString(trans) << endl; - } -} - -TEST_F(OptTest, TestS1) { - int fPhraseModel_0 = FD::Convert("PhraseModel_0"); - int fPhraseModel_1 = FD::Convert("PhraseModel_1"); - int fPhraseModel_2 = FD::Convert("PhraseModel_2"); - int fLanguageModel = FD::Convert("LanguageModel"); - int fWordPenalty = FD::Convert("WordPenalty"); - int fPassThrough = FD::Convert("PassThrough"); - SparseVector wts; - wts.set_value(fWordPenalty, 4.25); - wts.set_value(fLanguageModel, -1.1165); - wts.set_value(fPhraseModel_0, -0.96); - wts.set_value(fPhraseModel_1, -0.65); - wts.set_value(fPhraseModel_2, -0.77); - wts.set_value(fPassThrough, -10.0); - - vector to_optimize; - to_optimize.push_back(fWordPenalty); - to_optimize.push_back(fLanguageModel); - to_optimize.push_back(fPhraseModel_0); - to_optimize.push_back(fPhraseModel_1); - to_optimize.push_back(fPhraseModel_2); - - Hypergraph hg; - ReadFile rf("./test_data/0.json.gz"); - HypergraphIO::ReadFromJSON(rf.stream(), &hg); - hg.Reweight(wts); - - Hypergraph hg2; - ReadFile rf2("./test_data/1.json.gz"); - HypergraphIO::ReadFromJSON(rf2.stream(), &hg2); - hg2.Reweight(wts); - - vector > refs1(4); - TD::ConvertSentence(ref11, &refs1[0]); - TD::ConvertSentence(ref21, &refs1[1]); - TD::ConvertSentence(ref31, &refs1[2]); - TD::ConvertSentence(ref41, &refs1[3]); - vector > refs2(4); - TD::ConvertSentence(ref12, &refs2[0]); - TD::ConvertSentence(ref22, &refs2[1]); - TD::ConvertSentence(ref32, &refs2[2]); - TD::ConvertSentence(ref42, &refs2[3]); - vector envs(2); - - RandomNumberGenerator rng; - - vector > axes; // directions to search - LineOptimizer::CreateOptimizationDirections( - to_optimize, - 10, - &rng, - &axes); - assert(axes.size() == 10 + to_optimize.size()); - for (int i = 0; i < axes.size(); ++i) - cerr << axes[i] << endl; - const SparseVector& axis = axes[0]; - - cerr << "Computing Viterbi envelope using inside algorithm...\n"; - cerr << "axis: " << axis << endl; - clock_t t_start=clock(); - ViterbiEnvelopeWeightFunction wf(wts, axis); // wts = starting point, axis = search direction - envs[0] = Inside(hg, NULL, wf); - envs[1] = Inside(hg2, NULL, wf); - - vector es(2); - EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); - boost::shared_ptr scorer1 = metric->CreateSegmentEvaluator(refs1); - boost::shared_ptr scorer2 = metric->CreateSegmentEvaluator(refs2); - ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg); - ComputeErrorSurface(*scorer2, envs[1], &es[1], metric, hg2); - cerr << envs[0].size() << " " << envs[1].size() << endl; - cerr << es[0].size() << " " << es[1].size() << endl; - envs.clear(); - clock_t t_env=clock(); - float score; - double m = LineOptimizer::LineOptimize(metric,es, LineOptimizer::MAXIMIZE_SCORE, &score); - clock_t t_opt=clock(); - cerr << "line optimizer returned: " << m << " (SCORE=" << score << ")\n"; - EXPECT_FLOAT_EQ(0.48719698, score); - SparseVector res = axis; - res *= m; - res += wts; - cerr << "res: " << res << endl; - cerr << "ENVELOPE PROCESSING=" << (static_cast(t_env - t_start) / 1000.0) << endl; - cerr << " LINE OPTIMIZATION=" << (static_cast(t_opt - t_env) / 1000.0) << endl; - hg.Reweight(res); - hg2.Reweight(res); - vector t1,t2; - ViterbiESentence(hg, &t1); - ViterbiESentence(hg2, &t2); - cerr << TD::GetString(t1) << endl; - cerr << TD::GetString(t2) << endl; -} - -TEST_F(OptTest,TestZeroOrigin) { - const string json = "{\"rules\":[1,\"[X7] ||| blA ||| without ||| LHSProb=3.92173 LexE2F=2.90799 LexF2E=1.85003 GenerativeProb=10.5381 RulePenalty=1 XFE=2.77259 XEF=0.441833 LabelledEF=2.63906 LabelledFE=4.96981 LogRuleCount=0.693147\",2,\"[X7] ||| blA ||| except ||| LHSProb=4.92173 LexE2F=3.90799 LexF2E=1.85003 GenerativeProb=11.5381 RulePenalty=1 XFE=2.77259 XEF=1.44183 LabelledEF=2.63906 LabelledFE=4.96981 LogRuleCount=1.69315\",3,\"[S] ||| [X7,1] ||| [1] ||| GlueTop=1\",4,\"[X28] ||| EnwAn ||| title ||| LHSProb=3.96802 LexE2F=2.22462 LexF2E=1.83258 GenerativeProb=10.0863 RulePenalty=1 XFE=0 XEF=1.20397 LabelledEF=1.20397 LabelledFE=-1.98341e-08 LogRuleCount=1.09861\",5,\"[X0] ||| EnwAn ||| funny ||| LHSProb=3.98479 LexE2F=1.79176 LexF2E=3.21888 GenerativeProb=11.1681 RulePenalty=1 XFE=0 XEF=2.30259 LabelledEF=2.30259 LabelledFE=0 LogRuleCount=0 SingletonRule=1\",6,\"[X8] ||| [X7,1] EnwAn ||| entitled [1] ||| LHSProb=3.82533 LexE2F=3.21888 LexF2E=2.52573 GenerativeProb=11.3276 RulePenalty=1 XFE=1.20397 XEF=1.20397 LabelledEF=2.30259 LabelledFE=2.30259 LogRuleCount=0 SingletonRule=1\",7,\"[S] ||| [S,1] [X28,2] ||| [1] [2] ||| Glue=1\",8,\"[S] ||| [S,1] [X0,2] ||| [1] [2] ||| Glue=1\",9,\"[S] ||| [X8,1] ||| [1] ||| GlueTop=1\",10,\"[Goal] ||| [S,1] ||| [1]\"],\"features\":[\"PassThrough\",\"Glue\",\"GlueTop\",\"LanguageModel\",\"WordPenalty\",\"LHSProb\",\"LexE2F\",\"LexF2E\",\"GenerativeProb\",\"RulePenalty\",\"XFE\",\"XEF\",\"LabelledEF\",\"LabelledFE\",\"LogRuleCount\",\"SingletonRule\"],\"edges\":[{\"tail\":[],\"spans\":[0,1,-1,-1],\"feats\":[5,3.92173,6,2.90799,7,1.85003,8,10.5381,9,1,10,2.77259,11,0.441833,12,2.63906,13,4.96981,14,0.693147],\"rule\":1},{\"tail\":[],\"spans\":[0,1,-1,-1],\"feats\":[5,4.92173,6,3.90799,7,1.85003,8,11.5381,9,1,10,2.77259,11,1.44183,12,2.63906,13,4.96981,14,1.69315],\"rule\":2}],\"node\":{\"in_edges\":[0,1],\"cat\":\"X7\"},\"edges\":[{\"tail\":[0],\"spans\":[0,1,-1,-1],\"feats\":[2,1],\"rule\":3}],\"node\":{\"in_edges\":[2],\"cat\":\"S\"},\"edges\":[{\"tail\":[],\"spans\":[1,2,-1,-1],\"feats\":[5,3.96802,6,2.22462,7,1.83258,8,10.0863,9,1,11,1.20397,12,1.20397,13,-1.98341e-08,14,1.09861],\"rule\":4}],\"node\":{\"in_edges\":[3],\"cat\":\"X28\"},\"edges\":[{\"tail\":[],\"spans\":[1,2,-1,-1],\"feats\":[5,3.98479,6,1.79176,7,3.21888,8,11.1681,9,1,11,2.30259,12,2.30259,15,1],\"rule\":5}],\"node\":{\"in_edges\":[4],\"cat\":\"X0\"},\"edges\":[{\"tail\":[0],\"spans\":[0,2,-1,-1],\"feats\":[5,3.82533,6,3.21888,7,2.52573,8,11.3276,9,1,10,1.20397,11,1.20397,12,2.30259,13,2.30259,15,1],\"rule\":6}],\"node\":{\"in_edges\":[5],\"cat\":\"X8\"},\"edges\":[{\"tail\":[1,2],\"spans\":[0,2,-1,-1],\"feats\":[1,1],\"rule\":7},{\"tail\":[1,3],\"spans\":[0,2,-1,-1],\"feats\":[1,1],\"rule\":8},{\"tail\":[4],\"spans\":[0,2,-1,-1],\"feats\":[2,1],\"rule\":9}],\"node\":{\"in_edges\":[6,7,8],\"cat\":\"S\"},\"edges\":[{\"tail\":[5],\"spans\":[0,2,-1,-1],\"feats\":[],\"rule\":10}],\"node\":{\"in_edges\":[9],\"cat\":\"Goal\"}}"; - Hypergraph hg; - istringstream instr(json); - HypergraphIO::ReadFromJSON(&instr, &hg); - SparseVector wts; - wts.set_value(FD::Convert("PassThrough"), -0.929201533002898); - hg.Reweight(wts); - - vector, prob_t> > list; - std::vector > features; - KBest::KBestDerivations, ESentenceTraversal> kbest(hg, 10); - for (int i = 0; i < 10; ++i) { - const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = - kbest.LazyKthBest(hg.nodes_.size() - 1, i); - if (!d) break; - cerr << log(d->score) << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << endl; - } - - SparseVector axis; axis.set_value(FD::Convert("Glue"),1.0); - ViterbiEnvelopeWeightFunction wf(wts, axis); // wts = starting point, axis = search direction - vector envs(1); - envs[0] = Inside(hg, NULL, wf); - - vector > mr(4); - TD::ConvertSentence("untitled", &mr[0]); - TD::ConvertSentence("with no title", &mr[1]); - TD::ConvertSentence("without a title", &mr[2]); - TD::ConvertSentence("without title", &mr[3]); - EvaluationMetric* metric = EvaluationMetric::Instance("IBM_BLEU"); - boost::shared_ptr scorer1 = metric->CreateSegmentEvaluator(mr); - vector es(1); - ComputeErrorSurface(*scorer1, envs[0], &es[0], metric, hg); -} - -int main(int argc, char **argv) { - testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} - diff --git a/vest/mr_vest_generate_mapper_input.cc b/vest/mr_vest_generate_mapper_input.cc deleted file mode 100644 index 59d4f24f..00000000 --- a/vest/mr_vest_generate_mapper_input.cc +++ /dev/null @@ -1,78 +0,0 @@ -#include -#include - -#include -#include - -#include "filelib.h" -#include "weights.h" -#include "line_optimizer.h" - -using namespace std; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("dev_set_size,s",po::value(),"[REQD] Development set size (# of parallel sentences)") - ("forest_repository,r",po::value(),"[REQD] Path to forest repository") - ("weights,w",po::value(),"[REQD] Current feature weights file") - ("optimize_feature,o",po::value >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)") - ("random_directions,d",po::value()->default_value(20),"Number of random directions to run the line optimizer in") - ("help,h", "Help"); - po::options_description dcmdline_options; - dcmdline_options.add(opts); - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - bool flag = false; - if (conf->count("dev_set_size") == 0) { - cerr << "Please specify the size of the development set using -d N\n"; - flag = true; - } - if (conf->count("weights") == 0) { - cerr << "Please specify the starting-point weights using -w \n"; - flag = true; - } - if (conf->count("forest_repository") == 0) { - cerr << "Please specify the forest repository location using -r \n"; - flag = true; - } - if (flag || conf->count("help")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -int main(int argc, char** argv) { - RandomNumberGenerator rng; - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - vector features; - SparseVector origin; - vector w; - Weights::InitFromFile(conf["weights"].as(), &w, &features); - Weights::InitSparseVector(w, &origin); - const string forest_repository = conf["forest_repository"].as(); - assert(DirectoryExists(forest_repository)); - if (conf.count("optimize_feature") > 0) - features=conf["optimize_feature"].as >(); - vector > directions; - vector fids(features.size()); - for (int i = 0; i < features.size(); ++i) - fids[i] = FD::Convert(features[i]); - LineOptimizer::CreateOptimizationDirections( - fids, - conf["random_directions"].as(), - &rng, - &directions); - unsigned dev_set_size = conf["dev_set_size"].as(); - for (unsigned i = 0; i < dev_set_size; ++i) { - for (unsigned j = 0; j < directions.size(); ++j) { - cout << forest_repository << '/' << i << ".json.gz " << i << ' '; - print(cout, origin, "=", ";"); - cout << ' '; - print(cout, directions[j], "=", ";"); - cout << endl; - } - } - return 0; -} diff --git a/vest/mr_vest_map.cc b/vest/mr_vest_map.cc deleted file mode 100644 index 7d9625bc..00000000 --- a/vest/mr_vest_map.cc +++ /dev/null @@ -1,112 +0,0 @@ -#include -#include -#include -#include - -#include -#include - -#include "ns.h" -#include "ns_docscorer.h" -#include "ces.h" -#include "filelib.h" -#include "stringlib.h" -#include "sparse_vector.h" -#include "viterbi_envelope.h" -#include "inside_outside.h" -#include "error_surface.h" -#include "b64tools.h" -#include "hg_io.h" - -using namespace std; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("reference,r",po::value >(), "[REQD] Reference translation (tokenized text)") - ("source,s",po::value(), "Source file (ignored, except for AER)") - ("evaluation_metric,m",po::value()->default_value("ibm_bleu"), "Evaluation metric being optimized") - ("input,i",po::value()->default_value("-"), "Input file to map (- is STDIN)") - ("help,h", "Help"); - po::options_description dcmdline_options; - dcmdline_options.add(opts); - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - bool flag = false; - if (!conf->count("reference")) { - cerr << "Please specify one or more references using -r \n"; - flag = true; - } - if (flag || conf->count("help")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -bool ReadSparseVectorString(const string& s, SparseVector* v) { -#if 0 - // this should work, but untested. - std::istringstream i(s); - i>>*v; -#else - vector fields; - Tokenize(s, ';', &fields); - if (fields.empty()) return false; - for (int i = 0; i < fields.size(); ++i) { - vector pair(2); - Tokenize(fields[i], '=', &pair); - if (pair.size() != 2) { - cerr << "Error parsing vector string: " << fields[i] << endl; - return false; - } - v->set_value(FD::Convert(pair[0]), atof(pair[1].c_str())); - } - return true; -#endif -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - const string evaluation_metric = conf["evaluation_metric"].as(); - EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); - DocumentScorer ds(metric, conf["reference"].as >()); - cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl; - Hypergraph hg; - string last_file; - ReadFile in_read(conf["input"].as()); - istream &in=*in_read.stream(); - while(in) { - string line; - getline(in, line); - if (line.empty()) continue; - istringstream is(line); - int sent_id; - string file, s_origin, s_direction; - // path-to-file (JSON) sent_ed starting-point search-direction - is >> file >> sent_id >> s_origin >> s_direction; - SparseVector origin; - ReadSparseVectorString(s_origin, &origin); - SparseVector direction; - ReadSparseVectorString(s_direction, &direction); - // cerr << "File: " << file << "\nDir: " << direction << "\n X: " << origin << endl; - if (last_file != file) { - last_file = file; - ReadFile rf(file); - HypergraphIO::ReadFromJSON(rf.stream(), &hg); - } - ViterbiEnvelopeWeightFunction wf(origin, direction); - ViterbiEnvelope ve = Inside(hg, NULL, wf); - ErrorSurface es; - - ComputeErrorSurface(*ds[sent_id], ve, &es, metric, hg); - //cerr << "Viterbi envelope has " << ve.size() << " segments\n"; - // cerr << "Error surface has " << es.size() << " segments\n"; - string val; - es.Serialize(&val); - cout << 'M' << ' ' << s_origin << ' ' << s_direction << '\t'; - B64::b64encode(val.c_str(), val.size(), &cout); - cout << endl << flush; - } - return 0; -} diff --git a/vest/mr_vest_reduce.cc b/vest/mr_vest_reduce.cc deleted file mode 100644 index dda61f88..00000000 --- a/vest/mr_vest_reduce.cc +++ /dev/null @@ -1,77 +0,0 @@ -#include -#include -#include -#include - -#include -#include - -#include "sparse_vector.h" -#include "error_surface.h" -#include "line_optimizer.h" -#include "b64tools.h" -#include "stringlib.h" - -using namespace std; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("evaluation_metric,m",po::value(), "Evaluation metric (IBM_BLEU, etc.)") - ("help,h", "Help"); - po::options_description dcmdline_options; - dcmdline_options.add(opts); - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - bool flag = conf->count("evaluation_metric") == 0; - if (flag || conf->count("help")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - const string evaluation_metric = conf["evaluation_metric"].as(); - LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE; - if (UppercaseString(evaluation_metric) == "TER") - opt_type = LineOptimizer::MINIMIZE_SCORE; - EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); - - vector esv; - string last_key, line, key, val; - while(getline(cin, line)) { - size_t ks = line.find("\t"); - assert(string::npos != ks); - assert(ks > 2); - key = line.substr(2, ks - 2); - val = line.substr(ks + 1); - if (key != last_key) { - if (!last_key.empty()) { - float score; - double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score); - cout << last_key << "|" << x << "|" << score << endl; - } - last_key.swap(key); - esv.clear(); - } - if (val.size() % 4 != 0) { - cerr << "B64 encoding error 1! Skipping.\n"; - continue; - } - string encoded(val.size() / 4 * 3, '\0'); - if (!B64::b64decode(reinterpret_cast(&val[0]), val.size(), &encoded[0], encoded.size())) { - cerr << "B64 encoding error 2! Skipping.\n"; - continue; - } - esv.push_back(ErrorSurface()); - esv.back().Deserialize(encoded); - } - if (!esv.empty()) { - float score; - double x = LineOptimizer::LineOptimize(metric, esv, opt_type, &score); - cout << last_key << "|" << x << "|" << score << endl; - } - return 0; -} diff --git a/vest/parallelize.pl b/vest/parallelize.pl deleted file mode 100755 index 7d0365cc..00000000 --- a/vest/parallelize.pl +++ /dev/null @@ -1,423 +0,0 @@ -#!/usr/bin/env perl - -# Author: Adam Lopez -# -# This script takes a command that processes input -# from stdin one-line-at-time, and parallelizes it -# on the cluster using David Chiang's sentserver/ -# sentclient architecture. -# -# Prerequisites: the command *must* read each line -# without waiting for subsequent lines of input -# (for instance, a command which must read all lines -# of input before processing will not work) and -# return it to the output *without* buffering -# multiple lines. - -#TODO: if -j 1, run immediately, not via sentserver? possible differences in environment might make debugging harder - -#ANNOYANCE: if input is shorter than -j n lines, or at the very last few lines, repeatedly sleeps. time cut down to 15s from 60s - -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } -use LocalConfig; - -use Cwd qw/ abs_path cwd getcwd /; -use File::Temp qw/ tempfile /; -use Getopt::Long; -use IPC::Open2; -use strict; -use POSIX ":sys_wait_h"; - -use File::Basename; -my $myDir = dirname(__FILE__); -print STDERR __FILE__." -> $myDir\n"; -push(@INC, $myDir); -require "libcall.pl"; - -my $tailn=5; # +0 = concatenate all the client logs. 5 = last 5 lines -my $recycle_clients; # spawn new clients when previous ones terminate -my $stay_alive; # dont let server die when having zero clients -my $joblist = ""; -my $errordir=""; -my $multiline; -my @files_to_stage; -my $numnodes = 8; -my $user = $ENV{"USER"}; -my $pmem = "9g"; -my $basep=50300; -my $randp=300; -my $tryp=50; -my $no_which; -my $no_cd; - -my $DEBUG=$ENV{DEBUG}; -print STDERR "DEBUG=$DEBUG output enabled.\n" if $DEBUG; -my $verbose = 1; -sub verbose { - if ($verbose) { - print STDERR @_,"\n"; - } -} -sub debug { - if ($DEBUG) { - my ($package, $filename, $line) = caller; - print STDERR "DEBUG: $filename($line): ",join(' ',@_),"\n"; - } -} -my $is_shell_special=qr.[ \t\n\\><|&;"'`~*?{}$!()].; -my $shell_escape_in_quote=qr.[\\"\$`!].; -sub escape_shell { - my ($arg)=@_; - return undef unless defined $arg; - return '""' unless $arg; - if ($arg =~ /$is_shell_special/) { - $arg =~ s/($shell_escape_in_quote)/\\$1/g; - return "\"$arg\""; - } - return $arg; -} -sub preview_files { - my ($l,$skipempty,$footer,$n)=@_; - $n=$tailn unless defined $n; - my @f=grep { ! ($skipempty && -z $_) } @$l; - my $fn=join(' ',map {escape_shell($_)} @f); - my $cmd="tail -n $n $fn"; - unchecked_output("$cmd").($footer?"\nNONEMPTY FILES:\n$fn\n":""); -} -sub prefix_dirname($) { - #like `dirname but if ends in / then return the whole thing - local ($_)=@_; - if (/\/$/) { - $_; - } else { - s#/[^/]$##; - $_ ? $_ : ''; - } -} -sub ensure_final_slash($) { - local ($_)=@_; - m#/$# ? $_ : ($_."/"); -} -sub extend_path($$;$$) { - my ($base,$ext,$mkdir,$baseisdir)=@_; - if (-d $base) { - $base.="/"; - } else { - my $dir; - if ($baseisdir) { - $dir=$base; - $base.='/' unless $base =~ /\/$/; - } else { - $dir=prefix_dirname($base); - } - my @cmd=("/bin/mkdir","-p",$dir); - check_call(@cmd) if $mkdir; - } - return $base.$ext; -} - -my $abscwd=abs_path(&getcwd); -sub print_help; - -my $use_fork; -my @pids; - -# Process command-line options -unless (GetOptions( - "stay-alive" => \$stay_alive, - "recycle-clients" => \$recycle_clients, - "error-dir=s" => \$errordir, - "multi-line" => \$multiline, - "file=s" => \@files_to_stage, - "use-fork" => \$use_fork, - "verbose" => \$verbose, - "jobs=i" => \$numnodes, - "pmem=s" => \$pmem, - "baseport=i" => \$basep, -# "iport=i" => \$randp, #for short name -i - "no-which!" => \$no_which, - "no-cd!" => \$no_cd, - "tailn=s" => \$tailn, -) && scalar @ARGV){ - print_help(); - die "bad options."; -} - -my $cmd = ""; -my $prog=shift; -if ($no_which) { - $cmd=$prog; -} else { - $cmd=check_output("which $prog"); - chomp $cmd; - die "$prog not found - $cmd" unless $cmd; -} -#$cmd=abs_path($cmd); -for my $arg (@ARGV) { - $cmd .= " ".escape_shell($arg); -} -die "Please specify a command to parallelize\n" if $cmd eq ''; - -my $cdcmd=$no_cd ? '' : ("cd ".escape_shell($abscwd)."\n"); - -my $executable = $cmd; -$executable =~ s/^\s*(\S+)($|\s.*)/$1/; -$executable=check_output("basename $executable"); -chomp $executable; - - -print STDERR "Parallelizing ($numnodes ways): $cmd\n\n"; - -# create -e dir and save .sh -use File::Temp qw/tempdir/; -unless ($errordir) { - $errordir=tempdir("$executable.XXXXXX",CLEANUP=>1); -} -if ($errordir) { - my $scriptfile=extend_path("$errordir/","$executable.sh",1,1); - -d $errordir || die "should have created -e dir $errordir"; - open SF,">",$scriptfile || die; - print SF "$cdcmd$cmd\n"; - close SF; - chmod 0755,$scriptfile; - $errordir=abs_path($errordir); - &verbose("-e dir: $errordir"); -} - -# set cleanup handler -my @cleanup_cmds; -sub cleanup; -sub cleanup_and_die; -$SIG{INT} = "cleanup_and_die"; -$SIG{TERM} = "cleanup_and_die"; -$SIG{HUP} = "cleanup_and_die"; - -# other subs: -sub numof_live_jobs; -sub launch_job_on_node; - - -# vars -my $mydir = check_output("dirname $0"); chomp $mydir; -my $sentserver = "$mydir/sentserver"; -my $sentclient = "$mydir/sentclient"; -my $host = check_output("hostname"); -chomp $host; - - -# find open port -srand; -my $port = 50300+int(rand($randp)); -my $endp=$port+$tryp; -sub listening_port_lines { - my $quiet=$verbose?'':'2>/dev/null'; - return unchecked_output("netstat -a -n $quiet | grep LISTENING | grep -i tcp"); -} -my $netstat=&listening_port_lines; - -if ($verbose){ print STDERR "Testing port $port...";} - -while ($netstat=~/$port/ || &listening_port_lines=~/$port/){ - if ($verbose){ print STDERR "port is busy\n";} - $port++; - if ($port > $endp){ - die "Unable to find open port\n"; - } - if ($verbose){ print STDERR "Testing port $port... "; } -} -if ($verbose){ - print STDERR "port $port is available\n"; -} - -my $key = int(rand()*1000000); - -my $multiflag = ""; -if ($multiline){ $multiflag = "-m"; print STDERR "expecting multiline output.\n"; } -my $stay_alive_flag = ""; -if ($stay_alive){ $stay_alive_flag = "--stay-alive"; print STDERR "staying alive while no clients are connected.\n"; } - -my $node_count = 0; -my $script = ""; -# fork == one thread runs the sentserver, while the -# other spawns the sentclient commands. -my $pid = fork; -if ($pid == 0) { # child - sleep 8; # give other thread time to start sentserver - $script = "$cdcmd$sentclient $host:$port:$key $cmd"; - - if ($verbose){ - print STDERR "Client script:\n====\n"; - print STDERR $script; - print STDERR "====\n"; - } - for (my $jobn=0; $jobn<$numnodes; $jobn++){ - launch_job(); - } - if ($recycle_clients) { - my $ret; - my $livejobs; - while (1) { - $ret = waitpid($pid, WNOHANG); - #print STDERR "waitpid $pid ret = $ret \n"; - last if ($ret != 0); - $livejobs = numof_live_jobs(); - if ($numnodes >= $livejobs ) { # a client terminated, OR # lines of input was less than -j - print STDERR "num of requested nodes = $numnodes; num of currently live jobs = $livejobs; Client terminated - launching another.\n"; - launch_job(); - } else { - sleep 15; - } - } - } - print STDERR "CHILD PROCESSES SPAWNED ... WAITING\n"; - for my $p (@pids) { - waitpid($p, 0); - } -} else { -# my $todo = "$sentserver -k $key $multiflag $port "; - my $todo = "$sentserver -k $key $multiflag $port $stay_alive_flag "; - if ($verbose){ print STDERR "Running: $todo\n"; } - check_call($todo); - print STDERR "Call to $sentserver returned.\n"; - cleanup(); - exit(0); -} - -sub numof_live_jobs { - if ($use_fork) { - die "not implemented"; - } else { - # We can probably continue decoding if the qstat error is only temporary - my @livejobs = grep(/$joblist/, split(/\n/, unchecked_output("qstat"))); - return ($#livejobs + 1); - } -} -my (@errors,@outs,@cmds); - -sub launch_job { - if ($use_fork) { return launch_job_fork(); } - my $errorfile = "/dev/null"; - my $outfile = "/dev/null"; - $node_count++; - my $clientname = $executable; - $clientname =~ s/^(.{4}).*$/$1/; - $clientname = "$clientname.$node_count"; - if ($errordir){ - $errorfile = "$errordir/$clientname.ER"; - $outfile = "$errordir/$clientname.OU"; - push @errors,$errorfile; - push @outs,$outfile; - } - my $todo = qsub_args($pmem) . " -N $clientname -o $outfile -e $errorfile"; - push @cmds,$todo; - - print STDERR "Running: $todo\n"; - local(*QOUT, *QIN); - open2(\*QOUT, \*QIN, $todo) or die "Failed to open2: $!"; - print QIN $script; - close QIN; - while (my $jobid=){ - chomp $jobid; - if ($verbose){ print STDERR "Launched client job: $jobid"; } - $jobid =~ s/^(\d+)(.*?)$/\1/g; - $jobid =~ s/^Your job (\d+) .*$/\1/; - print STDERR " short job id $jobid\n"; - if ($verbose){ - print STDERR "cd: $abscwd\n"; - print STDERR "cmd: $cmd\n"; - } - if ($joblist == "") { $joblist = $jobid; } - else {$joblist = $joblist . "\|" . $jobid; } - my $cleanfn="qdel $jobid 2> /dev/null"; - push(@cleanup_cmds, $cleanfn); - } - close QOUT; -} - -sub launch_job_fork { - my $errorfile = "/dev/null"; - my $outfile = "/dev/null"; - $node_count++; - my $clientname = $executable; - $clientname =~ s/^(.{4}).*$/$1/; - $clientname = "$clientname.$node_count"; - if ($errordir){ - $errorfile = "$errordir/$clientname.ER"; - $outfile = "$errordir/$clientname.OU"; - push @errors,$errorfile; - push @outs,$outfile; - } - my $pid = fork; - if ($pid == 0) { - my ($fh, $scr_name) = get_temp_script(); - print $fh $script; - close $fh; - my $todo = "/bin/bash -xeo pipefail $scr_name 1> $outfile 2> $errorfile"; - print STDERR "EXEC: $todo\n"; - my $out = check_output("$todo"); - unlink $scr_name or warn "Failed to remove $scr_name"; - exit 0; - } else { - push @pids, $pid; - } -} - -sub get_temp_script { - my ($fh, $filename) = tempfile( "workXXXX", SUFFIX => '.sh'); - return ($fh, $filename); -} - -sub cleanup_and_die { - cleanup(); - die "\n"; -} - -sub cleanup { - print STDERR "Cleaning up...\n"; - for $cmd (@cleanup_cmds){ - print STDERR " Cleanup command: $cmd\n"; - eval $cmd; - } - print STDERR "outputs:\n",preview_files(\@outs,1),"\n"; - print STDERR "errors:\n",preview_files(\@errors,1),"\n"; - print STDERR "cmd:\n",$cmd,"\n"; - print STDERR " cat $errordir/*.ER\nfor logs.\n"; - print STDERR "Cleanup finished.\n"; -} - -sub print_help -{ - my $name = check_output("basename $0"); chomp $name; - print << "Help"; - -usage: $name [options] - - Automatic black-box parallelization of commands. - -options: - - --use-fork - Instead of using qsub, use fork. - - -e, --error-dir - Retain output files from jobs in , rather - than silently deleting them. - - -m, --multi-line - Expect that command may produce multiple output - lines for a single input line. $name makes a - reasonable attempt to obtain all output before - processing additional inputs. However, use of this - option is inherently unsafe. - - -v, --verbose - Print diagnostic informatoin on stderr. - - -j, --jobs - Number of jobs to use. - - -p, --pmem - pmem setting for each job. - -Help -} diff --git a/vest/sentclient.c b/vest/sentclient.c deleted file mode 100644 index 91d994ab..00000000 --- a/vest/sentclient.c +++ /dev/null @@ -1,76 +0,0 @@ -/* Copyright (c) 2001 by David Chiang. All rights reserved.*/ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "sentserver.h" - -int main (int argc, char *argv[]) { - int sock, port; - char *s, *key; - struct hostent *hp; - struct sockaddr_in server; - int errors = 0; - - if (argc < 3) { - fprintf(stderr, "Usage: sentclient host[:port[:key]] command [args ...]\n"); - exit(1); - } - - s = strchr(argv[1], ':'); - key = NULL; - - if (s == NULL) { - port = DEFAULT_PORT; - } else { - *s = '\0'; - s+=1; - /* dumb hack */ - key = strchr(s, ':'); - if (key != NULL){ - *key = '\0'; - key += 1; - } - port = atoi(s); - } - - sock = socket(AF_INET, SOCK_STREAM, 0); - - hp = gethostbyname(argv[1]); - if (hp == NULL) { - fprintf(stderr, "unknown host %s\n", argv[1]); - exit(1); - } - - bzero((char *)&server, sizeof(server)); - bcopy(hp->h_addr, (char *)&server.sin_addr, hp->h_length); - server.sin_family = hp->h_addrtype; - server.sin_port = htons(port); - - while (connect(sock, (struct sockaddr *)&server, sizeof(server)) < 0) { - perror("connect()"); - sleep(1); - errors++; - if (errors > 5) - exit(1); - } - - close(0); - close(1); - dup2(sock, 0); - dup2(sock, 1); - - if (key != NULL){ - write(1, key, strlen(key)); - write(1, "\n", 1); - } - - execvp(argv[2], argv+2); - return 0; -} diff --git a/vest/sentserver.c b/vest/sentserver.c deleted file mode 100644 index c20b4fa6..00000000 --- a/vest/sentserver.c +++ /dev/null @@ -1,515 +0,0 @@ -/* Copyright (c) 2001 by David Chiang. All rights reserved.*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "sentserver.h" - -#define MAX_CLIENTS 64 - -struct clientinfo { - int s; - struct sockaddr_in sin; -}; - -struct line { - int id; - char *s; - int status; - struct line *next; -} *head, **ptail; - -int n_sent = 0, n_received=0, n_flushed=0; - -#define STATUS_RUNNING 0 -#define STATUS_ABORTED 1 -#define STATUS_FINISHED 2 - -pthread_mutex_t queue_mutex = PTHREAD_MUTEX_INITIALIZER; -pthread_mutex_t clients_mutex = PTHREAD_MUTEX_INITIALIZER; -pthread_mutex_t input_mutex = PTHREAD_MUTEX_INITIALIZER; - -int n_clients = 0; -int s; -int expect_multiline_output = 0; -int log_mutex = 0; -int stay_alive = 0; /* dont panic and die with zero clients */ - -void queue_finish(struct line *node, char *s, int fid); -char * read_line(int fd, int multiline); -void done (int code); - -struct line * queue_get(int fid) { - struct line *cur; - char *s, *synch; - - if (log_mutex) fprintf(stderr, "Getting for data for fid %d\n", fid); - if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); - pthread_mutex_lock(&queue_mutex); - - /* First, check for aborted sentences. */ - - if (log_mutex) fprintf(stderr, " Checking queue for aborted jobs (fid %d)\n", fid); - for (cur = head; cur != NULL; cur = cur->next) { - if (cur->status == STATUS_ABORTED) { - cur->status = STATUS_RUNNING; - - if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); - pthread_mutex_unlock(&queue_mutex); - - return cur; - } - } - if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); - pthread_mutex_unlock(&queue_mutex); - - /* Otherwise, read a new one. */ - if (log_mutex) fprintf(stderr, "Locking input mutex (%d)\n", fid); - if (log_mutex) fprintf(stderr, " Reading input for new data (fid %d)\n", fid); - pthread_mutex_lock(&input_mutex); - s = read_line(0,0); - - while (s) { - if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); - pthread_mutex_lock(&queue_mutex); - if (log_mutex) fprintf(stderr, "Unlocking input mutex (%d)\n", fid); - pthread_mutex_unlock(&input_mutex); - - cur = malloc(sizeof (struct line)); - cur->id = n_sent; - cur->s = s; - cur->next = NULL; - - *ptail = cur; - ptail = &cur->next; - - n_sent++; - - if (strcmp(s,"===SYNCH===\n")==0){ - fprintf(stderr, "Received ===SYNCH=== signal (fid %d)\n", fid); - // Note: queue_finish calls free(cur->s). - // Therefore we need to create a new string here. - synch = malloc((strlen("===SYNCH===\n")+2) * sizeof (char)); - synch = strcpy(synch, s); - - if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); - pthread_mutex_unlock(&queue_mutex); - queue_finish(cur, synch, fid); /* handles its own lock */ - - if (log_mutex) fprintf(stderr, "Locking input mutex (%d)\n", fid); - if (log_mutex) fprintf(stderr, " Reading input for new data (fid %d)\n", fid); - pthread_mutex_lock(&input_mutex); - - s = read_line(0,0); - } else { - if (log_mutex) fprintf(stderr, " Received new data %d (fid %d)\n", cur->id, fid); - cur->status = STATUS_RUNNING; - if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); - pthread_mutex_unlock(&queue_mutex); - return cur; - } - } - - if (log_mutex) fprintf(stderr, "Unlocking input mutex (%d)\n", fid); - pthread_mutex_unlock(&input_mutex); - /* Only way to reach this point: no more output */ - - if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); - pthread_mutex_lock(&queue_mutex); - if (head == NULL) { - fprintf(stderr, "Reached end of file. Exiting.\n"); - done(0); - } else - ptail = NULL; /* This serves as a signal that there is no more input */ - if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); - pthread_mutex_unlock(&queue_mutex); - - return NULL; -} - -void queue_panic() { - struct line *next; - while (head && head->status == STATUS_FINISHED) { - /* Write out finished sentences */ - if (head->status == STATUS_FINISHED) { - fputs(head->s, stdout); - fflush(stdout); - } - /* Write out blank line for unfinished sentences */ - if (head->status == STATUS_ABORTED) { - fputs("\n", stdout); - fflush(stdout); - } - /* By defition, there cannot be any RUNNING sentences, since - function is only called when n_clients == 0 */ - free(head->s); - next = head->next; - free(head); - head = next; - n_flushed++; - } - fclose(stdout); - fprintf(stderr, "All clients died. Panicking, flushing completed sentences and exiting.\n"); - done(1); -} - -void queue_abort(struct line *node, int fid) { - if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); - pthread_mutex_lock(&queue_mutex); - node->status = STATUS_ABORTED; - if (n_clients == 0) { - if (stay_alive) { - fprintf(stderr, "Warning! No live clients detected! Staying alive, will retry soon.\n"); - } else { - queue_panic(); - } - } - if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); - pthread_mutex_unlock(&queue_mutex); -} - - -void queue_print() { - struct line *cur; - - fprintf(stderr, " Queue\n"); - - for (cur = head; cur != NULL; cur = cur->next) { - switch(cur->status) { - case STATUS_RUNNING: - fprintf(stderr, " %d running ", cur->id); break; - case STATUS_ABORTED: - fprintf(stderr, " %d aborted ", cur->id); break; - case STATUS_FINISHED: - fprintf(stderr, " %d finished ", cur->id); break; - - } - fprintf(stderr, "\n"); - //fprintf(stderr, cur->s); - } -} - -void queue_finish(struct line *node, char *s, int fid) { - struct line *next; - if (log_mutex) fprintf(stderr, "Locking queue mutex (%d)\n", fid); - pthread_mutex_lock(&queue_mutex); - - free(node->s); - node->s = s; - node->status = STATUS_FINISHED; - n_received++; - - /* Flush out finished nodes */ - while (head && head->status == STATUS_FINISHED) { - - if (log_mutex) fprintf(stderr, " Flushing finished node %d\n", head->id); - - fputs(head->s, stdout); - fflush(stdout); - if (log_mutex) fprintf(stderr, " Flushed node %d\n", head->id); - free(head->s); - - next = head->next; - free(head); - - head = next; - - n_flushed++; - - if (head == NULL) { /* empty queue */ - if (ptail == NULL) { /* This can only happen if set in queue_get as signal that there is no more input. */ - fprintf(stderr, "All sentences finished. Exiting.\n"); - done(0); - } else /* ptail pointed at something which was just popped off the stack -- reset to head*/ - ptail = &head; - } - } - - if (log_mutex) fprintf(stderr, " Flushing output %d\n", head->id); - fflush(stdout); - fprintf(stderr, "%d sentences sent, %d sentences finished, %d sentences flushed\n", n_sent, n_received, n_flushed); - - if (log_mutex) fprintf(stderr, "Unlocking queue mutex (%d)\n", fid); - pthread_mutex_unlock(&queue_mutex); - -} - -char * read_line(int fd, int multiline) { - int size = 80; - char errorbuf[100]; - char *s = malloc(size+2); - int result, errors=0; - int i = 0; - - result = read(fd, s+i, 1); - - while (1) { - if (result < 0) { - perror("read()"); - sprintf(errorbuf, "Error code: %d\n", errno); - fprintf(stderr, errorbuf); - errors++; - if (errors > 5) { - free(s); - return NULL; - } else { - sleep(1); /* retry after delay */ - } - } else if (result == 0) { - break; - } else if (multiline==0 && s[i] == '\n') { - break; - } else { - if (s[i] == '\n'){ - /* if we've reached this point, - then multiline must be 1, and we're - going to poll the fd for an additional - line of data. The basic design is to - run a select on the filedescriptor fd. - Select will return under two conditions: - if there is data on the fd, or if a - timeout is reached. We'll select on this - fd. If select returns because there's data - ready, keep going; else assume there's no - more and return the data we already have. - */ - - fd_set set; - FD_ZERO(&set); - FD_SET(fd, &set); - - struct timeval timeout; - timeout.tv_sec = 3; // number of seconds for timeout - timeout.tv_usec = 0; - - int ready = select(FD_SETSIZE, &set, NULL, NULL, &timeout); - if (ready<1){ - break; // no more data, stop looping - } - } - i++; - - if (i == size) { - size = size*2; - s = realloc(s, size+2); - } - } - - result = read(fd, s+i, 1); - } - - if (result == 0 && i == 0) { /* end of file */ - free(s); - return NULL; - } - - s[i] = '\n'; - s[i+1] = '\0'; - - return s; -} - -void * new_client(void *arg) { - struct clientinfo *client = (struct clientinfo *)arg; - struct line *cur; - int result; - char *s; - char errorbuf[100]; - - pthread_mutex_lock(&clients_mutex); - n_clients++; - pthread_mutex_unlock(&clients_mutex); - - fprintf(stderr, "Client connected (%d connected)\n", n_clients); - - for (;;) { - - cur = queue_get(client->s); - - if (cur) { - /* fprintf(stderr, "Sending to client: %s", cur->s); */ - fprintf(stderr, "Sending data %d to client (fid %d)\n", cur->id, client->s); - result = write(client->s, cur->s, strlen(cur->s)); - if (result < strlen(cur->s)){ - perror("write()"); - sprintf(errorbuf, "Error code: %d\n", errno); - fprintf(stderr, errorbuf); - - pthread_mutex_lock(&clients_mutex); - n_clients--; - pthread_mutex_unlock(&clients_mutex); - - fprintf(stderr, "Client died (%d connected)\n", n_clients); - queue_abort(cur, client->s); - - close(client->s); - free(client); - - pthread_exit(NULL); - } - } else { - close(client->s); - pthread_mutex_lock(&clients_mutex); - n_clients--; - pthread_mutex_unlock(&clients_mutex); - fprintf(stderr, "Client dismissed (%d connected)\n", n_clients); - pthread_exit(NULL); - } - - s = read_line(client->s,expect_multiline_output); - if (s) { - /* fprintf(stderr, "Client (fid %d) returned: %s", client->s, s); */ - fprintf(stderr, "Client (fid %d) returned data %d\n", client->s, cur->id); -// queue_print(); - queue_finish(cur, s, client->s); - } else { - pthread_mutex_lock(&clients_mutex); - n_clients--; - pthread_mutex_unlock(&clients_mutex); - - fprintf(stderr, "Client died (%d connected)\n", n_clients); - queue_abort(cur, client->s); - - close(client->s); - free(client); - - pthread_exit(NULL); - } - - } - return 0; -} - -void done (int code) { - close(s); - exit(code); -} - - - -int main (int argc, char *argv[]) { - struct sockaddr_in sin, from; - int g; - socklen_t len; - struct clientinfo *client; - int port; - int opt; - int errors = 0; - int argi; - char *key = NULL, *client_key; - int use_key = 0; - /* the key stuff here doesn't provide any - real measure of security, it's mainly to keep - jobs from bumping into each other. */ - - pthread_t tid; - port = DEFAULT_PORT; - - for (argi=1; argi < argc; argi++){ - if (strcmp(argv[argi], "-m")==0){ - expect_multiline_output = 1; - } else if (strcmp(argv[argi], "-k")==0){ - argi++; - if (argi == argc){ - fprintf(stderr, "Key must be specified after -k\n"); - exit(1); - } - key = argv[argi]; - use_key = 1; - } else if (strcmp(argv[argi], "--stay-alive")==0){ - stay_alive = 1; /* dont panic and die with zero clients */ - } else { - port = atoi(argv[argi]); - } - } - - /* Initialize data structures */ - head = NULL; - ptail = &head; - - /* Set up listener */ - s = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); - opt = 1; - setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)); - - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = htonl(INADDR_ANY); - sin.sin_port = htons(port); - while (bind(s, (struct sockaddr *) &sin, sizeof(sin)) < 0) { - perror("bind()"); - sleep(1); - errors++; - if (errors > 100) - exit(1); - } - - len = sizeof(sin); - getsockname(s, (struct sockaddr *) &sin, &len); - - fprintf(stderr, "Listening on port %hu\n", ntohs(sin.sin_port)); - - while (listen(s, MAX_CLIENTS) < 0) { - perror("listen()"); - sleep(1); - errors++; - if (errors > 100) - exit(1); - } - - for (;;) { - len = sizeof(from); - g = accept(s, (struct sockaddr *)&from, &len); - if (g < 0) { - perror("accept()"); - sleep(1); - continue; - } - client = malloc(sizeof(struct clientinfo)); - client->s = g; - bcopy(&from, &client->sin, len); - - if (use_key){ - fd_set set; - FD_ZERO(&set); - FD_SET(client->s, &set); - - struct timeval timeout; - timeout.tv_sec = 3; // number of seconds for timeout - timeout.tv_usec = 0; - - int ready = select(FD_SETSIZE, &set, NULL, NULL, &timeout); - if (ready<1){ - fprintf(stderr, "Prospective client failed to respond with correct key.\n"); - close(client->s); - free(client); - } else { - client_key = read_line(client->s,0); - client_key[strlen(client_key)-1]='\0'; /* chop trailing newline */ - if (strcmp(key, client_key)==0){ - pthread_create(&tid, NULL, new_client, client); - } else { - fprintf(stderr, "Prospective client failed to respond with correct key.\n"); - close(client->s); - free(client); - } - free(client_key); - } - } else { - pthread_create(&tid, NULL, new_client, client); - } - } - -} - - - diff --git a/vest/sentserver.h b/vest/sentserver.h deleted file mode 100644 index cd17a546..00000000 --- a/vest/sentserver.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef SENTSERVER_H -#define SENTSERVER_H - -#define DEFAULT_PORT 50000 - -#endif diff --git a/vest/tac.pl b/vest/tac.pl deleted file mode 100755 index 9fb525c1..00000000 --- a/vest/tac.pl +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/perl - -while(<>) { - chomp; - $|=1; - print (scalar reverse($_)); - print "\n"; -} diff --git a/vest/test_aer/README b/vest/test_aer/README deleted file mode 100644 index 819b2e32..00000000 --- a/vest/test_aer/README +++ /dev/null @@ -1,8 +0,0 @@ -To run the test: - -../dist-vest.pl --local --metric aer cdec.ini --source-file corpus.src --ref-files=ref.0 --weights weights - -This will optimize the parameters of the tiny lexical translation model -so as to minimize the AER of the Viterbi alignment on the development -set in corpus.src according to the reference alignments in ref.0. - diff --git a/vest/test_aer/cdec.ini b/vest/test_aer/cdec.ini deleted file mode 100644 index 08187848..00000000 --- a/vest/test_aer/cdec.ini +++ /dev/null @@ -1,3 +0,0 @@ -formalism=lextrans -grammar=grammar -aligner=true diff --git a/vest/test_aer/corpus.src b/vest/test_aer/corpus.src deleted file mode 100644 index 31b23971..00000000 --- a/vest/test_aer/corpus.src +++ /dev/null @@ -1,3 +0,0 @@ -el gato negro ||| the black cat -el gato ||| the cat -el libro ||| the book diff --git a/vest/test_aer/grammar b/vest/test_aer/grammar deleted file mode 100644 index 9d857824..00000000 --- a/vest/test_aer/grammar +++ /dev/null @@ -1,12 +0,0 @@ -el ||| cat ||| F1=1 -el ||| the ||| F2=1 -el ||| black ||| F3=1 -el ||| book ||| F11=1 -gato ||| cat ||| F4=1 NN=1 -gato ||| black ||| F5=1 -gato ||| the ||| F6=1 -negro ||| the ||| F7=1 -negro ||| cat ||| F8=1 -negro ||| black ||| F9=1 -libro ||| the ||| F10=1 -libro ||| book ||| F12=1 NN=1 diff --git a/vest/test_aer/ref.0 b/vest/test_aer/ref.0 deleted file mode 100644 index 734a9c5b..00000000 --- a/vest/test_aer/ref.0 +++ /dev/null @@ -1,3 +0,0 @@ -0-0 1-2 2-1 -0-0 1-1 -0-0 1-1 diff --git a/vest/test_aer/weights b/vest/test_aer/weights deleted file mode 100644 index afc9282e..00000000 --- a/vest/test_aer/weights +++ /dev/null @@ -1,13 +0,0 @@ -F1 0.1 -F2 -.5980815 -F3 0.24235 -F4 0.625 -F5 0.4514 -F6 0.112316 -F7 -0.123415 -F8 -0.25390285 -F9 -0.23852 -F10 0.646 -F11 0.413141 -F12 0.343216 -NN -0.1215 diff --git a/vest/test_data/0.json.gz b/vest/test_data/0.json.gz deleted file mode 100644 index 30f8dd77..00000000 Binary files a/vest/test_data/0.json.gz and /dev/null differ diff --git a/vest/test_data/1.json.gz b/vest/test_data/1.json.gz deleted file mode 100644 index c82cc179..00000000 Binary files a/vest/test_data/1.json.gz and /dev/null differ diff --git a/vest/test_data/c2e.txt.0 b/vest/test_data/c2e.txt.0 deleted file mode 100644 index 12c4abe9..00000000 --- a/vest/test_data/c2e.txt.0 +++ /dev/null @@ -1,2 +0,0 @@ -australia reopens embassy in manila -( afp , manila , january 2 ) australia reopened its embassy in the philippines today , which was shut down about seven weeks ago due to what was described as a specific threat of a terrorist attack . diff --git a/vest/test_data/c2e.txt.1 b/vest/test_data/c2e.txt.1 deleted file mode 100644 index 4ac12df1..00000000 --- a/vest/test_data/c2e.txt.1 +++ /dev/null @@ -1,2 +0,0 @@ -australia reopened manila embassy -( agence france-presse , manila , 2nd ) - australia reopened its embassy in the philippines today . the embassy was closed seven weeks ago after what was described as a specific threat of a terrorist attack . diff --git a/vest/test_data/c2e.txt.2 b/vest/test_data/c2e.txt.2 deleted file mode 100644 index 2f67b72f..00000000 --- a/vest/test_data/c2e.txt.2 +++ /dev/null @@ -1,2 +0,0 @@ -australia to reopen embassy in manila -( afp report from manila , january 2 ) australia reopened its embassy in the philippines today . seven weeks ago , the embassy was shut down due to so-called confirmed terrorist attack threats . diff --git a/vest/test_data/c2e.txt.3 b/vest/test_data/c2e.txt.3 deleted file mode 100644 index 5483cef6..00000000 --- a/vest/test_data/c2e.txt.3 +++ /dev/null @@ -1,2 +0,0 @@ -australia to re - open its embassy to manila -( afp , manila , thursday ) australia reopens its embassy to manila , which was closed for the so-called " clear " threat of terrorist attack 7 weeks ago . diff --git a/vest/test_data/re.txt.0 b/vest/test_data/re.txt.0 deleted file mode 100644 index 86eff087..00000000 --- a/vest/test_data/re.txt.0 +++ /dev/null @@ -1,5 +0,0 @@ -erdogan states turkey to reject any pressures to urge it to recognize cyprus -ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara will reject any pressure by the european union to urge it to recognize cyprus . this comes two weeks before the summit of european union state and government heads who will decide whether or nor membership negotiations with ankara should be opened . -erdogan told " ntv " television station that " the european union cannot address us by imposing new conditions on us with regard to cyprus . -we will discuss this dossier in the course of membership negotiations . " -he added " let me be clear , i cannot sidestep turkey , this is something we cannot accept . " diff --git a/vest/test_data/re.txt.1 b/vest/test_data/re.txt.1 deleted file mode 100644 index 2140f198..00000000 --- a/vest/test_data/re.txt.1 +++ /dev/null @@ -1,5 +0,0 @@ -erdogan confirms turkey will resist any pressure to recognize cyprus -ankara 12 - 1 ( afp ) - the turkish head of government , recep tayyip erdogan , announced today ( wednesday ) that ankara would resist any pressure the european union might exercise in order to force it into recognizing cyprus . this comes two weeks before a summit of european union heads of state and government , who will decide whether or not to open membership negotiations with ankara . -erdogan said to the ntv television channel : " the european union cannot engage with us through imposing new conditions on us with regard to cyprus . -we shall discuss this issue in the course of the membership negotiations . " -he added : " let me be clear - i cannot confine turkey . this is something we do not accept . " diff --git a/vest/test_data/re.txt.2 b/vest/test_data/re.txt.2 deleted file mode 100644 index 94e46286..00000000 --- a/vest/test_data/re.txt.2 +++ /dev/null @@ -1,5 +0,0 @@ -erdogan confirms that turkey will reject any pressures to encourage it to recognize cyprus -ankara , 12 / 1 ( afp ) - the turkish prime minister recep tayyip erdogan declared today , wednesday , that ankara will reject any pressures that the european union may apply on it to encourage to recognize cyprus . this comes two weeks before a summit of the heads of countries and governments of the european union , who will decide on whether or not to start negotiations on joining with ankara . -erdogan told the ntv television station that " it is not possible for the european union to talk to us by imposing new conditions on us regarding cyprus . -we shall discuss this dossier during the negotiations on joining . " -and he added , " let me be clear . turkey's arm should not be twisted ; this is something we cannot accept . " diff --git a/vest/test_data/re.txt.3 b/vest/test_data/re.txt.3 deleted file mode 100644 index f87c3308..00000000 --- a/vest/test_data/re.txt.3 +++ /dev/null @@ -1,5 +0,0 @@ -erdogan stresses that turkey will reject all pressures to force it to recognize cyprus -ankara 12 - 1 ( afp ) - turkish prime minister recep tayyip erdogan announced today , wednesday , that ankara would refuse all pressures applied on it by the european union to force it to recognize cyprus . that came two weeks before the summit of the presidents and prime ministers of the european union , who would decide on whether to open negotiations on joining with ankara or not . -erdogan said to " ntv " tv station that the " european union can not communicate with us by imposing on us new conditions related to cyprus . -we will discuss this file during the negotiations on joining . " -he added , " let me be clear . turkey's arm should not be twisted . this is unacceptable to us . " diff --git a/vest/viterbi_envelope.cc b/vest/viterbi_envelope.cc deleted file mode 100644 index 9fcf75a0..00000000 --- a/vest/viterbi_envelope.cc +++ /dev/null @@ -1,177 +0,0 @@ -#include "viterbi_envelope.h" - -#include -#include - -using namespace std; -using boost::shared_ptr; - -ostream& operator<<(ostream& os, const ViterbiEnvelope& env) { - os << '<'; - const vector >& segs = env.GetSortedSegs(); - for (int i = 0; i < segs.size(); ++i) - os << (i==0 ? "" : "|") << "x=" << segs[i]->x << ",b=" << segs[i]->b << ",m=" << segs[i]->m << ",p1=" << segs[i]->p1 << ",p2=" << segs[i]->p2; - return os << '>'; -} - -ViterbiEnvelope::ViterbiEnvelope(int i) { - if (i == 0) { - // do nothing - <> - } else if (i == 1) { - segs.push_back(shared_ptr(new Segment(0, 0, 0, shared_ptr(), shared_ptr()))); - assert(this->IsMultiplicativeIdentity()); - } else { - cerr << "Only can create ViterbiEnvelope semiring 0 and 1 with this constructor!\n"; - abort(); - } -} - -struct SlopeCompare { - bool operator() (const shared_ptr& a, const shared_ptr& b) const { - return a->m < b->m; - } -}; - -const ViterbiEnvelope& ViterbiEnvelope::operator+=(const ViterbiEnvelope& other) { - if (!other.is_sorted) other.Sort(); - if (segs.empty()) { - segs = other.segs; - return *this; - } - is_sorted = false; - int j = segs.size(); - segs.resize(segs.size() + other.segs.size()); - for (int i = 0; i < other.segs.size(); ++i) - segs[j++] = other.segs[i]; - assert(j == segs.size()); - return *this; -} - -void ViterbiEnvelope::Sort() const { - sort(segs.begin(), segs.end(), SlopeCompare()); - const int k = segs.size(); - int j = 0; - for (int i = 0; i < k; ++i) { - Segment l = *segs[i]; - l.x = kMinusInfinity; - // cerr << "m=" << l.m << endl; - if (0 < j) { - if (segs[j-1]->m == l.m) { // lines are parallel - if (l.b <= segs[j-1]->b) continue; - --j; - } - while(0 < j) { - l.x = (l.b - segs[j-1]->b) / (segs[j-1]->m - l.m); - if (segs[j-1]->x < l.x) break; - --j; - } - if (0 == j) l.x = kMinusInfinity; - } - *segs[j++] = l; - } - segs.resize(j); - is_sorted = true; -} - -const ViterbiEnvelope& ViterbiEnvelope::operator*=(const ViterbiEnvelope& other) { - if (other.IsMultiplicativeIdentity()) { return *this; } - if (this->IsMultiplicativeIdentity()) { (*this) = other; return *this; } - - if (!is_sorted) Sort(); - if (!other.is_sorted) other.Sort(); - - if (this->IsEdgeEnvelope()) { -// if (other.size() > 1) -// cerr << *this << " (TIMES) " << other << endl; - shared_ptr edge_parent = segs[0]; - const double& edge_b = edge_parent->b; - const double& edge_m = edge_parent->m; - segs.clear(); - for (int i = 0; i < other.segs.size(); ++i) { - const Segment& seg = *other.segs[i]; - const double m = seg.m + edge_m; - const double b = seg.b + edge_b; - const double& x = seg.x; // x's don't change with * - segs.push_back(shared_ptr(new Segment(x, m, b, edge_parent, other.segs[i]))); - assert(segs.back()->p1->edge); - } -// if (other.size() > 1) -// cerr << " = " << *this << endl; - } else { - vector > new_segs; - int this_i = 0; - int other_i = 0; - const int this_size = segs.size(); - const int other_size = other.segs.size(); - double cur_x = kMinusInfinity; // moves from left to right across the - // real numbers, stopping for all inter- - // sections - double this_next_val = (1 < this_size ? segs[1]->x : kPlusInfinity); - double other_next_val = (1 < other_size ? other.segs[1]->x : kPlusInfinity); - while (this_i < this_size && other_i < other_size) { - const Segment& this_seg = *segs[this_i]; - const Segment& other_seg= *other.segs[other_i]; - const double m = this_seg.m + other_seg.m; - const double b = this_seg.b + other_seg.b; - - new_segs.push_back(shared_ptr(new Segment(cur_x, m, b, segs[this_i], other.segs[other_i]))); - int comp = 0; - if (this_next_val < other_next_val) comp = -1; else - if (this_next_val > other_next_val) comp = 1; - if (0 == comp) { // the next values are equal, advance both indices - ++this_i; - ++other_i; - cur_x = this_next_val; // could be other_next_val (they're equal!) - this_next_val = (this_i+1 < this_size ? segs[this_i+1]->x : kPlusInfinity); - other_next_val = (other_i+1 < other_size ? other.segs[other_i+1]->x : kPlusInfinity); - } else { // advance the i with the lower x, update cur_x - if (-1 == comp) { - ++this_i; - cur_x = this_next_val; - this_next_val = (this_i+1 < this_size ? segs[this_i+1]->x : kPlusInfinity); - } else { - ++other_i; - cur_x = other_next_val; - other_next_val = (other_i+1 < other_size ? other.segs[other_i+1]->x : kPlusInfinity); - } - } - } - segs.swap(new_segs); - } - //cerr << "Multiply: result=" << (*this) << endl; - return *this; -} - -// recursively construct translation -void Segment::ConstructTranslation(vector* trans) const { - const Segment* cur = this; - vector > ant_trans; - while(!cur->edge) { - ant_trans.resize(ant_trans.size() + 1); - cur->p2->ConstructTranslation(&ant_trans.back()); - cur = cur->p1.get(); - } - size_t ant_size = ant_trans.size(); - vector*> pants(ant_size); - assert(ant_size == cur->edge->tail_nodes_.size()); - --ant_size; - for (int i = 0; i < pants.size(); ++i) pants[ant_size - i] = &ant_trans[i]; - cur->edge->rule_->ESubstitute(pants, trans); -} - -void Segment::CollectEdgesUsed(std::vector* edges_used) const { - if (edge) { - assert(edge->id_ < edges_used->size()); - (*edges_used)[edge->id_] = true; - } - if (p1) p1->CollectEdgesUsed(edges_used); - if (p2) p2->CollectEdgesUsed(edges_used); -} - -ViterbiEnvelope ViterbiEnvelopeWeightFunction::operator()(const Hypergraph::Edge& e) const { - const double m = direction.dot(e.feature_values_); - const double b = origin.dot(e.feature_values_); - Segment* seg = new Segment(m, b, e); - return ViterbiEnvelope(1, seg); -} - diff --git a/vest/viterbi_envelope.h b/vest/viterbi_envelope.h deleted file mode 100644 index 60ad82d8..00000000 --- a/vest/viterbi_envelope.h +++ /dev/null @@ -1,81 +0,0 @@ -#ifndef _VITERBI_ENVELOPE_H_ -#define _VITERBI_ENVELOPE_H_ - -#include -#include -#include - -#include "hg.h" -#include "sparse_vector.h" - -static const double kMinusInfinity = -std::numeric_limits::infinity(); -static const double kPlusInfinity = std::numeric_limits::infinity(); - -struct Segment { - Segment() : x(), m(), b(), edge() {} - Segment(double _m, double _b) : - x(kMinusInfinity), m(_m), b(_b), edge() {} - Segment(double _x, double _m, double _b, const boost::shared_ptr& p1_, const boost::shared_ptr& p2_) : - x(_x), m(_m), b(_b), p1(p1_), p2(p2_), edge() {} - Segment(double _m, double _b, const Hypergraph::Edge& edge) : - x(kMinusInfinity), m(_m), b(_b), edge(&edge) {} - - double x; // x intersection with previous segment in env, or -inf if none - double m; // this line's slope - double b; // intercept with y-axis - - // we keep a pointer to the "parents" of this segment so we can reconstruct - // the Viterbi translation corresponding to this segment - boost::shared_ptr p1; - boost::shared_ptr p2; - - // only Segments created from an edge using the ViterbiEnvelopeWeightFunction - // have rules - // TRulePtr rule; - const Hypergraph::Edge* edge; - - // recursively recover the Viterbi translation that will result from setting - // the weights to origin + axis * x, where x is any value from this->x up - // until the next largest x in the containing ViterbiEnvelope - void ConstructTranslation(std::vector* trans) const; - void CollectEdgesUsed(std::vector* edges_used) const; -}; - -// this is the semiring value type, -// it defines constructors for 0, 1, and the operations + and * -struct ViterbiEnvelope { - // create semiring zero - ViterbiEnvelope() : is_sorted(true) {} // zero - // for debugging: - ViterbiEnvelope(const std::vector >& s) : segs(s) { Sort(); } - // create semiring 1 or 0 - explicit ViterbiEnvelope(int i); - ViterbiEnvelope(int n, Segment* seg) : is_sorted(true), segs(n, boost::shared_ptr(seg)) {} - const ViterbiEnvelope& operator+=(const ViterbiEnvelope& other); - const ViterbiEnvelope& operator*=(const ViterbiEnvelope& other); - bool IsMultiplicativeIdentity() const { - return size() == 1 && (segs[0]->b == 0.0 && segs[0]->m == 0.0) && (!segs[0]->edge) && (!segs[0]->p1) && (!segs[0]->p2); } - const std::vector >& GetSortedSegs() const { - if (!is_sorted) Sort(); - return segs; - } - size_t size() const { return segs.size(); } - - private: - bool IsEdgeEnvelope() const { - return segs.size() == 1 && segs[0]->edge; } - void Sort() const; - mutable bool is_sorted; - mutable std::vector > segs; -}; -std::ostream& operator<<(std::ostream& os, const ViterbiEnvelope& env); - -struct ViterbiEnvelopeWeightFunction { - ViterbiEnvelopeWeightFunction(const SparseVector& ori, - const SparseVector& dir) : origin(ori), direction(dir) {} - ViterbiEnvelope operator()(const Hypergraph::Edge& e) const; - const SparseVector origin; - const SparseVector direction; -}; - -#endif -- cgit v1.2.3 From 8e5fad9bcbadf36bbab3c1c5b053e3c8f7dddbce Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 2 Feb 2012 06:29:50 +0000 Subject: lopez suffix array extractor with copyrighted david chiang code excised --- sa-extract/README | 50 + sa-extract/calignment.pxd | 10 + sa-extract/calignment.pyx | 128 ++ sa-extract/cdat.pxd | 12 + sa-extract/cdat.pyx | 178 +++ sa-extract/cfloatlist.pxd | 10 + sa-extract/cfloatlist.pyx | 93 ++ sa-extract/cintlist.pxd | 15 + sa-extract/cintlist.pyx | 196 +++ sa-extract/clex.pyx | 460 +++++++ sa-extract/cmath.pxd | 2 + sa-extract/cn.py | 164 +++ sa-extract/compile_bin.py | 148 +++ sa-extract/context_model.py | 234 ++++ sa-extract/cstrmap.pxd | 12 + sa-extract/cstrmap.pyx | 14 + sa-extract/csuf.pxd | 11 + sa-extract/csuf.pyx | 321 +++++ sa-extract/cveb.pxd | 15 + sa-extract/cveb.pyx | 390 ++++++ sa-extract/example/README | 8 + sa-extract/example/corpus.align.gz | Bin 0 -> 829334 bytes sa-extract/example/corpus.de.gz | Bin 0 -> 1724393 bytes sa-extract/example/corpus.en.gz | Bin 0 -> 1457711 bytes sa-extract/example/test.de | 10 + sa-extract/example/test.ref.en | 10 + sa-extract/extract.ini | 116 ++ sa-extract/extractor.py | 60 + sa-extract/lcp.pyx | 113 ++ sa-extract/lcp_ops.py | 52 + sa-extract/log.py | 18 + sa-extract/manager.py | 100 ++ sa-extract/model.py | 12 + sa-extract/monitor.py | 48 + sa-extract/precomputation.pxd | 13 + sa-extract/precomputation.pyx | 478 ++++++++ sa-extract/rule.pxd | 13 + sa-extract/rule.pyx | 286 +++++ sa-extract/rulefactory.pyx | 2360 ++++++++++++++++++++++++++++++++++++ sa-extract/sa-compile.pl | 322 +++++ sa-extract/setup.cfg | 2 + sa-extract/setup.py | 45 + sa-extract/sgml.py | 194 +++ sa-extract/strmap.cc | 232 ++++ sa-extract/strmap.h | 22 + sa-extract/strutil.c | 63 + sa-extract/strutil.h | 8 + sa-extract/sym.pxd | 17 + sa-extract/sym.pyx | 155 +++ 49 files changed, 7220 insertions(+) create mode 100644 sa-extract/README create mode 100644 sa-extract/calignment.pxd create mode 100644 sa-extract/calignment.pyx create mode 100644 sa-extract/cdat.pxd create mode 100644 sa-extract/cdat.pyx create mode 100644 sa-extract/cfloatlist.pxd create mode 100644 sa-extract/cfloatlist.pyx create mode 100644 sa-extract/cintlist.pxd create mode 100644 sa-extract/cintlist.pyx create mode 100644 sa-extract/clex.pyx create mode 100644 sa-extract/cmath.pxd create mode 100644 sa-extract/cn.py create mode 100755 sa-extract/compile_bin.py create mode 100644 sa-extract/context_model.py create mode 100644 sa-extract/cstrmap.pxd create mode 100644 sa-extract/cstrmap.pyx create mode 100644 sa-extract/csuf.pxd create mode 100644 sa-extract/csuf.pyx create mode 100644 sa-extract/cveb.pxd create mode 100644 sa-extract/cveb.pyx create mode 100644 sa-extract/example/README create mode 100644 sa-extract/example/corpus.align.gz create mode 100644 sa-extract/example/corpus.de.gz create mode 100644 sa-extract/example/corpus.en.gz create mode 100644 sa-extract/example/test.de create mode 100644 sa-extract/example/test.ref.en create mode 100644 sa-extract/extract.ini create mode 100755 sa-extract/extractor.py create mode 100644 sa-extract/lcp.pyx create mode 100755 sa-extract/lcp_ops.py create mode 100644 sa-extract/log.py create mode 100644 sa-extract/manager.py create mode 100644 sa-extract/model.py create mode 100644 sa-extract/monitor.py create mode 100644 sa-extract/precomputation.pxd create mode 100644 sa-extract/precomputation.pyx create mode 100644 sa-extract/rule.pxd create mode 100644 sa-extract/rule.pyx create mode 100644 sa-extract/rulefactory.pyx create mode 100755 sa-extract/sa-compile.pl create mode 100644 sa-extract/setup.cfg create mode 100644 sa-extract/setup.py create mode 100644 sa-extract/sgml.py create mode 100644 sa-extract/strmap.cc create mode 100644 sa-extract/strmap.h create mode 100644 sa-extract/strutil.c create mode 100644 sa-extract/strutil.h create mode 100644 sa-extract/sym.pxd create mode 100644 sa-extract/sym.pyx diff --git a/sa-extract/README b/sa-extract/README new file mode 100644 index 00000000..f43e58cc --- /dev/null +++ b/sa-extract/README @@ -0,0 +1,50 @@ +SUFFIX-ARRAY-EXTRACT README + Feb 1, 2012 + +Written by Adam Lopez, repackaged by Chris Dyer. + +Originally based on parts of Hiero, by David Chiang, but these dependencies +have been removed or rewritten. + + +BUILD INSTRUCTIONS +============================================================================== + +Requirements: + Python 2.7 or later (http://www.python.org) + Cython 0.14.1 or later (http://cython.org/) + +- Edit Makefile to set the location of Python/Cython then do: + + make + + +COMPILING A PARALLEL CORPUS AND WORD ALIGNMENT +============================================================================== +- Run sa-compile.pl to compile the training data and generate an extract.ini + file (which is written to STDOUT): + + sa-compile.pl -b bitext_name=source.fr,target.en \ + -a alignment_name=alignment.txt > extract.ini + + +EXTRACTION OF PER-SENTENCE GRAMMARS +============================================================================== +- Example: + cat test.fr | extractor.py -c extract.ini + + +EXTRACTION OF COMPLETE TEST-SET GRAMMARS +============================================================================== +Edit the generated extract.ini file a change per_sentence_grammar +to False. Then, run extraction as normal. + +Note: extracting a single grammar for an entire test set will consume more +memory during extraction and (probably) during decoding. + + +EXAMPLE +============================================================================== +- See example/ and the README therein. + + diff --git a/sa-extract/calignment.pxd b/sa-extract/calignment.pxd new file mode 100644 index 00000000..a7d3001f --- /dev/null +++ b/sa-extract/calignment.pxd @@ -0,0 +1,10 @@ +cimport cintlist +from libc.stdio cimport FILE + +cdef class Alignment: + + cdef cintlist.CIntList links + cdef cintlist.CIntList sent_index + cdef int link(self, int i, int j) + cdef _unlink(self, int link, int* f, int* e) + cdef int* _get_sent_links(self, int sent_id, int* num_links) diff --git a/sa-extract/calignment.pyx b/sa-extract/calignment.pyx new file mode 100644 index 00000000..976fcd66 --- /dev/null +++ b/sa-extract/calignment.pyx @@ -0,0 +1,128 @@ +import log +import gzip +import cintlist + +from libc.stdio cimport FILE, fopen, fread, fwrite, fclose +from libc.stdlib cimport malloc, realloc, free + +# Note: Callison-Burch uses short instead of int. +# We have the space for our corpus, so this is not a problem; +# May need to revisit if things get really tight, though. +cdef class Alignment: + + + cdef int link(self, int i, int j): + '''Integerizes an alignment link pair''' + return i*65536 + j + + + def unlink(self, link): + '''De-integerizes an alignment link pair''' + return (link/65536, link%65536) + + + cdef _unlink(self, int link, int* f, int* e): + f[0] = link/65536 + e[0] = link%65536 + + + def get_sent_links(self, int sent_id): + cdef cintlist.CIntList sent_links + cdef int* arr + cdef int arr_len + + sent_links = cintlist.CIntList() + arr = self._get_sent_links(sent_id, &arr_len) + sent_links._extend_arr(arr, arr_len*2) + free(arr) + return sent_links + + + cdef int* _get_sent_links(self, int sent_id, int* num_links): + cdef int* sent_links + cdef int i, start, end + + start = self.sent_index.arr[sent_id] + end = self.sent_index.arr[sent_id+1] + num_links[0] = end - start + sent_links = malloc(2*num_links[0]*sizeof(int)) + for i from 0 <= i < num_links[0]: + self._unlink(self.links.arr[start + i], sent_links + (2*i), sent_links + (2*i) + 1) + return sent_links + + + def __cinit__(self, filename, from_binary=False): + self.links = cintlist.CIntList(1000,1000) + self.sent_index = cintlist.CIntList(1000,1000) + log.writeln("Reading alignment from file %s" % filename) + if from_binary: + self.read_binary(filename) + else: + self.read_text(filename) + + + def read_text(self, filename): + if filename[-2:] == "gz": + f = gzip.GzipFile(filename) + else: + f = open(filename) + for line in f: + self.sent_index.append(len(self.links)) + pairs = line.split() + for pair in pairs: + (i, j) = map(int, pair.split('-')) + self.links.append(self.link(i, j)) + self.sent_index.append(len(self.links)) + + + def read_binary(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "r") + self.links.read_handle(f) + self.sent_index.read_handle(f) + fclose(f) + + + def write_text(self, filename): + f = open(filename, "w") + sent_num = 0 + for i, link in enumerate(self.links): + while i >= self.sent_index[sent_num]: + f.write("\n") + sent_num = sent_num + 1 + f.write("%d-%d " % self.unlink(link)) + f.write("\n") + + + def write_binary(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "w") + self.links.write_handle(f) + self.sent_index.write_handle(f) + fclose(f) + + + def write_enhanced(self, filename): + f = open(filename, "w") + sent_num = 1 + for link in self.links: + f.write("%d " % link) + f.write("\n") + for i in self.sent_index: + f.write("%d " % i) + f.write("\n") + + + def alignment(self, i): + '''Return all (e,f) pairs for sentence i''' + cdef int j, start, end + result = [] + start = self.sent_index.arr[i] + end = self.sent_index.arr[i+1] + for j from start <= j < end: + result.append(self.unlink(self.links.arr[j])) + return result diff --git a/sa-extract/cdat.pxd b/sa-extract/cdat.pxd new file mode 100644 index 00000000..b686f611 --- /dev/null +++ b/sa-extract/cdat.pxd @@ -0,0 +1,12 @@ +cimport cintlist +from libc.stdio cimport FILE + +cdef class DataArray: + cdef word2id + cdef id2word + cdef cintlist.CIntList data + cdef cintlist.CIntList sent_id + cdef cintlist.CIntList sent_index + cdef use_sent_id + cdef void write_handle(self, FILE* f) + cdef void read_handle(self, FILE* f) diff --git a/sa-extract/cdat.pyx b/sa-extract/cdat.pyx new file mode 100644 index 00000000..57d3ad63 --- /dev/null +++ b/sa-extract/cdat.pyx @@ -0,0 +1,178 @@ +# cdat.pyx +# Defines "data arrays" that can be directly written to/read from disk in binary format +# In particular, the array itself is written/read directly as a glob of binary data +# Adam Lopez + +import sys +import gzip +import log +import cintlist + +from libc.stdio cimport FILE, fopen, fread, fwrite, fclose +from libc.stdlib cimport malloc, realloc, free +from libc.string cimport memset, strcpy, strlen + +cdef class DataArray: + + def __init__(self, filename=None, from_binary=False, use_sent_id=False): + self.word2id = {"END_OF_FILE":0, "END_OF_LINE":1} + self.id2word = ["END_OF_FILE", "END_OF_LINE"] + self.data = cintlist.CIntList(1000,1000) + self.sent_id = cintlist.CIntList(1000,1000) + self.sent_index = cintlist.CIntList(1000,1000) + self.use_sent_id = use_sent_id + + if filename is not None: + if from_binary: + self.read_binary(filename) + else: + self.read_text(filename) + + + def __len__(self): + return len(self.data) + + + def getSentId(self, i): + return self.sent_id.arr[i] + + + def getSent(self, i): + cdef int j, start, stop + sent = [] + start = self.sent_index.arr[i] + stop = self.sent_index.arr[i+1] + for i from start <= i < stop: + sent.append(self.id2word[self.data.arr[i]]) + return sent + + + def getSentPos(self, loc): + return loc - self.sent_index.arr[self.sent_id.arr[loc]] + + + def get_id(self, word): + if not word in self.word2id: + self.word2id[word] = len(self.id2word) + self.id2word.append(word) + return self.word2id[word] + + + def get_word(self, id): + return self.id2word[id] + + + def write_text(self, filename): + f = open(filename, "w") + for w_id in self.data: + if w_id > 1: + f.write("%s " % self.get_word(w_id)) + if w_id == 1: + f.write("\n") + f.close() + + + def read_text(self, filename): + cdef int word_count + + if filename[-2:] == "gz": + file = gzip.GzipFile(filename) + else: + file = open(filename) + word_count = 0 + for line_num, line in enumerate(file): + self.sent_index.append(word_count) + for word in line.split(): + self.data.append(self.get_id(word)) + if self.use_sent_id: + self.sent_id.append(line_num) + word_count = word_count + 1 + self.data.append(1) + if self.use_sent_id: + self.sent_id.append(line_num) + word_count = word_count + 1 + self.data.append(0) + self.sent_index.append(word_count) + + def read_binary(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "r") + self.read_handle(f) + fclose(f) + + + cdef void read_handle(self, FILE* f): + cdef int num_words + cdef int word_len + cdef char* c_word + cdef bytes py_word + self.data.read_handle(f) + self.sent_index.read_handle(f) + self.sent_id.read_handle(f) + fread(&(num_words), sizeof(int), 1, f) + for i in xrange(num_words): + fread(&(word_len), sizeof(int), 1, f) + c_word = malloc (word_len * sizeof(char)) + fread(c_word, sizeof(char), word_len, f) + py_word = c_word + free(c_word) + self.word2id[py_word] = len(self.id2word) + self.id2word.append(py_word) + if len(self.sent_id) == 0: + self.use_sent_id = False + else: + self.use_sent_id = True + + + cdef void write_handle(self, FILE* f): + cdef int word_len + cdef int num_words + cdef char* c_word + + self.data.write_handle(f) + self.sent_index.write_handle(f) + self.sent_id.write_handle(f) + num_words = len(self.id2word) - 2 + fwrite(&(num_words), sizeof(int), 1, f) + for word in self.id2word[2:]: + c_word = word + word_len = strlen(c_word) + 1 + fwrite(&(word_len), sizeof(int), 1, f) + fwrite(c_word, sizeof(char), word_len, f) + + + def write_binary(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "w") + self.write_handle(f) + fclose(f) + + + def write_enhanced_handle(self, f): + for i in self.data: + f.write("%d " %i) + f.write("\n") + for i in self.sent_index: + f.write("%d " %i) + f.write("\n") + for i in self.sent_id: + f.write("%d " %i) + f.write("\n") + for word in self.id2word: + f.write("%s %d " % (word, self.word2id[word])) + f.write("\n") + + + + def write_enhanced(self, filename): + f = open(filename, "w") + self.write_enhanced_handle(self, f) + f.close() + + + + diff --git a/sa-extract/cfloatlist.pxd b/sa-extract/cfloatlist.pxd new file mode 100644 index 00000000..026f2739 --- /dev/null +++ b/sa-extract/cfloatlist.pxd @@ -0,0 +1,10 @@ +from libc.stdio cimport FILE + +cdef class CFloatList: + cdef int size + cdef int increment + cdef int len + cdef float* arr + cdef void write_handle(self, FILE* f) + cdef void read_handle(self, FILE* f) + cdef void set(self, int i, float v) diff --git a/sa-extract/cfloatlist.pyx b/sa-extract/cfloatlist.pyx new file mode 100644 index 00000000..18a0ef2a --- /dev/null +++ b/sa-extract/cfloatlist.pyx @@ -0,0 +1,93 @@ +# conveniencelist.pyx +# defines int arrays in C, with some convenience methods +# for reading arrays directly as globs directly from disk. +# Adam Lopez + +from libc.stdio cimport FILE, fopen, fread, fwrite, fclose +from libc.stdlib cimport malloc, realloc, free +from libc.string cimport memset, strcpy, strlen + +cdef class CFloatList: + + def __cinit__(self, size=0, increment=1, initial_len=0): + if initial_len > size: + size = initial_len + self.arr = malloc(size*sizeof(float)) + memset(self.arr, 0, initial_len*sizeof(float)) + + + def __init__(self, size=0, increment=1, initial_len=0): + self.size = size + if initial_len > size: + self.size = initial_len + self.increment = increment + self.len = initial_len + + + def __dealloc__(self): + free(self.arr) + + + def __getitem__(self, i): + j = i + if i<0: + j = self.len + i + if j<0 or j>=self.len: + raise IndexError("Requested index %d of %d-length FloatList" % (i, self.len)) + return self.arr[j] + + + cdef void set(self, int i, float v): + j = i + if i<0: + j = self.len + i + if j<0 or j>=self.len: + raise IndexError("Requested index %d of %d-length FloatList" % (i, self.len)) + self.arr[j] = v + + def __setitem__(self, i, val): + self.set(i, val) + + def __len__(self): + return self.len + + + def append(self, float val): + if self.len == self.size: + self.size = self.size + self.increment + self.arr = realloc(self.arr, self.size*sizeof(float)) + self.arr[self.len] = val + self.len = self.len + 1 + + + cdef void write_handle(self, FILE* f): + fwrite(&(self.len), sizeof(float), 1, f) + fwrite(self.arr, sizeof(float), self.len, f) + + + def write(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "w") + self.write_handle(f) + fclose(f) + + + cdef void read_handle(self, FILE* f): + free(self.arr) + fread(&(self.len), sizeof(float), 1, f) + self.arr = malloc(self.len * sizeof(float)) + self.size = self.len + fread(self.arr, sizeof(float), self.len, f) + + + def read(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "r") + self.read_handle(f) + fclose(f) + + diff --git a/sa-extract/cintlist.pxd b/sa-extract/cintlist.pxd new file mode 100644 index 00000000..8a3a655c --- /dev/null +++ b/sa-extract/cintlist.pxd @@ -0,0 +1,15 @@ +from libc.stdio cimport FILE + +cdef class CIntList: + cdef int size + cdef int increment + cdef int len + cdef int* arr + cdef void write_handle(self, FILE* f) + cdef void read_handle(self, FILE* f) + cdef void _append(self, int val) + cdef void _extend(self, CIntList other) + cdef void _extend_arr(self, int* other, int other_len) + cdef void _clear(self) + cdef void set(self, int i, int val) + diff --git a/sa-extract/cintlist.pyx b/sa-extract/cintlist.pyx new file mode 100644 index 00000000..9d0a058e --- /dev/null +++ b/sa-extract/cintlist.pyx @@ -0,0 +1,196 @@ +# cintlist.pyx +# defines int arrays in C, with some convenience methods +# for reading arrays as globs directly from disk. +# Adam Lopez + +from libc.stdio cimport FILE, fopen, fread, fwrite, fclose +from libc.stdlib cimport malloc, realloc, free +from libc.string cimport memset, memcpy + +cdef class CIntList: + + def __cinit__(self, size=0, increment=1, initial_len=0): + if initial_len > size: + size = initial_len + self.arr = malloc(size*sizeof(int)) + memset(self.arr, 0, initial_len*sizeof(int)) + + def __str__(self): + ret = "CIntList[" + for idx in xrange(self.size): + if idx>0: + ret += "," + ret += str(self.arr[idx]) + ret += "]" + ret += "len=" + ret += self.len + return ret + + def index(self, val): + for i in xrange(self.len): + if self.arr[i] == val: + return i + + return IndexError + + def partition(self,start,end): + pivot = self.arr[end] + bottom = start-1 + top = end + done = 0 + while not done: + while not done: + bottom += 1 + if bottom == top: + done = 1 + break + if self.arr[bottom] > pivot: + self.arr[top] = self.arr[bottom] + break + while not done: + top -= 1 + if top == bottom: + done = 1 + break + if self.arr[top] < pivot: + self.arr[bottom] = self.arr[top] + break + self.arr[top] = pivot + return top + + def _doquicksort(self,start,end): + if start < end: + split = self.partition(start,end) + self._doquicksort(start,split-1) + self._doquicksort(split+1,end) + else: + return + + def sort(self): + self._doquicksort(0,self.len-1) + + def reset(self): + self.len = 0 + + def __init__(self, size=0, increment=1, initial_len=0): + self.size = size + if initial_len > size: + self.size = initial_len + self.increment = increment + self.len = initial_len + + + def __dealloc__(self): + free(self.arr) + + + def __getitem__(self, index): + cdef int i, j, k + + if type(index) == int: + j = index + if j < 0: + j = self.len + j + if j<0 or j>=self.len: + raise IndexError("Requested index %d of %d-length CIntList" % (index, self.len)) + return self.arr[j] + elif type(index) == slice: + i = index.start + j = index.stop + if i < 0: + i = self.len + i + if j < 0: + j = self.len + j + if i < 0 or i >= self.len or j < 0 or j > self.len: + raise IndexError("Requested index %d:%d of %d-length CIntList" % (index.start, index.stop, self.len)) + result = () + for k from i <= k < j: + result = result + (self.arr[k],) + return result + else: + raise IndexError("Illegal key type %s for CIntList" % (type(index))) + + cdef void set(self, int i, int val): + j = i + if i<0: + j = self.len + i + if j<0 or j>=self.len: + raise IndexError("Requested index %d of %d-length IntList" % (i, self.len)) + if type(val) != int: + raise TypeError + self.arr[j] = val + + + def __setitem__(self, i, val): + self.set(i, val) + + def __len__(self): + return self.len + + def getSize(self): + return self.size + + def append(self, int val): + self._append(val) + + cdef void _append(self, int val): + if self.len == self.size: + self.size = self.size + self.increment + self.arr = realloc(self.arr, self.size*sizeof(int)) + self.arr[self.len] = val + self.len = self.len + 1 + + def extend(self, other): + self._extend(other) + + + cdef void _extend(self, CIntList other): + self._extend_arr(other.arr, other.len) + + + cdef void _extend_arr(self, int* other, int other_len): + if self.size < self.len + other_len: + self.size = self.len + other_len + self.arr = realloc(self.arr, self.size*sizeof(int)) + memcpy(self.arr+self.len, other, other_len*sizeof(int)) + self.len = self.len + other_len + + + cdef void _clear(self): + free(self.arr) + self.len = 0 + self.size = 0 + self.arr = malloc(0) + + + cdef void write_handle(self, FILE* f): + fwrite(&(self.len), sizeof(int), 1, f) + fwrite(self.arr, sizeof(int), self.len, f) + + + def write(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "w") + self.write_handle(f) + fclose(f) + + + cdef void read_handle(self, FILE* f): + (self.arr) + fread(&(self.len), sizeof(int), 1, f) + self.arr = malloc(self.len * sizeof(int)) + self.size = self.len + fread(self.arr, sizeof(int), self.len, f) + + + def read(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "r") + self.read_handle(f) + fclose(f) + + diff --git a/sa-extract/clex.pyx b/sa-extract/clex.pyx new file mode 100644 index 00000000..fa30caad --- /dev/null +++ b/sa-extract/clex.pyx @@ -0,0 +1,460 @@ +# clex.pyx +# defines bilexical dictionaries in C, with some convenience methods +# for reading arrays directly as globs directly from disk. +# Adam Lopez + +import gzip +import sys +import context_model + +cimport cintlist +cimport cfloatlist +cimport calignment +cimport csuf +cimport cdat + +from libc.stdio cimport FILE, fopen, fread, fwrite, fclose +from libc.stdlib cimport malloc, realloc, free +from libc.string cimport memset, strcpy, strlen + +cdef struct _node: + _node* smaller + _node* bigger + int key + int val + +cdef _node* new_node(int key): + cdef _node* n + n = <_node*> malloc(sizeof(_node)) + n.smaller = NULL + n.bigger = NULL + n.key = key + n.val = 0 + return n + + +cdef del_node(_node* n): + if n.smaller != NULL: + del_node(n.smaller) + if n.bigger != NULL: + del_node(n.bigger) + free(n) + + +cdef int* get_val(_node* n, int key): + if key == n.key: + return &n.val + elif key < n.key: + if n.smaller == NULL: + n.smaller = new_node(key) + return &(n.smaller.val) + return get_val(n.smaller, key) + else: + if n.bigger == NULL: + n.bigger = new_node(key) + return &(n.bigger.val) + return get_val(n.bigger, key) + + +cdef class CLex: + + cdef cfloatlist.CFloatList col1 + cdef cfloatlist.CFloatList col2 + cdef cintlist.CIntList f_index + cdef cintlist.CIntList e_index + cdef id2eword, id2fword, eword2id, fword2id + + def __init__(self, filename, from_binary=False, + from_data=False, earray=None, fsarray=None): + self.id2eword = [] + self.id2fword = [] + self.eword2id = {} + self.fword2id = {} + self.e_index = cintlist.CIntList() + self.f_index = cintlist.CIntList() + self.col1 = cfloatlist.CFloatList() + self.col2 = cfloatlist.CFloatList() + if from_binary: + self.read_binary(filename) + else: + if from_data: + self.compute_from_data(fsarray, earray, filename) + else: + self.read_text(filename) + '''print "self.eword2id" + print "=============" + for x in self.eword2id: + print x + print "self.fword2id" + print "=============" + for x in self.fword2id: + print x + print "-------------"''' + + + cdef compute_from_data(self, csuf.SuffixArray fsa, cdat.DataArray eda, calignment.Alignment aa): + cdef int sent_id, num_links, l, i, j, f_i, e_j, I, J, V_E, V_F, num_pairs + cdef int *fsent, *esent, *alignment, *links, *ealigned, *faligned + cdef _node** dict + cdef int *fmargin, *emargin, *count + cdef bytes word + cdef int null_word + + null_word = 0 + for word in fsa.darray.id2word: # I miss list comprehensions + self.id2fword.append(word) + self.id2fword[null_word] = "NULL" + for id, word in enumerate(self.id2fword): + self.fword2id[word] = id + + for word in eda.id2word: + self.id2eword.append(word) + self.id2eword[null_word] = "NULL" + for id, word in enumerate(self.id2eword): + self.eword2id[word] = id + + num_pairs = 0 + + V_E = len(eda.id2word) + V_F = len(fsa.darray.id2word) + fmargin = malloc(V_F*sizeof(int)) + emargin = malloc(V_E*sizeof(int)) + memset(fmargin, 0, V_F*sizeof(int)) + memset(emargin, 0, V_E*sizeof(int)) + + dict = <_node**> malloc(V_F*sizeof(_node*)) + memset(dict, 0, V_F*sizeof(_node*)) + + num_sents = len(fsa.darray.sent_index) + for sent_id from 0 <= sent_id < num_sents-1: + + fsent = fsa.darray.data.arr + fsa.darray.sent_index.arr[sent_id] + I = fsa.darray.sent_index.arr[sent_id+1] - fsa.darray.sent_index.arr[sent_id] - 1 + faligned = malloc(I*sizeof(int)) + memset(faligned, 0, I*sizeof(int)) + + esent = eda.data.arr + eda.sent_index.arr[sent_id] + J = eda.sent_index.arr[sent_id+1] - eda.sent_index.arr[sent_id] - 1 + ealigned = malloc(J*sizeof(int)) + memset(ealigned, 0, J*sizeof(int)) + + links = aa._get_sent_links(sent_id, &num_links) + + for l from 0 <= l < num_links: + i = links[l*2] + j = links[l*2+1] + if i >= I or j >= J: + sys.stderr.write(" %d-%d out of bounds (I=%d,J=%d) in line %d\n" % (i,j,I,J,sent_id+1)) + assert i < I + assert j < J + f_i = fsent[i] + e_j = esent[j] + fmargin[f_i] = fmargin[f_i]+1 + emargin[e_j] = emargin[e_j]+1 + if dict[f_i] == NULL: + dict[f_i] = new_node(e_j) + dict[f_i].val = 1 + num_pairs = num_pairs + 1 + else: + count = get_val(dict[f_i], e_j) + if count[0] == 0: + num_pairs = num_pairs + 1 + count[0] = count[0] + 1 + # add count + faligned[i] = 1 + ealigned[j] = 1 + for i from 0 <= i < I: + if faligned[i] == 0: + f_i = fsent[i] + fmargin[f_i] = fmargin[f_i] + 1 + emargin[null_word] = emargin[null_word] + 1 + if dict[f_i] == NULL: + dict[f_i] = new_node(null_word) + dict[f_i].val = 1 + num_pairs = num_pairs + 1 + else: + count = get_val(dict[f_i], null_word) + if count[0] == 0: + num_pairs = num_pairs + 1 + count[0] = count[0] + 1 + for j from 0 <= j < J: + if ealigned[j] == 0: + e_j = esent[j] + fmargin[null_word] = fmargin[null_word] + 1 + emargin[e_j] = emargin[e_j] + 1 + if dict[null_word] == NULL: + dict[null_word] = new_node(e_j) + dict[null_word].val = 1 + num_pairs = num_pairs + 1 + else: + count = get_val(dict[null_word], e_j) + if count[0] == 0: + num_pairs = num_pairs + 1 + count[0] = count[0] + 1 + free(links) + free(faligned) + free(ealigned) + self.f_index = cintlist.CIntList(initial_len=V_F) + self.e_index = cintlist.CIntList(initial_len=num_pairs) + self.col1 = cfloatlist.CFloatList(initial_len=num_pairs) + self.col2 = cfloatlist.CFloatList(initial_len=num_pairs) + + num_pairs = 0 + for i from 0 <= i < V_F: + #self.f_index[i] = num_pairs + self.f_index.set(i, num_pairs) + if dict[i] != NULL: + self._add_node(dict[i], &num_pairs, float(fmargin[i]), emargin) + del_node(dict[i]) + free(fmargin) + free(emargin) + free(dict) + return + + + cdef _add_node(self, _node* n, int* num_pairs, float fmargin, int* emargin): + cdef int loc + if n.smaller != NULL: + self._add_node(n.smaller, num_pairs, fmargin, emargin) + loc = num_pairs[0] + self.e_index.set(loc, n.key) + self.col1.set(loc, float(n.val)/fmargin) + self.col2.set(loc, float(n.val)/float(emargin[n.key])) + num_pairs[0] = loc + 1 + if n.bigger != NULL: + self._add_node(n.bigger, num_pairs, fmargin, emargin) + + + def write_binary(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "w") + self.f_index.write_handle(f) + self.e_index.write_handle(f) + self.col1.write_handle(f) + self.col2.write_handle(f) + self.write_wordlist(self.id2fword, f) + self.write_wordlist(self.id2eword, f) + fclose(f) + + + cdef write_wordlist(self, wordlist, FILE* f): + cdef int word_len + cdef int num_words + cdef char* c_word + + num_words = len(wordlist) + fwrite(&(num_words), sizeof(int), 1, f) + for word in wordlist: + c_word = word + word_len = strlen(c_word) + 1 + fwrite(&(word_len), sizeof(int), 1, f) + fwrite(c_word, sizeof(char), word_len, f) + + + cdef read_wordlist(self, word2id, id2word, FILE* f): + cdef int num_words + cdef int word_len + cdef char* c_word + cdef bytes py_word + + fread(&(num_words), sizeof(int), 1, f) + for i from 0 <= i < num_words: + fread(&(word_len), sizeof(int), 1, f) + c_word = malloc (word_len * sizeof(char)) + fread(c_word, sizeof(char), word_len, f) + py_word = c_word + free(c_word) + word2id[py_word] = len(id2word) + id2word.append(py_word) + + def read_binary(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "r") + self.f_index.read_handle(f) + self.e_index.read_handle(f) + self.col1.read_handle(f) + self.col2.read_handle(f) + self.read_wordlist(self.fword2id, self.id2fword, f) + self.read_wordlist(self.eword2id, self.id2eword, f) + fclose(f) + + + def get_e_id(self, eword): + if eword not in self.eword2id: + e_id = len(self.id2eword) + self.id2eword.append(eword) + self.eword2id[eword] = e_id + return self.eword2id[eword] + + + def get_f_id(self, fword): + if fword not in self.fword2id: + f_id = len(self.id2fword) + self.id2fword.append(fword) + self.fword2id[fword] = f_id + return self.fword2id[fword] + + + def read_text(self, filename): + cdef i, j, w, e_id, f_id, n_f, n_e, N + cdef cintlist.CIntList fcount + + fcount = cintlist.CIntList() + if filename[-2:] == "gz": + f = gzip.GzipFile(filename) + else: + f = open(filename) + + # first loop merely establishes size of array objects + sys.stderr.write("Initial read...\n") + for line in f: + (fword, eword, score1, score2) = line.split() + f_id = self.get_f_id(fword) + e_id = self.get_e_id(eword) + while f_id >= len(fcount): + fcount.append(0) + fcount.arr[f_id] = fcount.arr[f_id] + 1 + + # Allocate space for dictionary in arrays + N = 0 + n_f = len(fcount) + self.f_index = cintlist.CIntList(initial_len=n_f+1) + for i from 0 <= i < n_f: + self.f_index.arr[i] = N + N = N + fcount.arr[i] + fcount.arr[i] = 0 + self.f_index.arr[n_f] = N + self.e_index = cintlist.CIntList(initial_len=N) + self.col1 = cfloatlist.CFloatList(initial_len=N) + self.col2 = cfloatlist.CFloatList(initial_len=N) + + # Re-read file, placing words into buckets + sys.stderr.write("Bucket sort...\n") + f.seek(0) + for line in f: + (fword, eword, score1, score2) = line.split() + f_id = self.get_f_id(fword) + e_id = self.get_e_id(eword) + index = self.f_index.arr[f_id] + fcount.arr[f_id] + fcount.arr[f_id] = fcount.arr[f_id] + 1 + self.e_index.arr[index] = int(e_id) + self.col1[index] = float(score1) + self.col2[index] = float(score2) + f.close() + + sys.stderr.write("Final sort...\n") + # Sort buckets by eword + for b from 0 <= b < n_f: + i = self.f_index.arr[b] + j = self.f_index.arr[b+1] + self.qsort(i,j, "") + + + cdef swap(self, int i, int j): + cdef int itmp + cdef float ftmp + + if i == j: + return + + itmp = self.e_index.arr[i] + self.e_index.arr[i] = self.e_index.arr[j] + self.e_index.arr[j] = itmp + + ftmp = self.col1.arr[i] + self.col1.arr[i] = self.col1.arr[j] + self.col1.arr[j] = ftmp + + ftmp = self.col2.arr[i] + self.col2.arr[i] = self.col2.arr[j] + self.col2.arr[j] = ftmp + + + cdef qsort(self, int i, int j, pad): + cdef int pval, p + + if i > j: + raise Exception("Sort error in CLex") + if i == j: #empty interval + return + if i == j-1: # singleton interval + return + + p = (i+j)/2 + pval = self.e_index.arr[p] + self.swap(i, p) + p = i + for k from i+1 <= k < j: + if pval >= self.e_index.arr[k]: + self.swap(p+1, k) + self.swap(p, p+1) + p = p + 1 + self.qsort(i,p, pad+" ") + self.qsort(p+1,j, pad+" ") + + + def write_enhanced(self, filename): + f = open(filename, "w") + for i in self.f_index: + f.write("%d " % i) + f.write("\n") + for i, s1, s2 in zip(self.e_index, self.col1, self.col2): + f.write("%d %f %f " % (i, s1, s2)) + f.write("\n") + for i, w in enumerate(self.id2fword): + f.write("%d %s " % (i, w)) + f.write("\n") + for i, w in enumerate(self.id2eword): + f.write("%d %s " % (i, w)) + f.write("\n") + f.close() + + + def get_score(self, fword, eword, col): + cdef e_id, f_id, low, high, midpoint, val + #print "get_score fword=",fword,", eword=",eword,", col=",col + + if eword not in self.eword2id: + return None + if fword not in self.fword2id: + return None + f_id = self.fword2id[fword] + e_id = self.eword2id[eword] + low = self.f_index.arr[f_id] + high = self.f_index.arr[f_id+1] + while high - low > 0: + midpoint = (low+high)/2 + val = self.e_index.arr[midpoint] + if val == e_id: + if col == 0: + return self.col1.arr[midpoint] + if col == 1: + return self.col2.arr[midpoint] + if val > e_id: + high = midpoint + if val < e_id: + low = midpoint + 1 + return None + + + def write_text(self, filename): + """Note: does not guarantee writing the dictionary in the original order""" + cdef i, N, e_id, f_id + + f = open(filename, "w") + N = len(self.e_index) + f_id = 0 + for i from 0 <= i < N: + while self.f_index.arr[f_id+1] == i: + f_id = f_id + 1 + e_id = self.e_index.arr[i] + score1 = self.col1.arr[i] + score2 = self.col2.arr[i] + f.write("%s %s %.6f %.6f\n" % (self.id2fword[f_id], self.id2eword[e_id], score1, score2)) + f.close() + + diff --git a/sa-extract/cmath.pxd b/sa-extract/cmath.pxd new file mode 100644 index 00000000..3aaaa2a3 --- /dev/null +++ b/sa-extract/cmath.pxd @@ -0,0 +1,2 @@ +cdef extern from "math.h": + double log(double) diff --git a/sa-extract/cn.py b/sa-extract/cn.py new file mode 100644 index 00000000..e534783f --- /dev/null +++ b/sa-extract/cn.py @@ -0,0 +1,164 @@ +# cn.py +# Chris Dyer +# Copyright (c) 2006 University of Maryland. + +# vim:tabstop=4:autoindent:expandtab + +import sys +import math +import sym +import log +import sgml + +epsilon = sym.fromstring('*EPS*'); + +class CNStats(object): + def __init__(self): + self.read = 0 + self.colls = 0 + self.words = 0 + + def collect(self, cn): + self.read += 1 + self.colls += cn.get_length() + for col in cn.columns: + self.words += len(col) + + def __str__(self): + return "confusion net statistics:\n succ. read: %d\n columns: %d\n words: %d\n avg. words/column:\t%f\n avg. cols/sent:\t%f\n\n" % (self.read, self.colls, self.words, float(self.words)/float(self.colls), float(self.colls)/float(self.read)) + +class ConfusionNet(object): + def __init__(self, sent): + object.__init__(self) + if (len(sent.words) == 0): + self.columns = () + return # empty line, it happens + line = sent.words[0] + if (line.startswith("(((")): + if (len(sent.words) > 1): + log.write("Bad sentence: %s\n" % (line)) + assert(len(sent.words) == 1) # make sure there are no spaces in your confusion nets! + line = "((('',1.0,1),),"+line[1:len(line)-1]+"(('',1.0,1),))" + cols = eval(line) + res = [] + for col in cols: + x = [] + for alt in col: + costs = alt[1] + if (type(costs) != type((1,2))): + costs=(float(costs),) + j=[] + for c in costs: + j.append(float(c)) + cost = tuple(j) + spanlen = 1 + if (len(alt) == 3): + spanlen = alt[2] + x.append((sym.fromstring(alt[0],terminal=True), None, spanlen)) + res.append(tuple(x)) + self.columns = tuple(res) + else: # convert a string of input into a CN + res = []; + res.append(((sym.fromstring('',terminal=True), None, 1), )) + for word in sent.words: + res.append(((sym.fromstring(word,terminal=True), None, 1), )); # (alt=word, cost=0.0) + res.append(((sym.fromstring('',terminal=True), None, 1), )) + self.columns = tuple(res) + + def is_epsilon(self, position): + x = self.columns[position[0]][position[1]][0] + return x == epsilon + + def compute_epsilon_run_length(self, cn_path): + if (len(cn_path) == 0): + return 0 + x = len(cn_path) - 1 + res = 0 + ''' -1 denotes a non-terminal ''' + while (x >= 0 and cn_path[x][0] >= 0 and self.is_epsilon(cn_path[x])): + res += 1 + x -= 1 + return res + + def compute_cn_cost(self, cn_path): + c = None + for (col, row) in cn_path: + if (col >= 0): + if c is None: + c = self.columns[col][row][1].clone() + else: + c += self.columns[col][row][1] + return c + + def get_column(self, col): + return self.columns[col] + + def get_length(self): + return len(self.columns) + + def __str__(self): + r = "conf net: %d\n" % (len(self.columns),) + i = 0 + for col in self.columns: + r += "%d -- " % i + i += 1 + for alternative in col: + r += "(%s, %s, %s) " % (sym.tostring(alternative[0]), alternative[1], alternative[2]) + r += "\n" + return r + + def listdown(_columns, col = 0): + # output all the possible sentences out of the self lattice + # will be used by the "dumb" adaptation of lattice decoding with suffix array + result = [] + for entry in _columns[col]: + if col+entry[2]+1<=len(_columns) : + for suffix in self.listdown(_columns,col+entry[2]): + result.append(entry[0]+' '+suffix) + #result.append(entry[0]+' '+suffix) + else: + result.append(entry[0]) + #result.append(entry[0]) + return result + + def next(self,_columns,curr_idx, min_dist=1): + # can be used only when prev_id is defined + result = [] + #print "curr_idx=%i\n" % curr_idx + if curr_idx+min_dist >= len(_columns): + return result + for alt_idx in xrange(len(_columns[curr_idx])): + alt = _columns[curr_idx][alt_idx] + #print "checking %i alternative : " % alt_idx + #print "%s %f %i\n" % (alt[0],alt[1],alt[2]) + #print alt + if alt[2] "+ + "\n\nNote: -d,-s,-a, and -p are mutually exclusive") + parser.add_option("-d", "--data-array", + action="store_true", default=False, + dest="da", help="Compile file into data array (default)") + parser.add_option("-s", "--suffix-array", + action="store_true", default=False, + dest="sa", help="Compile file into suffix array") + parser.add_option("-a", "--alignment", + action="store_true", default=False, + dest="a", help="Compile file into alignment") + parser.add_option("-l", "--lexical", + action="store_true", default=False, + dest="l", help="Compile file into lex file") + parser.add_option("-x", "--compute_lexical", action="store", nargs=2, + dest="lex_args", help="Compute lex file from data", + metavar=" ") + parser.add_option("-p", "--parse", + action="store_true", default=False, + dest="p", help="Compile file into parse") + parser.add_option("-b", "--binary-infile", + action="store_true", default=False, + dest="bin", help="Input file is binary (default: text)") + parser.add_option("-t", "--text-outfile", + action="store_true", default=False, + dest="text", help="Output file is text (default: binary)") + parser.add_option("-e", "--enhanced-outfile", + action="store_true", default=False, + dest="enhanced", help="Output file is enhanced text (default: binary)") + parser.add_option("-r", action="store", nargs=7, + dest="precomp_args", help="Precompute collocations (Hiero only)", + metavar="max-len= max-nt= max-size= min-gap= rank1= rank2= sa=") + (options, args) = parser.parse_args() + + filetype_opts = [options.da, options.sa, options.a, options.p] + + if (len(filter(lambda x: x, filetype_opts))) > 1 or len(args) != 2: + parser.print_help() + sys.exit(1) + + (infilename, outfilename) = args + if options.bin: + bin = " binary" + else: + bin = "" + + start_time = monitor.cpu() + if options.precomp_args: + if options.bin: + obj = precomputation.Precomputation(infilename, from_binary=True) + else: + keys = set(["max-len", "max-nt", "max-size", "min-gap", "rank1", "rank2", "sa"]) + precomp_opts = {} + sys.stderr.write("Precomputing statistics for list %s\n" % infilename) + for pair in options.precomp_args: + (key, val) = pair.split("=") + if key in keys: + keys.remove(key) + if key != "sa": + val = int(val) + precomp_opts[key] = val + else: + sys.stderr.write("Unknown keyword arg %s for -r (must be one of: max-len, max-nt, max-size, min-gap, rank1, rank2)\n" % key) + return 1 + sa = csuf.SuffixArray(precomp_opts["sa"], True) + obj = precomputation.Precomputation(infilename, sa, + precompute_rank=precomp_opts["rank1"], + precompute_secondary_rank=precomp_opts["rank2"], + max_length=precomp_opts["max-len"], + max_nonterminals=precomp_opts["max-nt"], + train_max_initial_size=precomp_opts["max-size"], + train_min_gap_size=precomp_opts["min-gap"]) + elif options.sa: + sys.stderr.write("Reading %s as%s suffix array...\n" % (infilename, bin)) + obj = csuf.SuffixArray(infilename, options.bin) + elif options.a: + sys.stderr.write("Reading %s as%s alignment array...\n" % (infilename, bin)) + obj = calignment.Alignment(infilename, options.bin) + elif options.p: + sys.stderr.write("Reading %s as%s parse array...\n" % (infilename, bin)) + obj = parse.ParseArray(infilename, options.bin) + elif options.l: + sys.stderr.write("Reading %s as%s lex array...\n" % (infilename, bin)) + obj = clex.CLex(infilename, options.bin) + elif options.lex_args: + ffile = options.lex_args[0] + efile = options.lex_args[1] + sys.stderr.write("Computing lex array from:\n A=%s\n F=%s\n E=%s\n" % (infilename, ffile, efile)) + fsarray = csuf.SuffixArray(ffile, True) + earray = cdat.DataArray(efile, True) + aarray = calignment.Alignment(infilename, True) + obj = clex.CLex(aarray, from_data=True, earray=earray, fsarray=fsarray) + else: + sys.stderr.write("Reading %s as%s data array...\n" % (infilename, bin)) + obj = cdat.DataArray(infilename, options.bin) + + sys.stderr.write(" Total time for read: %f\n" % (monitor.cpu() - start_time)) + start_time = monitor.cpu() + if options.text: + sys.stderr.write("Writing text file %s...\n" % outfilename) + obj.write_text(outfilename) + elif options.enhanced: + sys.stderr.write("Writing enhanced text file %s...\n" % outfilename) + obj.write_enhanced(outfilename) + else: + sys.stderr.write("Writing binary file %s...\n" % outfilename) + obj.write_binary(outfilename) + sys.stderr.write("Finished.\n") + sys.stderr.write(" Total time for write: %f\n" % (monitor.cpu() - start_time)) + + mem_use = float(monitor.memory()) + metric = "B" + if mem_use / 1000 > 1: + mem_use /= 1000 + metric = "KB" + if mem_use / 1000 > 1: + mem_use /= 1000 + metric = "MB" + if mem_use / 1000 > 1: + mem_use /= 1000 + metric = "GB" + sys.stderr.write(" Memory usage: %.1f%s\n" % (mem_use, metric)) + + + +if __name__ == "__main__": + sys.exit(main(sys.argv)) diff --git a/sa-extract/context_model.py b/sa-extract/context_model.py new file mode 100644 index 00000000..8cb6c174 --- /dev/null +++ b/sa-extract/context_model.py @@ -0,0 +1,234 @@ +#!/usr/bin/env python +import sys +import model +import sym +import log +import math + +class ContextModel(model.Model): + '''A ContextModel is one that is computed using information + from the Context object''' + + def __init__(self, context_manager, default=0.0): + model.Model.__init__(self) + self.wordless = 0 + self.initial = None + self.default = default + self.context_manager = context_manager + self.id = self.context_manager.add_model(self) + + '''The next feature is true if the model depends in + some way on the entire input sentence; that is, if + it cannot be scored when created, but must be scored + no earlier than during the input method (note that + this is less strict than stateful)''' + self.contextual = True + ''' It may seem somewhat counterintuitive that a + ContextModel can be non-contextual, but a good + example is the rule probabilites; although these + are computed using the Context object, they don't + really depend in any way on context''' + + + '''inherited from model.Model, called once for each input sentence''' + def input(self, fwords, meta): + # all ContextModels must make this call + self.context_manager.input(self, fwords, meta) + + + '''This function will be called via the input method + only for contextual models''' + def compute_contextual_score(self, r): + return 0.0 + + '''This function is only called on rule creation for + contextless models''' + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + return 0.0 + + '''Stateless models should not need to + override this function, unless they define + something for model.TO_GOAL''' + def transition (self, r, antstates, i, j, j1=None): + return (None, 0.0) + + def estimate(self, r): + return r.getscore("context", self.id) + + def transition(self, r, antstates, i, j, j1=None): + return (None, r.getscore("context", self.id)) + + def finaltransition(self, state): + return 0.0 + + def rescore(self, ewords, score): + return score + + + +'''p(e|f)''' +class EgivenF(ContextModel): + + def __init__(self, context_manager, default=0.0): + ContextModel.__init__(self, context_manager) + self.contextual = False + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + prob = float(paircount)/float(fcount) + return -math.log10(prob) + +class CountEF(ContextModel): + + def __init__(self, context_manager, default=0.0): + ContextModel.__init__(self, context_manager) + self.contextual = False + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + return math.log10(1.0 + float(paircount)) + +class SampleCountF(ContextModel): + + def __init__(self, context_manager, default=0.0): + ContextModel.__init__(self, context_manager) + self.contextual = False + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + return math.log10(1.0 + float(fsample_count)) + + + +class EgivenFCoherent(ContextModel): + + def __init__(self, context_manager, default=0.0): + ContextModel.__init__(self, context_manager) + self.contextual = False + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + prob = float(paircount)/float(fsample_count) + #print "paircount=",paircount," , fsample_count=",fsample_count,", prob=",prob + if (prob == 0.0): return 99.0 + return -math.log10(prob) + + + +class CoherenceProb(ContextModel): + + def __init__(self, context_manager, default=0.0): + ContextModel.__init__(self, context_manager) + self.contextual = False + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + prob = float(fcount)/float(fsample_count) + return -math.log10(prob) + + + +class MaxLexEgivenF(ContextModel): + + def __init__(self, context_manager, ttable, col=0): + ContextModel.__init__(self, context_manager) + self.ttable = ttable + self.col = col + self.wordless = 0 + self.initial = None + self.contextual = False + + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + totalscore = 1.0 + fwords = map(sym.tostring, filter(lambda x: not sym.isvar(x), fphrase)) + fwords.append("NULL") + ewords = map(sym.tostring, filter(lambda x: not sym.isvar(x), ephrase)) + for e in ewords: + maxScore = 0.0 + for f in fwords: + score = self.ttable.get_score(f, e, self.col) + #print "score(MaxLexEgivenF) = ",score + if score > maxScore: + maxScore = score + totalscore *= maxScore + if totalscore == 0.0: + return 999 + else: + return -math.log10(totalscore) + + +class MaxLexFgivenE(ContextModel): + + def __init__(self, context_manager, ttable, col=1): + ContextModel.__init__(self, context_manager) + self.ttable = ttable + self.col = col + self.wordless = 0 + self.initial = None + self.contextual = False + + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + totalscore = 1.0 + fwords = map(sym.tostring, filter(lambda x: not sym.isvar(x), fphrase)) + ewords = map(sym.tostring, filter(lambda x: not sym.isvar(x), ephrase)) + ewords.append("NULL") + for f in fwords: + maxScore = 0.0 + for e in ewords: + score = self.ttable.get_score(f, e, self.col) + #print "score(MaxLexFgivenE) = ",score + if score > maxScore: + maxScore = score + totalscore *= maxScore + if totalscore == 0.0: + return 999 + else: + return -math.log10(totalscore) + + +class IsSingletonF(ContextModel): + + def __init__(self, context_manager, default=0.0): + ContextModel.__init__(self, context_manager) + self.contextual = False + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + return (fcount==1) + + +class IsSingletonFE(ContextModel): + + def __init__(self, context_manager, default=0.0): + ContextModel.__init__(self, context_manager) + self.contextual = False + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + return (paircount==1) + +class IsNotSingletonF(ContextModel): + + def __init__(self, context_manager, default=0.0): + ContextModel.__init__(self, context_manager) + self.contextual = False + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + return (fcount>1) + + +class IsNotSingletonFE(ContextModel): + + def __init__(self, context_manager, default=0.0): + ContextModel.__init__(self, context_manager) + self.contextual = False + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + return (paircount>1) + + +class IsFEGreaterThanZero(ContextModel): + + def __init__(self, context_manager, default=0.0): + ContextModel.__init__(self, context_manager) + self.contextual = False + + def compute_contextless_score(self, fphrase, ephrase, paircount, fcount, fsample_count): + return (paircount > 0.01) + + diff --git a/sa-extract/cstrmap.pxd b/sa-extract/cstrmap.pxd new file mode 100644 index 00000000..53becbc6 --- /dev/null +++ b/sa-extract/cstrmap.pxd @@ -0,0 +1,12 @@ +cdef extern from "strmap.h": + ctypedef struct StrMap + StrMap* stringmap_new() + void stringmap_delete(StrMap *vocab) + int stringmap_index(StrMap *vocab, char *s) + char* stringmap_word(StrMap *vocab, int i) + +cdef class StringMap: + cdef StrMap *vocab + cdef char *word(self, int i) + cdef int index(self, char *s) + diff --git a/sa-extract/cstrmap.pyx b/sa-extract/cstrmap.pyx new file mode 100644 index 00000000..d3883ea5 --- /dev/null +++ b/sa-extract/cstrmap.pyx @@ -0,0 +1,14 @@ + +cdef class StringMap: + def __cinit__(self): + self.vocab = stringmap_new() + + def __dealloc__(self): + stringmap_delete(self.vocab) + + cdef char *word(self, int i): + return stringmap_word(self.vocab, i) + + cdef int index(self, char *s): + return stringmap_index(self.vocab, s) + diff --git a/sa-extract/csuf.pxd b/sa-extract/csuf.pxd new file mode 100644 index 00000000..f44167dd --- /dev/null +++ b/sa-extract/csuf.pxd @@ -0,0 +1,11 @@ +cimport cdat +cimport cintlist + +cdef class SuffixArray: + cdef cdat.DataArray darray + cdef cintlist.CIntList sa + cdef cintlist.CIntList ha + cdef __lookup_helper(self, int word_id, int offset, int low, int high) + cdef __get_range(self, int word_id, int offset, int low, int high, int midpoint) + cdef __search_low(self, int word_id, int offset, int low, int high) + cdef __search_high(self, word_id, offset, low, high) diff --git a/sa-extract/csuf.pyx b/sa-extract/csuf.pyx new file mode 100644 index 00000000..64c44788 --- /dev/null +++ b/sa-extract/csuf.pyx @@ -0,0 +1,321 @@ +# csuf.pyx +# Defines suffix arrays that can be directly written to/read from disk in binary format +# Adam Lopez + +import sys +import log +import cdat +import cintlist +import monitor + +from libc.stdio cimport FILE, fclose, fopen + +cdef class SuffixArray: + + def __init__(self, filename, from_binary=False): + self.darray = cdat.DataArray() + self.sa = cintlist.CIntList() + self.ha = cintlist.CIntList() + if from_binary: + self.read_binary(filename) + else: + self.read_text(filename) + + + def __getitem__(self, i): + return self.sa.arr[i] + + + def getSentId(self, i): + return self.darray.getSentId(i) + + + def getSent(self, i): + return self.darray.getSent(i) + + + def getSentPos(self, loc): + return self.darray.getSentPos(loc) + + def read_text(self, filename): + '''Constructs suffix array using the algorithm + of Larsson & Sadahkane (1999)''' + cdef int V, N, i, j, h, a_i, n, current_run, skip + cdef cintlist.CIntList isa, word_count + + self.darray = cdat.DataArray(filename, from_binary=False, use_sent_id=True) + N = len(self.darray) + V = len(self.darray.id2word) + + self.sa = cintlist.CIntList(initial_len=N) + self.ha = cintlist.CIntList(initial_len=V+1) + + isa = cintlist.CIntList(initial_len=N) + word_count = cintlist.CIntList(initial_len=V+1) + + '''Step 1: bucket sort data''' + sort_start_time = monitor.cpu() + start_time = sort_start_time + for i from 0 <= i < N: + a_i = self.darray.data.arr[i] + word_count.arr[a_i] = word_count.arr[a_i] + 1 + + n = 0 + for i from 0 <= i < V+1: + self.ha.arr[i] = n + n = n + word_count.arr[i] + word_count.arr[i] = 0 + + for i from 0 <= i < N: + a_i = self.darray.data.arr[i] + self.sa.arr[self.ha.arr[a_i] + word_count.arr[a_i]] = i + isa.arr[i] = self.ha.arr[a_i + 1] - 1 # bucket pointer is last index in bucket + word_count.arr[a_i] = word_count.arr[a_i] + 1 + + '''Determine size of initial runs''' + current_run = 0 + for i from 0 <= i < V+1: + if i < V and self.ha.arr[i+1] - self.ha.arr[i] == 1: + current_run = current_run + 1 + else: + if current_run > 0: + self.sa.arr[self.ha.arr[i] - current_run] = -current_run + current_run = 0 + + sys.stderr.write(" Bucket sort took %f seconds\n" % (monitor.cpu() - sort_start_time)) + + '''Step 2: prefix-doubling sort''' + h = 1 + while self.sa.arr[0] != -N: + sort_start_time = monitor.cpu() + sys.stderr.write(" Refining, sort depth = %d\n" % (h,)) + i = 0 + skip = 0 + while i < N: + if self.sa.arr[i] < 0: + #sys.stderr.write("Skip from %d to %d\n" % (i, i-self.sa.arr[i]-1)) + skip = skip + self.sa.arr[i] + i = i - self.sa.arr[i] + else: + if skip < 0: + self.sa.arr[i+skip] = skip + skip = 0 + j = isa.arr[self.sa.arr[i]] + #sys.stderr.write("Process from %d to %d (%d, %d, %d)\n" % (i, j, self.sa.arr[i], self.darray.data.arr[self.sa.arr[i]], isa.arr[self.sa.arr[i]])) + self.q3sort(i, j, h, isa) + i = j+1 + if skip < 0: + self.sa.arr[i+skip] = skip + h = h * 2 + sys.stderr.write(" Refinement took %f seconds\n" % (monitor.cpu() - sort_start_time)) + + '''Step 3: read off suffix array from inverse suffix array''' + sys.stderr.write(" Finalizing sort...\n") + for i from 0 <= i < N: + j = isa.arr[i] + self.sa.arr[j] = i + sys.stderr.write("Suffix array construction took %f seconds\n" % (monitor.cpu() - start_time)) + + def q3sort(self, int i, int j, int h, cintlist.CIntList isa, pad=""): + '''This is a ternary quicksort. It divides the array into + three partitions: items less than the pivot, items equal + to pivot, and items greater than pivot. The first and last + of these partitions are then recursively sorted''' + cdef int k, midpoint, pval, phead, ptail, tmp + + if j-i < -1: + raise Exception("Unexpected condition found in q3sort: sort from %d to %d" % (i,j)) + if j-i == -1: # recursive base case -- empty interval + return + if (j-i == 0): # recursive base case -- singleton interval + isa.arr[self.sa.arr[i]] = i + self.sa.arr[i] = -1 + return + + # NOTE: choosing the first item as a pivot value resulted in + # stack overflow for some very large buckets. I think there + # is a natural bias towards order due the way the word ids are + # assigned; thus this resulted in the range to the left of the + # pivot being nearly empty. Therefore, choose the middle item. + # If the method of assigning word_id's is changed, this method + # may need to be reconsidered as well. + midpoint = (i+j)/2 + pval = isa.arr[self.sa.arr[midpoint] + h] + if i != midpoint: + tmp = self.sa.arr[midpoint] + self.sa.arr[midpoint] = self.sa.arr[i] + self.sa.arr[i] = tmp + phead = i + ptail = i + + # find the three partitions. phead marks the first element + # of the middle partition, and ptail marks the last element + for k from i+1 <= k < j+1: + if isa.arr[self.sa.arr[k] + h] < pval: + if k > ptail+1: + tmp = self.sa.arr[phead] + self.sa.arr[phead] = self.sa.arr[k] + self.sa.arr[k] = self.sa.arr[ptail+1] + self.sa.arr[ptail+1] = tmp + else: # k == ptail+1 + tmp = self.sa.arr[phead] + self.sa.arr[phead] = self.sa.arr[k] + self.sa.arr[k] = tmp + phead = phead + 1 + ptail = ptail + 1 + else: + if isa.arr[self.sa.arr[k] + h] == pval: + if k > ptail+1: + tmp = self.sa.arr[ptail+1] + self.sa.arr[ptail+1] = self.sa.arr[k] + self.sa.arr[k] = tmp + ptail = ptail + 1 + + # recursively sort smaller suffixes + self.q3sort(i, phead-1, h, isa, pad+" ") + + # update suffixes with pivot value + # corresponds to update_group function in Larsson & Sadakane + for k from phead <= k < ptail+1: + isa.arr[self.sa.arr[k]] = ptail + if phead == ptail: + self.sa.arr[phead] = -1 + + # recursively sort larger suffixes + self.q3sort(ptail+1, j, h, isa, pad+" ") + + + def write_text(self, filename): + self.darray.write_text(filename) + + + def read_binary(self, filename): + cdef FILE *f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "r") + self.darray.read_handle(f) + self.sa.read_handle(f) + self.ha.read_handle(f) + fclose(f) + + + def write_binary(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "w") + self.darray.write_handle(f) + self.sa.write_handle(f) + self.ha.write_handle(f) + fclose(f) + + + def write_enhanced(self, filename): + f = open(filename, "w") + self.darray.write_enhanced_handle(f) + for a_i in self.sa: + f.write("%d " % a_i) + f.write("\n") + for w_i in self.ha: + f.write("%d " % w_i) + f.write("\n") + f.close() + + + cdef __search_high(self, word_id, offset, low, high): + cdef int midpoint + + if low >= high: + return high + midpoint = (high + low) / 2 + if self.darray.data.arr[self.sa.arr[midpoint] + offset] == word_id: + return self.__search_high(word_id, offset, midpoint+1, high) + else: + return self.__search_high(word_id, offset, low, midpoint) + + + cdef __search_low(self, int word_id, int offset, int low, int high): + cdef int midpoint + + if low >= high: + return high + midpoint = (high + low) / 2 + if self.darray.data.arr[self.sa.arr[midpoint] + offset] == word_id: + return self.__search_low(word_id, offset, low, midpoint) + else: + return self.__search_low(word_id, offset, midpoint+1, high) + + + cdef __get_range(self, int word_id, int offset, int low, int high, int midpoint): + return (self.__search_low(word_id, offset, low, midpoint), + self.__search_high(word_id, offset, midpoint, high)) + + + cdef __lookup_helper(self, int word_id, int offset, int low, int high): + cdef int midpoint + + if offset == 0: + return (self.ha.arr[word_id], self.ha.arr[word_id+1]) + if low >= high: + return None + + midpoint = (high + low) / 2 + if self.darray.data.arr[self.sa.arr[midpoint] + offset] == word_id: + return self.__get_range(word_id, offset, low, high, midpoint) + if self.darray.data.arr[self.sa.arr[midpoint] + offset] > word_id: + return self.__lookup_helper(word_id, offset, low, midpoint) + else: + return self.__lookup_helper(word_id, offset, midpoint+1, high) + + + def lookup(self, word, offset, int low, int high): + if low == -1: + low = 0 + if high == -1: + high = len(self.sa) + if word in self.darray.word2id: + word_id = self.darray.word2id[word] + return self.__lookup_helper(word_id, offset, low, high) + else: + return None + + + + def print_sa(self, isa): + '''Slow; Use only in case of emergency''' + cdef int i, j, k, N + cdef cintlist.CIntList tmp_sa + + N = len(self.sa) + for i from 0 <= i < N: + sys.stderr.write("%2d " % i) + sys.stderr.write("\n") + for i from 0 <= i < N: + sys.stderr.write("%2d " % self.darray.data.arr[i]) + sys.stderr.write("\n") + for i from 0 <= i < N: + sys.stderr.write("%2d " % isa.arr[i]) + sys.stderr.write("\n\n\n") + + # Recover partially sorted array + tmp_sa = cintlist.CIntList(initial_len=N) + for i from 0 <= i < N: + j = isa.arr[i] + tmp_sa.arr[j] = i + for i from 0 <= i < N: + if self.sa.arr[i] > 0: + tmp_sa.arr[i] = self.sa.arr[i] + + for i from 0 <= i < N: + j = tmp_sa.arr[i] + sys.stderr.write("%2d %2d | " % (i, self.sa.arr[i])) + for k from j <= k < N: + sys.stderr.write("%2d " % self.darray.data.arr[k]) + sys.stderr.write("\n") + sys.stderr.write("\n") + + + + + diff --git a/sa-extract/cveb.pxd b/sa-extract/cveb.pxd new file mode 100644 index 00000000..8967f8e3 --- /dev/null +++ b/sa-extract/cveb.pxd @@ -0,0 +1,15 @@ +cdef struct _VEB: + int top_universe_size + int num_bottom_bits + int max_val + int min_val + int size + void* top + void** bottom + + +cdef class VEB: + cdef _VEB* veb + cdef int _findsucc(self, int i) + cdef int _insert(self, int i) + cdef int _first(self) diff --git a/sa-extract/cveb.pyx b/sa-extract/cveb.pyx new file mode 100644 index 00000000..ca87becc --- /dev/null +++ b/sa-extract/cveb.pyx @@ -0,0 +1,390 @@ +#!/usr/bin/env python2.4 +'''This module implements a partial stratified tree (van Emde Boas, 1977). +Only insert findsucc, __iter__, and __contains__ are implemented. +Delete is currently not supported. +There is very little error-checking in this code -- it is designed +to be used in the limited situation described in Lopez (EMNLP-CoNLL 2007), +which doesn't cover all of the possible ways that you could misuse it +(e.g. trying to insert a key larger than the universe size) +Other notes -- this code is really rather ugly C code masquerading as +Pyrex/Python. Virtual function calls are bypassed by hand in several +places for the sake of efficiency, and other Python niceties are +removed for the same reason.''' + +from libc.stdlib cimport malloc, free +from libc.math cimport log, ceil +from libc.string cimport memset + +cdef int MIN_BOTTOM_SIZE +cdef int MIN_BOTTOM_BITS + +MIN_BOTTOM_SIZE = 32 +MIN_BOTTOM_BITS = 5 + +cdef int lower_mask[32] +cdef int i, mask + +for i from 0 <= i < MIN_BOTTOM_SIZE: + mask = (mask << 1) + 1 + lower_mask[i] = mask + + +cdef struct _BitSet: + long bitset + int min_val + int max_val + int size + + +cdef _BitSet* new_BitSet(): + cdef _BitSet* b + + b = <_BitSet*> malloc(sizeof(_BitSet)) + b.bitset = 0 + b.min_val = -1 + b.max_val = -1 + b.size = 0 + return b + + +cdef int bitset_findsucc(_BitSet* b, int i): + cdef int bitset, mask + cdef int low, high, mid + + if b.max_val == -1 or i >= b.max_val: + return -1 + if i < b.min_val: + return b.min_val + + bitset = b.bitset & ~lower_mask[i] + low = i+1 + high = b.max_val+1 + while low < high-1: + mid = (high + low)/2 + mask = ~(lower_mask[high-1] ^ lower_mask[mid-1]) + if bitset & mask == 0: + low = mid + else: + bitset = bitset & mask + high = mid + return low + + +cdef int bitset_insert(_BitSet* b, int i): + cdef int val + + val = 1 << i + if b.bitset & val == 0: + b.bitset = b.bitset | val + if b.size == 0: + b.min_val = i + b.max_val = i + else: + if i < b.min_val: + b.min_val = i + if i > b.max_val: + b.max_val = i + b.size = b.size + 1 + return 1 + return 0 + + +cdef int bitset_contains(_BitSet* b, int i): + cdef int val + + val = 1 << i + if b.bitset & val == 0: + return 0 + else: + return 1 + + +cdef class BitSetIterator: + cdef _BitSet* b + cdef int next_val + + def __next__(self): + cdef int ret_val + + if self.next_val == -1: + raise StopIteration() + ret_val = self.next_val + self.next_val = bitset_findsucc(self.b, ret_val) + return ret_val + + + +# This is a Python wrapper class to give access to the +# (entirely C-implemented) _BitSet struct. +# Very slow; use only for debugging +cdef class BitSet: + + cdef _BitSet* b + + def __cinit__(self): + self.b = new_BitSet() + + def __dealloc__(self): + free(self.b) + + def __iter__(self): + cdef BitSetIterator it + it = BitSetIterator() + it.b = self.b + it.next_val = self.b.min_val + return it + + def insert(self, i): + return bitset_insert(self.b, i) + + def findsucc(self, i): + return bitset_findsucc(self.b, i) + + def __str__(self): + return dec2bin(self.b.bitset)+" ("+str(self.b.size)+","+str(self.b.min_val)+","+str(self.b.max_val)+")" + + def min(self): + return self.b.min_val + + def max(self): + return self.b.max_val + + def __len__(self): + return self.b.size + + def __contains__(self, i): + return bool(bitset_contains(self.b, i)) + + +def dec2bin(i): + cdef d + result = "" + for d from 0 <= d < MIN_BOTTOM_SIZE: + if i & lower_mask[0] == 0: + result = "0"+result + else: + result = "1"+result + i = i >> 1 + return result + + +cdef _VEB* new_VEB(int n): + cdef _VEB* veb + cdef int num_bits, num_top_bits, i + + veb = <_VEB*> malloc(sizeof(_VEB)) + + num_bits = int(ceil(log(n) / log(2))) + veb.num_bottom_bits = num_bits/2 + if veb.num_bottom_bits < MIN_BOTTOM_BITS: + veb.num_bottom_bits = MIN_BOTTOM_BITS + veb.top_universe_size = (n >> veb.num_bottom_bits) + 1 + + veb.bottom = malloc(veb.top_universe_size * sizeof(void*)) + memset(veb.bottom, 0, veb.top_universe_size * sizeof(void*)) + + if veb.top_universe_size > MIN_BOTTOM_SIZE: + veb.top = new_VEB(veb.top_universe_size) + else: + veb.top = new_BitSet() + + veb.max_val = -1 + veb.min_val = -1 + veb.size = 0 + return veb + + +cdef int VEB_insert(_VEB* veb, int i): + cdef _VEB* subv + cdef _BitSet* subb + cdef int a, b, tmp + + if veb.size == 0: + veb.min_val = i + veb.max_val = i + elif i == veb.min_val or i == veb.max_val: + return 0 + else: + if i < veb.min_val: + tmp = i + i = veb.min_val + veb.min_val = tmp + a = i >> veb.num_bottom_bits + b = i & lower_mask[veb.num_bottom_bits-1] + if veb.bottom[a] == NULL: + if veb.top_universe_size > MIN_BOTTOM_SIZE: + subv = <_VEB*> veb.top + VEB_insert(subv, a) + else: + subb = <_BitSet*> veb.top + bitset_insert(subb, a) + if veb.num_bottom_bits > MIN_BOTTOM_BITS: + veb.bottom[a] = new_VEB(1 << veb.num_bottom_bits) + else: + veb.bottom[a] = new_BitSet() + if veb.num_bottom_bits > MIN_BOTTOM_BITS: + subv = <_VEB*> veb.bottom[a] + if VEB_insert(subv, b) == 0: + return 0 + else: + subb = <_BitSet*> veb.bottom[a] + if bitset_insert(subb, b) == 0: + return 0 + + if i > veb.max_val: + veb.max_val = i + veb.size = veb.size + 1 + return 1 + + +cdef del_VEB(_VEB* veb): + cdef int i + + if veb.top_universe_size > MIN_BOTTOM_SIZE: + i = (<_VEB*> veb.top).min_val + else: + i = (<_BitSet*> veb.top).min_val + + while i != -1: + if veb.num_bottom_bits > MIN_BOTTOM_BITS: + del_VEB(<_VEB*> veb.bottom[i]) + else: + free(<_BitSet*> veb.bottom[i]) + + if veb.top_universe_size > MIN_BOTTOM_SIZE: + i = VEB_findsucc(<_VEB*> veb.top, i) + else: + i = bitset_findsucc(<_BitSet*> veb.top, i) + + if veb.top_universe_size > MIN_BOTTOM_SIZE: + del_VEB(<_VEB*> veb.top) + else: + free(<_BitSet*> veb.top) + free(veb.bottom) + free(veb) + + +cdef int VEB_findsucc(_VEB* veb, int i): + cdef _VEB* subv + cdef _BitSet* subb + cdef int a, b, j, c, found + + if veb.max_val == -1 or i>=veb.max_val: + return -1 + if i < veb.min_val: + return veb.min_val + + a = i >> veb.num_bottom_bits + b = i & lower_mask[veb.num_bottom_bits-1] + found = 0 + if veb.bottom[a] != NULL: + if veb.num_bottom_bits > MIN_BOTTOM_BITS: + subv = <_VEB*> veb.bottom[a] + if subv.max_val > b: + j = (a << veb.num_bottom_bits) + VEB_findsucc(subv, b) + found = 1 + else: + subb = <_BitSet*> veb.bottom[a] + if subb.max_val > b: + j = (a << veb.num_bottom_bits) + bitset_findsucc(subb, b) + found = 1 + if found==0: + if veb.top_universe_size > MIN_BOTTOM_SIZE: + subv = <_VEB*> veb.top + c = VEB_findsucc(subv, a) + else: + subb = <_BitSet*> veb.top + c = bitset_findsucc(subb, a) + if veb.num_bottom_bits > MIN_BOTTOM_BITS: + subv = <_VEB*> veb.bottom[c] + j = (c << veb.num_bottom_bits) + subv.min_val + else: + subb = <_BitSet*> veb.bottom[c] + j = (c << veb.num_bottom_bits) + subb.min_val + return j + + +cdef int VEB_contains(_VEB* veb, int i): + cdef _VEB* subv + cdef _BitSet* subb + cdef int a, b + + if veb.size == 0 or i < veb.min_val or i > veb.max_val: + return 0 + + if veb.min_val == i: + return 1 + else: + if veb.size == 1: + return 0 + + a = i >> veb.num_bottom_bits + b = i & lower_mask[veb.num_bottom_bits-1] + if veb.bottom[a] == NULL: + return 0 + else: + if veb.num_bottom_bits > MIN_BOTTOM_BITS: + subv = <_VEB*> veb.bottom[a] + return VEB_contains(subv, b) + else: + subb = <_BitSet*> veb.bottom[a] + return bitset_contains(subb, b) + + +cdef class VEBIterator: + cdef _VEB* v + cdef int next_val + + def __next__(self): + cdef int ret_val + + if self.next_val == -1: + raise StopIteration() + ret_val = self.next_val + self.next_val = VEB_findsucc(self.v, ret_val) + return ret_val + + +cdef class VEB: + + def __init__(self, size): + pass + + def __cinit__(self, int size): + self.veb = new_VEB(size) + + def __dealloc__(self): + del_VEB(self.veb) + + def __iter__(self): + cdef VEBIterator it + it = VEBIterator() + it.v = self.veb + it.next_val = self.veb.min_val + return it + + def insert(self, i): + return VEB_insert(self.veb, i) + + cdef int _insert(self, int i): + return VEB_insert(self.veb, i) + + def findsucc(self, i): + return VEB_findsucc(self.veb, i) + + cdef int _first(self): + return self.veb.min_val + + cdef int _findsucc(self, int i): + return VEB_findsucc(self.veb, i) + + def __len__(self): + return self.veb.size + + def __contains__(self, i): + return VEB_contains(self.veb, i) + + + + + diff --git a/sa-extract/example/README b/sa-extract/example/README new file mode 100644 index 00000000..9819ba5f --- /dev/null +++ b/sa-extract/example/README @@ -0,0 +1,8 @@ +Commands to compile a corpus and extract some grammars +====================================================== + +# compile +../sa-compile.pl -b nc=corpus.de.gz,corpus.en.gz -a gdfa=corpus.align.gz > extract.ini +# extract +cat test.de | ../extractor.py -c extract.ini + diff --git a/sa-extract/example/corpus.align.gz b/sa-extract/example/corpus.align.gz new file mode 100644 index 00000000..741de7e4 Binary files /dev/null and b/sa-extract/example/corpus.align.gz differ diff --git a/sa-extract/example/corpus.de.gz b/sa-extract/example/corpus.de.gz new file mode 100644 index 00000000..0d66470a Binary files /dev/null and b/sa-extract/example/corpus.de.gz differ diff --git a/sa-extract/example/corpus.en.gz b/sa-extract/example/corpus.en.gz new file mode 100644 index 00000000..28cb5c58 Binary files /dev/null and b/sa-extract/example/corpus.en.gz differ diff --git a/sa-extract/example/test.de b/sa-extract/example/test.de new file mode 100644 index 00000000..8923329f --- /dev/null +++ b/sa-extract/example/test.de @@ -0,0 +1,10 @@ +dies ist der richtige ansatz für diejenigen in chinas politischer führung , die aus der who den maximalen nutzen für die unterstützung der inneren reform ziehen wollen . +taiwan hat sich auch vorgenommen , konstruktiv zu sein - wenn china mitspielt . +die stadt staaten hongkong und singapur verfolgen eine klarsichtige who - politik und konzentrieren sich auf markt zugänge und starke regeln . +malaysia und thailand sind auch recht aktiv innerhalb der who , mit verschiedenen positionen , die vom frei handel bis zum protektionismus reichen . +indonesien und die philippinen sind schwächer , überwältigt von politischer zusammen hanglosigkeit und ganz in anspruch genommen von den anstrengungen , das schlimmste zu hause zu verhüten , so dass nur geringe kräfte übrig bleiben , mit der stets anschwellenden und immer komplizierteren agenda der who fertig zu werden . +die who steht vor einer wichtigen entscheidung . +sie muss dringend den handel progressiv liberalisieren . +eine starke führung seitens der usa ist erforderlich , damit die who in diese richtung gebracht werden kann und man gleichzeitig vermeidet , die zukunft nach dem muster der eu zu gestalten ( regel wucherung ) oder nach dem muster der uno ( macht lose gespräch runde ) . +dies geschieht sicher besser unter bush , mit einem klaren bekenntnis zum offenen markt und einer aktiveren außen politik , als es unter irgendeiner demokratischen alternative geschehen könnte . +robert zoellick , präsident bushs handel beauftragter , braucht aber verbündete . diff --git a/sa-extract/example/test.ref.en b/sa-extract/example/test.ref.en new file mode 100644 index 00000000..e50edcac --- /dev/null +++ b/sa-extract/example/test.ref.en @@ -0,0 +1,10 @@ +this is the right approach for those in china 's leadership who wish to extract maximum benefits from the wto to bolster domestic reform . +taiwan is also set to play a constructive role -- if mainland china plays along . +the city states , hong kong and singapore , have clear - sighted wto policies , focusing on market access and strong rules . +malaysia and thailand are also fairly active in the wto , with a mix of free - market and protectionist positions . +indonesia and the philippines are weaker , overwhelmed by policy incoherence and fire - fighting at home , and with insufficient capacity to deal with the wto 's burgeoning and increasingly complicated agenda . +the wto is at a crossroads . +it sorely needs to liberalize trade progressively . +strong us leadership is required to push the wto in this direction while avoiding an eu - style future ( regulatory overload ) or a un - style future ( an irrelevant talking shop ) . +this is more likely under a bush administration with better open - market credentials and a more assertive foreign policy than any democratic alternative . +however , robert zoellick , president bush 's trade representative , needs allies . diff --git a/sa-extract/extract.ini b/sa-extract/extract.ini new file mode 100644 index 00000000..56913245 --- /dev/null +++ b/sa-extract/extract.ini @@ -0,0 +1,116 @@ +# This .ini file extracts grammars to a file using +# the pattern matching infrastructure. +# +# Does not do any decoding. +# +# Variables can be set using sa-system.pl +# +# Usage: decoder.py -c [-x ] +# +# If the -x option is used, grammar will be written to the +# specified file, otherwise it is written to $PWD/grammar.out +# +# NOTE: all information about rules is cached, so use generous +# memory limits (rules themselves are not cached.) + +import os +import manager +import clex +import context_model +import rulefactory +import calignment +import sys + +out_grammar_file = "grammar.out" +if opts.extra: + out_grammar_file = opts.extra + +# *** these variables written by sa-system.pl. Do not modify *** +lm_file = "/tmp/sa-redpony/de-en/lm/lm/lm.gz" +f_sa_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/f.sa.bin" +e_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/e.bin" +a_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/a/gdfa/a.bin" +lex_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/a/gdfa/lex.bin" +max_len = 5 +max_nt = 2 +max_size=10 +min_gap=1 +rank1 = 100 +rank2 = 10 +precompute_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/precomp.5.2.10.1.100.10.bin" + +# check for path errors +if not os.path.exists(f_sa_file): + raise Exception("Cannot find compiled source language suffix array file %s" % f_sa_file) +if not os.path.exists(e_file): + raise Exception("Cannot find compiled target language array file %s" % e_file) +if not os.path.exists(a_file): + raise Exception("Cannot find compiled alignment file %s" % a_file) +if not os.path.exists(lex_file): + raise Exception("Cannot find compiled lexical weights file %s" % lex_file) +if not os.path.exists(precompute_file): + log.writeln("Could not find precomputed collocations %s, decoding will be slower" % precompute_file) + precompute_file = None + +### Output options +mark_phrases = False # show derivation as SGML markup in output +mert_mark_phrases = False # do the same when generating n-best lists (don't use this with minimum error rate training!) + +# Verbosity. 0 = silent, 1 = normal, 2-5 = verbose +log.level = 1 +log.file = sys.stderr + +# pattern-matching stuff +class PhonyGrammar: # saves us the cost of keeping the rules around + def add(self, thing): + pass + +local_grammar = PhonyGrammar() +xcat="X" + +cm = manager.ContextManager( + f_sa_file, + e_file, + sampler=rulefactory.Sampler(300), # lower=faster, higher=better; improvements level off above 200-300 range, -1 = don't sample, use all data (VERY SLOW!) + rulefactory=rulefactory.HieroCachingRuleFactory( + alignment=calignment.Alignment( # compiled alignment object (REQUIRED) + a_file, + from_binary=True + ), + category="["+xcat+"]", # name of generic nonterminal used by Hiero + grammar=local_grammar, # do not change for extraction + max_chunks=None, # maximum number of contiguous chunks of terminal symbols in RHS of a rule. If None, defaults to max_nonterminals+1 + max_initial_size=15, # maximum span of a grammar rule in TEST DATA + max_length=max_len, # maximum number of symbols (both T and NT) allowed in a rule + max_nonterminals=max_nt, # maximum number of nonterminals allowed in a rule (set >2 at your own risk) + max_target_chunks=None, # maximum number of contiguous chunks of terminal symbols in target-side RHS of a rule. If None, defaults to max_nonterminals+1 + max_target_length=None, # maximum number of target side symbols (both T and NT) allowed in a rule. If None, defaults to max_initial_size + min_gap_size=1, # minimum span of a nonterminal in the RHS of a rule in TEST DATA + precompute_file=precompute_file, # filename of file containing precomputed collocations + precompute_secondary_rank=rank2, # maximum frequency rank of patterns used to compute triples (don't set higher than 20). + precompute_rank=rank1, # maximum frequency rank of patterns used to compute collocations (no need to set higher than maybe 200-300) + require_aligned_terminal=True, # require extracted rules to have at least one aligned word + require_aligned_chunks=False, # require each contiguous chunk of extracted rules to have at least one aligned word + per_sentence_grammar=True, # generate a complete grammar for each input sentence + rule_file=out_grammar_file, # grammar is written to this file (sentence id is added to file name for per sentence grammars) + train_max_initial_size=max_size, # maximum span of a grammar rule extracted from TRAINING DATA + train_min_gap_size=min_gap, # minimum span of an RHS nonterminal in a rule extracted from TRAINING DATA + tight_phrases=True, # True if phrases should be tight, False otherwise (False seems to give better results but is slower) + ), + from_binary=True + ) + +# lexical weighting tables +tt = clex.CLex(lex_file, from_binary=True) + +# Only include features that depend on rule identity here +add_model(context_model.EgivenFCoherent(cm), 0.125) +add_model(context_model.SampleCountF(cm), 0.125) +add_model(context_model.CountEF(cm), 0.125) +add_model(context_model.MaxLexFgivenE(cm, tt), 0.125) +add_model(context_model.MaxLexEgivenF(cm, tt), 0.125) +add_model(context_model.IsSingletonF(cm), 0.125) +add_model(context_model.IsSingletonFE(cm), 0.125) + +# grammars, search parameters and all that other stuff are irrelevant + diff --git a/sa-extract/extractor.py b/sa-extract/extractor.py new file mode 100755 index 00000000..9d66ebf0 --- /dev/null +++ b/sa-extract/extractor.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python + +# vim:expandtab:shiftwidth=4 + +import sys, gc, monitor, sgml +import optparse +import model +import log +import cn + +models = [] + +def add_model(m,w=0.0): + models.append(m) + +def extract_grammar(input): + confnet = cn.ConfusionNet(input) + meta = input.meta + for m in models: + m.input(confnet.columns, meta) + +if __name__ == "__main__": + optparser = optparse.OptionParser() + optparser.add_option("-c", "--config", dest="config", help="configuration module") + optparser.add_option("-x", "--extra", dest="extra", help="output grammar name override") + (opts,args) = optparser.parse_args() + + if opts.config is None: + raise ValueError, "You must specify a configuration file." + else: + if log.level >= 1: + log.write("Reading configuration from %s\n" % opts.config) + execfile(opts.config) + + if len(args) >= 1 and args[0] != "-": + input_file = file(args[0], "r") + else: + input_file = sys.stdin + + if len(args) >= 2 and args[1] != "-": + output_file = file(args[1], "w") + else: + output_file = sys.stdout + + gc.collect() + if log.level >= 1: + log.write("all structures loaded, memory %s, time %s\n" % (monitor.memory(), monitor.cpu())) + log.write("models: %s\n" % (" ".join(str(x.name) for x in models))) + + sents = sgml.read_raw(input_file) + for sent in sents: + mark = sent.getmark() + if mark is not None: + (tag, attrs) = mark + if tag == "seg": + sent.unmark() + dattrs = sgml.attrs_to_dict(attrs) + sent.meta = attrs + extract_grammar(sent) + diff --git a/sa-extract/lcp.pyx b/sa-extract/lcp.pyx new file mode 100644 index 00000000..a992d3ee --- /dev/null +++ b/sa-extract/lcp.pyx @@ -0,0 +1,113 @@ +#!/usr/bin/env python2.4 + +'''Compute LCP array for a suffix array using the Kasai et al. algorithm''' +'''Can also be used to compute statistics such +as k most frequent n-grams''' + +import sys + +cimport cintlist +cimport csuf +cimport cdat +cimport cveb + +cdef class LCP: + + cdef csuf.SuffixArray sa + cdef cintlist.CIntList lcp + + def __init__(self, sa): + self._construct(sa) + + cdef _construct(self, csuf.SuffixArray sa): + cdef int i, k, j, h, n + cdef cintlist.CIntList rank + + sys.stderr.write("Constructing LCP array\n") + self.sa = sa + n = self.sa.sa.len + self.lcp = cintlist.CIntList(initial_len=n) + + rank = cintlist.CIntList(initial_len=n) + for i from 0 <= i < n: + rank.arr[sa.sa.arr[i]] = i + + h = 0 + for i from 0 <= i < n: + k = rank.arr[i] + if k == 0: + self.lcp.arr[k] = -1 + else: + j = sa.sa.arr[k-1] + while i+h < n and j+h < n and sa.darray.data.arr[i+h] == sa.darray.data.arr[j+h]: + h = h+1 + self.lcp.arr[k] = h + if h > 0: + h = h-1 + sys.stderr.write("LCP array completed\n") + + + def compute_stats(self, max_n): + self._compute_stats(max_n) + + cdef _compute_stats(self, int max_n): + '''Note: the output of this function is not exact. In + particular, the frequency associated with each word is + not guaranteed to be correct. This is due to a bit of + laxness in the design; the function is intended only to + obtain a list of the most frequent words; for this + purpose it is perfectly fine''' + cdef int i, ii, iii, j, k, h, n, N, rs, freq, valid + cdef cintlist.CIntList run_start + cdef cintlist.CIntList ngram_start + cdef cveb.VEB veb + + N = self.sa.sa.len + + ngram_starts = [] + for n from 0 <= n < max_n: + ngram_starts.append(cintlist.CIntList(initial_len=N)) + + run_start = cintlist.CIntList(initial_len=max_n) + veb = cveb.VEB(N) + + for i from 0 <= i < N: + h = self.lcp.arr[i] + if h < 0: + h = 0 + for n from h <= n < max_n: + rs = run_start.arr[n] + run_start.arr[n] = i + freq = i - rs + if freq > 1000: # arbitrary, but see note below + veb._insert(freq) + ngram_start = ngram_starts[n] + while ngram_start.arr[freq] > 0: + freq = freq + 1 # cheating a bit, should be ok for sparse histogram + ngram_start.arr[freq] = rs + i = veb.veb.min_val + while i != -1: + ii = veb._findsucc(i) + for n from 0 <= n < max_n: + ngram_start = ngram_starts[n] + iii = i + rs = ngram_start.arr[iii] + while (ii==-1 or iii < ii) and rs != 0: + j = self.sa.sa.arr[rs] + valid = 1 + for k from 0 <= k < n+1: + if self.sa.darray.data.arr[j+k] < 2: + valid = 0 + if valid: + ngram = "" + for k from 0 <= k < n+1: + ngram= ngram+ self.sa.darray.id2word[self.sa.darray.data.arr[j+k]] + " " + print i, n+1, ngram + iii = iii + 1 + rs = ngram_start.arr[iii] + i = ii + + + + + diff --git a/sa-extract/lcp_ops.py b/sa-extract/lcp_ops.py new file mode 100755 index 00000000..9df6e82a --- /dev/null +++ b/sa-extract/lcp_ops.py @@ -0,0 +1,52 @@ +#!/usr/bin/env python + +import lcp +import sys +import optparse +import csuf + +'''Run various computations using the LCP array''' +def main(): + + optparser = optparse.OptionParser() +# optparser.add_option("-c", "--config", dest="config", help="configuration module") + optparser.add_option("-s", "--sa-check", dest="sa_check", default=False, action="store_true") + optparser.add_option("-l", "--lcp-check", dest="lcp_check", default=False, action="store_true") + optparser.add_option("-t", "--stats", dest="stats", default=0, type="int", action="store") + optparser.add_option("-u", "--unigram", dest="uni_check", default=False, action="store_true") + optparser.add_option("-r", "--report-long-lcps", dest="long_lcp", type="int", default=0, action="store") + (opts,args) = optparser.parse_args() + + if len(args) < 1: + print >>sys.stderr, "Usage: lcp.py [opts] " + sys.exit(1) + + safile = args[0] + sa = csuf.SuffixArray(safile, from_binary=True) + +# if opts.sa_check: +# check_sufarray(sa) + + l = lcp.LCP(sa) + + if opts.lcp_check: + print >>sys.stderr, "Checking LCP Array..." + l.check() + print >>sys.stderr, "Check finished" + + if opts.stats > 0: + l.compute_stats(opts.stats) + +# if opts.uni_check: +# if lcp is None: +# lcp = LCP(sa) +# unigram_stats(sa, lcp) +# +# if opts.long_lcp: +# if lcp is None: +# lcp = LCP(sa, opts.long_lcp) + +if __name__ == "__main__": + sys.exit(main()) + + diff --git a/sa-extract/log.py b/sa-extract/log.py new file mode 100644 index 00000000..d4f96cb4 --- /dev/null +++ b/sa-extract/log.py @@ -0,0 +1,18 @@ +import sys + +level = 1 +file = sys.stderr + +def writeln(s="", l=0): + if level >= l: + file.write("%s\n" % s) + file.flush() + +def write(s, l=0): + if level >= l: + file.write(s) + file.flush() + + + + diff --git a/sa-extract/manager.py b/sa-extract/manager.py new file mode 100644 index 00000000..767192c1 --- /dev/null +++ b/sa-extract/manager.py @@ -0,0 +1,100 @@ +import csuf +import cdat + +class Sampler(object): + '''A Sampler implements a logic for choosing + samples from a population range''' + + def __init__(self): + pass + + def registerContext(self, context_manager): + self.context_manager = context_manager + + def sample(self, phrase_location): + return cintlist.CIntList() + + + +class Extractor(object): + '''Extractor is responsible for extracting rules + from a given context; once a sentence id/location + is found for the source fwords, extractor is + responsible for producing any matching rule(s). + Optionally, extractor may return an empty list''' + + def __init__(self): + pass + + def registerContext(self, context_manager): + self.context_manager = context_manager + + def extract(self, fwords, loc): + return [] + + + +class RuleFactory(object): + '''RuleFactory is a class that manages the + generation of translation rules, using the Context + and (optionally) any of its contained classes or + data. The RuleFactory is responsible for handling + any caching (i.e. when presented with an input + sentence, it may lookup a rule from its cache + rather than extracting a new rule)''' + + def __init__(self): + self.num_lookups = 0 + self.num_extractions = 0 + self.num_rules = 0 + self.time = 0.0 + + + def registerContext(self, context_manager): + self.context_manager = context_manager + + + def input(self, fwords): + '''Manages the process of enumerating + rules for a given input sentence, and + looking them with calls to Context, + Sampler, and Extractor''' + return [] + + +class ContextManager(object): + + def __init__(self, ffile, efile, extractor=None, sampler=None, rulefactory=None, from_binary=False): + # NOTE: Extractor does not have a default value because + # the only nontrivial extractor right now depends on an + # alignment file + + self.fsarray = csuf.SuffixArray(ffile, from_binary) + self.edarray = cdat.DataArray(efile, from_binary) + + self.factory = rulefactory + self.factory.registerContext(self) + + self.sampler = sampler + self.sampler.registerContext(self) + + self.models = [] + self.owner = None + + + def add_model(self, model): + if self.owner is None: + self.owner = model + model_id = len(self.models) + self.models.append(model) + return model_id + + + def input(self, model, fwords, meta): + if model != self.owner: + return + self.fwords = fwords + self.factory.input(self.fwords, meta) + + + diff --git a/sa-extract/model.py b/sa-extract/model.py new file mode 100644 index 00000000..66c51051 --- /dev/null +++ b/sa-extract/model.py @@ -0,0 +1,12 @@ + +class Model(object): + def __init__(self, name=None): + object.__init__(self) + if name is None: + self.name = self.__class__.__name__ + else: + self.name = name + + def input(self, fwords, meta): + pass + diff --git a/sa-extract/monitor.py b/sa-extract/monitor.py new file mode 100644 index 00000000..eb0bed57 --- /dev/null +++ b/sa-extract/monitor.py @@ -0,0 +1,48 @@ +import os, resource + +def cpu(): + return (resource.getrusage(resource.RUSAGE_SELF).ru_utime+ + resource.getrusage(resource.RUSAGE_SELF).ru_stime) + +# from http://aspn.activestate.com/ASPN/Cookbook/Python/Recipe/286222 + + +_proc_status = '/proc/%d/status' % os.getpid() + +_scale = {'kB': 1024.0, 'mB': 1024.0*1024.0, + 'KB': 1024.0, 'MB': 1024.0*1024.0} + +def _VmB(VmKey): + '''Private. + ''' + global _proc_status, _scale + # get pseudo file /proc//status + try: + t = open(_proc_status) + v = t.read() + t.close() + except: + return 0.0 # non-Linux? + # get VmKey line e.g. 'VmRSS: 9999 kB\n ...' + i = v.index(VmKey) + v = v[i:].split(None, 3) # whitespace + if len(v) < 3: + return 0.0 # invalid format? + # convert Vm value to bytes + return float(v[1]) * _scale[v[2]] + +def memory(since=0.0): + '''Return memory usage in bytes. + ''' + return _VmB('VmSize:') - since + +def resident(since=0.0): + '''Return resident memory usage in bytes. + ''' + return _VmB('VmRSS:') - since + + +def stacksize(since=0.0): + '''Return stack size in bytes. + ''' + return _VmB('VmStk:') - since diff --git a/sa-extract/precomputation.pxd b/sa-extract/precomputation.pxd new file mode 100644 index 00000000..c75d5eef --- /dev/null +++ b/sa-extract/precomputation.pxd @@ -0,0 +1,13 @@ +from libc.stdio cimport FILE + +cdef class Precomputation: + cdef int precompute_rank + cdef int precompute_secondary_rank + cdef int max_length + cdef int max_nonterminals + cdef int train_max_initial_size + cdef int train_min_gap_size + cdef precomputed_index + cdef precomputed_collocations + cdef read_map(self, FILE* f) + cdef write_map(self, m, FILE* f) diff --git a/sa-extract/precomputation.pyx b/sa-extract/precomputation.pyx new file mode 100644 index 00000000..ce4c21aa --- /dev/null +++ b/sa-extract/precomputation.pyx @@ -0,0 +1,478 @@ +# precomputes a set of collocations by advancing over the text. +# warning: nasty C code + +import log +import monitor + +cimport csuf +cimport cdat +cimport cintlist + +from libc.stdio cimport FILE, fopen, fread, fwrite, fclose +from libc.stdlib cimport malloc, realloc, free +from libc.string cimport memset, memcpy + +cdef struct _Trie_Node # forward decl + +cdef struct _Trie_Edge: + int val + _Trie_Node* node + _Trie_Edge* bigger + _Trie_Edge* smaller + +cdef struct _Trie_Node: + _Trie_Edge* root + int* arr + int arr_len + +cdef _Trie_Node* new_trie_node(): + cdef _Trie_Node* node + node = <_Trie_Node*> malloc(sizeof(_Trie_Node)) + node.root = NULL + node.arr_len = 0 + node.arr = malloc(sizeof(0*sizeof(int))) + return node + +cdef _Trie_Edge* new_trie_edge(int val): + cdef _Trie_Edge* edge + edge = <_Trie_Edge*> malloc(sizeof(_Trie_Edge)) + edge.node = new_trie_node() + edge.bigger = NULL + edge.smaller = NULL + edge.val = val + return edge + +cdef free_trie_node(_Trie_Node* node): + if node != NULL: + free_trie_edge(node.root) + free(node.arr) + +cdef free_trie_edge(_Trie_Edge* edge): + if edge != NULL: + free_trie_node(edge.node) + free_trie_edge(edge.bigger) + free_trie_edge(edge.smaller) + +cdef _Trie_Node* trie_find(_Trie_Node* node, int val): + cdef _Trie_Edge* cur + cur = node.root + while cur != NULL and cur.val != val: + if val > cur.val: + cur = cur.bigger + elif val < cur.val: + cur = cur.smaller + if cur == NULL: + return NULL + else: + return cur.node + +cdef trie_node_data_append(_Trie_Node* node, int val): + cdef int new_len + new_len = node.arr_len + 1 + node.arr = realloc(node.arr, new_len*sizeof(int)) + node.arr[node.arr_len] = val + node.arr_len = new_len + +cdef trie_node_data_extend(_Trie_Node* node, int* vals, int num_vals): + cdef int new_len + new_len = node.arr_len + num_vals + node.arr = realloc(node.arr, new_len*sizeof(int)) + memcpy(node.arr + node.arr_len, vals, num_vals*sizeof(int)) + node.arr_len = new_len + + +cdef _Trie_Node* trie_insert(_Trie_Node* node, int val): + cdef _Trie_Edge** cur + cur = &node.root + while cur[0] != NULL and cur[0].val != val: + if val > cur[0].val: + cur = &cur[0].bigger + elif val < cur[0].val: + cur = &cur[0].smaller + if cur[0] == NULL: + cur[0] = new_trie_edge(val) + return cur[0].node + +cdef trie_node_to_map(_Trie_Node* node, result, prefix, int include_zeros): + cdef cintlist.CIntList arr + + if include_zeros or node.arr_len > 0: + arr = cintlist.CIntList() + free(arr.arr) + arr.arr = malloc(node.arr_len * sizeof(int)) + memcpy(arr.arr, node.arr, node.arr_len * sizeof(int)) + arr.len = node.arr_len + arr.size = node.arr_len + result[prefix] = arr + trie_edge_to_map(node.root, result, prefix, include_zeros) + +cdef trie_edge_to_map(_Trie_Edge* edge, result, prefix, int include_zeros): + if edge != NULL: + trie_edge_to_map(edge.smaller, result, prefix, include_zeros) + trie_edge_to_map(edge.bigger, result, prefix, include_zeros) + prefix = prefix + (edge.val,) + trie_node_to_map(edge.node, result, prefix, include_zeros) + +cdef class TrieMap: + + cdef _Trie_Node** root + cdef int V + + def __init__(self, alphabet_size): + self.V = alphabet_size + self.root = <_Trie_Node**> malloc(self.V * sizeof(_Trie_Node*)) + memset(self.root, 0, self.V * sizeof(_Trie_Node*)) + + + def __dealloc__(self): + cdef int i + for i from 0 <= i < self.V: + if self.root[i] != NULL: + free_trie_node(self.root[i]) + free(self.root) + + + def insert(self, pattern): + cdef int* p + cdef int i, l + l = len(pattern) + p = malloc(l*sizeof(int)) + for i from 0 <= i < l: + p[i] = pattern[i] + self._insert(p,l) + free(p) + + + cdef _Trie_Node* _insert(self, int* pattern, int pattern_len): + cdef int i + cdef _Trie_Node* node + if self.root[pattern[0]] == NULL: + self.root[pattern[0]] = new_trie_node() + node = self.root[pattern[0]] + for i from 1 <= i < pattern_len: + node = trie_insert(node, pattern[i]) + return node + + def contains(self, pattern): + cdef int* p + cdef int i, l + cdef _Trie_Node* node + l = len(pattern) + p = malloc(l*sizeof(int)) + for i from 0 <= i < l: + p[i] = pattern[i] + node = self._contains(p,l) + free(p) + if node == NULL: + return False + else: + return True + + cdef _Trie_Node* _contains(self, int* pattern, int pattern_len): + cdef int i + cdef _Trie_Node* node + node = self.root[pattern[0]] + i = 1 + while node != NULL and i < pattern_len: + node = trie_find(node, pattern[i]) + i = i+1 + return node + + def toMap(self, flag): + cdef int i, include_zeros + + if flag: + include_zeros=1 + else: + include_zeros=0 + result = {} + for i from 0 <= i < self.V: + if self.root[i] != NULL: + trie_node_to_map(self.root[i], result, (i,), include_zeros) + return result + + +cdef class Precomputation: + +# Defined in .pxd file, here for reference: +# cdef int precompute_rank +# cdef int precompute_secondary_rank +# cdef int max_length +# cdef int max_nonterminals +# cdef int train_max_initial_size +# cdef int train_min_gap_size +# cdef precomputed_index +# cdef precomputed_collocations + + def __init__(self, filename, sa=None, precompute_rank=1000, precompute_secondary_rank=20, max_length=5, + max_nonterminals=2, train_max_initial_size=10, train_min_gap_size=2, from_binary=False): + self.precompute_rank = precompute_rank + self.precompute_secondary_rank = precompute_secondary_rank + self.max_length = max_length + self.max_nonterminals = max_nonterminals + self.train_max_initial_size = train_max_initial_size + self.train_min_gap_size = train_min_gap_size + if from_binary: + self.read_binary(filename) + else: + self.precompute(filename, sa) + + + def read_binary(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + f = fopen(cfilename, "r") + fread(&(self.precompute_rank), sizeof(int), 1, f) + fread(&(self.precompute_secondary_rank), sizeof(int), 1, f) + fread(&(self.max_length), sizeof(int), 1, f) + fread(&(self.max_nonterminals), sizeof(int), 1, f) + fread(&(self.train_max_initial_size), sizeof(int), 1, f) + fread(&(self.train_min_gap_size), sizeof(int), 1, f) + self.precomputed_index = self.read_map(f) + self.precomputed_collocations = self.read_map(f) + fclose(f) + + + def write_binary(self, filename): + cdef FILE* f + cdef bytes bfilename = filename + cdef char* cfilename = bfilename + + f = fopen(cfilename, "w") + fwrite(&(self.precompute_rank), sizeof(int), 1, f) + fwrite(&(self.precompute_secondary_rank), sizeof(int), 1, f) + fwrite(&(self.max_length), sizeof(int), 1, f) + fwrite(&(self.max_nonterminals), sizeof(int), 1, f) + fwrite(&(self.train_max_initial_size), sizeof(int), 1, f) + fwrite(&(self.train_min_gap_size), sizeof(int), 1, f) + self.write_map(self.precomputed_index, f) + self.write_map(self.precomputed_collocations, f) + fclose(f) + + + cdef write_map(self, m, FILE* f): + cdef int i, N + cdef cintlist.CIntList arr + + N = len(m) + fwrite(&(N), sizeof(int), 1, f) + for pattern, val in m.iteritems(): + N = len(pattern) + fwrite(&(N), sizeof(int), 1, f) + for word_id in pattern: + i = word_id + fwrite(&(i), sizeof(int), 1, f) + arr = val + arr.write_handle(f) + + + cdef read_map(self, FILE* f): + cdef int i, j, k, word_id, N + cdef cintlist.CIntList arr + + m = {} + fread(&(N), sizeof(int), 1, f) + for j from 0 <= j < N: + fread(&(i), sizeof(int), 1, f) + key = () + for k from 0 <= k < i: + fread(&(word_id), sizeof(int), 1, f) + key = key + (word_id,) + arr = cintlist.CIntList() + arr.read_handle(f) + m[key] = arr + return m + + + def precompute(self, filename, sa): + cdef int i, l, N, max_pattern_len, i1, l1, i2, l2, i3, l3, ptr1, ptr2, ptr3, is_super, sent_count, max_rank + cdef csuf.SuffixArray sarray + cdef cdat.DataArray darray + cdef cintlist.CIntList data, queue, cost_by_rank, count_by_rank + cdef TrieMap frequent_patterns, super_frequent_patterns, collocations + cdef _Trie_Node* node + + sarray = sa + darray = sarray.darray + data = darray.data + + frequent_patterns = TrieMap(len(darray.id2word)) + super_frequent_patterns = TrieMap(len(darray.id2word)) + collocations = TrieMap(len(darray.id2word)) + + I_set = set() + J_set = set() + J2_set = set() + IJ_set = set() + pattern_rank = {} + + log.writeln("Precomputing frequent intersections\n", 1) + start_time = monitor.cpu() + + max_pattern_len = 0 + if filename is not None: + precompute_file = open(filename) + for rank, line in enumerate(precompute_file): + if rank >= self.precompute_rank: + break + phrase_words = line.split()[2:] + phrase = () + for word in phrase_words: + phrase = phrase + (darray.word2id[word],) + max_pattern_len = max(max_pattern_len, len(phrase)) + frequent_patterns.insert(phrase) + I_set.add(phrase) + pattern_rank[phrase] = rank + if rank < self.precompute_secondary_rank: + super_frequent_patterns.insert(phrase) + J_set.add(phrase) + precompute_file.close() + + queue = cintlist.CIntList(increment=1000) + + log.writeln(" Computing inverted indexes...", 1) + N = len(data) + for i from 0 <= i < N: + sa_word_id = data.arr[i] + if sa_word_id == 1: + queue._append(-1) + else: + for l from 1 <= l <= max_pattern_len: + node = frequent_patterns._contains(data.arr+i, l) + if node == NULL: + break + queue._append(i) + queue._append(l) + trie_node_data_append(node, i) + + log.writeln(" Computing collocations...", 1) + N = len(queue) + ptr1 = 0 + sent_count = 0 + while ptr1 < N: # main loop + i1 = queue.arr[ptr1] + if i1 > -1: + l1 = queue.arr[ptr1+1] + ptr2 = ptr1 + 2 + while ptr2 < N: + i2 = queue.arr[ptr2] + if i2 == -1 or i2 - i1 >= self.train_max_initial_size: + break + l2 = queue.arr[ptr2+1] + if i2 - i1 - l1 >= self.train_min_gap_size and i2 + l2 - i1 <= self.train_max_initial_size and l1+l2+1 <= self.max_length: + node = collocations._insert(data.arr+i1, l1) + node = trie_insert(node, -1) + for i from i2 <= i < i2+l2: + node = trie_insert(node, data.arr[i]) + trie_node_data_append(node, i1) + trie_node_data_append(node, i2) + if super_frequent_patterns._contains(data.arr+i2, l2) != NULL: + if super_frequent_patterns._contains(data.arr+i1, l1) != NULL: + is_super = 1 + else: + is_super = 0 + ptr3 = ptr2 + 2 + while ptr3 < N: + i3 = queue.arr[ptr3] + if i3 == -1 or i3 - i1 >= self.train_max_initial_size: + break + l3 = queue.arr[ptr3+1] + if i3 - i2 - l2 >= self.train_min_gap_size and i3 + l3 - i1 <= self.train_max_initial_size and l1+l2+l3+2 <= self.max_length: + if is_super or super_frequent_patterns._contains(data.arr+i3, l3) != NULL: + node = collocations._insert(data.arr+i1, l1) + node = trie_insert(node, -1) + for i from i2 <= i < i2+l2: + node = trie_insert(node, data.arr[i]) + node = trie_insert(node, -1) + for i from i3 <= i < i3+l3: + node = trie_insert(node, data.arr[i]) + trie_node_data_append(node, i1) + trie_node_data_append(node, i2) + trie_node_data_append(node, i3) + ptr3 = ptr3 + 2 + ptr2 = ptr2 + 2 + ptr1 = ptr1 + 2 + else: + sent_count = sent_count + 1 + if sent_count % 10000 == 0: + log.writeln(" %d sentences" % sent_count) + ptr1 = ptr1 + 1 + + self.precomputed_collocations = collocations.toMap(False) + self.precomputed_index = frequent_patterns.toMap(True) + + x = 0 + for pattern1 in J_set: + for pattern2 in J_set: + if len(pattern1) + len(pattern2) + 1 < self.max_length: + combined_pattern = pattern1 + (-1,) + pattern2 + J2_set.add(combined_pattern) + + for pattern1 in I_set: + for pattern2 in I_set: + x = x+1 + if len(pattern1) + len(pattern2) + 1 <= self.max_length: + combined_pattern = pattern1 + (-1,) + pattern2 + IJ_set.add(combined_pattern) + + for pattern1 in I_set: + for pattern2 in J2_set: + x = x+2 + if len(pattern1) + len(pattern2) + 1<= self.max_length: + combined_pattern = pattern1 + (-1,) + pattern2 + IJ_set.add(combined_pattern) + combined_pattern = pattern2 + (-1,) + pattern1 + IJ_set.add(combined_pattern) + + N = len(pattern_rank) + cost_by_rank = cintlist.CIntList(initial_len=N) + count_by_rank = cintlist.CIntList(initial_len=N) + for pattern, arr in self.precomputed_collocations.iteritems(): + if pattern not in IJ_set: + s = "" + for word_id in pattern: + if word_id == -1: + s = s + "X " + else: + s = s + darray.id2word[word_id] + " " + log.writeln("ERROR: unexpected pattern %s in set of precomputed collocations" % (s), 1) + else: + chunk = () + max_rank = 0 + arity = 0 + for word_id in pattern: + if word_id == -1: + max_rank = max(max_rank, pattern_rank[chunk]) + arity = arity + 1 + chunk = () + else: + chunk = chunk + (word_id,) + max_rank = max(max_rank, pattern_rank[chunk]) + cost_by_rank.arr[max_rank] = cost_by_rank.arr[max_rank] + (4*len(arr)) + count_by_rank.arr[max_rank] = count_by_rank.arr[max_rank] + (len(arr)/(arity+1)) + + cumul_cost = 0 + cumul_count = 0 + for i from 0 <= i < N: + cumul_cost = cumul_cost + cost_by_rank.arr[i] + cumul_count = cumul_count + count_by_rank.arr[i] + log.writeln("RANK %d\tCOUNT, COST: %d %d\tCUMUL: %d, %d" % (i, count_by_rank.arr[i], cost_by_rank.arr[i], cumul_count, cumul_cost)) + + num_found_patterns = len(self.precomputed_collocations) + for pattern in IJ_set: + if pattern not in self.precomputed_collocations: + self.precomputed_collocations[pattern] = cintlist.CIntList() + + stop_time = monitor.cpu() + log.writeln("Precomputed collocations for %d patterns out of %d possible (upper bound %d)" % (num_found_patterns,len(self.precomputed_collocations),x)) + log.writeln("Precomputed inverted index for %d patterns " % len(self.precomputed_index)) + log.writeln("Precomputation took %f seconds" % (stop_time - start_time)) + log.writeln("Detailed statistics:") + + + + + + + diff --git a/sa-extract/rule.pxd b/sa-extract/rule.pxd new file mode 100644 index 00000000..c9c84e5c --- /dev/null +++ b/sa-extract/rule.pxd @@ -0,0 +1,13 @@ +cdef class Phrase: + cdef int *syms + cdef int n, *varpos, n_vars + cdef public int chunkpos(self, int k) + cdef public int chunklen(self, int k) + +cdef class Rule: + cdef public int lhs + cdef readonly Phrase f, e + cdef float *cscores + cdef int n_scores + cdef public owner, word_alignments + diff --git a/sa-extract/rule.pyx b/sa-extract/rule.pyx new file mode 100644 index 00000000..7cd3efda --- /dev/null +++ b/sa-extract/rule.pyx @@ -0,0 +1,286 @@ +from libc.stdlib cimport malloc, calloc, realloc, free, strtof, strtol +from libc.string cimport strsep, strcpy, strlen + +cdef extern from "strutil.h": + char *strstrsep(char **stringp, char *delim) + char *strip(char *s) + char **split(char *s, char *delim, int *pn) + +import sys + +import sym +cimport sym +cdef sym.Alphabet alphabet +alphabet = sym.alphabet + +global span_limit +span_limit = None + +cdef int bufsize +cdef char *buf +bufsize = 100 +buf = malloc(bufsize) +cdef ensurebufsize(int size): + global buf, bufsize + if size > bufsize: + buf = realloc(buf, size*sizeof(char)) + bufsize = size + +cdef class Phrase: + def __cinit__(self, words): + cdef int i, j, n, n_vars + cdef char **toks + cdef bytes bwords + cdef char* cwords + + n_vars = 0 + if type(words) is str: + ensurebufsize(len(words)+1) + bwords = words + cwords = bwords + strcpy(buf, cwords) + toks = split(buf, NULL, &n) + self.syms = malloc(n*sizeof(int)) + for i from 0 <= i < n: + self.syms[i] = alphabet.fromstring(toks[i], 0) + if alphabet.isvar(self.syms[i]): + n_vars = n_vars + 1 + + else: + n = len(words) + self.syms = malloc(n*sizeof(int)) + for i from 0 <= i < n: + self.syms[i] = words[i] + if alphabet.isvar(self.syms[i]): + n_vars = n_vars + 1 + self.n = n + self.n_vars = n_vars + self.varpos = malloc(n_vars*sizeof(int)) + j = 0 + for i from 0 <= i < n: + if alphabet.isvar(self.syms[i]): + self.varpos[j] = i + j = j + 1 + + def __dealloc__(self): + free(self.syms) + free(self.varpos) + + def __str__(self): + strs = [] + cdef int i, s + for i from 0 <= i < self.n: + s = self.syms[i] + strs.append(alphabet.tostring(s)) + return " ".join(strs) + + def instantiable(self, i, j, n): + return span_limit is None or (j-i) <= span_limit + + def handle(self): + """return a hashable representation that normalizes the ordering + of the nonterminal indices""" + norm = [] + cdef int i, j, s + i = 1 + j = 0 + for j from 0 <= j < self.n: + s = self.syms[j] + if alphabet.isvar(s): + s = alphabet.setindex(s,i) + i = i + 1 + norm.append(s) + return tuple(norm) + + def strhandle(self): + strs = [] + norm = [] + cdef int i, j, s + i = 1 + j = 0 + for j from 0 <= j < self.n: + s = self.syms[j] + if alphabet.isvar(s): + s = alphabet.setindex(s,i) + i = i + 1 + norm.append(alphabet.tostring(s)) + return " ".join(norm) + + def arity(self): + return self.n_vars + + def getvarpos(self, i): + if 0 <= i < self.n_vars: + return self.varpos[i] + else: + raise IndexError + + def getvar(self, i): + if 0 <= i < self.n_vars: + return self.syms[self.varpos[i]] + else: + raise IndexError + + cdef int chunkpos(self, int k): + if k == 0: + return 0 + else: + return self.varpos[k-1]+1 + + cdef int chunklen(self, int k): + if self.n_vars == 0: + return self.n + elif k == 0: + return self.varpos[0] + elif k == self.n_vars: + return self.n-self.varpos[k-1]-1 + else: + return self.varpos[k]-self.varpos[k-1]-1 + + def clen(self, k): + return self.chunklen(k) + + def getchunk(self, ci): + cdef int start, stop + start = self.chunkpos(ci) + stop = start+self.chunklen(ci) + chunk = [] + for i from start <= i < stop: + chunk.append(self.syms[i]) + return chunk + + def __cmp__(self, other): + cdef Phrase otherp + cdef int i + otherp = other + for i from 0 <= i < min(self.n, otherp.n): + if self.syms[i] < otherp.syms[i]: + return -1 + elif self.syms[i] > otherp.syms[i]: + return 1 + if self.n < otherp.n: + return -1 + elif self.n > otherp.n: + return 1 + else: + return 0 + + def __hash__(self): + cdef int i + cdef unsigned h + h = 0 + for i from 0 <= i < self.n: + if self.syms[i] > 0: + h = (h << 1) + self.syms[i] + else: + h = (h << 1) + -self.syms[i] + return h + + def __len__(self): + return self.n + + def __getitem__(self, i): + return self.syms[i] + + def __iter__(self): + cdef int i + l = [] + for i from 0 <= i < self.n: + l.append(self.syms[i]) + return iter(l) + + def subst(self, start, children): + cdef int i + for i from 0 <= i < self.n: + if alphabet.isvar(self.syms[i]): + start = start + children[alphabet.getindex(self.syms[i])-1] + else: + start = start + (self.syms[i],) + return start + +cdef class Rule: + def __cinit__(self, lhs, f, e, owner=None, scores=None, word_alignments=None): + cdef int i, n + cdef char *rest + + self.word_alignments = word_alignments + if scores is None: + self.cscores = NULL + self.n_scores = 0 + else: + n = len(scores) + self.cscores = malloc(n*sizeof(float)) + self.n_scores = n + for i from 0 <= i < n: + self.cscores[i] = scores[i] + + def __init__(self, lhs, f, e, owner=None, scores=None, word_alignments=None): + if not sym.isvar(lhs): + sys.stderr.write("error: lhs=%d\n" % lhs) + self.lhs = lhs + self.f = f + self.e = e + self.word_alignments = word_alignments + + def __dealloc__(self): + if self.cscores != NULL: + free(self.cscores) + + def __str__(self): + return self.to_line() + + def __hash__(self): + return hash((self.lhs, self.f, self.e)) + + def __cmp__(self, Rule other): + return cmp((self.lhs, self.f, self.e, self.word_alignments), (other.lhs, other.f, other.e, self.word_alignments)) + + def __iadd__(self, Rule other): + if self.n_scores != other.n_scores: + raise ValueError + for i from 0 <= i < self.n_scores: + self.cscores[i] = self.cscores[i] + other.cscores[i] + return self + + def fmerge(self, Phrase f): + if self.f == f: + self.f = f + + def arity(self): + return self.f.arity() + + def to_line(self): + scorestrs = [] + for i from 0 <= i < self.n_scores: + scorestrs.append(str(self.cscores[i])) + fields = [alphabet.tostring(self.lhs), str(self.f), str(self.e), " ".join(scorestrs)] + if self.word_alignments is not None: + alignstr = [] + for i from 0 <= i < len(self.word_alignments): + alignstr.append("%d-%d" % (self.word_alignments[i]/65536, self.word_alignments[i]%65536)) + #for s,t in self.word_alignments: + #alignstr.append("%d-%d" % (s,t)) + fields.append(" ".join(alignstr)) + + return " ||| ".join(fields) + + property scores: + def __get__(self): + s = [None]*self.n_scores + for i from 0 <= i < self.n_scores: + s[i] = self.cscores[i] + return s + + def __set__(self, s): + if self.cscores != NULL: + free(self.cscores) + self.cscores = malloc(len(s)*sizeof(float)) + self.n_scores = len(s) + for i from 0 <= i < self.n_scores: + self.cscores[i] = s[i] + +def rule_copy(r): + r1 = Rule(r.lhs, r.f, r.e, r.owner, r.scores) + r1.word_alignments = r.word_alignments + return r1 + diff --git a/sa-extract/rulefactory.pyx b/sa-extract/rulefactory.pyx new file mode 100644 index 00000000..20ea80d2 --- /dev/null +++ b/sa-extract/rulefactory.pyx @@ -0,0 +1,2360 @@ +# Pyrex implementation of the algorithms described in +# Lopez, EMNLP-CoNLL 2007 +# Much faster than the Python numbers reported there. +# Note to reader: this code is closer to C than Python +import sys +import sym +import log +import rule +import monitor +import cintlist +import csuf +import cdat +import cveb +import precomputation +import gc +import cn +import sgml + +cimport cmath +cimport csuf +cimport cdat +cimport cintlist +cimport rule +cimport cveb +cimport precomputation +cimport calignment + +from libc.stdlib cimport malloc, realloc, free +from libc.string cimport memset, memcpy +from libc.math cimport fmod, ceil, floor + +cdef int PRECOMPUTE +cdef int MERGE +cdef int BAEZA_YATES + +PRECOMPUTE = 0 +MERGE = 1 +BAEZA_YATES = 2 + +#cdef int node_count +#node_count = 0 + +cdef class TrieNode: + cdef public children + #cdef int id + + def __init__(self): + self.children = {} + #self.id = node_count + #node_count += 1 + + +cdef class ExtendedTrieNode(TrieNode): + cdef public phrase + cdef public phrase_location + cdef public suffix_link + + def __init__(self, phrase=None, phrase_location=None, suffix_link=None): + TrieNode.__init__(self) + self.phrase = phrase + self.phrase_location = phrase_location + self.suffix_link = suffix_link + + +cdef class TrieTable: + cdef public int extended + cdef public int count + cdef public root + def __cinit__(self, extended=False): + self.count = 0 + self.extended = extended + if extended: + self.root = ExtendedTrieNode() + else: + self.root = TrieNode() + +# linked list structure for storing matches in BaselineRuleFactory +cdef struct match_node: + int* match + match_node* next + +cdef class BaselineRuleFactory: + + cdef grammar, context_manager + cdef int max_terminals, max_nonterminals + cdef int max_initial_size, train_max_initial_size + cdef int min_gap_size, train_min_gap_size + cdef int category + cdef int visit + cdef float intersect_time, extract_time + cdef ruleFile, timingFile + cdef int* last_visit1 + cdef int* last_visit2 + cdef match_node** intersector1 + cdef match_node** intersector2 + cdef csuf.SuffixArray sa + cdef cintlist.CIntList sent_id + + def input(self, fwords): + flen = len(fwords) + start_time = monitor.cpu() + self.intersect_time = 0.0 + self.extract_time = 0.0 + + pyro_phrase_count = 0 + hiero_phrase_count = 0 + + frontier = [] + for i in xrange(len(fwords)): + frontier.append((i, (), False)) + + while len(frontier) > 0: + this_iter_intersect_time = self.intersect_time + new_frontier = [] + for i, prefix, is_shadow_path in frontier: + + word_id = fwords[i][0][0] + #print "word_id = %i" % word_id + phrase = prefix + (word_id,) + str_phrase = map(sym.tostring, phrase) + hiero_phrase = rule.Phrase(phrase) + + #log.writeln("pos %2d, '%s'" % (i, hiero_phrase)) + self.lookup(hiero_phrase) + if hiero_phrase.arity() == 0: + pyro_phrase_count = pyro_phrase_count + 1 + else: + hiero_phrase_count = hiero_phrase_count + 1 + + if len(phrase) - hiero_phrase.arity() < self.max_terminals and i+1 < len(fwords): + new_frontier.append((i+1, phrase, is_shadow_path)) + if hiero_phrase.arity() < self.max_nonterminals: + xcat = sym.setindex(self.category, hiero_phrase.arity()+1) + for j in xrange(i+1+self.min_gap_size, min(i+self.max_initial_size, len(fwords))): + new_frontier.append((j, phrase+(xcat,), is_shadow_path)) + log.writeln("This iteration intersect time = %f" % (self.intersect_time - this_iter_intersect_time)) + frontier = new_frontier + stop_time = monitor.cpu() + log.writeln("COUNT %d %d" % (pyro_phrase_count, hiero_phrase_count)) + + + def lookup(self, phrase): + cdef int j, g, start, stop, sent_id, num_ranges, arity + cdef match_node** cur_intersector + cdef match_node** next_intersector + cdef match_node** tmp_intersector + cdef match_node* node + cdef match_node* cur_node + cdef match_node* prev_node + cdef match_node** node_ptr + cdef int* cur_visit + cdef int* next_visit + cdef int* tmp_visit + cdef int* chunklen + + #print "\n\nLOOKUP\n\n" + ranges = [] + sizes = [] + arity = phrase.arity() + chunklen = malloc(arity*sizeof(int)) + for i from 0 <= i < arity+1: + chunk = phrase.getchunk(i) + chunklen[i] = len(chunk) + sa_range = None + phr = () + for offset, word_id in enumerate(chunk): + word = sym.tostring(word_id) + sa_range = self.context_manager.fsarray.lookup(word, offset, sa_range[0], sa_range[1]) + if sa_range is None: + #log.writeln("Returned for phrase %s" % rule.Phrase(phr)) + return + #log.writeln("Found range %s for phrase %s" % (sa_range, rule.Phrase(phr))) + ranges.append(sa_range) + sizes.append(sa_range[1]-sa_range[0]) + if phrase.arity() == 0: + return + + cur_intersector = self.intersector1 + next_intersector = self.intersector2 + cur_visit = self.last_visit1 + next_visit = self.last_visit2 + + num_ranges = len(ranges) + for i from 0 <= i < num_ranges: + sa_range = ranges[i] + start_time = monitor.cpu() + self.visit = self.visit + 1 + intersect_count = 0 + + start = sa_range[0] + stop = sa_range[1] + for j from start <= j < stop: + g = self.sa.sa.arr[j] + sent_id = self.sent_id.arr[g] + if i==0: + if next_visit[sent_id] != self.visit: + # clear intersector + node = next_intersector[sent_id] + next_intersector[sent_id] = NULL + while node != NULL: + prev_node = node + node = node.next + free(prev_node.match) + free(prev_node) + next_visit[sent_id] = self.visit + node_ptr = &(next_intersector[sent_id]) + while node_ptr[0] != NULL: + node_ptr = &(node_ptr[0].next) + node_ptr[0] = malloc(sizeof(match_node)) + node_ptr[0].match = malloc(sizeof(int)) + node_ptr[0].match[0] = g + node_ptr[0].next = NULL + intersect_count = intersect_count + 1 + else: + if cur_visit[sent_id] == self.visit-1: + cur_node = cur_intersector[sent_id] + while cur_node != NULL: + if g - cur_node.match[0] + chunklen[i] <= self.train_max_initial_size and g - cur_node.match[i-1] - chunklen[i-1] >= self.train_min_gap_size: + if next_visit[sent_id] != self.visit: + # clear intersector -- note that we only do this if we've got something to put there + node = next_intersector[sent_id] + next_intersector[sent_id] = NULL + while node != NULL: + prev_node = node + node = node.next + free(prev_node.match) + free(prev_node) + next_visit[sent_id] = self.visit + node_ptr = &(next_intersector[sent_id]) + while node_ptr[0] != NULL: + node_ptr = &(node_ptr[0].next) + node_ptr[0] = malloc(sizeof(match_node)) + node_ptr[0].match = malloc((i+1) * sizeof(int)) + memcpy(node_ptr[0].match, cur_node.match, i*sizeof(int)) + node_ptr[0].match[i] = g + node_ptr[0].next = NULL + intersect_count = intersect_count + 1 + cur_node = cur_node.next + tmp_intersector = cur_intersector + cur_intersector = next_intersector + next_intersector = tmp_intersector + + tmp_visit = cur_visit + cur_visit = next_visit + next_visit = tmp_visit + + intersect_time = monitor.cpu() - start_time + if i > 0: + log.writeln("INT %d %d %d %d %f baseline" % + (arity, prev_intersect_count, sa_range[1]-sa_range[0], + intersect_count, intersect_time)) + if intersect_count == 0: + return None + prev_intersect_count = intersect_count + free(chunklen) + + + + def __init__(self, max_terminals=5, max_nonterminals=2, + max_initial_size=10, train_max_initial_size=10, + min_gap_size=1, train_min_gap_size=2, + category='[PHRASE]', grammar=None, + ruleFile=None, timingFile=None): + self.grammar = grammar + self.max_terminals = max_terminals + self.max_nonterminals = max_nonterminals + self.max_initial_size = max_initial_size + self.train_max_initial_size = train_max_initial_size + self.min_gap_size = min_gap_size + self.train_min_gap_size = train_min_gap_size + self.category = sym.fromstring(category) + self.ruleFile = ruleFile + self.timingFile = timingFile + self.visit = 0 + + + def registerContext(self, context_manager): + cdef int num_sents + self.context_manager = context_manager + self.sa = context_manager.fsarray + self.sent_id = self.sa.darray.sent_id + + num_sents = len(self.sa.darray.sent_index) + self.last_visit1 = malloc(num_sents * sizeof(int)) + memset(self.last_visit1, 0, num_sents * sizeof(int)) + + self.last_visit2 = malloc(num_sents * sizeof(int)) + memset(self.last_visit2, 0, num_sents * sizeof(int)) + + self.intersector1 = malloc(num_sents * sizeof(match_node*)) + memset(self.intersector1, 0, num_sents * sizeof(match_node*)) + + self.intersector2 = malloc(num_sents * sizeof(match_node*)) + memset(self.intersector2, 0, num_sents * sizeof(match_node*)) + + +# encodes information needed to find a (hierarchical) phrase +# in the text. If phrase is contiguous, that's just a range +# in the suffix array; if discontiguous, it is the set of +# actual locations (packed into an array) +cdef class PhraseLocation: + cdef int sa_low + cdef int sa_high + cdef int arr_low + cdef int arr_high + cdef cintlist.CIntList arr + cdef int num_subpatterns + + # returns true if sent_id is contained + cdef int contains(self, int sent_id): + return 1 + + def __init__(self, sa_low=-1, sa_high=-1, arr_low=-1, arr_high=-1, arr=None, num_subpatterns=1): + self.sa_low = sa_low + self.sa_high = sa_high + self.arr_low = arr_low + self.arr_high = arr_high + self.arr = arr + self.num_subpatterns = num_subpatterns + + + +cdef class Sampler: + '''A Sampler implements a logic for choosing + samples from a population range''' + + cdef int sampleSize + cdef context_manager + cdef cintlist.CIntList sa + + def __init__(self, sampleSize=0): + self.sampleSize = sampleSize + if sampleSize > 0: + log.writeln("Sampling strategy: uniform, max sample size = %d" % sampleSize, 1) + else: + log.writeln("Sampling strategy: no sampling", 1) + + def registerContext(self, context_manager): + self.context_manager = context_manager + self.sa = ( context_manager.fsarray).sa + + + def sample(self, PhraseLocation phrase_location): + '''Returns a sample of the locations for + the phrase. If there are less than self.sampleSize + locations, return all of them; otherwise, return + up to self.sampleSize locations. In the latter case, + we choose to sample UNIFORMLY -- that is, the locations + are chosen at uniform intervals over the entire set, rather + than randomly. This makes the algorithm deterministic, which + is good for things like MERT''' + cdef cintlist.CIntList sample + cdef double i, stepsize + cdef int num_locations, val, j + + sample = cintlist.CIntList() + if phrase_location.arr is None: + num_locations = phrase_location.sa_high - phrase_location.sa_low + if self.sampleSize == -1 or num_locations <= self.sampleSize: + sample._extend_arr(self.sa.arr + phrase_location.sa_low, num_locations) + else: + stepsize = float(num_locations)/float(self.sampleSize) + i = phrase_location.sa_low + while i < phrase_location.sa_high and sample.len < self.sampleSize: + '''Note: int(i) not guaranteed to have the desired + effect, according to the python documentation''' + if fmod(i,1.0) > 0.5: + val = int(ceil(i)) + else: + val = int(floor(i)) + sample._append(self.sa.arr[val]) + i = i + stepsize + else: + num_locations = (phrase_location.arr_high - phrase_location.arr_low) / phrase_location.num_subpatterns + if self.sampleSize == -1 or num_locations <= self.sampleSize: + sample = phrase_location.arr + else: + stepsize = float(num_locations)/float(self.sampleSize) + i = phrase_location.arr_low + while i < num_locations and sample.len < self.sampleSize * phrase_location.num_subpatterns: + '''Note: int(i) not guaranteed to have the desired + effect, according to the python documentation''' + if fmod(i,1.0) > 0.5: + val = int(ceil(i)) + else: + val = int(floor(i)) + j = phrase_location.arr_low + (val*phrase_location.num_subpatterns) + sample._extend_arr(phrase_location.arr.arr + j, phrase_location.num_subpatterns) + i = i + stepsize + return sample + + +cdef long nGramCount(PhraseLocation loc): + return (loc.arr_high - loc.arr_low)/ loc.num_subpatterns + + +# struct used to encapsulate a single matching +cdef struct Matching: + int* arr + int start + int end + int sent_id + int size + + +cdef void assign_matching(Matching* m, int* arr, int start, int step, int* sent_id_arr): + m.arr = arr + m.start = start + m.end = start + step + m.sent_id = sent_id_arr[arr[start]] + m.size = step + + +cdef int* append_combined_matching(int* arr, Matching* loc1, Matching* loc2, + int offset_by_one, int num_subpatterns, int* result_len): + cdef int i, new_len + + new_len = result_len[0] + num_subpatterns + arr = realloc(arr, new_len*sizeof(int)) + + for i from 0 <= i < loc1.size: + arr[result_len[0]+i] = loc1.arr[loc1.start+i] + if num_subpatterns > loc1.size: + arr[new_len-1] = loc2.arr[loc2.end-1] + result_len[0] = new_len + return arr + + +cdef int* extend_arr(int* arr, int* arr_len, int* appendix, int appendix_len): + cdef int new_len + + new_len = arr_len[0] + appendix_len + arr = realloc(arr, new_len*sizeof(int)) + memcpy(arr+arr_len[0], appendix, appendix_len*sizeof(int)) + arr_len[0] = new_len + return arr + + +#cdef matching2str(Matching* m): +# cdef int i +# cdef result + +# result = "(" +# for i from m.start <= i < m.end: +# result = result + str(m.arr[i]) + " " +# result = result + ")" +# return result + + +cdef int median(int low, int high, int step): + return low + (((high - low)/step)/2)*step + + +cdef void findComparableMatchings(int low, int high, int* arr, int step, int loc, int* loc_minus, int* loc_plus): + # Returns (minus, plus) indices for the portion of the array + # in which all matchings have the same first index as the one + # starting at loc + loc_plus[0] = loc + step + while loc_plus[0] < high and arr[loc_plus[0]] == arr[loc]: + loc_plus[0] = loc_plus[0] + step + loc_minus[0] = loc + while loc_minus[0]-step >= low and arr[loc_minus[0]-step] == arr[loc]: + loc_minus[0] = loc_minus[0] - step + + +cdef class HieroCachingRuleFactory: + '''This RuleFactory implements a caching + method using TrieTable, which makes phrase + generation somewhat speedier -- phrases only + need to be extracted once (however, it is + quite possible they need to be scored + for each input sentence, for contextual models)''' + + cdef rules, grammar, context_manager + + cdef int max_chunks + cdef int max_target_chunks + cdef int max_length + cdef int max_target_length + cdef int max_nonterminals + cdef int max_initial_size + cdef int train_max_initial_size + cdef int min_gap_size + cdef int train_min_gap_size + cdef int category + + cdef cacheBetweenSents + cdef precomputed_index + cdef precomputed_collocations + cdef precompute_file + cdef max_rank + cdef int precompute_rank, precompute_secondary_rank + cdef useBaezaYates + cdef use_index + cdef use_collocations + cdef float by_slack_factor + + cdef per_sentence_grammar + cdef rule_filehandler + cdef rule_file + cdef pruned_rule_file + cdef extract_file + cdef sample_file + cdef search_file + cdef timingFile + cdef log_int_stats + cdef prev_norm_prefix + cdef float intersect_time, extract_time + cdef csuf.SuffixArray fsa + cdef cdat.DataArray fda + cdef cdat.DataArray eda + + cdef calignment.Alignment alignment + cdef cintlist.CIntList eid2symid + cdef cintlist.CIntList fid2symid + cdef int tight_phrases + cdef int require_aligned_terminal + cdef int require_aligned_chunks + + cdef cintlist.CIntList findexes + cdef cintlist.CIntList findexes1 + + cdef int excluded_sent_id # exclude a sentence id + + def __init__(self, + alignment=None, # compiled alignment object (REQUIRED) + by_slack_factor=1.0, # parameter for double-binary search; doesn't seem to matter much + category="[PHRASE]", # name of generic nonterminal used by Hiero + cacheBetweenSents=False, # prevent flushing of tree between sents; use carefully or you'll run out of memory + extract_file=None, # print raw extracted rules to this file + grammar=None, # empty grammar object -- must be supplied from outside (REQUIRED) + log_int_stats=False, # prints timing data on intersections to stderr + max_chunks=None, # maximum number of contiguous chunks of terminal symbols in RHS of a rule. If None, defaults to max_nonterminals+1 + max_initial_size=10, # maximum span of a grammar rule in TEST DATA + max_length=5, # maximum number of symbols (both T and NT) allowed in a rule + max_nonterminals=2, # maximum number of nonterminals allowed in a rule (set >2 at your own risk) + max_target_chunks=None, # maximum number of contiguous chunks of terminal symbols in target-side RHS of a rule. If None, defaults to max_nonterminals+1 + max_target_length=None, # maximum number of target side symbols (both T and NT) allowed in a rule. If None, defaults to max_initial_size + min_gap_size=2, # minimum span of a nonterminal in the RHS of a rule in TEST DATA + precompute_file=None, # filename of file containing precomputed collocations + precompute_secondary_rank=20, # maximum frequency rank of patterns used to compute triples (don't set higher than 20). + precompute_rank=100, # maximum frequency rank of patterns used to compute collocations (no need to set higher than maybe 200-300) + pruned_rule_file=None, # if specified, pruned grammars will be written to this filename + require_aligned_terminal=True, # require extracted rules to have at least one aligned word + require_aligned_chunks=False, # require each contiguous chunk of extracted rules to have at least one aligned word + per_sentence_grammar=True, # generate grammar files for each input segment + rule_file=None, # UNpruned grammars will be written to this filename + sample_file=None, # Sampling statistics will be written to this filename + search_file=None, # lookup statistics will be written to this filename + train_max_initial_size=10, # maximum span of a grammar rule extracted from TRAINING DATA + train_min_gap_size=2, # minimum span of an RHS nonterminal in a rule extracted from TRAINING DATA + tight_phrases=False, # True if phrases should be tight, False otherwise (False == slower but better results) + timingFile=None, # timing statistics will be written to this filename + useBaezaYates=True, # True to require use of double-binary alg, false otherwise + use_collocations=True, # True to enable used of precomputed collocations + use_index=True # True to enable use of precomputed inverted indices + ): + '''Note: we make a distinction between the min_gap_size + and max_initial_size used in test and train. The latter + are represented by train_min_gap_size and train_max_initial_size, + respectively. This is because Chiang's model does not require + them to be the same, therefore we don't either.''' + self.rules = TrieTable(True) # cache + self.rules.root = ExtendedTrieNode(phrase_location=PhraseLocation()) + self.grammar = grammar + if alignment is None: + raise Exception("Must specify an alignment object") + self.alignment = alignment + + self.excluded_sent_id = -1 + + # grammar parameters and settings + # NOTE: setting max_nonterminals > 2 is not currently supported in Hiero + self.max_length = max_length + self.max_nonterminals = max_nonterminals + self.max_initial_size = max_initial_size + self.train_max_initial_size = train_max_initial_size + self.min_gap_size = min_gap_size + self.train_min_gap_size = train_min_gap_size + self.category = sym.fromstring(category) + + if max_chunks is None: + self.max_chunks = self.max_nonterminals + 1 + else: + self.max_chunks = max_chunks + + if max_target_chunks is None: + self.max_target_chunks = self.max_nonterminals + 1 + else: + self.max_target_chunks = max_target_chunks + + if max_target_length is None: + self.max_target_length = max_initial_size + else: + self.max_target_length = max_target_length + + # algorithmic parameters and settings + self.cacheBetweenSents = not per_sentence_grammar + self.precomputed_collocations = {} + self.precomputed_index = {} + self.use_index = use_index + self.use_collocations = use_collocations + self.max_rank = {} + self.precompute_file = precompute_file + self.precompute_rank = precompute_rank + self.precompute_secondary_rank = precompute_secondary_rank + self.useBaezaYates = useBaezaYates + self.by_slack_factor = by_slack_factor + if tight_phrases: + self.tight_phrases = 1 + else: + self.tight_phrases = 0 + + if require_aligned_chunks: + # one condition is a stronger version of the other. + self.require_aligned_chunks = 1 + self.require_aligned_terminal = 1 + elif require_aligned_terminal: + self.require_aligned_chunks = 0 + self.require_aligned_terminal = 1 + else: + self.require_aligned_chunks = 0 + self.require_aligned_terminal = 0 + + + self.per_sentence_grammar = per_sentence_grammar + if not self.per_sentence_grammar: + self.rule_filehandler = open(rule_file, "w") + # diagnostics + #if rule_file is None: + # self.rule_file = None + self.rule_file = rule_file + if extract_file is None: + self.extract_file = None + else: + self.extract_file = open(extract_file, "w") + if sample_file is None: + self.sample_file = None + else: + self.sample_file = open(sample_file, "w") + if search_file is None: + self.search_file = None + else: + self.search_file = open(search_file, "w") + self.pruned_rule_file = pruned_rule_file + self.timingFile = timingFile + self.log_int_stats = log_int_stats + self.prev_norm_prefix = () + + self.findexes = cintlist.CIntList(initial_len=10) + self.findexes1 = cintlist.CIntList(initial_len=10) + + def registerContext(self, context_manager): + '''This gives the RuleFactory access to the Context object. + Here we also use it to precompute the most expensive intersections + in the corpus quickly.''' + self.context_manager = context_manager + self.fsa = context_manager.fsarray + self.fda = self.fsa.darray + self.eda = context_manager.edarray + self.fid2symid = self.set_idmap(self.fda) + self.eid2symid = self.set_idmap(self.eda) + self.precompute() + + cdef set_idmap(self, cdat.DataArray darray): + cdef int word_id, new_word_id, N + cdef cintlist.CIntList idmap + + N = len(darray.id2word) + idmap = cintlist.CIntList(initial_len=N) + for word_id from 0 <= word_id < N: + new_word_id = sym.fromstring(darray.id2word[word_id], terminal=True) + idmap.arr[word_id] = new_word_id + return idmap + + + def pattern2phrase(self, pattern): + # pattern is a tuple, which we must convert to a hiero rule.Phrase + result = () + arity = 0 + for word_id in pattern: + if word_id == -1: + arity = arity + 1 + new_id = sym.setindex(self.category, arity) + else: + new_id = sym.fromstring(self.fda.id2word[word_id]) + result = result + (new_id,) + return rule.Phrase(result) + + def pattern2phrase_plus(self, pattern): + # returns a list containing both the pattern, and pattern + # suffixed/prefixed with the NT category. + patterns = [] + result = () + arity = 0 + for word_id in pattern: + if word_id == -1: + arity = arity + 1 + new_id = sym.setindex(self.category, arity) + else: + new_id = sym.fromstring(self.fda.id2word[word_id]) + result = result + (new_id,) + patterns.append(rule.Phrase(result)) + patterns.append(rule.Phrase(result + (sym.setindex(self.category, 1),))) + patterns.append(rule.Phrase((sym.setindex(self.category, 1),) + result)) + return patterns + + def precompute(self): + cdef precomputation.Precomputation pre + + if self.precompute_file is not None: + start_time = monitor.cpu() + log.write("Reading precomputed data from file %s... " % self.precompute_file, 1) + pre = precomputation.Precomputation(self.precompute_file, from_binary=True) + # check parameters of precomputation -- some are critical and some are not + if pre.max_nonterminals != self.max_nonterminals: + log.writeln("\nWARNING: Precomputation done with max nonterminals %d, decoder uses %d" % (pre.max_nonterminals, self.max_nonterminals)) + if pre.max_length != self.max_length: + log.writeln("\nWARNING: Precomputation done with max terminals %d, decoder uses %d" % (pre.max_length, self.max_length)) + if pre.train_max_initial_size != self.train_max_initial_size: + log.writeln("\nERROR: Precomputation done with max initial size %d, decoder uses %d" % (pre.train_max_initial_size, self.train_max_initial_size)) + raise Exception("Parameter mismatch with precomputed data") + if pre.train_min_gap_size != self.train_min_gap_size: + log.writeln("\nERROR: Precomputation done with min gap size %d, decoder uses %d" % (pre.train_min_gap_size, self.train_min_gap_size)) + raise Exception("Parameter mismatch with precomputed data") + log.writeln("done.", 1) + if self.use_index: + log.write("Converting %d hash keys on precomputed inverted index... " % (len(pre.precomputed_index)), 1) + for pattern, arr in pre.precomputed_index.iteritems(): + phrases = self.pattern2phrase_plus(pattern) + for phrase in phrases: + self.precomputed_index[phrase] = arr + log.writeln("done.", 1) + if self.use_collocations: + log.write("Converting %d hash keys on precomputed collocations... " % (len(pre.precomputed_collocations)), 1) + for pattern, arr in pre.precomputed_collocations.iteritems(): + phrase = self.pattern2phrase(pattern) + self.precomputed_collocations[phrase] = arr + log.writeln("done.", 1) + stop_time = monitor.cpu() + log.writeln("Processing precomputations took %f seconds" % (stop_time - start_time), 1) + + + def getPrecomputedCollocation(self, phrase): + if phrase in self.precomputed_collocations: + arr = self.precomputed_collocations[phrase] + return PhraseLocation(arr=arr, arr_low=0, arr_high=len(arr), num_subpatterns=phrase.arity()+1) + return None + + + cdef int* baezaYatesHelper(self, int low1, int high1, int* arr1, int step1, + int low2, int high2, int* arr2, int step2, + int offset_by_one, int len_last, int num_subpatterns, int* result_len): + cdef int i1, i2, j1, j2, med1, med2, med1_plus, med1_minus, med2_minus, med2_plus + cdef int d_first, qsetsize, dsetsize, tmp, search_low, search_high + cdef int med_result_len, low_result_len, high_result_len + cdef long comparison + cdef int* result + cdef int* low_result + cdef int* med_result + cdef int* high_result + cdef Matching loc1, loc2 + + result = malloc(0*sizeof(int*)) +# log.writeln("%sBY: [%d, %d, %d] [%d, %d, %d]" % (pad, low1, high1, step1, low2, high2, step2,), 5) + + d_first = 0 + if high1 - low1 > high2 - low2: +# log.writeln("%sD first" % (pad), 5) + d_first = 1 +# else: +# log.writeln("%sQ first" % (pad), 5) + +# '''First, check to see if we are at any of the +# recursive base cases''' +# +# '''Case 1: one of the sets is empty''' + if low1 >= high1 or low2 >= high2: +# log.writeln("%sRETURN: set is empty" % (pad), 5) + return result + +# '''Case 2: sets are non-overlapping''' + assign_matching(&loc1, arr1, high1-step1, step1, self.fda.sent_id.arr) + assign_matching(&loc2, arr2, low2, step2, self.fda.sent_id.arr) + if self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) == -1: +# log.writeln("%s %s < %s" % (pad, tuple(arr1[high1-step1:high1]), tuple(arr2[low2:low2+step2])),5) +# log.writeln("%sRETURN: non-overlapping sets" % (pad), 5) + return result + + assign_matching(&loc1, arr1, low1, step1, self.fda.sent_id.arr) + assign_matching(&loc2, arr2, high2-step2, step2, self.fda.sent_id.arr) + if self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) == 1: +# log.writeln("%s %s > %s" % (pad, tuple(arr1[low1:low1+step1]), tuple(arr2[high2-step2:high2])),5) +# log.writeln("%sRETURN: non-overlapping sets" % (pad), 5) + return result + + # Case 3: query set and data set do not meet size mismatch constraints; + # We use mergesort instead in this case + qsetsize = (high1-low1) / step1 + dsetsize = (high2-low2) / step2 + if d_first: + tmp = qsetsize + qsetsize = dsetsize + dsetsize = tmp + + if self.by_slack_factor * qsetsize * cmath.log(dsetsize) / cmath.log(2) > dsetsize: + free(result) + return self.mergeHelper(low1, high1, arr1, step1, low2, high2, arr2, step2, offset_by_one, len_last, num_subpatterns, result_len) + + # binary search. There are two flavors, depending on + # whether the queryset or dataset is first + if d_first: + med2 = median(low2, high2, step2) + assign_matching(&loc2, arr2, med2, step2, self.fda.sent_id.arr) + + search_low = low1 + search_high = high1 + while search_low < search_high: + med1 = median(search_low, search_high, step1) + findComparableMatchings(low1, high1, arr1, step1, med1, &med1_minus, &med1_plus) + comparison = self.compareMatchingsSet(med1_minus, med1_plus, arr1, step1, &loc2, offset_by_one, len_last) + if comparison == -1: + search_low = med1_plus + elif comparison == 1: + search_high = med1_minus + else: + break + else: + med1 = median(low1, high1, step1) + findComparableMatchings(low1, high1, arr1, step1, med1, &med1_minus, &med1_plus) + + search_low = low2 + search_high = high2 + while search_low < search_high: + med2 = median(search_low, search_high, step2) + assign_matching(&loc2, arr2, med2, step2, self.fda.sent_id.arr) + comparison = self.compareMatchingsSet(med1_minus, med1_plus, arr1, step1, &loc2, offset_by_one, len_last) + if comparison == -1: + search_high = med2 + elif comparison == 1: + search_low = med2 + step2 + else: + break + + med_result_len = 0 + med_result = malloc(0*sizeof(int*)) + if search_high > search_low: +# '''Then there is a match for the median element of Q''' +# +# '''What we want to find is the group of all bindings in the first set +# s.t. their first element == the first element of med1. Then we +# want to store the bindings for all of those elements. We can +# subsequently throw all of them away.''' + med2_minus = med2 + med2_plus = med2 + step2 + i1 = med1_minus + while i1 < med1_plus: + assign_matching(&loc1, arr1, i1, step1, self.fda.sent_id.arr) + while med2_minus-step2 >= low2: + assign_matching(&loc2, arr2, med2_minus-step2, step2, self.fda.sent_id.arr) + if self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) < 1: + med2_minus = med2_minus - step2 + else: + break + i2 = med2_minus + while i2 < high2: + assign_matching(&loc2, arr2, i2, step2, self.fda.sent_id.arr) + comparison = self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) + if comparison == 0: + pass + med_result = append_combined_matching(med_result, &loc1, &loc2, offset_by_one, num_subpatterns, &med_result_len) + if comparison == -1: + break + i2 = i2 + step2 + if i2 > med2_plus: + med2_plus = i2 + i1 = i1 + step1 + + tmp = med1_minus + med1_minus = med1_plus + med1_plus = tmp + else: + # No match; need to figure out the point of division in D and Q + med2_minus = med2 + med2_plus = med2 + if d_first: + med2_minus = med2_minus + step2 + if comparison == -1: + med1_minus = med1_plus + if comparison == 1: + med1_plus = med1_minus + else: + tmp = med1_minus + med1_minus = med1_plus + med1_plus = tmp + if comparison == 1: + med2_minus = med2_minus + step2 + med2_plus = med2_plus + step2 + + low_result_len = 0 + low_result = self.baezaYatesHelper(low1, med1_plus, arr1, step1, low2, med2_plus, arr2, step2, offset_by_one, len_last, num_subpatterns, &low_result_len) + high_result_len = 0 + high_result = self.baezaYatesHelper(med1_minus, high1, arr1, step1, med2_minus, high2, arr2, step2, offset_by_one, len_last, num_subpatterns, &high_result_len) + + result = extend_arr(result, result_len, low_result, low_result_len) + result = extend_arr(result, result_len, med_result, med_result_len) + result = extend_arr(result, result_len, high_result, high_result_len) + free(low_result) + free(med_result) + free(high_result) + + return result + + + + cdef long compareMatchingsSet(self, int i1_minus, int i1_plus, int* arr1, int step1, + Matching* loc2, int offset_by_one, int len_last): +# '''Compares a *set* of bindings, all with the same first element, +# to a single binding. Returns -1 if all comparisons == -1, 1 if all +# comparisons == 1, and 0 otherwise.''' + cdef int i1, comparison, prev_comparison + cdef Matching l1_stack + cdef Matching* loc1 + + loc1 = &l1_stack + + i1 = i1_minus + while i1 < i1_plus: + assign_matching(loc1, arr1, i1, step1, self.fda.sent_id.arr) + comparison = self.compare_matchings(loc1, loc2, offset_by_one, len_last) + if comparison == 0: + prev_comparison = 0 + break + elif i1 == i1_minus: + prev_comparison = comparison + else: + if comparison != prev_comparison: + prev_comparison = 0 + break + i1 = i1 + step1 + return prev_comparison + + + cdef long compare_matchings(self, Matching* loc1, Matching* loc2, int offset_by_one, int len_last): + cdef int i + + if loc1.sent_id > loc2.sent_id: + return 1 + if loc2.sent_id > loc1.sent_id: + return -1 + + if loc1.size == 1 and loc2.size == 1: + if loc2.arr[loc2.start] - loc1.arr[loc1.start] <= self.train_min_gap_size: + return 1 + + elif offset_by_one: + for i from 1 <= i < loc1.size: + if loc1.arr[loc1.start+i] > loc2.arr[loc2.start+i-1]: + return 1 + if loc1.arr[loc1.start+i] < loc2.arr[loc2.start+i-1]: + return -1 + + else: + if loc1.arr[loc1.start]+1 > loc2.arr[loc2.start]: + return 1 + if loc1.arr[loc1.start]+1 < loc2.arr[loc2.start]: + return -1 + + for i from 1 <= i < loc1.size: + if loc1.arr[loc1.start+i] > loc2.arr[loc2.start+i]: + return 1 + if loc1.arr[loc1.start+i] < loc2.arr[loc2.start+i]: + return -1 + + if loc2.arr[loc2.end-1] + len_last - loc1.arr[loc1.start] > self.train_max_initial_size: + return -1 + return 0 + + + cdef int* mergeHelper(self, int low1, int high1, int* arr1, int step1, + int low2, int high2, int* arr2, int step2, + int offset_by_one, int len_last, int num_subpatterns, int* result_len): + cdef int i1, i2, j1, j2 + cdef long comparison + cdef int* result + cdef Matching loc1, loc2 +# cdef int i + +# pad = " " +# log.writeln("->mergeHelper", 5) + + result_len[0] = 0 + result = malloc(0*sizeof(int)) + + i1 = low1 + i2 = low2 +# if log.level==5: +# log.writeln("%sMERGE lists [%d,%d,%d] and [%d,%d,%d]" % (pad,low1,high1,step1,low2,high2,step2), 5) +# log.writeln("%soffset_by_one: %d, len_last: %d" % (pad, offset_by_one, len_last), 5) +# log.write("[") +# for i from low1 <= i < high1: +# log.write("%d, " % arr1.arr[i],5) +# log.writeln("]") +# log.write("[") +# for i from low2 <= i < high2: +# log.write("%d, " % arr2.arr[i],5) +# log.writeln("]") + while i1 < high1 and i2 < high2: + +# '''First, pop all unneeded loc2's off the stack''' + assign_matching(&loc1, arr1, i1, step1, self.fda.sent_id.arr) +# if log.level==5: +# log.writeln("%s TOP1 %s" % (pad,matching2str(loc1)),5) + while i2 < high2: + assign_matching(&loc2, arr2, i2, step2, self.fda.sent_id.arr) + if self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) == 1: +# if log.level==5: +# log.writeln("%s %s > %s" % (pad,matching2str(loc1),matching2str(loc2)),5) +# log.writeln("%s POP2 %s" % (pad,matching2str(loc2)),5) + i2 = i2 + step2 + else: + break + +# '''Next: process all loc1's with the same starting val''' + j1 = i1 + while i1 < high1 and arr1[j1] == arr1[i1]: + assign_matching(&loc1, arr1, i1, step1, self.fda.sent_id.arr) + j2 = i2 + while j2 < high2: + assign_matching(&loc2, arr2, j2, step2, self.fda.sent_id.arr) + comparison = self.compare_matchings(&loc1, &loc2, offset_by_one, len_last) + if comparison == 0: +# if log.level==5: +# log.writeln("%s %s == %s" % (pad,matching2str(loc1),matching2str(loc2)),5) + result = append_combined_matching(result, &loc1, &loc2, offset_by_one, num_subpatterns, result_len) + if comparison == 1: +# if log.level==5: +# log.writeln("%s %s > %s" % (pad,matching2str(loc1),matching2str(loc2)),5) + pass + if comparison == -1: +# if log.level==5: +# log.writeln("%s %s < %s" % (pad,matching2str(loc1),matching2str(loc2)),5) + break + else: + j2 = j2 + step2 +# if log.level==5: +# log.writeln("%s POP1 %s" % (pad,matching2str(loc1)),5) + i1 = i1 + step1 + +# log.writeln("<-mergeHelper", 5) + return result + + + cdef void sortPhraseLoc(self, cintlist.CIntList arr, PhraseLocation loc, rule.Phrase phrase): + cdef int i, j + cdef cveb.VEB veb + cdef cintlist.CIntList result + + if phrase in self.precomputed_index: + loc.arr = self.precomputed_index[phrase] + else: + loc.arr = cintlist.CIntList(initial_len=loc.sa_high-loc.sa_low) + veb = cveb.VEB(arr.len) + for i from loc.sa_low <= i < loc.sa_high: + veb._insert(arr.arr[i]) + i = veb.veb.min_val + for j from 0 <= j < loc.sa_high-loc.sa_low: + loc.arr.arr[j] = i + i = veb._findsucc(i) + loc.arr_low = 0 + loc.arr_high = loc.arr.len + + + cdef intersectHelper(self, rule.Phrase prefix, rule.Phrase suffix, + PhraseLocation prefix_loc, PhraseLocation suffix_loc, int algorithm): + + cdef cintlist.CIntList arr1, arr2, result + cdef int low1, high1, step1, low2, high2, step2, offset_by_one, len_last, num_subpatterns, result_len + cdef int* result_ptr + cdef csuf.SuffixArray suf + + result_len = 0 + + if sym.isvar(suffix[0]): + offset_by_one = 1 + else: + offset_by_one = 0 + + len_last = len(suffix.getchunk(suffix.arity())) + + if prefix_loc.arr is None: + suf = self.context_manager.fsarray + self.sortPhraseLoc(suf.sa, prefix_loc, prefix) + arr1 = prefix_loc.arr + low1 = prefix_loc.arr_low + high1 = prefix_loc.arr_high + step1 = prefix_loc.num_subpatterns + + if suffix_loc.arr is None: + suf = self.context_manager.fsarray + self.sortPhraseLoc(suf.sa, suffix_loc, suffix) + arr2 = suffix_loc.arr + low2 = suffix_loc.arr_low + high2 = suffix_loc.arr_high + step2 = suffix_loc.num_subpatterns + + num_subpatterns = prefix.arity()+1 + + if algorithm == MERGE: + result_ptr = self.mergeHelper(low1, high1, arr1.arr, step1, + low2, high2, arr2.arr, step2, + offset_by_one, len_last, num_subpatterns, &result_len) + else: + result_ptr = self.baezaYatesHelper(low1, high1, arr1.arr, step1, + low2, high2, arr2.arr, step2, + offset_by_one, len_last, num_subpatterns, &result_len) + + if result_len == 0: + free(result_ptr) + return None + else: + result = cintlist.CIntList() + free(result.arr) + result.arr = result_ptr + result.len = result_len + result.size = result_len + return PhraseLocation(arr_low=0, arr_high=result_len, arr=result, num_subpatterns=num_subpatterns) + + cdef loc2str(self, PhraseLocation loc): + cdef int i, j + result = "{" + i = 0 + while i < loc.arr_high: + result = result + "(" + for j from i <= j < i + loc.num_subpatterns: + result = result + ("%d " %loc.arr[j]) + result = result + ")" + i = i + loc.num_subpatterns + result = result + "}" + return result + +# cdef compareResults(self, PhraseLocation loc1, PhraseLocation loc2, phrase, type1, type2): +# cdef i +# if loc1 is None and type1=="pre": +# return +# if loc1 is None: +# if loc2 is None or loc2.arr_high == 0: +# return +# if loc2 is None: +# if loc1.arr_high == 0: +# return +# if loc1.arr_high != loc2.arr_high: +# log.writeln("ERROR: %d vs %d (%s vs %s)" % (loc1.arr_high, loc2.arr_high, type1, type2)) +# #log.writeln(" %s" % self.loc2str(loc2)) +# if loc1.arr_high == 0: +# return +# elif loc1.num_subpatterns != loc2.num_subpatterns: +# log.writeln("ERROR 2: %d vs %d (%d v %d) %s" % (loc1.num_subpatterns, loc2.num_subpatterns, loc1.arr_high, loc2.arr_high, phrase)) +# for i from 0 <= i < loc1.arr_high: +# if loc1.arr[i] != loc2.arr[i]: +# log.writeln("ERROR 3") +# + cdef PhraseLocation intersect(self, prefix_node, suffix_node, rule.Phrase phrase): + cdef rule.Phrase prefix, suffix + cdef PhraseLocation prefix_loc, suffix_loc, result + + start_time = monitor.cpu() + prefix = prefix_node.phrase + suffix = suffix_node.phrase + prefix_loc = prefix_node.phrase_location + suffix_loc = suffix_node.phrase_location + + result = self.getPrecomputedCollocation(phrase) + if result is not None: + intersect_method = "precomputed" + + if result is None: + if self.useBaezaYates: + result = self.intersectHelper(prefix, suffix, prefix_loc, suffix_loc, BAEZA_YATES) + intersect_method="double binary" + else: + result = self.intersectHelper(prefix, suffix, prefix_loc, suffix_loc, MERGE) + intersect_method="merge" + stop_time = monitor.cpu() + intersect_time = stop_time - start_time + if self.log_int_stats: + if intersect_method == "precomputed": + sort1 = "none" + sort2 = "none" + else: + if prefix in self.precomputed_index: + sort1 = "index" + else: + sort1 = "veb" + if suffix in self.precomputed_index: + sort2 = "index" + else: + sort2 = "veb" + result_len=0 + if result is not None: + result_len = len(result.arr)/result.num_subpatterns + rank = 0 +# if phrase in self.max_rank: +# rank = self.max_rank[phrase] +# else: +# rank = self.precompute_rank + 10 + log.writeln("INT %d %d %d %d %d %f %d %s %s %s" % + (len(prefix)+1 - prefix.arity(), prefix.arity(), + nGramCount(prefix_node.phrase_location), + nGramCount(suffix_node.phrase_location), + result_len, intersect_time, rank, intersect_method, sort1, sort2)) + return result + + def advance(self, frontier, res, fwords): + nf = [] + for (toskip, (i, alt, pathlen)) in frontier: + spanlen = fwords[i][alt][2] + if (toskip == 0): + #log.writeln("RES: (%d %d %d)" % (i, alt, pathlen), 3) + res.append((i, alt, pathlen)) + ni = i + spanlen + #log.writeln("proc: %d (%d %d %d) sl=%d ni=%d len(fwords)=%d" % (toskip, i, alt, pathlen, spanlen, ni, len(fwords)), 3) + if (ni < len(fwords) and (pathlen + 1) < self.max_initial_size): + for na in xrange(len(fwords[ni])): + nf.append((toskip - 1, (ni, na, pathlen + 1))) + if (len(nf) > 0): + return self.advance(nf, res, fwords) + else: + return res + + def get_all_nodes_isteps_away(self, skip, i, spanlen, pathlen, fwords, next_states, reachable_buffer): + frontier = [] + if (i+spanlen+skip >= len(next_states)): + return frontier + #print "get_all_nodes_isteps_away from %i" % (i) + key = tuple([i,spanlen]) + reachable = [] + if (key in reachable_buffer): + reachable = reachable_buffer[key] + else: + reachable = self.reachable(fwords, i, spanlen) + reachable_buffer[key] = reachable + #print "reachable(from=%i,dist=%i) = " % (i,spanlen) + #print reachable + for nextreachable in reachable: + for next_id in next_states[nextreachable]: + jump = self.shortest(fwords,i,next_id) + #print "checking next_id = %i, pathlen[sofar] = %i, jump = %i" % (next_id,pathlen,jump) + #if (next_id - (i+spanlen)) < skip: + if jump < skip: + continue + #if next_id-(i-pathlen) < self.max_initial_size: + if pathlen+jump <= self.max_initial_size: + for alt_id in xrange(len(fwords[next_id])): + if (fwords[next_id][alt_id][0] != cn.epsilon): + #frontier.append((next_id,alt_id,next_id-(i-pathlen))); + #print "finding the shortest from %i to %i" % (i, next_id) + newel = (next_id,alt_id,pathlen+jump) + if newel not in frontier: + frontier.append((next_id,alt_id,pathlen+jump)) + #print "appending to frontier = next_id=%i, alt_id=%i, pathlen=%i" % (next_id,alt_id,pathlen+jump) + #else: + #print "NOT appending to frontier = next_id=%i, alt_id=%i, pathlen=%i" % (next_id,alt_id,pathlen+jump) + #else: + #print "next_id = %s is aborted\n" % next_id + #print "returning frontier" + #print frontier + return frontier + + def reachable(self, fwords, ifrom, dist): + #print "inside reachable(%i,%i)" % (ifrom,dist) + ret = [] + if (ifrom >= len(fwords)): + return ret + for alt_id in xrange(len(fwords[ifrom])): + if (fwords[ifrom][alt_id][0] == cn.epsilon): + ret.extend(self.reachable(fwords,ifrom+fwords[ifrom][alt_id][2],dist)) + else: + if (dist==0): + if (ifrom not in ret): + ret.append(ifrom) + else: + for ifromchild in self.reachable(fwords,ifrom+fwords[ifrom][alt_id][2],dist-1): + if (ifromchild not in ret): + ret.append(ifromchild) + + return ret + + def shortest(self, fwords, ifrom, ito): + min = 1000 + #print "shortest ifrom=%i, ito=%i" % (ifrom,ito) + if (ifrom > ito): + return min + if (ifrom == ito): + return 0 + for alt_id in xrange(len(fwords[ifrom])): + currmin = self.shortest(fwords,ifrom+fwords[ifrom][alt_id][2],ito) + if (fwords[ifrom][alt_id][0] != cn.epsilon): + currmin += 1 + if (currmin 0: + curr = candidate.pop() + if curr[0] >= len(_columns): + continue + if curr[0] not in result and min_dist <= curr[1] <= self.max_initial_size: + result.append(curr[0]); + curr_col = _columns[curr[0]] + for alt in curr_col: + next_id = curr[0]+alt[2] + jump = 1 + if (alt[0] == cn.epsilon): + jump = 0 + if next_id not in result and min_dist <= curr[1]+jump <= self.max_initial_size+1: + candidate.append([next_id,curr[1]+jump]) + return sorted(result); + + def input(self, fwords, meta): + '''When this function is called on the RuleFactory, + it looks up all of the rules that can be used to translate + the input sentence''' + cdef int i, j, k, flen, arity, num_subpatterns, num_samples + cdef float start_time + cdef PhraseLocation phrase_location + cdef cintlist.CIntList sample, chunklen + cdef Matching matching + cdef rule.Phrase hiero_phrase + + #fwords = [ ((1,0.0,1),), fwords1 ] #word id for = 1, cost = 0.0, next = 1 + #print fwords + flen = len(fwords) + #print "length = %i" % flen + start_time = monitor.cpu() + self.intersect_time = 0.0 + self.extract_time = 0.0 + nodes_isteps_away_buffer = {} + hit = 0 + reachable_buffer = {} + #print "id = ",meta + #print "rule_file = ",self.rule_file + dattrs = sgml.attrs_to_dict(meta) + id = dattrs.get('id', 'NOID') + if self.per_sentence_grammar: + self.rule_filehandler = open(self.rule_file+'.'+id, 'w') + self.excluded_sent_id = int(dattrs.get('exclude', '-1')) + + #print "max_initial_size = %i" % self.max_initial_size + + if not self.cacheBetweenSents: + self.rules.root = ExtendedTrieNode(phrase_location=PhraseLocation()) + self.grammar.root = [None, {}] + + frontier = [] + for i in xrange(len(fwords)): + for alt in xrange(0, len(fwords[i])): + if fwords[i][alt][0] != cn.epsilon: + frontier.append((i, i, alt, 0, self.rules.root, (), False)) + + xroot = None + x1 = sym.setindex(self.category, 1) + if x1 in self.rules.root.children: + xroot = self.rules.root.children[x1] + else: + xroot = ExtendedTrieNode(suffix_link=self.rules.root, phrase_location=PhraseLocation()) + self.rules.root.children[x1] = xroot + + for i in xrange(self.min_gap_size, len(fwords)): + for alt in xrange(0, len(fwords[i])): + if fwords[i][alt][0] != cn.epsilon: + frontier.append((i-self.min_gap_size, i, alt, self.min_gap_size, xroot, (x1,), True)) + '''for k, i, alt, pathlen, node, prefix, is_shadow_path in frontier: + if len(prefix)>0: + print k, i, alt, pathlen, node, map(sym.tostring,prefix), is_shadow_path + else: + print k, i, alt, pathlen, node, prefix, is_shadow_path''' + + #for wid in xrange(1000): + # print "%i = %s" % (wid, sym.tostring(wid)) + next_states = [] + for i in xrange(len(fwords)): + next_states.append(self.get_next_states(fwords,i,self.min_gap_size)) + #print "next state of %i" % i + #print next_states[i] + + while len(frontier) > 0: + #print "frontier = %i" % len(frontier) + this_iter_intersect_time = self.intersect_time + new_frontier = [] + for k, i, alt, pathlen, node, prefix, is_shadow_path in frontier: + #print "looking at: " + #if len(prefix)>0: + # print k, i, alt, pathlen, node, map(sym.tostring,prefix), is_shadow_path + #else: + # print k, i, alt, pathlen, node, prefix, is_shadow_path + word_id = fwords[i][alt][0] + spanlen = fwords[i][alt][2] + #print "word_id = %i, %s" % (word_id, sym.tostring(word_id)) + # to prevent .. [X] + #print "prefix = ",prefix + #if word_id == 2 and len(prefix)>=2: + #print "at the end: %s" % (prefix[len(prefix)-1]) + #if prefix[len(prefix)-1]<0: + #print "break" + #continue + #print "continuing" + #if pathlen + spanlen > self.max_initial_size: + #continue + # TODO get rid of k -- pathlen is replacing it + if word_id == cn.epsilon: + #print "skipping because word_id is epsilon" + if i+spanlen >= len(fwords): + continue + for nualt in xrange(0,len(fwords[i+spanlen])): + frontier.append((k, i+spanlen, nualt, pathlen, node, prefix, is_shadow_path)) + continue + + phrase = prefix + (word_id,) + str_phrase = map(sym.tostring, phrase) + hiero_phrase = rule.Phrase(phrase) + arity = hiero_phrase.arity() + + #print "pos %2d, node %5d, '%s'" % (i, node.id, hiero_phrase) + if self.search_file is not None: + self.search_file.write("%s\n" % hiero_phrase) + + lookup_required = False + if word_id in node.children: + if node.children[word_id] is None: + #print "Path dead-ends at this node\n" + continue + else: + #print "Path continues at this node\n" + node = node.children[word_id] + else: + if node.suffix_link is None: + #print "Current node is root; lookup required\n" + lookup_required = True + else: + if word_id in node.suffix_link.children: + if node.suffix_link.children[word_id] is None: + #print "Suffix link reports path is dead end\n" + node.children[word_id] = None + continue + else: + #print "Suffix link indicates lookup is reqired\n" + lookup_required = True + else: + #print "ERROR: We never get here\n" + raise Exception("Keyword trie error") + #new_frontier.append((k, i, alt, pathlen, node, prefix, is_shadow_path)) + #print "checking whether lookup_required\n" + if lookup_required: + new_node = None + if is_shadow_path: + #print "Extending shadow path for %s \n" + # on the shadow path we don't do any search, we just use info from suffix link + new_node = ExtendedTrieNode(phrase_location=node.suffix_link.children[word_id].phrase_location, + suffix_link=node.suffix_link.children[word_id], + phrase=hiero_phrase) + else: + if arity > 0: + #print "Intersecting for %s because of arity > 0\n" % hiero_phrase + phrase_location = self.intersect(node, node.suffix_link.children[word_id], hiero_phrase) + else: + #print "Suffix array search for %s" % hiero_phrase + phrase_location = node.phrase_location + sa_range = self.context_manager.fsarray.lookup(str_phrase[-1], len(str_phrase)-1, phrase_location.sa_low, phrase_location.sa_high) + if sa_range is not None: + phrase_location = PhraseLocation(sa_low=sa_range[0], sa_high=sa_range[1]) + else: + phrase_location = None + + if phrase_location is None: + node.children[word_id] = None + #print "Search failed\n" + continue + #print "Search succeeded\n" + suffix_link = self.rules.root + if node.suffix_link is not None: + suffix_link = node.suffix_link.children[word_id] + new_node = ExtendedTrieNode(phrase_location=phrase_location, + suffix_link=suffix_link, + phrase=hiero_phrase) + node.children[word_id] = new_node + node = new_node + #print "Added node %d with suffix link %d\n" % (node.id, node.suffix_link.id) + + '''Automatically add a trailing X node, if allowed -- + This should happen before we get to extraction (so that + the node will exist if needed)''' + if arity < self.max_nonterminals: + xcat_index = arity+1 + xcat = sym.setindex(self.category, xcat_index) + suffix_link_xcat_index = xcat_index + if is_shadow_path: + suffix_link_xcat_index = xcat_index-1 + suffix_link_xcat = sym.setindex(self.category, suffix_link_xcat_index) + node.children[xcat] = ExtendedTrieNode(phrase_location=node.phrase_location, + suffix_link=node.suffix_link.children[suffix_link_xcat], + phrase= rule.Phrase(phrase + (xcat,))) + #log.writeln("Added node %d with suffix link %d (for X)" % (node.children[xcat].id, node.children[xcat].suffix_link.id), 4) + + # sample from range + if not is_shadow_path: + #print "is_not_shadow_path" + sample = self.context_manager.sampler.sample(node.phrase_location) + #print "node.phrase_location %s" % str(node.phrase_location) + #print "sample.len = %i" % len(sample) + num_subpatterns = ( node.phrase_location).num_subpatterns + chunklen = cintlist.CIntList(initial_len=num_subpatterns) + for j from 0 <= j < num_subpatterns: + chunklen.arr[j] = hiero_phrase.chunklen(j) + extracts = [] + j = 0 + extract_start = monitor.cpu() + '''orig_tight_phrases = self.tight_phrases + orig_require_aligned_terminal = self.require_aligned_terminal + orig_require_aligned_chunks = self.require_aligned_chunks + if k==0 or i==len(fwords)-1: + self.tight_phrases = 0 + self.require_aligned_terminal = 0 + self.require_aligned_chunks = 0''' + while j < sample.len: + extract = [] + + assign_matching(&matching, sample.arr, j, num_subpatterns, self.fda.sent_id.arr) + '''print "tight_phrase = " + print self.tight_phrases + print "require_aligned_terminal = " + print self.require_aligned_terminal + print "require_aligned_chunks = " + print self.require_aligned_chunks''' + + extract = self.extract(hiero_phrase, &matching, chunklen.arr, num_subpatterns) + extracts.extend(extract) + j = j + num_subpatterns + '''self.tight_phrases = orig_tight_phrases + sttice+sa.nw.normelf.require_aligned_terminal = orig_require_aligned_terminal + self.require_aligned_chunks = orig_require_aligned_chunks''' + num_samples = sample.len/num_subpatterns + extract_stop = monitor.cpu() + self.extract_time = self.extract_time + extract_stop - extract_start + #print "extract.size = %i" % len(extracts) + if len(extracts) > 0: + fphrases = {} + fals = {} + fcount = {} + for f, e, count, als in extracts: + fcount.setdefault(f, 0.0) + fcount[f] = fcount[f] + count + fphrases.setdefault(f, {}) + fphrases[f].setdefault(e, {}) + #fphrases[f][e] = fphrases[f][e] + count + fphrases[f][e].setdefault(als,0.0) + fphrases[f][e][als] = fphrases[f][e][als] + count + #print "f,e,als ",f," : ",e," : ",als," count = ",fphrases[f][e][als] + #fals[str(f)+" ||| "+str(e)] = als + for f, elist in fphrases.iteritems(): + #print "f = '%s'" % f + #if (str(f) in ['','',' [X,1]','[X,1] ']): + # print "rejected" + # continue + f_margin = fcount[f] + for e, alslist in elist.iteritems(): + alignment = None + count = 0 + for als, currcount in alslist.iteritems(): + #print "als = ",als,", count = ",currcount + if currcount > count: + alignment = als + count = currcount + #alignment = fals[str(f)+" ||| "+str(e)] + #print "selected = ",alignment," with count = ",count + scores = [] + for m in self.context_manager.models: + scores.append(m.compute_contextless_score(f, e, count, fcount[f], num_samples)) + r = rule.Rule(self.category, f, e, scores=scores, owner="context", word_alignments = alignment) + self.grammar.add(r) + if self.rule_filehandler is not None: + self.rule_filehandler.write("%s\n" % r.to_line()) + #print "adding a rule = %s" % r + + #if len(phrase) < self.max_length and i+spanlen < len(fwords) and pathlen+spanlen < self.max_initial_size: + if len(phrase) < self.max_length and i+spanlen < len(fwords) and pathlen+1 <= self.max_initial_size: + #to prevent [X] + #print "lexicalized" + for alt_id in xrange(len(fwords[i+spanlen])): + #if (fwords[i+spanlen][alt_id][2]+pathlen+spanlen <= self.max_initial_size): + #new_frontier.append((k, i+spanlen, alt_id, pathlen + spanlen, node, phrase, is_shadow_path)) + #print "alt_id = %d\n" % alt_id + new_frontier.append((k, i+spanlen, alt_id, pathlen + 1, node, phrase, is_shadow_path)) + #print (k, i+spanlen, alt_id, pathlen + spanlen, node, map(sym.tostring,phrase), is_shadow_path) + #print "end lexicalized" + num_subpatterns = arity + if not is_shadow_path: + num_subpatterns = num_subpatterns + 1 + #to avoid X ... we want next to a lexicalized item + #if k>0 and i X_1 w X_2 / X_1 X_2. This is probably + # not worth the bother, though. + #print "find_fixpoint0" + return 0 + elif e_in_low != -1 and e_low[0] != e_in_low: + if e_in_low - e_low[0] < min_ex_size: + e_low[0] = e_in_low - min_ex_size + if e_low[0] < 0: + #print "find_fixpoint1" + return 0 + + if e_high[0] - e_low[0] > max_e_len: + #print "find_fixpoint2" + return 0 + elif e_in_high != -1 and e_high[0] != e_in_high: + if e_high[0] - e_in_high < min_ex_size: + e_high[0] = e_in_high + min_ex_size + if e_high[0] > e_sent_len: + #print "find_fixpoint3" + return 0 + + f_back_low[0] = -1 + f_back_high[0] = -1 + f_low_prev = f_low + f_high_prev = f_high + new_x = 0 + new_low_x = 0 + new_high_x = 0 + + while True: + + if f_back_low[0] == -1: + self.find_projection(e_low[0], e_high[0], e_links_low, e_links_high, f_back_low, f_back_high) + else: + self.find_projection(e_low[0], e_low_prev, e_links_low, e_links_high, f_back_low, f_back_high) + self.find_projection(e_high_prev, e_high[0], e_links_low, e_links_high, f_back_low, f_back_high) + + if f_back_low[0] > f_low: + f_back_low[0] = f_low + + if f_back_high[0] < f_high: + f_back_high[0] = f_high + + if f_back_low[0] == f_low_prev and f_back_high[0] == f_high_prev: + return 1 + + if allow_low_x == 0 and f_back_low[0] < f_low: +# log.writeln(" FAIL: f phrase is not tight") + #print " FAIL: f phrase is not tight" + return 0 + + if f_back_high[0] - f_back_low[0] > max_f_len: +# log.writeln(" FAIL: f back projection is too wide") + #print " FAIL: f back projection is too wide" + return 0 + + if allow_high_x == 0 and f_back_high[0] > f_high: +# log.writeln(" FAIL: extension on high side not allowed") + #print " FAIL: extension on high side not allowed" + return 0 + + if f_low != f_back_low[0]: + if new_low_x == 0: + if new_x >= max_new_x: +# log.writeln(" FAIL: extension required on low side violates max # of gaps") + #print " FAIL: extension required on low side violates max # of gaps" + return 0 + else: + new_x = new_x + 1 + new_low_x = 1 + if f_low - f_back_low[0] < min_fx_size: + f_back_low[0] = f_low - min_fx_size + if f_back_high[0] - f_back_low[0] > max_f_len: +# log.writeln(" FAIL: extension required on low side violates max initial length") + #print " FAIL: extension required on low side violates max initial length" + return 0 + if f_back_low[0] < 0: +# log.writeln(" FAIL: extension required on low side violates sentence boundary") + #print " FAIL: extension required on low side violates sentence boundary" + return 0 + + if f_high != f_back_high[0]: + if new_high_x == 0: + if new_x >= max_new_x: +# log.writeln(" FAIL: extension required on high side violates max # of gaps") + #print " FAIL: extension required on high side violates max # of gaps" + return 0 + else: + new_x = new_x + 1 + new_high_x = 1 + if f_back_high[0] - f_high < min_fx_size: + f_back_high[0] = f_high + min_fx_size + if f_back_high[0] - f_back_low[0] > max_f_len: +# log.writeln(" FAIL: extension required on high side violates max initial length") + #print " FAIL: extension required on high side violates max initial length" + return 0 + if f_back_high[0] > f_sent_len: +# log.writeln(" FAIL: extension required on high side violates sentence boundary") + #print " FAIL: extension required on high side violates sentence boundary" + return 0 + + e_low_prev = e_low[0] + e_high_prev = e_high[0] + + self.find_projection(f_back_low[0], f_low_prev, f_links_low, f_links_high, e_low, e_high) + self.find_projection(f_high_prev, f_back_high[0], f_links_low, f_links_high, e_low, e_high) + if e_low[0] == e_low_prev and e_high[0] == e_high_prev: + return 1 + if allow_arbitrary_x == 0: +# log.writeln(" FAIL: arbitrary expansion not permitted") + #print " FAIL: arbitrary expansion not permitted" + return 0 + if e_high[0] - e_low[0] > max_e_len: +# log.writeln(" FAIL: re-projection violates sentence max phrase length") + #print " FAIL: re-projection violates sentence max phrase length" + return 0 + f_low_prev = f_back_low[0] + f_high_prev = f_back_high[0] + + + cdef find_projection(self, int in_low, int in_high, int* in_links_low, int* in_links_high, + int* out_low, int* out_high): + cdef int i + for i from in_low <= i < in_high: + if in_links_low[i] != -1: + if out_low[0] == -1 or in_links_low[i] < out_low[0]: + out_low[0] = in_links_low[i] + if out_high[0] == -1 or in_links_high[i] > out_high[0]: + out_high[0] = in_links_high[i] + + + cdef int* int_arr_extend(self, int* arr, int* arr_len, int* data, int data_len): + cdef int new_len + new_len = arr_len[0] + data_len + arr = realloc(arr, new_len*sizeof(int)) + memcpy(arr+arr_len[0], data, data_len*sizeof(int)) + arr_len[0] = new_len + return arr + + + cdef extract_phrases(self, int e_low, int e_high, int* e_gap_low, int* e_gap_high, int* e_links_low, int num_gaps, + int f_low, int f_high, int* f_gap_low, int* f_gap_high, int* f_links_low, + int sent_id, int e_sent_len, int e_sent_start): + cdef int i, j, k, m, n, *e_gap_order, e_x_low, e_x_high, e_x_gap_low, e_x_gap_high + cdef int *e_gaps1, *e_gaps2, len1, len2, step, num_chunks + cdef cintlist.CIntList ephr_arr + cdef result + + #print "inside extract_phrases" + #print "f_low=%d, f_high=%d" % (f_low,f_high) + result = [] + len1 = 0 + e_gaps1 = malloc(0) + ephr_arr = cintlist.CIntList() + + e_gap_order = malloc(num_gaps*sizeof(int)) + if num_gaps > 0: + e_gap_order[0] = 0 + for i from 1 <= i < num_gaps: + for j from 0 <= j < i: + if e_gap_low[i] < e_gap_low[j]: + for k from j <= k < i: + e_gap_order[k+1] = e_gap_order[k] + e_gap_order[j] = i + break + else: + e_gap_order[i] = i + + e_x_low = e_low + e_x_high = e_high + if self.tight_phrases == 0: + while e_x_low > 0 and e_high - e_x_low < self.train_max_initial_size and e_links_low[e_x_low-1] == -1: + e_x_low = e_x_low - 1 + while e_x_high < e_sent_len and e_x_high - e_low < self.train_max_initial_size and e_links_low[e_x_high] == -1: + e_x_high = e_x_high + 1 + + for i from e_x_low <= i <= e_low: + e_gaps1 = self.int_arr_extend(e_gaps1, &len1, &i, 1) + + for i from 0 <= i < num_gaps: + e_gaps2 = malloc(0) + len2 = 0 + + j = e_gap_order[i] + e_x_gap_low = e_gap_low[j] + e_x_gap_high = e_gap_high[j] + if self.tight_phrases == 0: + while e_x_gap_low > e_x_low and e_links_low[e_x_gap_low-1] == -1: + e_x_gap_low = e_x_gap_low - 1 + while e_x_gap_high < e_x_high and e_links_low[e_x_gap_high] == -1: + e_x_gap_high = e_x_gap_high + 1 + + k = 0 + step = 1+(i*2) + while k < len1: + for m from e_x_gap_low <= m <= e_gap_low[j]: + if m >= e_gaps1[k+step-1]: + for n from e_gap_high[j] <= n <= e_x_gap_high: + if n-m >= 1: # extractor.py doesn't restrict target-side gap length + e_gaps2 = self.int_arr_extend(e_gaps2, &len2, e_gaps1+k, step) + e_gaps2 = self.int_arr_extend(e_gaps2, &len2, &m, 1) + e_gaps2 = self.int_arr_extend(e_gaps2, &len2, &n, 1) + k = k + step + free(e_gaps1) + e_gaps1 = e_gaps2 + len1 = len2 + + step = 1+(num_gaps*2) + e_gaps2 = malloc(0) + len2 = 0 + for i from e_high <= i <= e_x_high: + j = 0 + while j < len1: + if i - e_gaps1[j] <= self.train_max_initial_size and i >= e_gaps1[j+step-1]: + e_gaps2 = self.int_arr_extend(e_gaps2, &len2, e_gaps1+j, step) + e_gaps2 = self.int_arr_extend(e_gaps2, &len2, &i, 1) + j = j + step + free(e_gaps1) + e_gaps1 = e_gaps2 + len1 = len2 + + step = (num_gaps+1)*2 + i = 0 + + while i < len1: + ephr_arr._clear() + num_chunks = 0 + indexes = [] + for j from 0 <= j < num_gaps+1: + if e_gaps1[i+2*j] < e_gaps1[i+(2*j)+1]: + num_chunks = num_chunks + 1 + for k from e_gaps1[i+2*j] <= k < e_gaps1[i+(2*j)+1]: + indexes.append(k) + ephr_arr._append(self.eid2symid[self.eda.data.arr[e_sent_start+k]]) + if j < num_gaps: + indexes.append(sym.setindex(self.category, e_gap_order[j]+1)) + ephr_arr._append(sym.setindex(self.category, e_gap_order[j]+1)) + i = i + step + if ephr_arr.len <= self.max_target_length and num_chunks <= self.max_target_chunks: + result.append((rule.Phrase(ephr_arr),indexes)) + + free(e_gaps1) + free(e_gap_order) + return result + + cdef create_alignments(self, int* sent_links, int num_links, findexes, eindexes): + #print "create_alignments" + #s = "sent_links = " + #i = 0 + #while (i < num_links*2): + # s = s+"%d-%d " % (sent_links[i],sent_links[i+1]) + # i += 2 + #print s + #print findexes + #print eindexes + + ret = cintlist.CIntList() + for i in xrange(len(findexes)): + s = findexes[i] + if (s<0): + continue + idx = 0 + while (idx < num_links*2): + if (sent_links[idx] == s): + j = eindexes.index(sent_links[idx+1]) + ret.append(i*65536+j) + idx += 2 + return ret + + cdef extract(self, rule.Phrase phrase, Matching* matching, int* chunklen, int num_chunks): + cdef int* sent_links, *e_links_low, *e_links_high, *f_links_low, *f_links_high + cdef int *f_gap_low, *f_gap_high, *e_gap_low, *e_gap_high, num_gaps, gap_start + cdef int i, j, k, e_i, f_i, num_links, num_aligned_chunks, met_constraints + cdef int f_low, f_high, e_low, e_high, f_back_low, f_back_high + cdef int e_sent_start, e_sent_end, f_sent_start, f_sent_end, e_sent_len, f_sent_len + cdef int e_word_count, f_x_low, f_x_high, e_x_low, e_x_high, phrase_len + cdef float pair_count + cdef float available_mass + cdef extracts, phrase_list + cdef cintlist.CIntList fphr_arr + cdef rule.Phrase fphr + cdef reason_for_failure + + fphr_arr = cintlist.CIntList() + phrase_len = phrase.n + extracts = [] + sent_links = self.alignment._get_sent_links(matching.sent_id, &num_links) + + e_sent_start = self.eda.sent_index.arr[matching.sent_id] + e_sent_end = self.eda.sent_index.arr[matching.sent_id+1] + e_sent_len = e_sent_end - e_sent_start - 1 + f_sent_start = self.fda.sent_index.arr[matching.sent_id] + f_sent_end = self.fda.sent_index.arr[matching.sent_id+1] + f_sent_len = f_sent_end - f_sent_start - 1 + available_mass = 1.0 + if matching.sent_id == self.excluded_sent_id: + available_mass = 0.0 + + self.findexes1.reset() + sofar = 0 + for i in xrange(num_chunks): + for j in xrange(chunklen[i]): + self.findexes1.append(matching.arr[matching.start+i]+j-f_sent_start); + sofar += 1 + if (i+1 malloc(e_sent_len*sizeof(int)) + e_links_high = malloc(e_sent_len*sizeof(int)) + f_links_low = malloc(f_sent_len*sizeof(int)) + f_links_high = malloc(f_sent_len*sizeof(int)) + f_gap_low = malloc((num_chunks+1)*sizeof(int)) + f_gap_high = malloc((num_chunks+1)*sizeof(int)) + e_gap_low = malloc((num_chunks+1)*sizeof(int)) + e_gap_high = malloc((num_chunks+1)*sizeof(int)) + memset(f_gap_low, 0, (num_chunks+1)*sizeof(int)) + memset(f_gap_high, 0, (num_chunks+1)*sizeof(int)) + memset(e_gap_low, 0, (num_chunks+1)*sizeof(int)) + memset(e_gap_high, 0, (num_chunks+1)*sizeof(int)) + + reason_for_failure = "" + + for i from 0 <= i < e_sent_len: + e_links_low[i] = -1 + e_links_high[i] = -1 + for i from 0 <= i < f_sent_len: + f_links_low[i] = -1 + f_links_high[i] = -1 + + # this is really inefficient -- might be good to + # somehow replace with binary search just for the f + # links that we care about (but then how to look up + # when we want to check something on the e side?) + i = 0 + while i < num_links*2: + f_i = sent_links[i] + e_i = sent_links[i+1] + if f_links_low[f_i] == -1 or f_links_low[f_i] > e_i: + f_links_low[f_i] = e_i + if f_links_high[f_i] == -1 or f_links_high[f_i] < e_i + 1: + f_links_high[f_i] = e_i + 1 + if e_links_low[e_i] == -1 or e_links_low[e_i] > f_i: + e_links_low[e_i] = f_i + if e_links_high[e_i] == -1 or e_links_high[e_i] < f_i + 1: + e_links_high[e_i] = f_i + 1 + i = i + 2 + + als = [] + for x in xrange(matching.start,matching.end): + al = (matching.arr[x]-f_sent_start,f_links_low[matching.arr[x]-f_sent_start]) + als.append(al) + # check all source-side alignment constraints + met_constraints = 1 + if self.require_aligned_terminal: + num_aligned_chunks = 0 + for i from 0 <= i < num_chunks: + for j from 0 <= j < chunklen[i]: + if f_links_low[matching.arr[matching.start+i]+j-f_sent_start] > -1: + num_aligned_chunks = num_aligned_chunks + 1 + break + if num_aligned_chunks == 0: + reason_for_failure = "No aligned terminals" + met_constraints = 0 + if self.require_aligned_chunks and num_aligned_chunks < num_chunks: + reason_for_failure = "Unaligned chunk" + met_constraints = 0 + + if met_constraints and self.tight_phrases: + # outside edge constraints are checked later + for i from 0 <= i < num_chunks-1: + if f_links_low[matching.arr[matching.start+i]+chunklen[i]-f_sent_start] == -1: + reason_for_failure = "Gaps are not tight phrases" + met_constraints = 0 + break + if f_links_low[matching.arr[matching.start+i+1]-1-f_sent_start] == -1: + reason_for_failure = "Gaps are not tight phrases" + met_constraints = 0 + break + + f_low = matching.arr[matching.start] - f_sent_start + f_high = matching.arr[matching.start + matching.size - 1] + chunklen[num_chunks-1] - f_sent_start + if met_constraints: + + if self.find_fixpoint(f_low, f_high, f_links_low, f_links_high, e_links_low, e_links_high, + -1, -1, &e_low, &e_high, &f_back_low, &f_back_high, f_sent_len, e_sent_len, + self.train_max_initial_size, self.train_max_initial_size, + self.train_min_gap_size, 0, + self.max_nonterminals - num_chunks + 1, 1, 1, 0, 0): + gap_error = 0 + num_gaps = 0 + + if f_back_low < f_low: + f_gap_low[0] = f_back_low + f_gap_high[0] = f_low + num_gaps = 1 + gap_start = 0 + phrase_len = phrase_len+1 + if phrase_len > self.max_length: + gap_error = 1 + if self.tight_phrases: + if f_links_low[f_back_low] == -1 or f_links_low[f_low-1] == -1: + gap_error = 1 + reason_for_failure = "Inside edges of preceding subphrase are not tight" + else: + gap_start = 1 + if self.tight_phrases and f_links_low[f_low] == -1: + # this is not a hard error. we can't extract this phrase + # but we still might be able to extract a superphrase + met_constraints = 0 + + for i from 0 <= i < matching.size - 1: + f_gap_low[1+i] = matching.arr[matching.start+i] + chunklen[i] - f_sent_start + f_gap_high[1+i] = matching.arr[matching.start+i+1] - f_sent_start + num_gaps = num_gaps + 1 + + if f_high < f_back_high: + f_gap_low[gap_start+num_gaps] = f_high + f_gap_high[gap_start+num_gaps] = f_back_high + num_gaps = num_gaps + 1 + phrase_len = phrase_len+1 + if phrase_len > self.max_length: + gap_error = 1 + if self.tight_phrases: + if f_links_low[f_back_high-1] == -1 or f_links_low[f_high] == -1: + gap_error = 1 + reason_for_failure = "Inside edges of following subphrase are not tight" + else: + if self.tight_phrases and f_links_low[f_high-1] == -1: + met_constraints = 0 + + if gap_error == 0: + e_word_count = e_high - e_low + for i from 0 <= i < num_gaps: # check integrity of subphrases + if self.find_fixpoint(f_gap_low[gap_start+i], f_gap_high[gap_start+i], + f_links_low, f_links_high, e_links_low, e_links_high, + -1, -1, e_gap_low+gap_start+i, e_gap_high+gap_start+i, + f_gap_low+gap_start+i, f_gap_high+gap_start+i, + f_sent_len, e_sent_len, + self.train_max_initial_size, self.train_max_initial_size, + 0, 0, 0, 0, 0, 0, 0) == 0: + gap_error = 1 + reason_for_failure = "Subphrase [%d, %d] failed integrity check" % (f_gap_low[gap_start+i], f_gap_high[gap_start+i]) + break + + if gap_error == 0: + i = 1 + self.findexes.reset() + if f_back_low < f_low: + fphr_arr._append(sym.setindex(self.category, i)) + i = i+1 + self.findexes.append(sym.setindex(self.category, i)) + self.findexes.extend(self.findexes1) + for j from 0 <= j < phrase.n: + if sym.isvar(phrase.syms[j]): + fphr_arr._append(sym.setindex(self.category, i)) + i = i + 1 + else: + fphr_arr._append(phrase.syms[j]) + if f_back_high > f_high: + fphr_arr._append(sym.setindex(self.category, i)) + self.findexes.append(sym.setindex(self.category, i)) + + fphr = rule.Phrase(fphr_arr) + if met_constraints: + phrase_list = self.extract_phrases(e_low, e_high, e_gap_low + gap_start, e_gap_high + gap_start, e_links_low, num_gaps, + f_back_low, f_back_high, f_gap_low + gap_start, f_gap_high + gap_start, f_links_low, + matching.sent_id, e_sent_len, e_sent_start) + #print "e_low=%d, e_high=%d, gap_start=%d, num_gaps=%d, f_back_low=%d, f_back_high=%d" & (e_low, e_high, gap_start, num_gaps, f_back_low, f_back_high) + if len(phrase_list) > 0: + pair_count = available_mass / len(phrase_list) + else: + pair_count = 0 + reason_for_failure = "Didn't extract anything from [%d, %d] -> [%d, %d]" % (f_back_low, f_back_high, e_low, e_high) + for (phrase2,eindexes) in phrase_list: + als1 = self.create_alignments(sent_links,num_links,self.findexes,eindexes) + extracts.append((fphr, phrase2, pair_count, tuple(als1))) + if self.extract_file: + self.extract_file.write("%s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_back_low, f_back_high, e_low, e_high)) + #print "extract_phrases1: %s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_back_low, f_back_high, e_low, e_high) + + if (num_gaps < self.max_nonterminals and + phrase_len < self.max_length and + f_back_high - f_back_low + self.train_min_gap_size <= self.train_max_initial_size): + if (f_back_low == f_low and + f_low >= self.train_min_gap_size and + ((not self.tight_phrases) or (f_links_low[f_low-1] != -1 and f_links_low[f_back_high-1] != -1))): + f_x_low = f_low-self.train_min_gap_size + met_constraints = 1 + if self.tight_phrases: + while f_x_low >= 0 and f_links_low[f_x_low] == -1: + f_x_low = f_x_low - 1 + if f_x_low < 0 or f_back_high - f_x_low > self.train_max_initial_size: + met_constraints = 0 + + if (met_constraints and + self.find_fixpoint(f_x_low, f_back_high, + f_links_low, f_links_high, e_links_low, e_links_high, + e_low, e_high, &e_x_low, &e_x_high, &f_x_low, &f_x_high, + f_sent_len, e_sent_len, + self.train_max_initial_size, self.train_max_initial_size, + 1, 1, 1, 1, 0, 1, 0) and + ((not self.tight_phrases) or f_links_low[f_x_low] != -1) and + self.find_fixpoint(f_x_low, f_low, # check integrity of new subphrase + f_links_low, f_links_high, e_links_low, e_links_high, + -1, -1, e_gap_low, e_gap_high, f_gap_low, f_gap_high, + f_sent_len, e_sent_len, + self.train_max_initial_size, self.train_max_initial_size, + 0, 0, 0, 0, 0, 0, 0)): + fphr_arr._clear() + i = 1 + self.findexes.reset() + self.findexes.append(sym.setindex(self.category, i)) + fphr_arr._append(sym.setindex(self.category, i)) + i = i+1 + self.findexes.extend(self.findexes1) + for j from 0 <= j < phrase.n: + if sym.isvar(phrase.syms[j]): + fphr_arr._append(sym.setindex(self.category, i)) + i = i + 1 + else: + fphr_arr._append(phrase.syms[j]) + if f_back_high > f_high: + fphr_arr._append(sym.setindex(self.category, i)) + self.findexes.append(sym.setindex(self.category, i)) + fphr = rule.Phrase(fphr_arr) + phrase_list = self.extract_phrases(e_x_low, e_x_high, e_gap_low, e_gap_high, e_links_low, num_gaps+1, + f_x_low, f_x_high, f_gap_low, f_gap_high, f_links_low, matching.sent_id, + e_sent_len, e_sent_start) + if len(phrase_list) > 0: + pair_count = available_mass / len(phrase_list) + else: + pair_count = 0 + for phrase2,eindexes in phrase_list: + als2 = self.create_alignments(sent_links,num_links,self.findexes,eindexes) + extracts.append((fphr, phrase2, pair_count, tuple(als2))) + if self.extract_file: + self.extract_file.write("%s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high)) + #print "extract_phrases2: %s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high) + + if (f_back_high == f_high and + f_sent_len - f_high >= self.train_min_gap_size and + ((not self.tight_phrases) or (f_links_low[f_high] != -1 and f_links_low[f_back_low] != -1))): + f_x_high = f_high+self.train_min_gap_size + met_constraints = 1 + if self.tight_phrases: + while f_x_high <= f_sent_len and f_links_low[f_x_high-1] == -1: + f_x_high = f_x_high + 1 + if f_x_high > f_sent_len or f_x_high - f_back_low > self.train_max_initial_size: + met_constraints = 0 + + if (met_constraints and + self.find_fixpoint(f_back_low, f_x_high, + f_links_low, f_links_high, e_links_low, e_links_high, + e_low, e_high, &e_x_low, &e_x_high, &f_x_low, &f_x_high, + f_sent_len, e_sent_len, + self.train_max_initial_size, self.train_max_initial_size, + 1, 1, 1, 0, 1, 1, 0) and + ((not self.tight_phrases) or f_links_low[f_x_high-1] != -1) and + self.find_fixpoint(f_high, f_x_high, + f_links_low, f_links_high, e_links_low, e_links_high, + -1, -1, e_gap_low+gap_start+num_gaps, e_gap_high+gap_start+num_gaps, + f_gap_low+gap_start+num_gaps, f_gap_high+gap_start+num_gaps, + f_sent_len, e_sent_len, + self.train_max_initial_size, self.train_max_initial_size, + 0, 0, 0, 0, 0, 0, 0)): + fphr_arr._clear() + i = 1 + self.findexes.reset() + if f_back_low < f_low: + fphr_arr._append(sym.setindex(self.category, i)) + i = i+1 + self.findexes.append(sym.setindex(self.category, i)) + self.findexes.extend(self.findexes1) + for j from 0 <= j < phrase.n: + if sym.isvar(phrase.syms[j]): + fphr_arr._append(sym.setindex(self.category, i)) + i = i + 1 + else: + fphr_arr._append(phrase.syms[j]) + fphr_arr._append(sym.setindex(self.category, i)) + self.findexes.append(sym.setindex(self.category, i)) + fphr = rule.Phrase(fphr_arr) + phrase_list = self.extract_phrases(e_x_low, e_x_high, e_gap_low+gap_start, e_gap_high+gap_start, e_links_low, num_gaps+1, + f_x_low, f_x_high, f_gap_low+gap_start, f_gap_high+gap_start, f_links_low, + matching.sent_id, e_sent_len, e_sent_start) + if len(phrase_list) > 0: + pair_count = available_mass / len(phrase_list) + else: + pair_count = 0 + for phrase2, eindexes in phrase_list: + als3 = self.create_alignments(sent_links,num_links,self.findexes,eindexes) + extracts.append((fphr, phrase2, pair_count, tuple(als3))) + if self.extract_file: + self.extract_file.write("%s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high)) + #print "extract_phrases3: %s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high) + if (num_gaps < self.max_nonterminals - 1 and + phrase_len+1 < self.max_length and + f_back_high == f_high and + f_back_low == f_low and + f_back_high - f_back_low + (2*self.train_min_gap_size) <= self.train_max_initial_size and + f_low >= self.train_min_gap_size and + f_high <= f_sent_len - self.train_min_gap_size and + ((not self.tight_phrases) or (f_links_low[f_low-1] != -1 and f_links_low[f_high] != -1))): + + met_constraints = 1 + f_x_low = f_low-self.train_min_gap_size + if self.tight_phrases: + while f_x_low >= 0 and f_links_low[f_x_low] == -1: + f_x_low = f_x_low - 1 + if f_x_low < 0: + met_constraints = 0 + + f_x_high = f_high+self.train_min_gap_size + if self.tight_phrases: + while f_x_high <= f_sent_len and f_links_low[f_x_high-1] == -1: + f_x_high = f_x_high + 1 + if f_x_high > f_sent_len or f_x_high - f_x_low > self.train_max_initial_size: + met_constraints = 0 + + if (met_constraints and + self.find_fixpoint(f_x_low, f_x_high, + f_links_low, f_links_high, e_links_low, e_links_high, + e_low, e_high, &e_x_low, &e_x_high, &f_x_low, &f_x_high, + f_sent_len, e_sent_len, + self.train_max_initial_size, self.train_max_initial_size, + 1, 1, 2, 1, 1, 1, 1) and + ((not self.tight_phrases) or (f_links_low[f_x_low] != -1 and f_links_low[f_x_high-1] != -1)) and + self.find_fixpoint(f_x_low, f_low, + f_links_low, f_links_high, e_links_low, e_links_high, + -1, -1, e_gap_low, e_gap_high, f_gap_low, f_gap_high, + f_sent_len, e_sent_len, + self.train_max_initial_size, self.train_max_initial_size, + 0, 0, 0, 0, 0, 0, 0) and + self.find_fixpoint(f_high, f_x_high, + f_links_low, f_links_high, e_links_low, e_links_high, + -1, -1, e_gap_low+1+num_gaps, e_gap_high+1+num_gaps, + f_gap_low+1+num_gaps, f_gap_high+1+num_gaps, + f_sent_len, e_sent_len, + self.train_max_initial_size, self.train_max_initial_size, + 0, 0, 0, 0, 0, 0, 0)): + fphr_arr._clear() + i = 1 + self.findexes.reset() + self.findexes.append(sym.setindex(self.category, i)) + fphr_arr._append(sym.setindex(self.category, i)) + i = i+1 + self.findexes.extend(self.findexes1) + for j from 0 <= j < phrase.n: + if sym.isvar(phrase.syms[j]): + fphr_arr._append(sym.setindex(self.category, i)) + i = i + 1 + else: + fphr_arr._append(phrase.syms[j]) + fphr_arr._append(sym.setindex(self.category, i)) + self.findexes.append(sym.setindex(self.category, i)) + fphr = rule.Phrase(fphr_arr) + phrase_list = self.extract_phrases(e_x_low, e_x_high, e_gap_low, e_gap_high, e_links_low, num_gaps+2, + f_x_low, f_x_high, f_gap_low, f_gap_high, f_links_low, + matching.sent_id, e_sent_len, e_sent_start) + if len(phrase_list) > 0: + pair_count = available_mass / len(phrase_list) + else: + pair_count = 0 + for phrase2, eindexes in phrase_list: + als4 = self.create_alignments(sent_links,num_links,self.findexes,eindexes) + extracts.append((fphr, phrase2, pair_count, tuple(als4))) + if self.extract_file: + self.extract_file.write("%s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high)) + #print "extract_phrases4 %s ||| %s ||| %f ||| %d: [%d, %d] -> [%d, %d]\n" % (fphr, phrase2, pair_count, matching.sent_id+1, f_x_low, f_x_high, e_x_low, e_x_high) + else: + reason_for_failure = "Unable to extract basic phrase" + + free(sent_links) + free(f_links_low) + free(f_links_high) + free(e_links_low) + free(e_links_high) + free(f_gap_low) + free(f_gap_high) + free(e_gap_low) + free(e_gap_high) + + if self.sample_file is not None: + self.sample_file.write("%s ||| %d: [%d, %d] ||| %d ||| %s\n" % (phrase, matching.sent_id+1, f_low, f_high, len(extracts), reason_for_failure)) + + #print "%s ||| %d: [%d, %d] ||| %d ||| %s\n" % (phrase, matching.sent_id+1, f_low, f_high, len(extracts), reason_for_failure) + + + return extracts + diff --git a/sa-extract/sa-compile.pl b/sa-extract/sa-compile.pl new file mode 100755 index 00000000..1cae83a7 --- /dev/null +++ b/sa-extract/sa-compile.pl @@ -0,0 +1,322 @@ +#!/usr/bin/env perl + +use strict; +use Getopt::Long; + +my $cwd; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $cwd = cwd(); } + +my $rootdir = `dirname $0`; chomp $rootdir; +my $compile = "$rootdir/compile_bin.py"; +my $lcp = "$rootdir/lcp_ops.py"; +die "Can't find $compile" unless -f $compile; +die "Can't execute $compile" unless -x $compile; + +sub print_help; +sub cleanup; + +my $alignment; +my $bitext; +my $catalog; +my $dryrun = 0; +my $group; +my $help = 0; +my $ini = "$rootdir/extract.ini"; +my $lm; +my $precomp; +my $no_ini = 0; +my $remove; +my $type; +my $local_only = 1; +my $output; + +# Process command-line options +if (GetOptions( + "alignment=s" => \$alignment, + "bitext=s" => \$bitext, + "help" => \$help, + "ini=s" => \$ini, + "output=s" => \$output, + "precomp-options=s" => \$precomp, + "no-ini" => \$no_ini, +) == 0 || $help == 1 || @ARGV > 0){ + print_help; + die "\n"; +} + +open(INI, $ini) or die "Can't read $ini: $!"; + +$bitext || die "You must specify a bitext with -b\n"; +$alignment || die "You must specify an alignment with -a\n"; + +my $top_dir; +if (defined $output) { + $top_dir = $output; +} else { + $top_dir = "$cwd/sa-compiled"; +} + +my $type_dir = "$top_dir"; + +my $bitext_name; +my $bitext_f_file; +my $bitext_e_file; +my $bitext_dir; +if ($bitext){ + if ($bitext =~ /(.*)=(.*),(.*)/){ + $bitext_name = $1; + $bitext_f_file = $2; + $bitext_e_file = $3; + -e $bitext_f_file || die "Could not find file $bitext_f_file\n"; + -e $bitext_e_file || die "Could not find file $bitext_e_file\n"; + } else { + $bitext_name = $bitext; + } + + $bitext_dir = "$type_dir/bitext/$bitext_name"; + if ($bitext_f_file){ + if (-e $bitext_dir) { + die "Bitext $bitext_name already exists\n"; + } + } else { + unless (-e $bitext_dir){ + die "No bitext $bitext_name. You must specify bitext files with -b\n"; + } + } +} + +my $max_nt = 2; +my $max_len = 5; +my $max_size = 15; +my $min_gap = 1; +my $rank1 = 100; +my $rank2 = 10; +my $precomp_file; +if ($precomp){ + unless ($bitext_name){ + die "You must specify a bitext with -b if using -p\n"; + } + my @precomp_args = split(/,/, $precomp); + my $precomp_arg; + for $precomp_arg (@precomp_args){ + if ($precomp_arg =~ /(.*)=(.*)/){ + my $key = $1; + my $value = $2; + unless ($value =~ /^\d+$/){ + die "Value for -p option must be a positive integer, found $value\n"; + } + if ($key eq "max-len"){ $max_len = $value; } + elsif ($key eq "max-nt"){ $max_nt = $value; } + elsif ($key eq "max-size"){ $max_size = $value; } + elsif ($key eq "min-gap"){ $min_gap = $value; } + elsif ($key eq "rank1"){ $rank1 = $value; } + elsif ($key eq "rank2"){ $rank2 = $value; } + else{ + die "Unknown option $key given for -p\n"; + } + } else { + die "When using -p, you must specify key-value pairs using syntax: =,...,=\n"; + } + } +} +my $precomp_compile_needed = 0; +if ($bitext_name){ + $precomp_file = "$bitext_dir/precomp.$max_len.$max_nt.$max_size.$min_gap.$rank1.$rank2.bin"; + unless (-e $precomp_file){ + $precomp_compile_needed = 1; + } +} + +my $alignment_name; +my $alignment_file; +my $alignment_dir; +if ($alignment){ + $bitext || die "Specified alignment $alignment without specifying bitext using -b\n"; + if ($alignment =~ /(.*)=(.*)/){ + $alignment_name = $1; + $alignment_file = $2; + -e $alignment_file || die "Could not find file $alignment_file\n"; + } else { + $alignment_name = $alignment; + } + + $alignment_dir = "$bitext_dir/a/$alignment_name"; + if ($alignment_file){ + if (-e $alignment_dir){ + die "Alignment $alignment_name already exists for bitext $bitext_name\n"; + } + } else { + require_top_dirs(); + unless (-e $alignment_dir){ + die "No alignment $alignment_name for bitext $bitext_name\n"; + } + } +} + +if ($bitext_name){ + print STDERR " from files $bitext_f_file and $bitext_e_file\n"; +} else { + print " No bitext\n"; +} +if ($precomp_compile_needed){ + print STDERR " Precompilation needed: max-len=$max_len, max-nt=$max_nt, max-size=$max_size, min-gap=$min_gap, rank1=$rank1, rank2=$rank2\n"; +} +if ($alignment_name){ + print STDERR " Alignment = $alignment_name"; + if ($alignment_file){ + print STDERR " from file $alignment_file\n"; + } +} else { + print STDERR " No alignment\n"; +} + +my $script; +my $compile_dir; +$SIG{INT} = "cleanup"; +$SIG{TERM} = "cleanup"; +$SIG{HUP} = "cleanup"; + + if ($bitext_e_file || $precomp_compile_needed || $alignment_file){ + my $compiled_e_file; + my $compiled_f_file; + + $compile_dir = $top_dir; + my $compile_top_dir = "$compile_dir"; + + my $compile_bitext_dir = "$compile_top_dir/bitext/$bitext_name"; + if ($bitext_e_file){ + `mkdir -p $compile_bitext_dir`; + print STDERR "\nCompiling bitext (f side)...\n"; + `$compile -s $bitext_f_file $compile_bitext_dir/f.sa.bin`; + die "Command failed: $!" unless $? == 0; + print STDERR "\nCompiling bitext (e side)...\n"; + `$compile -d $bitext_e_file $compile_bitext_dir/e.bin`; + die "Command failed: $!" unless $? == 0; + + $compiled_f_file = "$compile_bitext_dir/f.sa.bin"; + $compiled_e_file = "$compile_bitext_dir/e.bin"; + } else { # bitext already compiled + $compiled_f_file = "$bitext_dir/f.sa.bin"; + $compiled_e_file = "$bitext_dir/e.bin"; + } + + if ($precomp_compile_needed){ + `mkdir -p $compile_bitext_dir`; + my $top_stats_file = "$compile_bitext_dir/f.top.$rank1"; + my $compiled_precomp_file = "$compile_bitext_dir/precomp.$max_len.$max_nt.$max_size.$min_gap.$rank1.$rank2.bin"; + my $cmd = "$lcp -t 4 $compiled_f_file | sort -nr | head -$rank1 > $top_stats_file"; + print STDERR "$cmd\n"; + `$cmd`; + die "Command failed: $cmd" unless $? == 0; + `$compile -r max-len=$max_len max-nt=$max_nt max-size=$max_size min-gap=$min_gap rank1=$rank1 rank2=$rank2 sa=$compiled_f_file $top_stats_file $compiled_precomp_file`; + die "Command failed: $!" unless $? == 0; + } + + if ($alignment_file){ + my $compile_alignment_dir = "$compile_top_dir/bitext/$bitext_name/a/$alignment_name"; + `mkdir -p $compile_alignment_dir`; + print STDERR "\nCompiling alignment...\n"; + my $cmd= "$compile -a $alignment_file $compile_alignment_dir/a.bin"; + print STDERR " $cmd\n"; + `$cmd`; + die "Command failed: $!" unless $? == 0; + + print STDERR "\nCompiling lexical weights file...\n"; + $cmd="$compile -x $compiled_f_file $compiled_e_file $compile_alignment_dir/a.bin $compile_alignment_dir/lex.bin"; + print STDERR " $cmd\n"; + `$cmd`; + die "Command failed: $!" unless $? == 0; + } + + chdir $compile_dir; + print STDERR "Compiling done: $compile_dir\n"; + } + + unless ($no_ini){ + my $line; + while($line=){ + $line =~ s/^([^#]*a_file\s*=\s*")(.*)("\s*)$/$1$alignment_dir\/a.bin$3/; + $line =~ s/^([^#]*lex_file\s*=\s*")(.*)("\s*)$/$1$alignment_dir\/lex.bin$3/; + $line =~ s/^([^#]*f_sa_file\s*=\s*")(.*)("\s*)$/$1$bitext_dir\/f.sa.bin$3/; + $line =~ s/^([^#]*e_file\s*=\s*")(.*)("\s*)$/$1$bitext_dir\/e.bin$3/; + $line =~ s/^([^#]*precompute_file\s*=\s*")(.*)("\s*)$/$1$bitext_dir\/precomp.$max_len.$max_nt.$max_size.$min_gap.$rank1.$rank2.bin$3/; + + $line =~ s/^([^#]*max_len\s*=\s*)(.*)(\s*)$/$1$max_len$3/; + $line =~ s/^([^#]*max_nt\s*=\s*)(.*)(\s*)$/$1$max_nt$3/; + $line =~ s/^([^#]*max_size\s*=\s*)(.*)(\s*)$/$1$max_size$3/; + $line =~ s/^([^#]*min_gap\s*=\s*)(.*)(\s*)$/$1$min_gap$3/; + $line =~ s/^([^#]*rank1\s*=\s*)(.*)(\s*)$/$1$rank1$3/; + $line =~ s/^([^#]*rank2\s*=\s*)(.*)(\s*)$/$1$rank2$3/; + + print $line; + } + } + +exit(0); + +sub cleanup { + die "Cleanup.\n"; +} + +sub print_help +{ + my $name = `basename $0`; chomp $name; + print << "Help"; + +usage: $name [options] + + Manage compilation of SA-Hiero files and creation of ini files. + In the default usage, the command deploys a set of files needed + to create a system, and writes an ini for the system on stdout. + +options: + + -a, --alignment [=] + Name of an alignment of a bitext (which must be specified + with -b unless using the -c flag). If used with -r, the + alignment is removed from the deployment. If used with -c, + only alignments with this name are listed. If a filename is + given, then the file will be deployed using the name. + + -b, --bitext [=,] + Name of a bitext for a particular system type (which must be + specified with -t unless using the -c flag). If used with -r, + the bitext is removed from the deployment. If used with -c, + only bitexts with the name are listed. If a filename is given, + then the file will be deployed using the name. + + -h, --help + Prints this message. + + -i, --ini + Use a specific ini file as the template for a system, rather than + the default ini file. + + -p, --precomp-options =[,=,...,=] + Set parameters of the grammar. This must be set by $name because + many parameters involve precomputation. There are six keys that can + be set: + max-len: maximum number of symbols (T and NT) in a grammar rule + max-nt: maximum number of nonterminals in a grammar rule + max-size: maximum span of a grammar rule extracted from training + min-gap: minimum gap spanned by a nonterminal in training + rank1: number of frequent words to precompute collocations for. + rank2: number of super-frequent words to precompute triple + collocations for. + All values must be positive integers. If not specified, defaults are: + max-len = 5 + max-nt = 2 (>2 not supported) + max-size = 10 + min-gap = 2 + rank1 = 100 (>300 not recommended) + rank2 = 10 (>10 not recommended) + + -n, --no-ini + Do not generate an ini file on stdout. If this option is used, then + the requirement to specify a full system is relaxed. Therefore, this + option can be used when the sole objective is deployment of files. + + -o, --output-dir + Write the compiled model to this directory. + +Help +} diff --git a/sa-extract/setup.cfg b/sa-extract/setup.cfg new file mode 100644 index 00000000..8f696136 --- /dev/null +++ b/sa-extract/setup.cfg @@ -0,0 +1,2 @@ +[build_ext] +inplace=1 diff --git a/sa-extract/setup.py b/sa-extract/setup.py new file mode 100644 index 00000000..cdcbfb54 --- /dev/null +++ b/sa-extract/setup.py @@ -0,0 +1,45 @@ +from distutils.core import setup, Extension +from distutils.util import get_platform +import os.path + +cstrmap_module = Extension('cstrmap', sources = ['cstrmap.c', 'strmap.cc']) +setup (name = 'CStringMap', version = '1.0', description = 'C string->int map', ext_modules = [cstrmap_module]) + +rule_module = Extension('rule', + sources = ['rule.c', 'strutil.c']) +setup (name = 'Rule', version = '1.0', description = 'rule class', ext_modules = [rule_module]) + +sym_module = Extension('sym', + sources = ['sym.c']) +setup (name = 'Sym', version = '1.0', description = 'symbol class', ext_modules = [sym_module]) + +cdat_module = Extension('cdat', sources = ['cdat.c']) +setup(name = "CDat", version = '1.0', description = 'C Data class', ext_modules = [cdat_module]) + +cintlist_module = Extension('cintlist', sources = ['cintlist.c']) +setup(name = "CIntList", version = '1.0', description = 'C int array/list class', ext_modules = [cintlist_module]) + +cfloatlist_module = Extension('cfloatlist', sources = ['cfloatlist.c']) +setup(name = "CFloatList", version = '1.0', description = 'C float array/list class', ext_modules = [cfloatlist_module]) + +calignment_module = Extension('calignment', sources = ['calignment.c']) +setup(name = "CAlignment", version = '1.0', description = 'C alignment class', ext_modules = [calignment_module]) + +csuf_module = Extension('csuf', sources = ['csuf.c']) +setup(name = "CSuffixArray", version = '1.0', description = 'C suffix array class', ext_modules = [csuf_module]) + +clex_module = Extension('clex', sources = ['clex.c']) +setup(name = "CLex", version = '1.0', description = 'C lexical class', ext_modules = [clex_module]) + +factory_module = Extension('rulefactory', sources = ['rulefactory.c']) +setup(name = "RuleFactory", version = '1.0', description = 'C rule factory classes', ext_modules = [factory_module]) + +cveb_module = Extension('cveb', sources = ['cveb.c']) +setup(name = "CVEB", version = '1.0', description = 'C impl. of van Emde Boas tree', ext_modules = [cveb_module]) + +lcp_module = Extension('lcp', sources = ['lcp.c']) +setup(name = "LCP", version = '1.0', description = 'C impl. of LCP', ext_modules = [lcp_module]) + +precomp_module = Extension('precomputation', sources = ['precomputation.c']) +setup(name = "Precomputation", version = '1.0', description = 'Precomputation Algorithm', ext_modules = [precomp_module]) + diff --git a/sa-extract/sgml.py b/sa-extract/sgml.py new file mode 100644 index 00000000..2db8b5dc --- /dev/null +++ b/sa-extract/sgml.py @@ -0,0 +1,194 @@ +import sys, sgmllib, xml.sax.saxutils, sym + +def attrs_to_str(d): + if len(d) == 0: + return "" + l = [""]+["%s=%s" % (name, xml.sax.saxutils.quoteattr(value)) for (name, value) in d] + return " ".join(l) + +def attrs_to_dict(a): + d = {} + for (name, value) in a: + if d.has_key(name.lower()): + raise ValueError, "duplicate attribute names" + d[name.lower()] = value + return d + +class Sentence(object): + def __init__(self, words=None, meta=None): + if words is not None: + self.words = list(words) + else: + self.words = [] + if meta is not None: + self.meta = meta + else: + self.meta = [] + + def copy(self): + return Sentence(self.words, list(self.meta)) + + def mark(self, tag, attrs): + self.meta.append((tag, attrs, 0, len(self.words))) + + def getmark(self): + if len(self.meta) > 0: + (tag, attrs, i, j) = self.meta[-1] + if i == 0 and j == len(self.words): + return (tag, attrs) + else: + return None + else: + return None + + def unmark(self): + mark = self.getmark() + if mark is not None: + self.meta = self.meta[:-1] + return mark + + def __cmp__(self, other): + return cmp((self.words, self.meta), (other.words, other.meta)) + + def __str__(self): + def cmp_spans((tag1,attr1,i1,j1),(tag2,attr2,i2,j2)): + if i1==i2<=j1==j2: + return 0 + elif i2<=i1<=j1<=j2: + return -1 + elif i1<=i2<=j2<=j1: + return 1 + else: + return cmp((i1,j1),(i2,j2)) # don't care + # this guarantees that equal spans will come out nested + # we want the later spans to be outer + # this relies on stable sort + open = [[] for i in xrange(len(self.words)+1)] + # there seems to be a bug still with empty spans + empty = [[] for i in xrange(len(self.words)+1)] + close = [[] for j in xrange(len(self.words)+1)] + for (tag,attrs,i,j) in sorted(self.meta, cmp=cmp_spans): + if i == j: + # do we want these to nest? + empty[i].append("<%s%s>\n" % (tag, attrs_to_str(attrs), tag)) + else: + open[i].append("<%s%s>\n" % (tag, attrs_to_str(attrs))) + close[j].append("\n" % tag) + + result = [] + if len(empty[0]) > 0: + result.extend(empty[0]) + for i in xrange(len(self.words)): + if i > 0: + result.append(" ") + result.extend(reversed(open[i])) + result.append(xml.sax.saxutils.escape(sym.tostring(self.words[i]))) + result.extend(close[i+1]) + if len(empty[i+1]) > 0: + result.extend(empty[i+1]) + + return "".join(result) + + def __add__(self, other): + if type(other) in (list, tuple): + return Sentence(self.words + list(other), self.meta) + else: + othermeta = [(tag, attrs, i+len(self.words), j+len(self.words)) for (tag, attrs, i, j) in other.meta] + return Sentence(self.words + other.words, self.meta+othermeta) + +def read_raw(f): + """Read a raw file into a list of Sentences.""" + if type(f) is str: + f = file(f, "r") + i = 0 + line = f.readline() + while line != "": + sent = process_sgml_line(line, i) + mark = sent.getmark() + if mark is None: + sent.mark('seg', [('id',str(i))]) + else: + (tag, attrs) = mark + if tag == "seg" and not attrs_to_dict(attrs).has_key('id'): + x = ('id',str(i)) + attrs.append(x) + sent.mark('seg', attrs) + if tag != "seg": + sent.mark('seg', [('id',str(i))]) + yield sent + i += 1 + line = f.readline() + +def process_sgml_line(line, id=None): + p = DatasetParser(None) + p.pos = 0 + p.words = [] + p.meta = [] + p.feed(line) + p.close() + sent = Sentence(p.words, p.meta) + return sent + +class DatasetParser(sgmllib.SGMLParser): + def __init__(self, set): + sgmllib.SGMLParser.__init__(self) + self.words = None + self.mystack = [] + self.string = None + self.convref = d = {"amp":"&", "lt":"<", "gt":">", "quot":'"', "squot":"'"} + def close(self): + self.convert() + sgmllib.SGMLParser.close(self) + + def handle_starttag(self, tag, method, attrs): + thing = method(attrs) + self.mystack.append(thing) + + def handle_endtag(self, tag, method): + thing = self.mystack.pop() + method(thing) + + def unknown_starttag(self, tag, attrs): + thing = self.start(tag, attrs) + self.mystack.append(thing) + + def unknown_endtag(self, tag): + thing = self.mystack.pop() + self.end(tag, thing) + + def start(self, tag, attrs): + self.convert() + if self.words is not None: + return (tag, attrs, self.pos, None) + else: + return None + + def convert(self): + if self.words is not None and self.string is not None: + words = self.string.split() + self.pos += len(words) + self.words.extend(words) + self.string = None + + def end(self, tag, thing): + self.convert() + if self.words is not None: + (tag, attrs, i, j) = thing + self.meta.append((tag, attrs, i, self.pos)) + + def handle_data(self, s): + if self.words is not None: + if (self.string is None): + self.string = s + else: + self.string += s + + def handle_entityref(self, ref): + # s=self.convert_entityref(ref) # if python 2.5 + s=self.convref[ref] + if self.words is not None: + if (self.string is None): + self.string = s + else: + self.string += s + diff --git a/sa-extract/strmap.cc b/sa-extract/strmap.cc new file mode 100644 index 00000000..5040477e --- /dev/null +++ b/sa-extract/strmap.cc @@ -0,0 +1,232 @@ +#include "strmap.h" + +#include +#include +#include +#include + +using namespace std; +using namespace std::tr1; + +#undef HAVE_64_BITS + +#if INTPTR_MAX == INT32_MAX +# define HAVE_64_BITS 0 +#elif INTPTR_MAX >= INT64_MAX +# define HAVE_64_BITS 1 +#else +# error "couldn't tell if HAVE_64_BITS from INTPTR_MAX INT32_MAX INT64_MAX" +#endif + +typedef uintptr_t MurmurInt; + +// MurmurHash2, by Austin Appleby + +static const uint32_t DEFAULT_SEED=2654435769U; + +#if HAVE_64_BITS +//MurmurInt MurmurHash(void const *key, int len, uint32_t seed=DEFAULT_SEED); + +inline uint64_t MurmurHash64( const void * key, int len, unsigned int seed=DEFAULT_SEED ) +{ + const uint64_t m = 0xc6a4a7935bd1e995ULL; + const int r = 47; + + uint64_t h = seed ^ (len * m); + + const uint64_t * data = (const uint64_t *)key; + const uint64_t * end = data + (len/8); + + while(data != end) + { + uint64_t k = *data++; + + k *= m; + k ^= k >> r; + k *= m; + + h ^= k; + h *= m; + } + + const unsigned char * data2 = (const unsigned char*)data; + + switch(len & 7) + { + case 7: h ^= uint64_t(data2[6]) << 48; + case 6: h ^= uint64_t(data2[5]) << 40; + case 5: h ^= uint64_t(data2[4]) << 32; + case 4: h ^= uint64_t(data2[3]) << 24; + case 3: h ^= uint64_t(data2[2]) << 16; + case 2: h ^= uint64_t(data2[1]) << 8; + case 1: h ^= uint64_t(data2[0]); + h *= m; + }; + + h ^= h >> r; + h *= m; + h ^= h >> r; + + return h; +} + +inline uint32_t MurmurHash32(void const *key, int len, uint32_t seed=DEFAULT_SEED) +{ + return (uint32_t) MurmurHash64(key,len,seed); +} + +inline MurmurInt MurmurHash(void const *key, int len, uint32_t seed=DEFAULT_SEED) +{ + return MurmurHash64(key,len,seed); +} + +#else +// 32-bit + +// Note - This code makes a few assumptions about how your machine behaves - +// 1. We can read a 4-byte value from any address without crashing +// 2. sizeof(int) == 4 +inline uint32_t MurmurHash32 ( const void * key, int len, uint32_t seed=DEFAULT_SEED) +{ + // 'm' and 'r' are mixing constants generated offline. + // They're not really 'magic', they just happen to work well. + + const uint32_t m = 0x5bd1e995; + const int r = 24; + + // Initialize the hash to a 'random' value + + uint32_t h = seed ^ len; + + // Mix 4 bytes at a time into the hash + + const unsigned char * data = (const unsigned char *)key; + + while(len >= 4) + { + uint32_t k = *(uint32_t *)data; + + k *= m; + k ^= k >> r; + k *= m; + + h *= m; + h ^= k; + + data += 4; + len -= 4; + } + + // Handle the last few bytes of the input array + + switch(len) + { + case 3: h ^= data[2] << 16; + case 2: h ^= data[1] << 8; + case 1: h ^= data[0]; + h *= m; + }; + + // Do a few final mixes of the hash to ensure the last few + // bytes are well-incorporated. + + h ^= h >> 13; + h *= m; + h ^= h >> 15; + + return h; +} + +inline MurmurInt MurmurHash ( const void * key, int len, uint32_t seed=DEFAULT_SEED) { + return MurmurHash32(key,len,seed); +} + +// 64-bit hash for 32-bit platforms + +inline uint64_t MurmurHash64 ( const void * key, int len, uint32_t seed=DEFAULT_SEED) +{ + const uint32_t m = 0x5bd1e995; + const int r = 24; + + uint32_t h1 = seed ^ len; + uint32_t h2 = 0; + + const uint32_t * data = (const uint32_t *)key; + + while(len >= 8) + { + uint32_t k1 = *data++; + k1 *= m; k1 ^= k1 >> r; k1 *= m; + h1 *= m; h1 ^= k1; + len -= 4; + + uint32_t k2 = *data++; + k2 *= m; k2 ^= k2 >> r; k2 *= m; + h2 *= m; h2 ^= k2; + len -= 4; + } + + if(len >= 4) + { + uint32_t k1 = *data++; + k1 *= m; k1 ^= k1 >> r; k1 *= m; + h1 *= m; h1 ^= k1; + len -= 4; + } + + switch(len) + { + case 3: h2 ^= ((unsigned char*)data)[2] << 16; + case 2: h2 ^= ((unsigned char*)data)[1] << 8; + case 1: h2 ^= ((unsigned char*)data)[0]; + h2 *= m; + }; + + h1 ^= h2 >> 18; h1 *= m; + h2 ^= h1 >> 22; h2 *= m; + h1 ^= h2 >> 17; h1 *= m; + h2 ^= h1 >> 19; h2 *= m; + + uint64_t h = h1; + + h = (h << 32) | h2; + + return h; +} + +#endif +//32bit + +struct MurmurHasher { + size_t operator()(const string& s) const { + return MurmurHash(s.c_str(), s.size()); + } +}; + +struct StrMap { + StrMap() { keys_.reserve(10000); keys_.push_back(""); map_[keys_[0]] = 0; } + unordered_map map_; + vector keys_; +}; + +StrMap* stringmap_new() { + return new StrMap; +} + +void stringmap_delete(StrMap *vocab) { + delete vocab; +} + +int stringmap_index(StrMap *vocab, char *s) { + int& cell = vocab->map_[s]; + if (!cell) { + cell = vocab->keys_.size(); + vocab->keys_.push_back(s); + } + return cell; +} + +char* stringmap_word(StrMap *vocab, int i) { + return const_cast(vocab->keys_[i].c_str()); +} + diff --git a/sa-extract/strmap.h b/sa-extract/strmap.h new file mode 100644 index 00000000..a218a4c0 --- /dev/null +++ b/sa-extract/strmap.h @@ -0,0 +1,22 @@ +#ifndef _STRMAP_H_ +#define _STRMAP_H_ + +#ifdef __cplusplus + extern "C" { +#else + typedef struct StrMap StrMap; /* dummy type to stand in for class */ +#endif + +struct StrMap; + +StrMap* stringmap_new(); +void stringmap_delete(StrMap *vocab); +int stringmap_index(StrMap *vocab, char *s); +char* stringmap_word(StrMap *vocab, int i); + +#ifdef __cplusplus + } +#endif + + +#endif diff --git a/sa-extract/strutil.c b/sa-extract/strutil.c new file mode 100644 index 00000000..456de87a --- /dev/null +++ b/sa-extract/strutil.c @@ -0,0 +1,63 @@ +#include +#include + +/* Like strsep(3) except that the delimiter is a string, not a set of characters. +*/ +char *strstrsep(char **stringp, const char *delim) { + char *match, *save; + save = *stringp; + if (*stringp == NULL) + return NULL; + match = strstr(*stringp, delim); + if (match == NULL) { + *stringp = NULL; + return save; + } + *match = '\0'; + *stringp = match + strlen(delim); + return save; +} + +static char **words = NULL; +static int max_words; +char **split(char *s, const char *delim, int *pn) { + int i; + char *tok, *rest; + + if (words == NULL) { + max_words = 10; + words = malloc(max_words*sizeof(char *)); + } + i = 0; + rest = s; + while ((tok = (delim ? strstrsep(&rest, delim) : strsep(&rest, " \t\n"))) != NULL) { + if (!delim && !*tok) // empty token + continue; + while (i+1 >= max_words) { + max_words *= 2; + words = realloc(words, max_words*sizeof(char *)); + } + words[i] = tok; + i++; + } + words[i] = NULL; + if (pn != NULL) + *pn = i; + return words; +} + +inline int isspace(char c) { + return (c == ' ' || c == '\t' || c == '\n'); +} + +char *strip(char *s) { + int n; + while (isspace(*s) && *s != '\0') + s++; + n = strlen(s); + while (n > 0 && isspace(s[n-1])) { + s[n-1] = '\0'; + n--; + } + return s; +} diff --git a/sa-extract/strutil.h b/sa-extract/strutil.h new file mode 100644 index 00000000..94a77033 --- /dev/null +++ b/sa-extract/strutil.h @@ -0,0 +1,8 @@ +#ifndef STRUTIL_H +#define STRUTIL_H + +char *strstrsep(char **stringp, const char *delim); +char *strip(char *s); +char **split(char *s, const char *delim, int *pn); + +#endif diff --git a/sa-extract/sym.pxd b/sa-extract/sym.pxd new file mode 100644 index 00000000..d0650f46 --- /dev/null +++ b/sa-extract/sym.pxd @@ -0,0 +1,17 @@ +cimport cstrmap + +cdef class Alphabet: + cdef readonly cstrmap.StringMap terminals, nonterminals + cdef int first_nonterminal, last_nonterminal + cdef int isvar(self, int sym) + cdef int isword(self, int sym) + cdef int getindex(self, int sym) + cdef int setindex(self, int sym, int ind) + cdef int clearindex(self, int sym) + cdef int match(self, int sym1, int sym2) + cdef char* tocat(self, int sym) + cdef int fromcat(self, char *s) + cdef char* tostring(self, int sym) + cdef int fromstring(self, char *s, int terminal) + + diff --git a/sa-extract/sym.pyx b/sa-extract/sym.pyx new file mode 100644 index 00000000..264cfcd9 --- /dev/null +++ b/sa-extract/sym.pyx @@ -0,0 +1,155 @@ +from libc.string cimport strrchr, strstr, strcpy, strlen +from libc.stdlib cimport malloc, realloc, strtol + +cdef int index_shift, index_mask, n_index +index_shift = 3 +n_index = 1<= 0 + + cdef int getindex(self, int sym): + return -sym & index_mask + + cdef int setindex(self, int sym, int ind): + return -(-sym & ~index_mask | ind) + + cdef int clearindex(self, int sym): + return -(-sym& ~index_mask) + + cdef int match(self, int sym1, int sym2): + return self.clearindex(sym1) == self.clearindex(sym2); + + cdef char* tocat(self, int sym): + return self.nonterminals.word((-sym >> index_shift)-1) + + cdef int fromcat(self, char *s): + cdef int i + i = self.nonterminals.index(s) + if self.first_nonterminal == -1: + self.first_nonterminal = i + if i > self.last_nonterminal: + self.last_nonterminal = i + return -(i+1 << index_shift) + + cdef char* tostring(self, int sym): + cdef int ind + if self.isvar(sym): + if sym in id2sym: + return id2sym[sym] + + ind = self.getindex(sym) + if ind > 0: + id2sym[sym] = "[%s,%d]" % (self.tocat(sym), ind) + else: + id2sym[sym] = "[%s]" % self.tocat(sym) + return id2sym[sym] + + else: + return self.terminals.word(sym) + + cdef int fromstring(self, char *s, int terminal): + """Warning: this method is allowed to alter s.""" + cdef char *comma + cdef int n + n = strlen(s) + cdef char *sep + sep = strstr(s,"_SEP_") + if n >= 3 and s[0] == c'[' and s[n-1] == c']' and sep == NULL: + if terminal: + s1 = "\\"+s + return self.terminals.index(s1) + s[n-1] = c'\0' + s = s + 1 + comma = strrchr(s, c',') + if comma != NULL: + comma[0] = c'\0' + return self.setindex(self.fromcat(s), strtol(comma+1, NULL, 10)) + else: + return self.fromcat(s) + else: + return self.terminals.index(s) + +# Expose Python functions as top-level functions for backward compatibility + +alphabet = Alphabet() + +cdef Alphabet calphabet +calphabet = alphabet + +def isvar(int sym): + return calphabet.isvar(sym) + +def isword(int sym): + return calphabet.isword(sym) + +def getindex(int sym): + return calphabet.getindex(sym) + +def setindex(int sym, int ind): + return calphabet.setindex(sym, ind) + +def clearindex(int sym): + return calphabet.clearindex(sym) + +def match(int sym1, int sym2): + return calphabet.match(sym1, sym2) != 0 + +def totag(int sym): + return calphabet.tocat(sym) + +def fromtag(s): + s = s.upper() + return calphabet.fromcat(s) + +def tostring(sym): + if type(sym) is str: + return sym + else: + return calphabet.tostring(sym) + +cdef int bufsize +cdef char *buf +bufsize = 100 +buf = malloc(bufsize) +cdef ensurebufsize(int size): + global buf, bufsize + if size > bufsize: + buf = realloc(buf, size*sizeof(char)) + bufsize = size + +def fromstring(s, terminal=False): + cdef bytes bs + cdef char* cs + if terminal: + return calphabet.fromstring(s, 1) + else: + bs = s + cs = bs + ensurebufsize(len(s)+1) + strcpy(buf, cs) + return calphabet.fromstring(buf, 0) + +def nonterminals(): + cdef i + l = [] + for i from calphabet.first_nonterminal <= i <= calphabet.last_nonterminal: + l.append(-(i+1 << index_shift)) + return l -- cgit v1.2.3 From e5831933364a4cab5084db49281a855baea8e09c Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 2 Feb 2012 12:33:13 -0500 Subject: fix broken build --- gi/pf/Makefile.am | 2 +- gi/pf/base_distributions.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 28367e67..8d43f36d 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,7 +1,7 @@ bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp noinst_LIBRARIES = libpf.a -libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc +libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc align_lexonly_SOURCES = align-lexonly.cc diff --git a/gi/pf/base_distributions.cc b/gi/pf/base_distributions.cc index 4b1863fa..d362fd76 100644 --- a/gi/pf/base_distributions.cc +++ b/gi/pf/base_distributions.cc @@ -1,4 +1,4 @@ -#include "base_measures.h" +#include "base_distributions.h" #include -- cgit v1.2.3 From e158354f893c4720f7f5f1d9b18c62a40ad10f79 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 2 Feb 2012 17:47:00 +0000 Subject: forgotten makefile --- sa-extract/Makefile | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 sa-extract/Makefile diff --git a/sa-extract/Makefile b/sa-extract/Makefile new file mode 100644 index 00000000..e2b6158d --- /dev/null +++ b/sa-extract/Makefile @@ -0,0 +1,18 @@ +PYVER=python2.7 +PYDIR=/usr +PYINCLUDE=$(PYDIR)/include/$(PYVER) +CYTHON=/usr/bin/cython +PYTHON=$(PYDIR)/bin/python + +%.c: %.pyx + $(CYTHON) $< -o $@ + +%.o: %.cc + g++ -O6 -g -fPIC -c $< + +all: cstrmap.c strmap.cc rule.c sym.c cdat.c cintlist.c cfloatlist.c calignment.c csuf.c clex.c rulefactory.c cveb.c lcp.c precomputation.c + $(PYTHON) setup.py build + +clean: + rm -f cdat.c cstrmap.c sym.c rule.c cintlist.c cfloatlist.c calignment.c csuf.c clex.c rulefactory.c cveb.c lcp.c precomputation.c *.so *.o *.cxx *~ *.pyc + rm -rf build -- cgit v1.2.3 From e1eae0ac941aa76528d4673dbd35f214cdac23fb Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 2 Feb 2012 17:19:40 -0500 Subject: remove some dead code to clean things up --- decoder/1dev.ur | 1 - decoder/apply_fsa_models.README | 21 --- decoder/cdec-gz.ini | 7 - decoder/cdec-nolm-tuned.ini | 7 - decoder/decode.sh | 10 -- decoder/do.tests.sh | 1 - decoder/fsa-decode.sh | 3 - decoder/fsa-hiero.ini | 5 - decoder/fsa.ini | 2 - decoder/glue-lda.scfg | 8 -- decoder/grammar.hiero | 151 --------------------- decoder/perro.sh | 1 - decoder/perro.ur | 1 - decoder/short.ur | 1 - decoder/weights-fsa | 14 -- decoder/weights.hiero | 10 -- dpmert/tac.pl | 8 -- expLog | 60 --------- graehl/NOTES | 18 --- graehl/NOTES.beam | 29 ----- graehl/NOTES.earley | 111 ---------------- graehl/NOTES.lm.phrase | 180 -------------------------- graehl/NOTES.partial.binarize | 21 --- graehl/NOTES.wfsa | 16 --- rescore/cdec_kbest_to_zmert.pl | 64 --------- rescore/example/README | 4 - rescore/example/cdec.ini | 2 - rescore/example/hyp.txt | 5 - rescore/example/small.scfg | 9 -- rescore/example/source.txt | 2 - rescore/example/weights | 1 - rescore/generate_zmert_params_from_weights.pl | 26 ---- rescore/rerank.pl | 86 ------------ rescore/rescore_inv_model1.pl | 126 ------------------ rescore/rescore_with_cdec_model.pl | 121 ----------------- 35 files changed, 1132 deletions(-) delete mode 100755 decoder/1dev.ur delete mode 100755 decoder/apply_fsa_models.README delete mode 100755 decoder/cdec-gz.ini delete mode 100755 decoder/cdec-nolm-tuned.ini delete mode 100755 decoder/decode.sh delete mode 100755 decoder/do.tests.sh delete mode 100755 decoder/fsa-decode.sh delete mode 100755 decoder/fsa-hiero.ini delete mode 100755 decoder/fsa.ini delete mode 100755 decoder/glue-lda.scfg delete mode 100755 decoder/grammar.hiero delete mode 100755 decoder/perro.sh delete mode 100755 decoder/perro.ur delete mode 100755 decoder/short.ur delete mode 100644 decoder/weights-fsa delete mode 100755 decoder/weights.hiero delete mode 100755 dpmert/tac.pl delete mode 100644 expLog delete mode 100755 graehl/NOTES delete mode 100755 graehl/NOTES.beam delete mode 100755 graehl/NOTES.earley delete mode 100755 graehl/NOTES.lm.phrase delete mode 100755 graehl/NOTES.partial.binarize delete mode 100755 graehl/NOTES.wfsa delete mode 100755 rescore/cdec_kbest_to_zmert.pl delete mode 100644 rescore/example/README delete mode 100644 rescore/example/cdec.ini delete mode 100644 rescore/example/hyp.txt delete mode 100644 rescore/example/small.scfg delete mode 100644 rescore/example/source.txt delete mode 100644 rescore/example/weights delete mode 100755 rescore/generate_zmert_params_from_weights.pl delete mode 100755 rescore/rerank.pl delete mode 100755 rescore/rescore_inv_model1.pl delete mode 100755 rescore/rescore_with_cdec_model.pl diff --git a/decoder/1dev.ur b/decoder/1dev.ur deleted file mode 100755 index adeaa101..00000000 --- a/decoder/1dev.ur +++ /dev/null @@ -1 +0,0 @@ -krAcy ( AstRAf rpwrtRr ) krAcy myN pyr kw mxtlf HAdvAt myN xAtwn smyt 4 AfrAd hlAk hw gyY jbkh smndr sY Ayk $xS ky lA$ mly . diff --git a/decoder/apply_fsa_models.README b/decoder/apply_fsa_models.README deleted file mode 100755 index 7e116a62..00000000 --- a/decoder/apply_fsa_models.README +++ /dev/null @@ -1,21 +0,0 @@ -trie root and trie lhs2[lhs-nodeid] -> trie node - -trie node edges (adj) - list of w,dest,p. dest==0 means it's a completed rule (note: p is redundant with node e.dest->p-p, except in case of dest=0). we will also use null_wordid (max_int) for dest=0 edges, but that doesn't matter - -we intersect by iterating over adj and scoring w/ fsa. TODO: index for sparse fsa; for now we assume smoothed ngram fsa where all items are scorable. - -predicted items: we don't make copies of the pending predictions as we scan toward completion; instead, item backpointers are followed until the prediction (where backpointer=0). such backpointer=0 items have a queue of prediction-originating items. - -reusing completed items using a lookup on pair [NT,a] -> all [NT,a,b] lazy best-first. b-next (right state) index in lazy index. - -perhaps predictors need to register the # of items it has already mated with. (b-next index) - -comb-like (cube) t-next (position in trie node edge list), b-next? or just check chart and don't redup. depends on whether we want just 1best or kbest deriv - diff. ways of reaching same result are good in kbest. - -types of chart items: - -A->t.*,a,b (trie node t) with mutable state t-next for generating successor lazily (vs. all at once) - -A->t.B,a,b (t-next of A->t.* points to (B,t')): mutable state b-next for choosing which B->b,? to use. note: such an item can't be queued immediately on its own, but can be added to the pending list of B->b,? ; once any B->b,? is completed then we see if any more b-next are already known; if they're exhausted then we add back to pending list? - -A->a,? - list of all known (b,inside prob) such that A[a,b]. we may also choose to represent this as A->.*,a,a. diff --git a/decoder/cdec-gz.ini b/decoder/cdec-gz.ini deleted file mode 100755 index f9b15420..00000000 --- a/decoder/cdec-gz.ini +++ /dev/null @@ -1,7 +0,0 @@ -cubepruning_pop_limit=200 -feature_function=WordPenalty -feature_function=ArityPenalty -add_pass_through_rules=true -formalism=scfg -grammar=mt09.grammar.gz -weights=weights.tune.nolm diff --git a/decoder/cdec-nolm-tuned.ini b/decoder/cdec-nolm-tuned.ini deleted file mode 100755 index 5ebab747..00000000 --- a/decoder/cdec-nolm-tuned.ini +++ /dev/null @@ -1,7 +0,0 @@ -cubepruning_pop_limit=200 -feature_function=WordPenalty -feature_function=ArityPenalty -add_pass_through_rules=true -formalism=scfg -grammar=mt09.grammar -weights=weights.tune.nolm diff --git a/decoder/decode.sh b/decoder/decode.sh deleted file mode 100755 index 677e64ad..00000000 --- a/decoder/decode.sh +++ /dev/null @@ -1,10 +0,0 @@ -d=$(dirname `readlink -f $0`)/ -decode() { -if [ "$lm" ] ; then - lmargs0=-F - lmargs1="LanguageModel lm.gz -n LM" -fi -set -x -$gdb ${cdec:=$d/cdec} -c $d/${cfg:=cdec-fsa}.ini -i $d/${in:=1dev.ur} $lmargs0 "$lmargs1" --show_features --show_config --show_weights "$@" -set +x -} diff --git a/decoder/do.tests.sh b/decoder/do.tests.sh deleted file mode 100755 index b3ddeb18..00000000 --- a/decoder/do.tests.sh +++ /dev/null @@ -1 +0,0 @@ -for f in *_test; do ./$f; done diff --git a/decoder/fsa-decode.sh b/decoder/fsa-decode.sh deleted file mode 100755 index 66879523..00000000 --- a/decoder/fsa-decode.sh +++ /dev/null @@ -1,3 +0,0 @@ -d=$(dirname `readlink -f $0`)/ -. $d/decode.sh -in=1dev.ur cfg=cdec-fsa decode diff --git a/decoder/fsa-hiero.ini b/decoder/fsa-hiero.ini deleted file mode 100755 index 7c7d0347..00000000 --- a/decoder/fsa-hiero.ini +++ /dev/null @@ -1,5 +0,0 @@ -formalism=scfg -scfg_extra_glue_grammar=glue-lda.scfg -grammar=grammar.hiero -show_tree_structure=true -weights=weights.hiero diff --git a/decoder/fsa.ini b/decoder/fsa.ini deleted file mode 100755 index 571a2e34..00000000 --- a/decoder/fsa.ini +++ /dev/null @@ -1,2 +0,0 @@ -feature_function=ShorterThanPrev -feature_function=LongerThanPrev diff --git a/decoder/glue-lda.scfg b/decoder/glue-lda.scfg deleted file mode 100755 index 27489817..00000000 --- a/decoder/glue-lda.scfg +++ /dev/null @@ -1,8 +0,0 @@ -[S] ||| [S,1] [X0,2] ||| [1] [2] ||| Glue=1 -[S] ||| [X0,1] ||| [1] ||| GlueTop=1 -[S] ||| [S,1] [X1,2] ||| [1] [2] ||| Glue=1 -[S] ||| [X1,1] ||| [1] ||| GlueTop=1 -[S] ||| [S,1] [X2,2] ||| [1] [2] ||| Glue=1 -[S] ||| [X2,1] ||| [1] ||| GlueTop=1 -[S] ||| [S,1] [X3,2] ||| [1] [2] ||| Glue=1 -[S] ||| [X3,1] ||| [1] ||| GlueTop=1 diff --git a/decoder/grammar.hiero b/decoder/grammar.hiero deleted file mode 100755 index 79adf33a..00000000 --- a/decoder/grammar.hiero +++ /dev/null @@ -1,151 +0,0 @@ -[X] ||| . ||| . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] . ||| [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] anciano ||| [1] old man ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| [X,1] anciano . ||| [1] old man . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| [X,1] anciano [X,2] ||| [1] old man [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| [X,1] feo ||| ugly [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] feo . ||| ugly [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] feo [X,2] ||| ugly [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] gato ||| [1] cat ||| EgivenF=0.405465 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] gato . ||| [1] cat . ||| EgivenF=0.405465 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] gato [X,2] ||| [1] [2] cat ||| EgivenF=0 FgivenE=1.09861 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] gato [X,2] ||| [1] cat [2] ||| EgivenF=0 FgivenE=0.405465 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] gato [X,2] . ||| [1] [2] cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] gato negro ||| [1] black cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] gato negro . ||| [1] black cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] gato negro [X,2] ||| [1] black cat [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] grande ||| big [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] grande . ||| big [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] grande [X,2] ||| big [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] negro ||| black [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] negro . ||| black [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] negro [X,2] ||| black [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] oruga ||| [1] caterpiller ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] oruga . ||| [1] caterpiller . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] oruga [X,2] ||| [1] caterpiller [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] patito [X,2] ||| [1] [2] duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] patito [X,2] . ||| [1] [2] duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] patito feo ||| [1] ugly duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] patito feo . ||| [1] ugly duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] patito feo [X,2] ||| [1] ugly duckling [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] peces ||| [1] fish ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] peces . ||| [1] fish . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] peces [X,2] ||| [1] fish [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] perro ||| [1] dog ||| EgivenF=0.405465 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] perro . ||| [1] dog . ||| EgivenF=0.405465 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] perro [X,2] ||| [1] dog [2] ||| EgivenF=0 FgivenE=0.405465 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] perro [X,2] ||| [1] [2] dog ||| EgivenF=0 FgivenE=1.09861 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] perro [X,2] . ||| [1] [2] dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] perro grande ||| [1] big dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] perro grande . ||| [1] big dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] perro grande [X,2] ||| [1] big dog [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] pájaro [X,2] ||| [1] [2] bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] pájaro [X,2] . ||| [1] [2] bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] pájaro negro ||| [1] black bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] pájaro negro . ||| [1] black bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| [X,1] pájaro negro [X,2] ||| [1] black bird [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| anciano ||| old man ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| anciano . ||| old man . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| anciano [X,1] ||| old man [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| el ||| the ||| EgivenF=0.287682 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] ||| the [1] ||| EgivenF=0.287682 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] . ||| the [1] . ||| EgivenF=0.287682 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] feo ||| the ugly [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] feo . ||| the ugly [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] feo [X,2] ||| the ugly [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] grande ||| the big [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] grande . ||| the big [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] grande [X,2] ||| the big [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] negro ||| the black [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] negro . ||| the black [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el [X,1] negro [X,2] ||| the black [1] [2] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el gato ||| the cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el gato . ||| the cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el gato [X,1] ||| the [1] cat ||| EgivenF=0 FgivenE=0.693147 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el gato [X,1] ||| the cat [1] ||| EgivenF=0 FgivenE=0.693147 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el gato [X,1] . ||| the [1] cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el gato negro ||| the black cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el gato negro . ||| the black cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el gato negro [X,1] ||| the black cat [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el patito [X,1] ||| the [1] duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el patito [X,1] . ||| the [1] duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el patito feo ||| the ugly duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el patito feo . ||| the ugly duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el patito feo [X,1] ||| the ugly duckling [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el perro ||| the dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el perro . ||| the dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el perro [X,1] ||| the [1] dog ||| EgivenF=0 FgivenE=0.693147 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el perro [X,1] ||| the dog [1] ||| EgivenF=0 FgivenE=0.693147 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el perro [X,1] . ||| the [1] dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el perro grande ||| the big dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el perro grande . ||| the big dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el perro grande [X,1] ||| the big dog [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el pájaro [X,1] ||| the [1] bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el pájaro [X,1] . ||| the [1] bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el pájaro negro ||| the black bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el pájaro negro . ||| the black bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| el pájaro negro [X,1] ||| the black bird [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0.287682 LexFgivenE=0 -[X] ||| eso ||| that ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| eso [X,1] ||| that [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| eso [X,1] . ||| that [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| eso perro ||| that dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| eso perro . ||| that dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| eso perro [X,1] ||| that dog [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| este ||| this ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| este [X,1] ||| this [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| este [X,1] . ||| this [1] . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| este anciano ||| this old man ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| este anciano . ||| this old man . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| este anciano [X,1] ||| this old man [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=1.38629 -[X] ||| este gato ||| this cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| este gato . ||| this cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| este gato [X,1] ||| this cat [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| feo ||| ugly ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| gato ||| cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| gato . ||| cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| gato [X,1] ||| [1] cat ||| EgivenF=1.09861 FgivenE=1.09861 LexEgivenF=0 LexFgivenE=0 -[X] ||| gato [X,1] ||| cat [1] ||| EgivenF=0 FgivenE=0.405465 LexEgivenF=0 LexFgivenE=0 -[X] ||| gato [X,1] . ||| [1] cat . ||| EgivenF=1.09861 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| gato negro ||| black cat ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| gato negro . ||| black cat . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| gato negro [X,1] ||| black cat [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| grande ||| big ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| la ||| the ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| la [X,1] ||| the [1] ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| la [X,1] . ||| the [1] . ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| la oruga ||| the caterpiller ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| la oruga . ||| the caterpiller . ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| la oruga [X,1] ||| the caterpiller [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| los ||| the ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| los [X,1] ||| the [1] ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| los [X,1] . ||| the [1] . ||| EgivenF=2.07944 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| los peces ||| the fish ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| los peces . ||| the fish . ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| los peces [X,1] ||| the fish [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=2.07944 LexFgivenE=0 -[X] ||| negro ||| black ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| oruga ||| caterpiller ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| oruga . ||| caterpiller . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| oruga [X,1] ||| caterpiller [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| patito ||| duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| patito [X,1] ||| [1] duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| patito [X,1] . ||| [1] duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| patito feo ||| ugly duckling ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| patito feo . ||| ugly duckling . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| patito feo [X,1] ||| ugly duckling [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| peces ||| fish ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| peces . ||| fish . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| peces [X,1] ||| fish [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| perro ||| dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| perro . ||| dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| perro [X,1] ||| [1] dog ||| EgivenF=1.09861 FgivenE=1.09861 LexEgivenF=0 LexFgivenE=0 -[X] ||| perro [X,1] ||| dog [1] ||| EgivenF=0 FgivenE=0.405465 LexEgivenF=0 LexFgivenE=0 -[X] ||| perro [X,1] . ||| [1] dog . ||| EgivenF=1.09861 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| perro grande ||| big dog ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| perro grande . ||| big dog . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| perro grande [X,1] ||| big dog [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| pájaro ||| bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| pájaro [X,1] ||| [1] bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| pájaro [X,1] . ||| [1] bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| pájaro negro ||| black bird ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| pájaro negro . ||| black bird . ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 -[X] ||| pájaro negro [X,1] ||| black bird [1] ||| EgivenF=0 FgivenE=0 LexEgivenF=0 LexFgivenE=0 diff --git a/decoder/perro.sh b/decoder/perro.sh deleted file mode 100755 index 3e54ac71..00000000 --- a/decoder/perro.sh +++ /dev/null @@ -1 +0,0 @@ -$gdb $cdec "$@" -k 30 --show_features -c fsa-hiero.ini -i perro.ur diff --git a/decoder/perro.ur b/decoder/perro.ur deleted file mode 100755 index 6c5da6d7..00000000 --- a/decoder/perro.ur +++ /dev/null @@ -1 +0,0 @@ -eso perro feo diff --git a/decoder/short.ur b/decoder/short.ur deleted file mode 100755 index 48612801..00000000 --- a/decoder/short.ur +++ /dev/null @@ -1 +0,0 @@ -krAcy myN pyr kw mxtlf HAdvAt diff --git a/decoder/weights-fsa b/decoder/weights-fsa deleted file mode 100644 index 3cc96c2f..00000000 --- a/decoder/weights-fsa +++ /dev/null @@ -1,14 +0,0 @@ -Arity_0 1.70741473606976 -Arity_1 1.12426238048012 -Arity_2 1.14986187839554 -Glue -0.04589037041388 -LanguageModel 1.09051 -LM 1.09051 -PassThrough -3.66226367902928 -PhraseModel_0 -1.94633451863252 -PhraseModel_1 -0.1475347695476 -PhraseModel_2 -1.614818994946 -WordPenalty -3.0 -WordPenaltyFsa -0.56028442964748 -ShorterThanPrev -10 -LongerThanPrev -10 diff --git a/decoder/weights.hiero b/decoder/weights.hiero deleted file mode 100755 index 6747f059..00000000 --- a/decoder/weights.hiero +++ /dev/null @@ -1,10 +0,0 @@ -SameFirstLetter 1 -LongerThanPrev 1 -ShorterThanPrev 1 -GlueTop 0.0 -Glue -1.0 -EgivenF -0.5 -FgivenE -0.5 -LexEgivenF -0.5 -LexFgivenE -0.5 -LM 1 diff --git a/dpmert/tac.pl b/dpmert/tac.pl deleted file mode 100755 index 9fb525c1..00000000 --- a/dpmert/tac.pl +++ /dev/null @@ -1,8 +0,0 @@ -#!/usr/bin/perl - -while(<>) { - chomp; - $|=1; - print (scalar reverse($_)); - print "\n"; -} diff --git a/expLog b/expLog deleted file mode 100644 index 2070ac98..00000000 --- a/expLog +++ /dev/null @@ -1,60 +0,0 @@ -TIME MEASURES AFTER MERGE WITH cdec: -8/July/2011 -commit ed8a6e81d87f6e917ecf - -./runEval -Fri Jul 8 13:28:23 CEST 2011 -Fri Jul 8 13:30:24 CEST 2011 -Loading references (4 files) -Loaded reference translations for 919 sentences. -Loaded 919 references for scoring with ibm_bleu -BLEU = 32.25, 76.5|43.1|24.3|13.9 (brev=0.993) -0.322487 -Fri Jul 8 13:30:24 CEST 2011 ------------- -Fri Jul 8 15:04:00 CEST 2011 -Fri Jul 8 15:05:58 CEST 2011 -Time required for Cube Pruning execution: 77.61 seconds. ------------- -Fri Jul 8 15:24:39 CEST 2011 -Fri Jul 8 15:26:36 CEST 2011 -Time required for Cube Pruning execution: 79.01 seconds. ------------- - -./runEvalFCP -Fri Jul 8 13:33:17 CEST 2011 -Fri Jul 8 13:35:06 CEST 2011 -Loading references (4 files) -Loaded reference translations for 919 sentences. -Loaded 919 references for scoring with ibm_bleu -BLEU = 32.39, 76.5|43.1|24.5|14.0 (brev=0.994) -0.323857 -Fri Jul 8 13:35:07 CEST 2011 ------------- -Fri Jul 8 15:08:17 CEST 2011 -Fri Jul 8 15:10:05 CEST 2011 -Time required for Cube Pruning execution: 69.36 seconds. ------------- -Fri Jul 8 15:21:48 CEST 2011 -Fri Jul 8 15:23:35 CEST 2011 -Time required for Cube Pruning execution: 69.71 seconds. ------------- - -./runEvalFCP2 -Fri Jul 8 13:53:38 CEST 2011 -Fri Jul 8 13:55:29 CEST 2011 -Loading references (4 files) -Loaded reference translations for 919 sentences. -Loaded 919 references for scoring with ibm_bleu -BLEU = 32.49, 76.6|43.2|24.5|14.1 (brev=0.994) -0.324901 -Fri Jul 8 13:55:29 CEST 2011 ------------- -Fri Jul 8 15:12:52 CEST 2011 -Fri Jul 8 15:14:42 CEST 2011 -Time required for Cube Pruning execution: 72.66 seconds. ------------- -Fri Jul 8 15:19:13 CEST 2011 -Fri Jul 8 15:21:03 CEST 2011 -Time required for Cube Pruning execution: 72.06 seconds. ------------- diff --git a/graehl/NOTES b/graehl/NOTES deleted file mode 100755 index 77e99fee..00000000 --- a/graehl/NOTES +++ /dev/null @@ -1,18 +0,0 @@ -BUG: tune is bad - urdu conf=baseline tuning (16 dev bleu score???) - -conf=baseline force=1 ./tune.sh - - decode is good. - - UPDATE: maybe tuning is fine; chris never gave me a dev-corpus-filtered grammar and so a bleu of 16 may be what we always got; i just never checked. this means i need to redo tuned-first-pass experiments - -valgrind is ok - - dist-vest? - - (changes made to scoring? plusequals? shared_ptr? small_vector?) - - scorer_test is good - - - line_optimer fast_score scorer diff --git a/graehl/NOTES.beam b/graehl/NOTES.beam deleted file mode 100755 index a48d1ab7..00000000 --- a/graehl/NOTES.beam +++ /dev/null @@ -1,29 +0,0 @@ -(graehl, comments on code) - -passive chart: completion of actual translation rules (X or S NT in Hiero), have -rule features. Hyperedge inserted with copy of rule feature vector -(non-sparse). Inefficient; should be postponed on intermediate parses with -global pruning; just keep pointer to rules and models must provide an interface -to build a (sparse) feat. vector on demand later for the stuff we keep. - -multithreading: none. list of hyperarcs for refinement would need to be -segregated into subforest blocks and have own output lists for later merging. -e.g. bottom up count number of tail-reachable nodes under each hypernode, then -assign to workers. - -ngram caching: trie, no locks, for example. for threading, LRU hashing w/ locks per bucket is probably better, or per-thread caches. probably cache is reset per sentence? - -randlm worth using? guess not. - -actually get all 0-state models in 1st pass parse and prune passive edges per span. - -allocate cube pruning budget per prev pass - -(has been tried in ISI decoder) models with nonstandard state comparison, -typically (partially) greedy forest scoring, some part of the state is excluded -from equality/hashing. Within virtual ff interface, would just add equals, hash -to vtable (rather than the faster raw state compare). If this is done often, -then add a nonvirtual flag to interface instead, saying whether to use the -virt. methods or not. or: simple flag by user of ApplyModels (per feature?) -saying whether to be 100% greedy or 0% - no halfway e.g. item name uses bigram -context, but score using 5gram state. diff --git a/graehl/NOTES.earley b/graehl/NOTES.earley deleted file mode 100755 index 0953708c..00000000 --- a/graehl/NOTES.earley +++ /dev/null @@ -1,111 +0,0 @@ -1. fsts (modify target string produced) are quite feasible. augment fsa ff to not just emit features, but also new target words. composition or intersection is no problem (can always bunch into a single FSA/FST lazily by wrapping) - -2. sparse fsas (many transitions have -inf score) aren't efficiently supported presently (they are in early_composer where the fsa is a phrase table); the fsa ff interface doesn't even provide a way to query the non-0 transitions (you have to emit a -inf feature). if sparse fsas were expected often and we wanted exact search, then a similar index of the tcfg as in earley_composer would make sense. however, with prob. beam search, we prune out bad-scoring stuff anyway - -3. binarization of rhs isn't usually considered necessary in earley, but i liked the idea of optimal binarization making the most sharing possible. however, this means what would have just been a scan is now a scan+complete. - -4. prefix (forward) + inside cost. this is phrased in a way so that prefix includes inside. but there's no reason not to think of it in exclusive terms (outside,inside) where prefix=outside*inside when using viterbi. on the other hand, we usually use the outside*inside as the beam score. and furthermore, it looks like when summing over all derivations, there would be some difficulty calculating, as the total inside wouldn't be known at first. - -(a,i) r => (+=a*r,r) would become (o,i) r => (+=[(o*i*r)/r],r) = (+=o*i,r) -(_,b'') (a,b) => (+=a*b'',+=b*b'') would become (_,b'') (o,b) => (?????) - -==== - - -the target CFG (tcfg) is finite - absolutely no cycles. conceptually we're intersecting it with a wfsa (weights are feature vectors), which is a lot like parsing a lattice, in that states are (source state, dest state) pairs and you're covering some string(s) that go from source->dest in the wfsa. - -Chris' paper http://www.ling.umd.edu/~redpony/forest-reordering.pdf - apparently (figure 5) already contains the exact concept we're going for, albeit with only inside scores. http://www.speech.sri.com/cgi-bin/run-distill?ftp:papers/stolcke-cl95.ps.gz describes a nice way of computing sums over derivations given a string by keeping a tuple of ("forward","inner") scores while Earley parsing. I'm not sure yet if this is applicable (because we'll already have the exact outside scores from the -LM forest already, and plan on using cost pushing toward the top so we don't have to explicitly consider them). - -normally in earley, one word is consumed at a time, left to right. completions happen from shortest to longest, then (repeated) predictions, and finally scans. i'm sure this has the usual obvious extension to parsing lattices (proceed in some topological order). - -but because the wfsa (ngram lm) has cycles and forgets the length of the string (at some point), it's slightly more complicated than lattice parsing the tcfg - there's no topological order over the wfsa states and so you can't finish all the items [x,j] for j from left->right. best first with monotonic total scores (admissable heuristics) is an easy way to avoid generating the whole space; otherwise I can't think of a fixed order that would allow for alternative beaming. as we discussed, arbitrary predicates filtering candidate items can be added if exact best-first is too slow - -if the wfsa were just a single string t[0...n-1], then any time you have an item [i,j]X->a.b that means that there's some derivation in the tCFG of S =>* t[0...i-1]Xc => t[0....i-1]abc =>* t[0...j-1]bc , for a FSA, the analog is S =>* W(0,i)Xc => W(0,i)abc =>* W(0,i)W(i,j)bc where W(a,b) means any string in the wfsa language with a as the start state and b as the final state. - - -http://www.isi.edu/natural-language/teaching/cs544/cs544-huang-3-Earley.pdf - -http://www.isi.edu/~lhuang/dp-final.pdf (variation on stolcke 1995 prefix cost) - -http://acl.ldc.upenn.edu/P/P07/P07-1019.pdf - phrase based lazy priority queue "cube growing" descendants (p149) - - - - - -http://www.speech.sri.com/cgi-bin/run-distill?ftp:papers/stolcke-cl95.ps.gz - -http://www.icsi.berkeley.edu/~stolcke/papers/cl95/node10.html#SECTION00042000000000000000 - -a) An (unconstrained) Earley path, or simply path, is a sequence of Earley -states linked by prediction, scanning, or completion. For the purpose of -this definition, we allow scanning to operate in “generation mode,” i.e., all -states with terminals to the right of the dot can be scanned, not just those -matching the input. (For completed states, the predecessor state is defined -to be the complete state from the same state set contributing to the -completion.) -b) A path is said to be constrained by, or generate a string x if the terminals -immediately to the left of the dot in all scanned states, in sequence, form -the string x. -c) A path is complete if the last state on it matches the first, except that the -dot has moved to the end of the RHS. -d) We say that a path starts with nonterminal X if the first state on it is a -predicted statewith X on the LHS. -e) The length of a path is defined as the number of scanned states on it. - -Note that the definition of path length is somewhat counter-intuitive, but is motivated -by the fact that only scanned states correspond directly to input symbols. Thus, -the length of a path is always the same as the length of the input string it generates. - -A constrained path starting with the initial state contains a sequence of states from -state set 0 derived by repeated prediction, followed by a single state from set 1 produced -by scanning the first symbol, followed by a sequence of states produced by completion, -followed by a sequence of predicted states, followed by a state scanning the second -symbol, and so on. The significance of Earley paths is that they are in a one-to-one -correspondence with left-most derivations. - - -============= - -The forward probability alpha_i(X[k]->x.y) is the sum of the probabilities of all -constrained paths of length that end in state X[k]->x.y - -b) The inner probability beta_i(X[k]->x.y) is the sum of the probabilities of all -paths of length i-k that start in state X[k,k]->.xy and end in X[k,i]->x.y, and generate the input symbols x[k,...,i-1] - -(forward,inner) [i.e. (outside,inside)?] - unchanged by scan (rule cost is paid up front when predicting) - -if X[k,i] -> x.Yz (a,b) and rule Y -> r (p) -then Y[i,i] -> .r (a',p) with a' += a*p - -if Y[j,i]->y. (a'',b'') and X[k,j]->r.Ys (a,b) -then X[k,i]->rY.s (a',b') with a' += a*b'', b' += b*b'' - -(this is summing over all derivations) - - -========== - -is forward cost viterbi fine? i.e. can i have items whose names ignore the lhs NT (look up predictions that i finish lazily / graph structured?) -====== - -1) A -> x . * (trie) - -this is somewhat nice. cost pushed for best first, of course. similar benefit as left-branching binarization without the explicit predict/complete steps? - -vs. just - -2) * -> x . y - -here you have to potentially list out all A -> . x y as items * -> . x y immediately, and shared rhs seqs won't be shared except at the usual single-NT predict/complete. of course, the prediction of items -> . x y can occur lazy best-first. - -vs. - -3) * -> x . * - -with 3, we predict all sorts of useless items - that won't give us our goal A and may not partcipate in any parse. this is not a good option at all. - -====== - --LM forest may have many in-edges per V. (many rules per NT lhs). so instead of generating all successors for scan/predict, i wanted to have them in sorted (admissable) -LM cost order and postpone once the prefix+rule part is more expensive than something else in the agenda. question: how many such postponed successor things diff --git a/graehl/NOTES.lm.phrase b/graehl/NOTES.lm.phrase deleted file mode 100755 index e87cc6fb..00000000 --- a/graehl/NOTES.lm.phrase +++ /dev/null @@ -1,180 +0,0 @@ -possibly the most direct solution is to print every individual probability from LM (to global fstream?). since the difference happens even w/o shortening, disable shortening to remove the possible effect of floor(p+bo) vs floor(p)+bo disagreeing - -+LM forest (nodes/edges): 2163/11293 - +LM forest (paths): 7.14685e+14 - +LM forest Viterbi logp: -490.21 - +LM forest Viterbi: karachi ( AstRAf rpwrtRr ) in karachi on monday in different HAdvAt gyY and killed 4 people including a woman found dead body of a person from the sea . - +LM forest derivation: ({<0,28>[1] ||| (final r->l(( karachi| ) start=[ ]->{karachi (} r->l(|. sea) end=[sea .]->{} LanguageModelFsa=-5.74766; }({<0,28>[1] [2] ||| [karachi ( : [a woman]] r->l(( karachi|) [found dead : [sea .]] r->l(dead found|woman a) = [karachi ( : [sea .]] LanguageModelFsa=-5.93027 h=-5.83552); }({<0,20>[1] [2] ||| [karachi ( : []] r->l(( karachi|) [gyY and : [a woman]] r->l(and gyY|) = [karachi ( : [a woman]] LanguageModelFsa=-101.72 h=-5.83552); }({<0,12>[1] [2] ||| [karachi ( : [in karachi]] r->l(( karachi|) [on monday : []] r->l(monday on|karachi in) = [karachi ( : []] LanguageModelFsa=-1.99946 h=-5.83552); }({<0,7>[1] [2] ||| [karachi ( : [rpwrtRr )]] r->l(( karachi|) [in karachi : [in karachi]] r->l(karachi in|) rpwrtRr) = [karachi ( : [in karachi]] LanguageModelFsa=-3.40247 h=-5.83552); }({<0,5>[1] [2] ||| [karachi ( : []] r->l(( karachi|) [rpwrtRr ) : [rpwrtRr )]] r->l() rpwrtRr|) = [karachi ( : [rpwrtRr )]] LanguageModelFsa=-102.623 h=-5.83552); }({<0,3>[1] ||| [karachi ( : []] r->l(( karachi|) = [karachi ( : []] LanguageModelFsa=0 h=-5.83552); }({<0,3>karachi [1] ||| [( AstRAf : []] r->l(( karachi|) r->l(AstRAf|( karachi) = [karachi ( : []] LanguageModelFsa=-100 h=-5.83552); r->l(karachi|)}({<1,3>( [1] ||| [AstRAf] r->l(AstRAf (|) = [( AstRAf : []] LanguageModelFsa=0 h=-102.641); r->l((|)}({<2,3>AstRAf ||| r->l(AstRAf|) = [AstRAf] LanguageModelFsa=0 h=-100); r->l(AstRAf|)}) ) ) ) ({<3,5>[1] ) ||| [rpwrtRr] r->l() rpwrtRr|) = [rpwrtRr ) : [rpwrtRr )]] LanguageModelFsa=0 h=-102.623); r->l()|)}({<3,4>rpwrtRr ||| r->l(rpwrtRr|) = [rpwrtRr] LanguageModelFsa=0 h=-100); r->l(rpwrtRr|)}) ) ) ({<5,7>in karachi ||| r->l(karachi in|) = [in karachi : [in karachi]] LanguageModelFsa=0 h=-3.80404); r->l(karachi in|)}) ) ({<7,12>on monday in [1] ||| r->l(monday on|) rule-phrase[in] r->l(in|monday on) [different HAdvAt : []] r->l(HAdvAt different|in monday) = [on monday : []] LanguageModelFsa=-103.918 h=-3.91305); r->l(in monday on|)}({<9,11>different [1] ||| [HAdvAt] r->l(HAdvAt different|) = [different HAdvAt : []] LanguageModelFsa=0 h=-103.573); r->l(different|)}({<10,11>HAdvAt ||| r->l(HAdvAt|) = [HAdvAt] LanguageModelFsa=0 h=-100); r->l(HAdvAt|)}) ) ) ) ({<12,20>[2] killed [1] ||| [gyY and : [gyY and]] r->l(and gyY|) rule-phrase[killed] r->l(killed|and gyY) [4 people : [a woman]] r->l(people 4|killed and) = [gyY and : [a woman]] LanguageModelFsa=-5.57026 h=-101.72); r->l(killed|)}({<12,16>[2] people including a [1] ||| [4] r->l(people 4|) rule-phrase[including a] r->l(a including|people 4) [woman] r->l(woman|a including) = [4 people : [a woman]] LanguageModelFsa=-3.99305 h=-6.22734); r->l(a including people|)}({<12,13>woman ||| r->l(woman|) = [woman] LanguageModelFsa=0 h=-3.82934); r->l(woman|)}) ({<14,15>4 ||| r->l(4|) = [4] LanguageModelFsa=0 h=-3.62974); r->l(4|)}) ) ({<18,20>[1] and ||| [gyY] r->l(and gyY|) = [gyY and : [gyY and]] LanguageModelFsa=0 h=-101.72); r->l(and|)}({<18,19>gyY ||| r->l(gyY|) = [gyY] LanguageModelFsa=0 h=-100); r->l(gyY|)}) ) ) ) ({<20,28>[1] the sea . ||| [found dead : [ from]] r->l(dead found|) rule-phrase[the sea .] r->l(. sea the|from ) = [found dead : [sea .]] LanguageModelFsa=-4.84745 h=-7.62839); r->l(. sea the|)}({<21,27>found [1] from ||| [dead body : []] r->l(dead found|) r->l(body|dead found) rule-phrase[from] r->l(from|) = [found dead : [ from]] LanguageModelFsa=-3.42491 h=-7.62839); r->l(found|) r->l(from|)}({<22,26>dead body of [1] ||| r->l(body dead|) rule-phrase[of] r->l(of|body dead) [a person : []] r->l(person a|of body) = [dead body : []] LanguageModelFsa=-2.9934 h=-4.63222); r->l(of body dead|)}({<22,24>a [1] ||| [person] r->l(person a|) = [a person : []] LanguageModelFsa=0 h=-4.90016); r->l(a|)}({<23,24>person ||| r->l(person|) = [person] LanguageModelFsa=0 h=-3.50165); r->l(person|)}) ) ) ) ) ) ) - +LM forest features: Arity_0=-3.47436;Arity_1=-4.77724;Arity_2=-3.04006;Glue=5;LanguageModel=-446.49;LmFsa=-446.17;PassThrough=5;PhraseModel_0=12.2199;PhraseModel_1=11.6391;PhraseModel_2=10.9878;WordPenalty=-13.0288;Unigram=-462.696;UnigramFsa=-462.696 -Output kbest to - -0 ||| karachi ( AstRAf rpwrtRr ) in karachi on monday in different HAdvAt gyY and killed 4 people including a woman found dead body of a person from the sea . ||| Arity_0=-3.47436;Arity_1=-4.77724;Arity_2=-3.04006;Glue=5;LanguageModel=-446.49;LmFsa=-446.17;PassThrough=5;PhraseModel_0=12.2199;PhraseModel_1=11.6391;PhraseModel_2=10.9878;WordPenalty=-13.0288;Unigram=-462.696;UnigramFsa=-462.696 ||| -490.21 - -sent_id=0 -({<0,28>[1] ||| (final r->l(( karachi| ) start=[ ]->{karachi (} r->l(|. sea) end=[sea .]->{} LanguageModelFsa=-5.74766; } - ({<0,28>[1] [2] ||| [karachi ( : [a woman]] r->l(( karachi|) [found dead : [sea .]] r->l(dead found|woman a) = [karachi ( : [sea .]] LanguageModelFsa=-5.93027 h=-5.83552); } - ({<0,20>[1] [2] ||| [karachi ( : []] r->l(( karachi|) [gyY and : [a woman]] r->l(and gyY|) = [karachi ( : [a woman]] LanguageModelFsa=-101.72 h=-5.83552); } - ({<0,12>[1] [2] ||| [karachi ( : [in karachi]] r->l(( karachi|) [on monday : []] r->l(monday on|karachi in) = [karachi ( : []] LanguageModelFsa=-1.99946 h=-5.83552); } - ({<0,7>[1] [2] ||| [karachi ( : [rpwrtRr )]] r->l(( karachi|) [in karachi : [in karachi]] r->l(karachi in|) rpwrtRr) = [karachi ( : [in karachi]] LanguageModelFsa=-3.40247 h=-5.83552); } - ({<0,5>[1] [2] ||| [karachi ( : []] r->l(( karachi|) [rpwrtRr ) : [rpwrtRr )]] r->l() rpwrtRr|) = [karachi ( : [rpwrtRr )]] LanguageModelFsa=-102.623 h=-5.83552); } - ({<0,3>[1] ||| [karachi ( : []] r->l(( karachi|) = [karachi ( : []] LanguageModelFsa=0 h=-5.83552); } - ({<0,3>karachi [1] ||| [( AstRAf : []] r->l(( karachi|) r->l(AstRAf|( karachi) = [karachi ( : []] LanguageModelFsa=-100 h=-5.83552); r->l(karachi|)} - ({<1,3>( [1] ||| [AstRAf] r->l(AstRAf (|) = [( AstRAf : []] LanguageModelFsa=0 h=-102.641); r->l((|)} - ({<2,3>AstRAf ||| r->l(AstRAf|) = [AstRAf] LanguageModelFsa=0 h=-100); r->l(AstRAf|)} - ) - ) - ) - ) - ({<3,5>[1] ) ||| [rpwrtRr] r->l() rpwrtRr|) = [rpwrtRr ) : [rpwrtRr )]] LanguageModelFsa=0 h=-102.623); r->l()|)} - ({<3,4>rpwrtRr ||| r->l(rpwrtRr|) = [rpwrtRr] LanguageModelFsa=0 h=-100); r->l(rpwrtRr|)} - ) - ) - ) - ({<5,7>in karachi ||| r->l(karachi in|) = [in karachi : [in karachi]] LanguageModelFsa=0 h=-3.80404); r->l(karachi in|)} - ) - ) - ({<7,12>on monday in [1] ||| r->l(monday on|) rule-phrase[in] r->l(in|monday on) [different HAdvAt : []] r->l(HAdvAt different|in monday) = [on monday : []] LanguageModelFsa=-103.918 h=-3.91305); r->l(in monday on|)} - ({<9,11>different [1] ||| [HAdvAt] r->l(HAdvAt different|) = [different HAdvAt : []] LanguageModelFsa=0 h=-103.573); r->l(different|)} - ({<10,11>HAdvAt ||| r->l(HAdvAt|) = [HAdvAt] LanguageModelFsa=0 h=-100); r->l(HAdvAt|)} - ) - ) - ) - ) - ({<12,20>[2] killed [1] ||| [gyY and : [gyY and]] r->l(and gyY|) rule-phrase[killed] r->l(killed|and gyY) [4 people : [a woman]] r->l(people 4|killed and) = [gyY and : [a woman]] LanguageModelFsa=-5.57026 h=-101.72); r->l(killed|)} - ({<12,16>[2] people including a [1] ||| [4] r->l(people 4|) rule-phrase[including a] r->l(a including|people 4) [woman] r->l(woman|a including) = [4 people : [a woman]] LanguageModelFsa=-3.99305 h=-6.22734); r->l(a including people|)} - ({<12,13>woman ||| r->l(woman|) = [woman] LanguageModelFsa=0 h=-3.82934); r->l(woman|)} - ) - ({<14,15>4 ||| r->l(4|) = [4] LanguageModelFsa=0 h=-3.62974); r->l(4|)} - ) - ) - ({<18,20>[1] and ||| [gyY] r->l(and gyY|) = [gyY and : [gyY and]] LanguageModelFsa=0 h=-101.72); r->l(and|)} - ({<18,19>gyY ||| r->l(gyY|) = [gyY] LanguageModelFsa=0 h=-100); r->l(gyY|)} - ) - ) - ) - ) - ({<20,28>[1] the sea . ||| [found dead : [ from]] r->l(dead found|) rule-phrase[the sea .] r->l(. sea the|from ) = [found dead : [sea .]] LanguageModelFsa=-4.84745 h=-7.62839); r->l(. sea the|)} - ({<21,27>found [1] from ||| [dead body : []] r->l(dead found|) r->l(body|dead found) rule-phrase[from] r->l(from|) = [found dead : [ from]] LanguageModelFsa=-3.42491 h=-7.62839); r->l(found|) r->l(from|)} - ({<22,26>dead body of [1] ||| r->l(body dead|) rule-phrase[of] r->l(of|body dead) [a person : []] r->l(person a|of body) = [dead body : []] LanguageModelFsa=-2.9934 h=-4.63222); r->l(of body dead|)} - ({<22,24>a [1] ||| [person] r->l(person a|) = [a person : []] LanguageModelFsa=0 h=-4.90016); r->l(a|)} - ({<23,24>person ||| r->l(person|) = [person] LanguageModelFsa=0 h=-3.50165); r->l(person|)} - ) - ) - ) - ) - ) - ) -) -0 ||| karachi ( AstRAf rpwrtRr ) in karachi on monday in different HAdvAt gyY and killed 4 people including a woman found the dead body of a person from the sea . ||| Arity_0=-3.47436;Arity_1=-4.77724;Arity_2=-3.04006;Glue=5;LanguageModel=-446.828;LmFsa=-446.508;PassThrough=5;PhraseModel_0=12.697;PhraseModel_1=11.6391;PhraseModel_2=11.5728;WordPenalty=-13.4631;Unigram=-463.765;UnigramFsa=-463.765 ||| -490.295 - -sent_id=0 -({<0,28>[1] ||| (final r->l(( karachi| ) start=[ ]->{karachi (} r->l(|. sea) end=[sea .]->{} LanguageModelFsa=-5.74766; } - ({<0,28>[1] [2] ||| [karachi ( : [a woman]] r->l(( karachi|) [found the : [sea .]] r->l(the found|woman a) = [karachi ( : [sea .]] LanguageModelFsa=-3.6217 h=-5.83552); } - ({<0,20>[1] [2] ||| [karachi ( : []] r->l(( karachi|) [gyY and : [a woman]] r->l(and gyY|) = [karachi ( : [a woman]] LanguageModelFsa=-101.72 h=-5.83552); } - ({<0,12>[1] [2] ||| [karachi ( : [in karachi]] r->l(( karachi|) [on monday : []] r->l(monday on|karachi in) = [karachi ( : []] LanguageModelFsa=-1.99946 h=-5.83552); } - ({<0,7>[1] [2] ||| [karachi ( : [rpwrtRr )]] r->l(( karachi|) [in karachi : [in karachi]] r->l(karachi in|) rpwrtRr) = [karachi ( : [in karachi]] LanguageModelFsa=-3.40247 h=-5.83552); } - ({<0,5>[1] [2] ||| [karachi ( : []] r->l(( karachi|) [rpwrtRr ) : [rpwrtRr )]] r->l() rpwrtRr|) = [karachi ( : [rpwrtRr )]] LanguageModelFsa=-102.623 h=-5.83552); } - ({<0,3>[1] ||| [karachi ( : []] r->l(( karachi|) = [karachi ( : []] LanguageModelFsa=0 h=-5.83552); } - ({<0,3>karachi [1] ||| [( AstRAf : []] r->l(( karachi|) r->l(AstRAf|( karachi) = [karachi ( : []] LanguageModelFsa=-100 h=-5.83552); r->l(karachi|)} - ({<1,3>( [1] ||| [AstRAf] r->l(AstRAf (|) = [( AstRAf : []] LanguageModelFsa=0 h=-102.641); r->l((|)} - ({<2,3>AstRAf ||| r->l(AstRAf|) = [AstRAf] LanguageModelFsa=0 h=-100); r->l(AstRAf|)} - ) - ) - ) - ) - ({<3,5>[1] ) ||| [rpwrtRr] r->l() rpwrtRr|) = [rpwrtRr ) : [rpwrtRr )]] LanguageModelFsa=0 h=-102.623); r->l()|)} - ({<3,4>rpwrtRr ||| r->l(rpwrtRr|) = [rpwrtRr] LanguageModelFsa=0 h=-100); r->l(rpwrtRr|)} - ) - ) - ) - ({<5,7>in karachi ||| r->l(karachi in|) = [in karachi : [in karachi]] LanguageModelFsa=0 h=-3.80404); r->l(karachi in|)} - ) - ) - ({<7,12>on monday in [1] ||| r->l(monday on|) rule-phrase[in] r->l(in|monday on) [different HAdvAt : []] r->l(HAdvAt different|in monday) = [on monday : []] LanguageModelFsa=-103.918 h=-3.91305); r->l(in monday on|)} - ({<9,11>different [1] ||| [HAdvAt] r->l(HAdvAt different|) = [different HAdvAt : []] LanguageModelFsa=0 h=-103.573); r->l(different|)} - ({<10,11>HAdvAt ||| r->l(HAdvAt|) = [HAdvAt] LanguageModelFsa=0 h=-100); r->l(HAdvAt|)} - ) - ) - ) - ) - ({<12,20>[2] killed [1] ||| [gyY and : [gyY and]] r->l(and gyY|) rule-phrase[killed] r->l(killed|and gyY) [4 people : [a woman]] r->l(people 4|killed and) = [gyY and : [a woman]] LanguageModelFsa=-5.57026 h=-101.72); r->l(killed|)} - ({<12,16>[2] people including a [1] ||| [4] r->l(people 4|) rule-phrase[including a] r->l(a including|people 4) [woman] r->l(woman|a including) = [4 people : [a woman]] LanguageModelFsa=-3.99305 h=-6.22734); r->l(a including people|)} - ({<12,13>woman ||| r->l(woman|) = [woman] LanguageModelFsa=0 h=-3.82934); r->l(woman|)} - ) - ({<14,15>4 ||| r->l(4|) = [4] LanguageModelFsa=0 h=-3.62974); r->l(4|)} - ) - ) - ({<18,20>[1] and ||| [gyY] r->l(and gyY|) = [gyY and : [gyY and]] LanguageModelFsa=0 h=-101.72); r->l(and|)} - ({<18,19>gyY ||| r->l(gyY|) = [gyY] LanguageModelFsa=0 h=-100); r->l(gyY|)} - ) - ) - ) - ) - ({<20,28>[1] the sea . ||| [found the : [ from]] r->l(the found|) rule-phrase[the sea .] r->l(. sea the|from ) = [found the : [sea .]] LanguageModelFsa=-4.84745 h=-5.31983); r->l(. sea the|)} - ({<21,27>found [1] from ||| [the dead : []] r->l(the found|) r->l(dead|the found) rule-phrase[from] r->l(from|) = [found the : [ from]] LanguageModelFsa=-5.34421 h=-5.31983); r->l(found|) r->l(from|)} - ({<22,26>the dead body of [1] ||| r->l(dead the|) rule-phrase[body of] r->l(of body|dead the) [a person : []] r->l(person a|of body) = [the dead : []] LanguageModelFsa=-3.7205 h=-4.97373); r->l(of body dead the|)} - ({<22,24>a [1] ||| [person] r->l(person a|) = [a person : []] LanguageModelFsa=0 h=-4.90016); r->l(a|)} - ({<23,24>person ||| r->l(person|) = [person] LanguageModelFsa=0 h=-3.50165); r->l(person|)} - ) - ) - ) - ) - ) - ) -) -0 ||| karachi ( AstRAf rpwrtRr ) in karachi on monday in different HAdvAt gyY killed 4 people including a woman while dead body of a person from the sea . ||| Arity_0=-3.47436;Arity_1=-4.77724;Arity_2=-3.04006;Glue=5;LanguageModel=-445.419;LmFsa=-445.099;PassThrough=5;PhraseModel_0=12.5687;PhraseModel_1=12.5781;PhraseModel_2=9.61571;WordPenalty=-12.5945;Unigram=-461.303;UnigramFsa=-461.303 ||| -490.646 - -sent_id=0 -({<0,28>[1] ||| (final r->l(( karachi| ) start=[ ]->{karachi (} r->l(|. sea) end=[sea .]->{} LanguageModelFsa=-5.74766; } - ({<0,28>[1] [2] ||| [karachi ( : [a woman]] r->l(( karachi|) [while dead : [sea .]] r->l(dead while|woman a) = [karachi ( : [sea .]] LanguageModelFsa=-5.71074 h=-5.83552); } - ({<0,19>[1] [2] ||| [karachi ( : []] r->l(( karachi|) [gyY killed : [a woman]] r->l(killed gyY|) = [karachi ( : [a woman]] LanguageModelFsa=-103.345 h=-5.83552); } - ({<0,12>[1] [2] ||| [karachi ( : [in karachi]] r->l(( karachi|) [on monday : []] r->l(monday on|karachi in) = [karachi ( : []] LanguageModelFsa=-1.99946 h=-5.83552); } - ({<0,7>[1] [2] ||| [karachi ( : [rpwrtRr )]] r->l(( karachi|) [in karachi : [in karachi]] r->l(karachi in|) rpwrtRr) = [karachi ( : [in karachi]] LanguageModelFsa=-3.40247 h=-5.83552); } - ({<0,5>[1] [2] ||| [karachi ( : []] r->l(( karachi|) [rpwrtRr ) : [rpwrtRr )]] r->l() rpwrtRr|) = [karachi ( : [rpwrtRr )]] LanguageModelFsa=-102.623 h=-5.83552); } - ({<0,3>[1] ||| [karachi ( : []] r->l(( karachi|) = [karachi ( : []] LanguageModelFsa=0 h=-5.83552); } - ({<0,3>karachi [1] ||| [( AstRAf : []] r->l(( karachi|) r->l(AstRAf|( karachi) = [karachi ( : []] LanguageModelFsa=-100 h=-5.83552); r->l(karachi|)} - ({<1,3>( [1] ||| [AstRAf] r->l(AstRAf (|) = [( AstRAf : []] LanguageModelFsa=0 h=-102.641); r->l((|)} - ({<2,3>AstRAf ||| r->l(AstRAf|) = [AstRAf] LanguageModelFsa=0 h=-100); r->l(AstRAf|)} - ) - ) - ) - ) - ({<3,5>[1] ) ||| [rpwrtRr] r->l() rpwrtRr|) = [rpwrtRr ) : [rpwrtRr )]] LanguageModelFsa=0 h=-102.623); r->l()|)} - ({<3,4>rpwrtRr ||| r->l(rpwrtRr|) = [rpwrtRr] LanguageModelFsa=0 h=-100); r->l(rpwrtRr|)} - ) - ) - ) - ({<5,7>in karachi ||| r->l(karachi in|) = [in karachi : [in karachi]] LanguageModelFsa=0 h=-3.80404); r->l(karachi in|)} - ) - ) - ({<7,12>on monday in [1] ||| r->l(monday on|) rule-phrase[in] r->l(in|monday on) [different HAdvAt : []] r->l(HAdvAt different|in monday) = [on monday : []] LanguageModelFsa=-103.918 h=-3.91305); r->l(in monday on|)} - ({<9,11>different [1] ||| [HAdvAt] r->l(HAdvAt different|) = [different HAdvAt : []] LanguageModelFsa=0 h=-103.573); r->l(different|)} - ({<10,11>HAdvAt ||| r->l(HAdvAt|) = [HAdvAt] LanguageModelFsa=0 h=-100); r->l(HAdvAt|)} - ) - ) - ) - ) - ({<12,19>[2] killed [1] ||| [gyY] r->l(killed gyY|) [4 people : [a woman]] r->l(people 4|killed gyY) = [gyY killed : [a woman]] LanguageModelFsa=-2.98475 h=-103.345); r->l(killed|)} - ({<12,16>[2] people including a [1] ||| [4] r->l(people 4|) rule-phrase[including a] r->l(a including|people 4) [woman] r->l(woman|a including) = [4 people : [a woman]] LanguageModelFsa=-3.99305 h=-6.22734); r->l(a including people|)} - ({<12,13>woman ||| r->l(woman|) = [woman] LanguageModelFsa=0 h=-3.82934); r->l(woman|)} - ) - ({<14,15>4 ||| r->l(4|) = [4] LanguageModelFsa=0 h=-3.62974); r->l(4|)} - ) - ) - ({<18,19>gyY ||| r->l(gyY|) = [gyY] LanguageModelFsa=0 h=-100); r->l(gyY|)} - ) - ) - ) - ({<19,28>while [1] ||| [dead body : [sea .]] r->l(dead while|) r->l(body|dead while) = [while dead : [sea .]] LanguageModelFsa=-1.20144 h=-6.25144); r->l(while|)} - ({<20,28>[1] . ||| [dead body : [the sea]] r->l(body dead|) rule-phrase[.] r->l(.|sea the) = [dead body : [sea .]] LanguageModelFsa=-0.45297 h=-4.63222); r->l(.|)} - ({<20,26>[1] the sea ||| [dead body : [ from]] r->l(body dead|) rule-phrase[the sea] r->l(sea the|from ) = [dead body : [the sea]] LanguageModelFsa=-4.39448 h=-4.63222); r->l(sea the|)} - ({<21,26>dead body of [1] ||| r->l(body dead|) rule-phrase[of] r->l(of|body dead) [a person : [ from]] r->l(person a|of body) = [dead body : [ from]] LanguageModelFsa=-2.9934 h=-4.63222); r->l(of body dead|)} - ({<21,24>a [1] from ||| [person] r->l(person a|) rule-phrase[from] r->l(from|) = [a person : [ from]] LanguageModelFsa=-2.33299 h=-4.90016); r->l(a|) r->l(from|)} - ({<23,24>person ||| r->l(person|) = [person] LanguageModelFsa=0 h=-3.50165); r->l(person|)} - ) - ) - ) - ) - ) - ) - ) -) diff --git a/graehl/NOTES.partial.binarize b/graehl/NOTES.partial.binarize deleted file mode 100755 index a9985891..00000000 --- a/graehl/NOTES.partial.binarize +++ /dev/null @@ -1,21 +0,0 @@ -Earley doesn't require binarized rules. - -But a (partially) binarized grammar may lead to smaller (exhaustive or heuristic) charts. The tradeoff is mostly more reduce steps (the # of NTs should be similar or less than the usual dotted-item binarization0. - -Optionally collapse a rule rhs to unary as well (normal binarization would stop when an rhs is binary), if the rule to collapse it exists or is frequent enough. - -Greedy binarization schemes: - -1) (repeatedly) for the most frequent rhs bigram "X a" create a binary rule "V -> X a" and replace "X a" in all rules' rhs with V. stop if the most frequent bigram has count lower than some threshold (e.g. 3), because each instance of it saves one symbol, but the new rule has 3 symbols. - -2) (repeatedly) for each rule, pick the most frequent bigram in its rhs and binarize it (2a for that rule only, 2b everywhere that bigram occurs). again, some frequency threshold. optionally allow collapsing an rhs to unary. this fails to use some substitutions that are available "for free" based on actions taken at earlier rules w/ no frequent bigrams in common with this one. - -3) (DeNero) (for complete binarization only?) for each rule until binarized, pick a split point k of L->r[0..n) to make rules L->V1 V2, V1->r[0..k) V2->r[k..n), to minimize the number of new rules created. If no prefix or suffix of r already exists as a virtual rule, then choose k=floor(n/2). To amend this to consider frequency of rhs, use the frequency of rhs-prefix/suffixes to decide where to split? - -4?) Song, Chin-Yew Lin - seems to require collecting stats from a larged parsed corpus - interesting idea: make rules that don't match fail early (that's 1 way you get a speedup), and pick V1 -> ... based on some kind of expected utility. - -5) l2r, r2l. yawn. - -1) seems the most sensible. don't just keep a count for each bigram, keep a set of left and right adjacent partially overlapping bigrams (i.e. the words left and right). for "a b" if "c" and "d" occur to the right, then "b c" and "b d" would be the right adjacent bigrams. when replacing a bigram, follow the left and right adjacencies to decrement the count of those bigrams, and add a (bidirectional) link to the new bigram. - -Further, partial-1) can be followed by complete-3) or 5) - although i see no reason not to just continue 1) until the grammar is binary if you want a full binarization. diff --git a/graehl/NOTES.wfsa b/graehl/NOTES.wfsa deleted file mode 100755 index b74dc810..00000000 --- a/graehl/NOTES.wfsa +++ /dev/null @@ -1,16 +0,0 @@ -left-to-right finite-state models (with heuristic) that depend only on the target string. - -http://github.com/jganitkevitch/cdec.git has some progress toward this: - -earley_generator.*: make a trie of earley dotted items (from first pass finite parse projected to target side?) and rules for each earley deduction step (is the predict step actually making a hyperedge? or is it marked "active" and so doesn't appear in the result?) - -ff_ltor.*: interface for l2r models; needless scoring of "complete" action (only heuristic changes there and heuristics can just be precomputed for all dot-items -ff_lm.*: ugly clone of regular LM model with l2r interface - -apply_models.*: ApplyLeftToRightModelSet - -l2r features: - -multiple feature ids from single model? - -declare markov bound for bottom-up scoring (inside items) wrapper, and "backoff start" state (i.e. empty context, not context) diff --git a/rescore/cdec_kbest_to_zmert.pl b/rescore/cdec_kbest_to_zmert.pl deleted file mode 100755 index 88bc9682..00000000 --- a/rescore/cdec_kbest_to_zmert.pl +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use utf8; -use Getopt::Long; - -my $feature_file; -my $hyp_file; -my $help; - -Getopt::Long::Configure("no_auto_abbrev"); -if (GetOptions( - "feature_file|f=s" => \$feature_file, - "hypothesis_file|h=s" => \$hyp_file, - "help" => \$help, -) == 0 || @ARGV!=0 || $help || !$feature_file || !$hyp_file) { - usage(); - exit(1); -} - -open W, "<$feature_file" or die "Can't read $feature_file: $!"; -my %weights; -my @all_feats; -while() { - chomp; - next if /^#/; - next if /^\s*$/; - my ($fname, $w) = split /\s+/; - push @all_feats, $fname; - $weights{$fname} = 1; -} -close W; - -open HYP, "<$hyp_file" or die "Can't read $hyp_file: $!"; -while() { - chomp; - my ($id, $hyp, $feats) = split / \|\|\| /; - my @afeats = split /\s+/, $feats; - my $tot = 0; - my %fvaldict; - for my $featpair (@afeats) { - my ($fname,$fval) = split /=/, $featpair; - $fvaldict{$fname} = $fval; - my $weight = $weights{$fname}; - warn "Feature '$fname' not mentioned in feature file $feature_file" unless defined $weight; - $weights{$fname} = 1; - } - my @trans; - for my $feat (@all_feats) { - my $v = $fvaldict{$feat}; - if (!defined $v) { $v = '0.0'; } - push @trans, $v; - } - print "$id ||| $hyp ||| @trans\n"; -} -close HYP; - -sub usage { - print <) { - next if /^#/; - chomp; - next if /^\s*$/; - s/^\s+//; - s/\s+$//; - my ($a,$b) = split /\s+/; - next unless ($a && $b); - my $line = $DEFAULT; - if ($defaults{$a}) { $line = $defaults{$a}; } - print "$a\t|||\t$b\t$line\n"; -} - -print "normalization = none\n"; - diff --git a/rescore/rerank.pl b/rescore/rerank.pl deleted file mode 100755 index 4a0c5750..00000000 --- a/rescore/rerank.pl +++ /dev/null @@ -1,86 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use utf8; -use Getopt::Long; - -my $weights_file; -my $hyp_file; -my $help; -my $kbest; # flag to extract reranked list - -Getopt::Long::Configure("no_auto_abbrev"); -if (GetOptions( - "weights_file|w=s" => \$weights_file, - "hypothesis_file|h=s" => \$hyp_file, - "kbest" => \$kbest, - "help" => \$help, -) == 0 || @ARGV!=0 || $help || !$weights_file || !$hyp_file) { - usage(); - exit(1); -} - -open W, "<$weights_file" or die "Can't read $weights_file: $!"; -my %weights; -while() { - chomp; - next if /^#/; - next if /^\s*$/; - my ($fname, $w) = split /\s+/; - $weights{$fname} = $w; -} -close W; - -my $cur = undef; -my %hyps = (); -open HYP, "<$hyp_file" or die "Can't read $hyp_file: $!"; -while() { - chomp; - my ($id, $hyp, $feats) = split / \|\|\| /; - unless (defined $cur) { $cur = $id; } - if ($cur ne $id) { - extract_1best($cur, \%hyps); - $cur = $id; - %hyps = (); - } - my @afeats = split /\s+/, $feats; - my $tot = 0; - for my $featpair (@afeats) { - my ($fname,$fval) = split /=/, $featpair; - my $weight = $weights{$fname}; - die "Unweighted feature '$fname'" unless defined $weight; - $tot += ($weight * $fval); - } - $hyps{"$hyp ||| $feats"} = $tot; -} -extract_1best($cur, \%hyps) if defined $cur; -close HYP; - -sub extract_1best { - my ($id, $rh) = @_; - my %hyps = %$rh; - if ($kbest) { - for my $hyp (sort { $hyps{$b} <=> $hyps{$a} } keys %hyps) { - print "$id ||| $hyp\n"; - } - } else { - my $best_score = undef; - my $best_hyp = undef; - for my $hyp (keys %hyps) { - if (!defined $best_score || $hyps{$hyp} > $best_score) { - $best_score = $hyps{$hyp}; - $best_hyp = $hyp; - } - } - $best_hyp =~ s/ \|\|\|.*$//; - print "$best_hyp\n"; - } -} - -sub usage { - print < \$model_file, - "source_file|s=s" => \$src_file, - "feature_name|f=s" => \$feature_name, - "hypothesis_file|h=s" => \$hyp_file, - "help" => \$help, -) == 0 || @ARGV!=0 || $help || !$model_file || !$src_file || !$hyp_file) { - usage(); - exit; -} - -binmode STDIN, ":utf8"; -binmode STDOUT, ":utf8"; -binmode STDERR, ":utf8"; - -print STDERR "Reading Model 1 probabilities from $model_file...\n"; -open M, "<$model_file" or die "Couldn't read $model_file: $!"; -binmode M, ":utf8"; -my %m1; -while(){ - chomp; - my ($e,$f,$lp) = split /\s+/; - die unless defined $e; - die unless defined $f; - die unless defined $lp; - $m1{$f}->{$e} = $lp; -} -close M; - -open SRC, "<$src_file" or die "Can't read $src_file: $!"; -open HYP, "<$hyp_file" or die "Can't read $hyp_file: $!"; -binmode(SRC,":utf8"); -binmode(HYP,":utf8"); -binmode(STDOUT,":utf8"); -my @source; while(){chomp; push @source, $_; } -close SRC; -my $src_len = scalar @source; -print STDERR "Read $src_len sentences...\n"; -print STDERR "Rescoring...\n"; - -my $cur = undef; -my @hyps = (); -my @feats = (); -while() { - chomp; - my ($id, $hyp, $feats) = split / \|\|\| /; - unless (defined $cur) { $cur = $id; } - die "sentence ids in k-best list file must be between 0 and $src_len" if $id < 0 || $id > $src_len; - if ($cur ne $id) { - rescore($cur, $source[$cur], \@hyps, \@feats); - $cur = $id; - @hyps = (); - @feats = (); - } - push @hyps, $hyp; - push @feats, $feats; -} -rescore($cur, $source[$cur], \@hyps, \@feats) if defined $cur; - -sub rescore { - my ($id, $src, $rh, $rf) = @_; - my @hyps = @$rh; - my @feats = @$rf; - my $nhyps = scalar @hyps; - my %cache = (); - print STDERR "RESCORING SENTENCE id=$id (# hypotheses=$nhyps)...\n"; - for (my $i=0; $i < $nhyps; $i++) { - my $score = $cache{$hyps[$i]}; - if (!defined $score) { - if ($reverse_model) { - die "not implemented"; - } else { - $score = m1_prob($src, $hyps[$i]); - } - $cache{$hyps[$i]} = $score; - } - print "$id ||| $hyps[$i] ||| $feats[$i] $feature_name=$score\n"; - } - -} - -sub m1_prob { - my ($fsent, $esent) = @_; - die unless defined $fsent; - die unless defined $esent; - my @fwords = split /\s+/, $fsent; - my @ewords = split /\s+/, $esent; - push @ewords, ""; - my $tp = 0; - for my $f (@fwords) { - my $m1f = $m1{$f}; - if (!defined $m1f) { $m1f = {}; } - my $tfp = 0; - for my $e (@ewords) { - my $lp = $m1f->{$e}; - if (!defined $lp) { $lp = -100; } - #print "P($f|$e) = $lp\n"; - my $prob = exp($lp); - #if ($prob > $tfp) { $tfp = $prob; } - $tfp += $prob; - } - $tp += log($tfp); - $tp -= log(scalar @ewords); # uniform probability of each generating word - } - return $tp; -} - -sub usage { - print STDERR "Usage: $0 -m model_file.txt -h hypothesis.nbest -s source.txt\n Adds the back-translation probability under Model 1\n Use training/model1 to generate the required parameter file\n"; -} - - diff --git a/rescore/rescore_with_cdec_model.pl b/rescore/rescore_with_cdec_model.pl deleted file mode 100755 index cdd8c217..00000000 --- a/rescore/rescore_with_cdec_model.pl +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/perl -w - -use strict; -use utf8; -my @ORIG_ARGV=@ARGV; -use Cwd qw(getcwd); -my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR, "$SCRIPT_DIR/../environment"; } -use LocalConfig; -use Getopt::Long; -use IPC::Open2; -use POSIX ":sys_wait_h"; - -my $decoder = "$SCRIPT_DIR/../decoder/cdec"; -my $help; -my $cdec_ini; -my $src_file; -my $hyp_file; -my $reverse_model; -my $weights_file; -my $feature_name='NewModel'; - -sub catch_pipe { - my $signame = shift; - die "$0 received SIGPIPE: did the decoder die?\n"; -} -$SIG{PIPE} = \&catch_pipe; - -Getopt::Long::Configure("no_auto_abbrev"); -if (GetOptions( - "config|c=s" => \$cdec_ini, - "weights|w=s" => \$weights_file, - "source_file|s=s" => \$src_file, - "feature_name|f=s" => \$feature_name, - "hypothesis_file|h=s" => \$hyp_file, - "reverse" => \$reverse_model, # if true translate hyp -> src - "decoder=s" => \$decoder, - "help" => \$help, -) == 0 || @ARGV!=0 || $help || !$cdec_ini || !$src_file || !$hyp_file) { - usage(); - exit; -} -die "Can't find $decoder" unless -f $decoder; -die "Can't run $decoder" unless -x $decoder; -my $weights = ''; -if (defined $weights_file) { - die "Can't read $weights_file" unless -f $weights_file; - $weights = "-w $weights_file"; -} -my $decoder_command = "$decoder -c $cdec_ini --quiet $weights --show_conditional_prob"; -print STDERR "DECODER COMMAND: $decoder_command\n"; -my $cdec_pid = open2(\*CDEC_IN, \*CDEC_OUT, $decoder_command) - or die "Couldn't run $decoder: $!"; -sleep 1; - -die "Can't find $cdec_ini" unless -f $cdec_ini; -open SRC, "<$src_file" or die "Can't read $src_file: $!"; -open HYP, "<$hyp_file" or die "Can't read $hyp_file: $!"; -binmode(SRC,":utf8"); -binmode(HYP,":utf8"); -binmode(STDOUT,":utf8"); -my @source; while(){chomp; push @source, $_; } -close SRC; -my $src_len = scalar @source; -print STDERR "Read $src_len sentences...\n"; -binmode(CDEC_IN, ":utf8"); -binmode(CDEC_OUT, ":utf8"); - -my $cur = undef; -my @hyps = (); -my @feats = (); -while() { - chomp; - my ($id, $hyp, $feats) = split / \|\|\| /; - unless (defined $cur) { $cur = $id; } - die "sentence ids in k-best list file must be between 0 and $src_len" if $id < 0 || $id > $src_len; - if ($cur ne $id) { - rescore($cur, $source[$cur], \@hyps, \@feats); - $cur = $id; - @hyps = (); - @feats = (); - } - push @hyps, $hyp; - push @feats, $feats; -} -rescore($cur, $source[$cur], \@hyps, \@feats) if defined $cur; - -close CDEC_IN; -close CDEC_OUT; -close HYP; -waitpid($cdec_pid, 0); -my $status = $? >> 8; -if ($status != 0) { - print STDERR "Decoder returned bad status!\n"; -} - -sub rescore { - my ($id, $src, $rh, $rf) = @_; - my @hyps = @$rh; - my @feats = @$rf; - my $nhyps = scalar @hyps; - print STDERR "RESCORING SENTENCE id=$id (# hypotheses=$nhyps)...\n"; - for (my $i=0; $i < $nhyps; $i++) { - if ($reverse_model) { - print CDEC_OUT "$hyps[$i] ||| $src\n"; - } else { - print CDEC_OUT "$src ||| $hyps[$i]\n"; - } - my $score = ; - chomp $score; - my @words = split /\s+/, $hyps[$i]; - print "$id ||| $hyps[$i] ||| $feats[$i] $feature_name=$score\n"; - } -} - -sub usage { - print < Date: Fri, 3 Feb 2012 17:19:16 -0500 Subject: make pro use new interface --- .gitignore | 77 ++++++++++++++++++++++++++++++++++++++++--------- mteval/ns.cc | 4 +++ mteval/ns.h | 4 +++ mteval/ns_ter.h | 1 + pro-train/dist-pro.pl | 4 +-- pro-train/mr_pro_map.cc | 37 +++++++++++++++--------- 6 files changed, 98 insertions(+), 29 deletions(-) diff --git a/.gitignore b/.gitignore index 5efe37b0..ab8bf2c7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,46 @@ +mira/kbest_mira +sa-extract/calignment.c +sa-extract/calignment.so +sa-extract/cdat.c +sa-extract/cdat.so +sa-extract/cfloatlist.c +sa-extract/cfloatlist.so +sa-extract/cintlist.c +sa-extract/cintlist.so +sa-extract/clex.c +sa-extract/clex.so +sa-extract/cn.pyc +sa-extract/context_model.pyc +sa-extract/cstrmap.c +sa-extract/cstrmap.so +sa-extract/csuf.c +sa-extract/csuf.so +sa-extract/cveb.c +sa-extract/cveb.so +sa-extract/lcp.c +sa-extract/lcp.so +sa-extract/log.pyc +sa-extract/manager.pyc +sa-extract/model.pyc +sa-extract/monitor.pyc +sa-extract/precomputation.c +sa-extract/precomputation.so +sa-extract/rule.c +sa-extract/rule.so +sa-extract/rulefactory.c +sa-extract/rulefactory.so +sa-extract/sgml.pyc +sa-extract/sym.c +sa-extract/sym.so +training/mpi_flex_optimize +training/test_ngram +utils/dict_test +utils/logval_test +utils/mfcr_test +utils/phmt +utils/small_vector_test +utils/ts +utils/weights_test pro-train/.deps pro-train/mr_pro_map pro-train/mr_pro_reduce @@ -38,8 +81,8 @@ utils/.deps/ utils/libutils.a *swp *.o -vest/sentserver -vest/sentclient +dpmert/sentserver +dpmert/sentclient gi/pyp-topics/src/contexts_lexer.cc config.guess config.sub @@ -61,12 +104,12 @@ training/mr_em_map_adapter training/mr_reduce_to_weights training/optimize_test training/plftools -vest/fast_score -vest/lo_test -vest/mr_vest_map -vest/mr_vest_reduce -vest/scorer_test -vest/union_forests +dpmert/fast_score +dpmert/lo_test +dpmert/mr_dpmert_map +dpmert/mr_dpmert_reduce +dpmert/scorer_test +dpmert/union_forests Makefile Makefile.in aclocal.m4 @@ -99,11 +142,11 @@ training/Makefile.in training/*.o training/grammar_convert training/model1 -vest/.deps/ -vest/Makefile -vest/Makefile.in -vest/mr_vest_generate_mapper_input -vest/*.o +dpmert/.deps/ +dpmert/Makefile +dpmert/Makefile.in +dpmert/mr_dpmert_generate_mapper_input +dpmert/*.o decoder/logval_test extools/build_lexical_translation extools/filter_grammar @@ -124,7 +167,6 @@ m4/ltoptions.m4 m4/ltsugar.m4 m4/ltversion.m4 m4/lt~obsolete.m4 -vest/mbr_kbest extools/featurize_grammar extools/filter_score_grammar gi/posterior-regularisation/prjava/build/ @@ -143,3 +185,10 @@ gi/posterior-regularisation/prjava/lib/prjava-20100715.jar *.ps *.toc *~ +gi/pf/align-lexonly +gi/pf/align-lexonly-pyp +gi/pf/condnaive +mteval/scorer_test +phrasinator/gibbs_train_plm +phrasinator/gibbs_train_plm_notables +.* diff --git a/mteval/ns.cc b/mteval/ns.cc index da678b84..788f809a 100644 --- a/mteval/ns.cc +++ b/mteval/ns.cc @@ -21,6 +21,10 @@ map EvaluationMetric::instances_; SegmentEvaluator::~SegmentEvaluator() {} EvaluationMetric::~EvaluationMetric() {} +bool EvaluationMetric::IsErrorMetric() const { + return false; +} + struct DefaultSegmentEvaluator : public SegmentEvaluator { DefaultSegmentEvaluator(const vector >& refs, const EvaluationMetric* em) : refs_(refs), em_(em) {} void Evaluate(const vector& hyp, SufficientStats* out) const { diff --git a/mteval/ns.h b/mteval/ns.h index d88c263b..4e4c6975 100644 --- a/mteval/ns.h +++ b/mteval/ns.h @@ -94,6 +94,10 @@ class EvaluationMetric { public: const std::string& MetricId() const { return name_; } + // returns true for metrics like WER and TER where lower scores are better + // false for metrics like BLEU and METEOR where higher scores are better + virtual bool IsErrorMetric() const; + virtual unsigned SufficientStatisticsVectorSize() const; virtual float ComputeScore(const SufficientStats& stats) const = 0; virtual std::string DetailedScore(const SufficientStats& stats) const; diff --git a/mteval/ns_ter.h b/mteval/ns_ter.h index 3190fc1b..c5c25413 100644 --- a/mteval/ns_ter.h +++ b/mteval/ns_ter.h @@ -9,6 +9,7 @@ class TERMetric : public EvaluationMetric { TERMetric() : EvaluationMetric("TER") {} public: + virtual bool IsErrorMetric() const; virtual unsigned SufficientStatisticsVectorSize() const; virtual std::string DetailedScore(const SufficientStats& stats) const; virtual void ComputeSufficientStatistics(const std::vector& hyp, diff --git a/pro-train/dist-pro.pl b/pro-train/dist-pro.pl index ba9cdc06..31258fa6 100755 --- a/pro-train/dist-pro.pl +++ b/pro-train/dist-pro.pl @@ -12,7 +12,7 @@ use POSIX ":sys_wait_h"; my $QSUB_CMD = qsub_args(mert_memory()); my $default_jobs = env_default_jobs(); -my $VEST_DIR="$SCRIPT_DIR/../vest"; +my $VEST_DIR="$SCRIPT_DIR/../dpmert"; require "$VEST_DIR/libcall.pl"; # Default settings @@ -338,7 +338,7 @@ while (1){ $mapoutput =~ s/mapinput/mapoutput/; push @mapoutputs, "$dir/splag.$im1/$mapoutput"; $o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard"; - my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep -w $inweights -K $dir/kbest < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput"; + my $script = "$MAPPER -s $srcFile -m $metric $refs_comma_sep -w $inweights -K $dir/kbest < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput"; if ($use_make) { my $script_file = "$dir/scripts/map.$shard"; open F, ">$script_file" or die "Can't write $script_file: $!"; diff --git a/pro-train/mr_pro_map.cc b/pro-train/mr_pro_map.cc index 0a9b75d7..52b67f32 100644 --- a/pro-train/mr_pro_map.cc +++ b/pro-train/mr_pro_map.cc @@ -13,11 +13,12 @@ #include "filelib.h" #include "stringlib.h" #include "weights.h" -#include "scorer.h" #include "inside_outside.h" #include "hg_io.h" #include "kbest.h" #include "viterbi.h" +#include "ns.h" +#include "ns_docscorer.h" // This is Figure 4 (Algorithm Sampler) from Hopkins&May (2011) @@ -80,7 +81,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("kbest_repository,K",po::value()->default_value("./kbest"),"K-best list repository (directory)") ("input,i",po::value()->default_value("-"), "Input file to map (- is STDIN)") ("source,s",po::value()->default_value(""), "Source file (ignored, except for AER)") - ("loss_function,l",po::value()->default_value("ibm_bleu"), "Loss function being optimized") + ("evaluation_metric,m",po::value()->default_value("IBM_BLEU"), "Evaluation metric (ibm_bleu, koehn_bleu, nist_bleu, ter, meteor, etc.)") ("kbest_size,k",po::value()->default_value(1500u), "Top k-hypotheses to extract") ("candidate_pairs,G", po::value()->default_value(5000u), "Number of pairs to sample per hypothesis (Gamma)") ("best_pairs,X", po::value()->default_value(50u), "Number of pairs, ranked by magnitude of objective delta, to retain (Xi)") @@ -109,9 +110,12 @@ struct HypInfo { HypInfo(const vector& h, const SparseVector& feats) : hyp(h), g_(-100.0f), x(feats) {} // lazy evaluation - double g(const SentenceScorer& scorer) const { - if (g_ == -100.0f) - g_ = scorer.ScoreCandidate(hyp)->ComputeScore(); + double g(const SegmentEvaluator& scorer, const EvaluationMetric* metric) const { + if (g_ == -100.0f) { + SufficientStats ss; + scorer.Evaluate(hyp, &ss); + g_ = metric->ComputeScore(ss); + } return g_; } vector hyp; @@ -233,15 +237,21 @@ struct DiffOrder { } }; -void Sample(const unsigned gamma, const unsigned xi, const vector& J_i, const SentenceScorer& scorer, const bool invert_score, vector* pv) { +void Sample(const unsigned gamma, + const unsigned xi, + const vector& J_i, + const SegmentEvaluator& scorer, + const EvaluationMetric* metric, + vector* pv) { + const bool invert_score = metric->IsErrorMetric(); vector v1, v2; float avg_diff = 0; for (unsigned i = 0; i < gamma; ++i) { const size_t a = rng->inclusive(0, J_i.size() - 1)(); const size_t b = rng->inclusive(0, J_i.size() - 1)(); if (a == b) continue; - float ga = J_i[a].g(scorer); - float gb = J_i[b].g(scorer); + float ga = J_i[a].g(scorer, metric); + float gb = J_i[b].g(scorer, metric); bool positive = gb < ga; if (invert_score) positive = !positive; const float gdiff = fabs(ga - gb); @@ -288,11 +298,12 @@ int main(int argc, char** argv) { rng.reset(new MT19937(conf["random_seed"].as())); else rng.reset(new MT19937); - const string loss_function = conf["loss_function"].as(); + const string evaluation_metric = conf["evaluation_metric"].as(); + + EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); + DocumentScorer ds(metric, conf["reference"].as >()); + cerr << "Loaded " << ds.size() << " references for scoring with " << evaluation_metric << endl; - ScoreType type = ScoreTypeFromString(loss_function); - DocScorer ds(type, conf["reference"].as >(), conf["source"].as()); - cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl; Hypergraph hg; string last_file; ReadFile in_read(conf["input"].as()); @@ -335,7 +346,7 @@ int main(int argc, char** argv) { Dedup(&J_i); WriteKBest(kbest_file, J_i); - Sample(gamma, xi, J_i, *ds[sent_id], (type == TER), &v); + Sample(gamma, xi, J_i, *ds[sent_id], metric, &v); for (unsigned i = 0; i < v.size(); ++i) { const TrainingInstance& vi = v[i]; cout << vi.y << "\t" << vi.x << endl; -- cgit v1.2.3 From 3a2fc36378337147a956e439db31baf91bfb95c8 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 3 Feb 2012 18:03:49 -0500 Subject: escaping tool for grammar extractor --- mteval/ns_ter.cc | 4 ++++ sa-extract/Makefile | 4 ++-- sa-extract/README | 14 +++++++++++++- sa-extract/escape-testset.pl | 35 +++++++++++++++++++++++++++++++++++ sa-extract/example/README | 2 +- 5 files changed, 55 insertions(+), 4 deletions(-) create mode 100755 sa-extract/escape-testset.pl diff --git a/mteval/ns_ter.cc b/mteval/ns_ter.cc index 91a17f0d..0e1008db 100644 --- a/mteval/ns_ter.cc +++ b/mteval/ns_ter.cc @@ -22,6 +22,10 @@ static const unsigned kDUMMY_LAST_ENTRY = 5; using namespace std; using namespace std::tr1; +bool TERMetric::IsErrorMetric() const { + return true; +} + namespace NewScorer { struct COSTS { diff --git a/sa-extract/Makefile b/sa-extract/Makefile index e2b6158d..7b39ae4d 100644 --- a/sa-extract/Makefile +++ b/sa-extract/Makefile @@ -1,7 +1,7 @@ PYVER=python2.7 -PYDIR=/usr +PYDIR=/usr/local/Cellar/python/2.7.2 PYINCLUDE=$(PYDIR)/include/$(PYVER) -CYTHON=/usr/bin/cython +CYTHON=/usr/local/share/python/cython PYTHON=$(PYDIR)/bin/python %.c: %.pyx diff --git a/sa-extract/README b/sa-extract/README index f43e58cc..e4022c7e 100644 --- a/sa-extract/README +++ b/sa-extract/README @@ -28,10 +28,22 @@ COMPILING A PARALLEL CORPUS AND WORD ALIGNMENT -a alignment_name=alignment.txt > extract.ini + The training data should be in two parallel text files (source.fr,source.en) + and the alignments are expected in "0-0 1-2 2-1 ..." format produced by + most alignment toolkits. The text files should NOT be escaped for non-XML + characters. + + EXTRACTION OF PER-SENTENCE GRAMMARS ============================================================================== +The most common use-case we support is extraction of "per-sentence" grammars +for each segment in a testset. You may run the extractor on test set, but it +will try to interpret tags as SGML markup, so we provide a script that does +escaping: ./escape-testset.pl. + - Example: - cat test.fr | extractor.py -c extract.ini + + cat test.fr | ./escape-testset.pl | ./extractor.py -c extract.ini EXTRACTION OF COMPLETE TEST-SET GRAMMARS diff --git a/sa-extract/escape-testset.pl b/sa-extract/escape-testset.pl new file mode 100755 index 00000000..02fd7445 --- /dev/null +++ b/sa-extract/escape-testset.pl @@ -0,0 +1,35 @@ +#!/usr/bin/perl -w + +use utf8; +use strict; + +binmode(STDIN,":utf8"); +binmode(STDOUT,":utf8"); + +my @fh = (); +if (scalar @ARGV == 0) { + push @fh, \*STDIN; +} else { + for my $file (@ARGV) { + my $f; + open $f, "<$file" or die "Can't read $file: $!\n"; + binmode $f, ":utf8"; + push @fh, $f; + } +} + +my $id = -1; +for my $f (@fh) { + while(<$f>) { + chomp; + die "Empty line in test set" if /^\s*$/; + die "Please remove tags from input:\n$_" if /^\s*/\>/g; + print " $_ \n"; + } +} + + diff --git a/sa-extract/example/README b/sa-extract/example/README index 9819ba5f..f6eac52b 100644 --- a/sa-extract/example/README +++ b/sa-extract/example/README @@ -4,5 +4,5 @@ Commands to compile a corpus and extract some grammars # compile ../sa-compile.pl -b nc=corpus.de.gz,corpus.en.gz -a gdfa=corpus.align.gz > extract.ini # extract -cat test.de | ../extractor.py -c extract.ini +cat test.de | ../escape-testset.pl | ../extractor.py -c extract.ini -- cgit v1.2.3 From f740c843d6ea88484f84152e5fb02a481192ae41 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 3 Feb 2012 20:55:10 -0500 Subject: use interface properly --- dpmert/mr_dpmert_reduce.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dpmert/mr_dpmert_reduce.cc b/dpmert/mr_dpmert_reduce.cc index dda61f88..31512a03 100644 --- a/dpmert/mr_dpmert_reduce.cc +++ b/dpmert/mr_dpmert_reduce.cc @@ -34,10 +34,10 @@ int main(int argc, char** argv) { po::variables_map conf; InitCommandLine(argc, argv, &conf); const string evaluation_metric = conf["evaluation_metric"].as(); + EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE; - if (UppercaseString(evaluation_metric) == "TER") + if (metric->IsErrorMetric()) opt_type = LineOptimizer::MINIMIZE_SCORE; - EvaluationMetric* metric = EvaluationMetric::Instance(evaluation_metric); vector esv; string last_key, line, key, val; -- cgit v1.2.3 From 7da167c93ca2c10649cb0292fa2818ca8d5c76ff Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 3 Feb 2012 20:59:00 -0500 Subject: remove dead code --- dpmert/cat.pl | 4 ---- 1 file changed, 4 deletions(-) delete mode 100755 dpmert/cat.pl diff --git a/dpmert/cat.pl b/dpmert/cat.pl deleted file mode 100755 index 2ecba3f9..00000000 --- a/dpmert/cat.pl +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/perl - -$|=1; -print while(<>); -- cgit v1.2.3 From d91750f35d4d7edfc77a589ae92100d523068ad7 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 3 Feb 2012 21:11:40 -0500 Subject: fix broken build --- gi/pf/align-lexonly-pyp.cc | 2 +- gi/pf/align-lexonly.cc | 2 +- gi/pf/conditional_pseg.h | 2 +- gi/pf/condnaive.cc | 2 +- gi/pf/dpnaive.cc | 2 +- gi/pf/monotonic_pseg.h | 2 +- gi/pf/pfdist.cc | 2 +- gi/pf/pfnaive.cc | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index d2630a2b..e24cb457 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -7,7 +7,7 @@ #include #include "array2d.h" -#include "base_measures.h" +#include "base_distributions.h" #include "monotonic_pseg.h" #include "conditional_pseg.h" #include "trule.h" diff --git a/gi/pf/align-lexonly.cc b/gi/pf/align-lexonly.cc index 76e2e009..8c1d689f 100644 --- a/gi/pf/align-lexonly.cc +++ b/gi/pf/align-lexonly.cc @@ -7,7 +7,7 @@ #include #include "array2d.h" -#include "base_measures.h" +#include "base_distributions.h" #include "monotonic_pseg.h" #include "conditional_pseg.h" #include "trule.h" diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h index db951d15..0aa5e8e0 100644 --- a/gi/pf/conditional_pseg.h +++ b/gi/pf/conditional_pseg.h @@ -10,7 +10,7 @@ #include "ccrp_nt.h" #include "mfcr.h" #include "trule.h" -#include "base_measures.h" +#include "base_distributions.h" #include "tdict.h" template diff --git a/gi/pf/condnaive.cc b/gi/pf/condnaive.cc index 52ddbbfe..3ea88016 100644 --- a/gi/pf/condnaive.cc +++ b/gi/pf/condnaive.cc @@ -6,7 +6,7 @@ #include #include -#include "base_measures.h" +#include "base_distributions.h" #include "monotonic_pseg.h" #include "conditional_pseg.h" #include "trule.h" diff --git a/gi/pf/dpnaive.cc b/gi/pf/dpnaive.cc index db1c43c7..469dff5c 100644 --- a/gi/pf/dpnaive.cc +++ b/gi/pf/dpnaive.cc @@ -6,7 +6,7 @@ #include #include -#include "base_measures.h" +#include "base_distributions.h" #include "monotonic_pseg.h" #include "trule.h" #include "tdict.h" diff --git a/gi/pf/monotonic_pseg.h b/gi/pf/monotonic_pseg.h index 301aa6d8..10d171fe 100644 --- a/gi/pf/monotonic_pseg.h +++ b/gi/pf/monotonic_pseg.h @@ -6,7 +6,7 @@ #include "prob.h" #include "ccrp_nt.h" #include "trule.h" -#include "base_measures.h" +#include "base_distributions.h" template struct MonotonicParallelSegementationModel { diff --git a/gi/pf/pfdist.cc b/gi/pf/pfdist.cc index aae5f798..ef08a165 100644 --- a/gi/pf/pfdist.cc +++ b/gi/pf/pfdist.cc @@ -7,7 +7,7 @@ #include #include "pf.h" -#include "base_measures.h" +#include "base_distributions.h" #include "reachability.h" #include "viterbi.h" #include "hg.h" diff --git a/gi/pf/pfnaive.cc b/gi/pf/pfnaive.cc index 728ec00d..acba9d22 100644 --- a/gi/pf/pfnaive.cc +++ b/gi/pf/pfnaive.cc @@ -7,7 +7,7 @@ #include #include "pf.h" -#include "base_measures.h" +#include "base_distributions.h" #include "monotonic_pseg.h" #include "reachability.h" #include "viterbi.h" -- cgit v1.2.3 From 648fd70ec05997003e801e113d825c84e55e01ca Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 8 Feb 2012 16:22:55 -0500 Subject: move widely duplicated math functions into m.h header --- .gitignore | 1 + gi/pf/base_distributions.cc | 22 +++++------ gi/pf/base_distributions.h | 21 +--------- gi/pf/conditional_pseg.h | 3 +- gi/pf/pfdist.cc | 6 +-- gi/pf/pfnaive.cc | 4 +- phrasinator/gibbs_train_plm.cc | 8 +--- utils/Makefile.am | 5 ++- utils/m.h | 89 ++++++++++++++++++++++++++++++++++++++++++ utils/m_test.cc | 75 +++++++++++++++++++++++++++++++++++ utils/mfcr.h | 22 ++--------- 11 files changed, 194 insertions(+), 62 deletions(-) create mode 100644 utils/m.h create mode 100644 utils/m_test.cc diff --git a/.gitignore b/.gitignore index ab8bf2c7..4f75d153 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ mira/kbest_mira +utils/m_test sa-extract/calignment.c sa-extract/calignment.so sa-extract/cdat.c diff --git a/gi/pf/base_distributions.cc b/gi/pf/base_distributions.cc index d362fd76..d9761005 100644 --- a/gi/pf/base_distributions.cc +++ b/gi/pf/base_distributions.cc @@ -59,7 +59,7 @@ prob_t PhraseConditionalUninformativeUnigramBase::p0(const vector& vsrc, const int flen = vsrc.size() - start_src; const int elen = vtrg.size() - start_trg; prob_t p; - p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) + p.logeq(Md::log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) //p.logeq(log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01) for (int i = 0; i < elen; ++i) p *= u(vtrg[i + start_trg]); // draw e_i ~Uniform @@ -73,7 +73,7 @@ prob_t PhraseConditionalUninformativeBase::p0(const vector& vsrc, const int elen = vtrg.size() - start_trg; prob_t p; //p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) - p.logeq(log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01) + p.logeq(Md::log_poisson(elen, 1)); // elen | flen ~Pois(flen + 0.01) for (int i = 0; i < elen; ++i) p *= kUNIFORM_TARGET; // draw e_i ~Uniform return p; @@ -113,7 +113,7 @@ prob_t PhraseConditionalBase::p0(const vector& vsrc, const int elen = vtrg.size() - start_trg; prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); prob_t p; - p.logeq(log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) + p.logeq(Md::log_poisson(elen, flen + 0.01)); // elen | flen ~Pois(flen + 0.01) for (int i = 0; i < elen; ++i) { // for each position i in e-RHS const WordID trg = vtrg[i + start_trg]; prob_t tp = prob_t::Zero(); @@ -139,9 +139,9 @@ prob_t PhraseJointBase::p0(const vector& vsrc, const int elen = vtrg.size() - start_trg; prob_t uniform_src_alignment; uniform_src_alignment.logeq(-log(flen + 1)); prob_t p; - p.logeq(log_poisson(flen, 1.0)); // flen ~Pois(1) + p.logeq(Md::log_poisson(flen, 1.0)); // flen ~Pois(1) // elen | flen ~Pois(flen + 0.01) - prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01)); + prob_t ptrglen; ptrglen.logeq(Md::log_poisson(elen, flen + 0.01)); p *= ptrglen; p *= kUNIFORM_SOURCE.pow(flen); // each f in F ~Uniform for (int i = 0; i < elen; ++i) { // for each position i in E @@ -171,9 +171,9 @@ prob_t PhraseJointBase_BiDir::p0(const vector& vsrc, prob_t uniform_trg_alignment; uniform_trg_alignment.logeq(-log(elen + 1)); prob_t p1; - p1.logeq(log_poisson(flen, 1.0)); // flen ~Pois(1) + p1.logeq(Md::log_poisson(flen, 1.0)); // flen ~Pois(1) // elen | flen ~Pois(flen + 0.01) - prob_t ptrglen; ptrglen.logeq(log_poisson(elen, flen + 0.01)); + prob_t ptrglen; ptrglen.logeq(Md::log_poisson(elen, flen + 0.01)); p1 *= ptrglen; p1 *= kUNIFORM_SOURCE.pow(flen); // each f in F ~Uniform for (int i = 0; i < elen; ++i) { // for each position i in E @@ -193,9 +193,9 @@ prob_t PhraseJointBase_BiDir::p0(const vector& vsrc, } prob_t p2; - p2.logeq(log_poisson(elen, 1.0)); // elen ~Pois(1) + p2.logeq(Md::log_poisson(elen, 1.0)); // elen ~Pois(1) // flen | elen ~Pois(flen + 0.01) - prob_t psrclen; psrclen.logeq(log_poisson(flen, elen + 0.01)); + prob_t psrclen; psrclen.logeq(Md::log_poisson(flen, elen + 0.01)); p2 *= psrclen; p2 *= kUNIFORM_TARGET.pow(elen); // each f in F ~Uniform for (int i = 0; i < flen; ++i) { // for each position i in E @@ -227,9 +227,9 @@ JumpBase::JumpBase() : p(200) { for (int j = min_jump; j <= max_jump; ++j) { prob_t& cp = cpd[j]; if (j < 0) - cp.logeq(log_poisson(1.5-j, 1)); + cp.logeq(Md::log_poisson(1.5-j, 1)); else if (j > 0) - cp.logeq(log_poisson(j, 1)); + cp.logeq(Md::log_poisson(j, 1)); cp.poweq(0.2); z += cp; } diff --git a/gi/pf/base_distributions.h b/gi/pf/base_distributions.h index a23ac32b..0d597c5c 100644 --- a/gi/pf/base_distributions.h +++ b/gi/pf/base_distributions.h @@ -13,24 +13,7 @@ #include "prob.h" #include "tdict.h" #include "sampler.h" - -inline double log_poisson(unsigned x, const double& lambda) { - assert(lambda > 0.0); - return log(lambda) * x - lgamma(x + 1) - lambda; -} - -inline double log_binom_coeff(unsigned n, unsigned k) { - assert(n >= k); - if (n == k) return 0.0; - return lgamma(n + 1) - lgamma(k + 1) - lgamma(n - k + 1); -} - -// http://en.wikipedia.org/wiki/Negative_binomial_distribution -inline double log_negative_binom(unsigned x, unsigned r, double p) { - assert(p > 0.0); - assert(p < 1.0); - return log_binom_coeff(x + r - 1, x) + r * log(1 - p) + x * log(p); -} +#include "m.h" inline std::ostream& operator<<(std::ostream& os, const std::vector& p) { os << '['; @@ -68,7 +51,7 @@ struct Model1 { struct PoissonUniformUninformativeBase { explicit PoissonUniformUninformativeBase(const unsigned ves) : kUNIFORM(1.0 / ves) {} prob_t operator()(const TRule& r) const { - prob_t p; p.logeq(log_poisson(r.e_.size(), 1.0)); + prob_t p; p.logeq(Md::log_poisson(r.e_.size(), 1.0)); prob_t q = kUNIFORM; q.poweq(r.e_.size()); p *= q; return p; diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h index 0aa5e8e0..2e9e38fc 100644 --- a/gi/pf/conditional_pseg.h +++ b/gi/pf/conditional_pseg.h @@ -6,6 +6,7 @@ #include #include +#include "m.h" #include "prob.h" #include "ccrp_nt.h" #include "mfcr.h" @@ -210,7 +211,7 @@ struct ConditionalParallelSegementationModel { prob_t AlignProbability(unsigned span) const { prob_t p; - p.logeq(aligns.logprob(span, log_poisson(span, 1.0))); + p.logeq(aligns.logprob(span, Md::log_poisson(span, 1.0))); return p; } diff --git a/gi/pf/pfdist.cc b/gi/pf/pfdist.cc index ef08a165..3d578db2 100644 --- a/gi/pf/pfdist.cc +++ b/gi/pf/pfdist.cc @@ -315,7 +315,7 @@ struct BackwardEstimate { for (int i = 0; i < src_cov.size(); ++i) if (!src_cov[i]) r.push_back(src_[i]); const prob_t uniform_alignment(1.0 / r.size()); - e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) + e.logeq(Md::log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) for (unsigned j = trg_cov; j < trg_.size(); ++j) { prob_t p; for (unsigned i = 0; i < r.size(); ++i) @@ -352,7 +352,7 @@ struct BackwardEstimateSym { if (!src_cov[i]) r.push_back(src_[i]); r.push_back(0); // NULL word const prob_t uniform_alignment(1.0 / r.size()); - e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) + e.logeq(Md::log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) for (unsigned j = trg_cov; j < trg_.size(); ++j) { prob_t p; for (unsigned i = 0; i < r.size(); ++i) @@ -367,7 +367,7 @@ struct BackwardEstimateSym { r.pop_back(); const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0)); prob_t inv; - inv.logeq(log_poisson(r.size(), trg_.size() - trg_cov)); + inv.logeq(Md::log_poisson(r.size(), trg_.size() - trg_cov)); for (unsigned i = 0; i < r.size(); ++i) { prob_t p; for (unsigned j = trg_cov - 1; j < trg_.size(); ++j) diff --git a/gi/pf/pfnaive.cc b/gi/pf/pfnaive.cc index acba9d22..e1a53f5c 100644 --- a/gi/pf/pfnaive.cc +++ b/gi/pf/pfnaive.cc @@ -77,7 +77,7 @@ struct BackwardEstimateSym { r.push_back(src_[i]); r.push_back(0); // NULL word const prob_t uniform_alignment(1.0 / r.size()); - e.logeq(log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) + e.logeq(Md::log_poisson(trg_.size() - trg_cov, r.size() - 1)); // p(trg len remaining | src len remaining) for (unsigned j = trg_cov; j < trg_.size(); ++j) { prob_t p; for (unsigned i = 0; i < r.size(); ++i) @@ -92,7 +92,7 @@ struct BackwardEstimateSym { r.pop_back(); const prob_t inv_uniform(1.0 / (trg_.size() - trg_cov + 1.0)); prob_t inv; - inv.logeq(log_poisson(r.size(), trg_.size() - trg_cov)); + inv.logeq(Md::log_poisson(r.size(), trg_.size() - trg_cov)); for (unsigned i = 0; i < r.size(); ++i) { prob_t p; for (unsigned j = trg_cov - 1; j < trg_.size(); ++j) diff --git a/phrasinator/gibbs_train_plm.cc b/phrasinator/gibbs_train_plm.cc index 29b3d7ea..66b46011 100644 --- a/phrasinator/gibbs_train_plm.cc +++ b/phrasinator/gibbs_train_plm.cc @@ -8,6 +8,7 @@ #include "dict.h" #include "sampler.h" #include "ccrp.h" +#include "m.h" using namespace std; using namespace std::tr1; @@ -95,11 +96,6 @@ void ReadCorpus(const string& filename, vector >* c, set* vocab if (in != &cin) delete in; } -double log_poisson(unsigned x, const double& lambda) { - assert(lambda > 0.0); - return log(lambda) * x - lgamma(x + 1) - lambda; -} - struct UniphraseLM { UniphraseLM(const vector >& corpus, const set& vocab, @@ -128,7 +124,7 @@ struct UniphraseLM { double log_p0(const vector& phrase) const { double len_logprob; if (use_poisson_) - len_logprob = log_poisson(phrase.size(), 1.0); + len_logprob = Md::log_poisson(phrase.size(), 1.0); else len_logprob = log(1 - p_end_) * (phrase.size() -1) + log(p_end_); return log(uniform_word_) * phrase.size() + len_logprob; diff --git a/utils/Makefile.am b/utils/Makefile.am index 3e559c75..a1ea8270 100644 --- a/utils/Makefile.am +++ b/utils/Makefile.am @@ -7,11 +7,12 @@ TESTS = ts phmt mfcr_test if HAVE_GTEST noinst_PROGRAMS += \ dict_test \ + m_test \ weights_test \ logval_test \ small_vector_test -TESTS += small_vector_test logval_test weights_test dict_test +TESTS += small_vector_test logval_test weights_test dict_test m_test endif reconstruct_weights_SOURCES = reconstruct_weights.cc @@ -38,6 +39,8 @@ endif phmt_SOURCES = phmt.cc ts_SOURCES = ts.cc +m_test_SOURCES = m_test.cc +m_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) dict_test_SOURCES = dict_test.cc dict_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) mfcr_test_SOURCES = mfcr_test.cc diff --git a/utils/m.h b/utils/m.h new file mode 100644 index 00000000..b25248c2 --- /dev/null +++ b/utils/m.h @@ -0,0 +1,89 @@ +#ifndef _M_H_ +#define _M_H_ + +#include +#include + +template +struct M { + // support [0, 1, 2 ...) + static inline F log_poisson(unsigned x, const F& lambda) { + assert(lambda > 0.0); + return std::log(lambda) * x - lgamma(x + 1) - lambda; + } + + // support [0, 1, 2 ...) + static inline F log_geometric(unsigned x, const F& p) { + assert(p > 0.0); + assert(p < 1.0); + return std::log(1 - p) * x + std::log(p); + } + + // log of the binomial coefficient + static inline F log_binom_coeff(unsigned n, unsigned k) { + assert(n >= k); + if (n == k) return 0.0; + return lgamma(n + 1) - lgamma(k + 1) - lgamma(n - k + 1); + } + + // http://en.wikipedia.org/wiki/Negative_binomial_distribution + // support [0, 1, 2 ...) + static inline F log_negative_binom(unsigned x, unsigned r, const F& p) { + assert(p > 0.0); + assert(p < 1.0); + return log_binom_coeff(x + r - 1u, x) + r * std::log(F(1) - p) + x * std::log(p); + } + + // this is the Beta function, *not* the beta probability density + // http://mathworld.wolfram.com/BetaFunction.html + static inline F log_beta_fn(const F& x, const F& y) { + return lgamma(x) + lgamma(y) - lgamma(x + y); + } + + // support x >= 0.0 + static F log_gamma_density(const F& x, const F& shape, const F& rate) { + assert(x >= 0.0); + assert(shape > 0.0); + assert(rate > 0.0); + return (shape-1)*std::log(x) - shape*std::log(rate) - x/rate - lgamma(shape); + } + + // this is the Beta *density* p(x ; alpha, beta) + // support x \in (0,1) + static inline F log_beta_density(const F& x, const F& alpha, const F& beta) { + assert(x > 0.0); + assert(x < 1.0); + assert(alpha > 0.0); + assert(beta > 0.0); + return (alpha-1)*std::log(x)+(beta-1)*std::log(1-x) - log_beta_fn(alpha, beta); + } + + // note: this has been adapted so that 0 is in the support of the distribution + // support [0, 1, 2 ...) + static inline F log_yule_simon(unsigned x, const F& rho) { + assert(rho > 0.0); + return std::log(rho) + log_beta_fn(x + 1, rho + 1); + } + + // see http://www.gatsby.ucl.ac.uk/~ywteh/research/compling/hpylm.pdf + // when y=1, sometimes written x^{\overline{n}} or x^{(n)} "Pochhammer symbol" + static inline F log_generalized_factorial(const F& x, const F& n, const F& y = 1.0) { + assert(x > 0.0); + assert(y >= 0.0); + assert(n > 0.0); + if (!n) return 0.0; + if (y == F(1)) { + return lgamma(x + n) - lgamma(x); + } else if (y) { + return n * std::log(y) + lgamma(x/y + n) - lgamma(x/y); + } else { // y == 0.0 + return n * std::log(x); + } + } + +}; + +typedef M Md; +typedef M Mf; + +#endif diff --git a/utils/m_test.cc b/utils/m_test.cc new file mode 100644 index 00000000..fca8f895 --- /dev/null +++ b/utils/m_test.cc @@ -0,0 +1,75 @@ +#include "m.h" + +#include +#include +#include + +using namespace std; + +class MTest : public testing::Test { + public: + MTest() {} + protected: + virtual void SetUp() { } + virtual void TearDown() { } +}; + +TEST_F(MTest, Poisson) { + double prev = 1.0; + double tot = 0; + for (int i = 0; i < 10; ++i) { + double p = Md::log_poisson(i, 0.99); + cerr << "p(i=" << i << ") = " << exp(p) << endl; + EXPECT_LT(p, prev); + tot += exp(p); + prev = p; + } + cerr << " tot=" << tot << endl; + EXPECT_LE(tot, 1.0); +} + +TEST_F(MTest, YuleSimon) { + double prev = 1.0; + double tot = 0; + for (int i = 0; i < 10; ++i) { + double p = Md::log_yule_simon(i, 1.0); + cerr << "p(i=" << i << ") = " << exp(p) << endl; + EXPECT_LT(p, prev); + tot += exp(p); + prev = p; + } + cerr << " tot=" << tot << endl; + EXPECT_LE(tot, 1.0); +} + +TEST_F(MTest, LogGeometric) { + double prev = 1.0; + double tot = 0; + for (int i = 0; i < 10; ++i) { + double p = Md::log_geometric(i, 0.5); + cerr << "p(i=" << i << ") = " << exp(p) << endl; + EXPECT_LT(p, prev); + tot += exp(p); + prev = p; + } + cerr << " tot=" << tot << endl; + EXPECT_LE(tot, 1.0); +} + +TEST_F(MTest, GeneralizedFactorial) { + for (double i = 0.3; i < 10000; i += 0.4) { + double a = Md::log_generalized_factorial(1.0, i); + double b = lgamma(1.0 + i); + EXPECT_FLOAT_EQ(a,b); + } + double gf_3_6 = 3.0 * 4.0 * 5.0 * 6.0 * 7.0 * 8.0; + EXPECT_FLOAT_EQ(Md::log_generalized_factorial(3.0, 6.0), std::log(gf_3_6)); + double gf_314_6 = 3.14 * 4.14 * 5.14 * 6.14 * 7.14 * 8.14; + EXPECT_FLOAT_EQ(Md::log_generalized_factorial(3.14, 6.0), std::log(gf_314_6)); +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} + diff --git a/utils/mfcr.h b/utils/mfcr.h index 3eb133fc..396d0205 100644 --- a/utils/mfcr.h +++ b/utils/mfcr.h @@ -12,6 +12,7 @@ #include #include "sampler.h" #include "slice_sampler.h" +#include "m.h" struct TableCount { TableCount() : count(), floor() {} @@ -218,31 +219,14 @@ class MFCR { return log_crp_prob(d_, alpha_); } - static double log_beta_density(const double& x, const double& alpha, const double& beta) { - assert(x > 0.0); - assert(x < 1.0); - assert(alpha > 0.0); - assert(beta > 0.0); - const double lp = (alpha-1)*log(x)+(beta-1)*log(1-x)+lgamma(alpha+beta)-lgamma(alpha)-lgamma(beta); - return lp; - } - - static double log_gamma_density(const double& x, const double& shape, const double& rate) { - assert(x >= 0.0); - assert(shape > 0.0); - assert(rate > 0.0); - const double lp = (shape-1)*log(x) - shape*log(rate) - x/rate - lgamma(shape); - return lp; - } - // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process // does not include draws from G_w's double log_crp_prob(const double& d, const double& alpha) const { double lp = 0.0; if (has_d_prior()) - lp = log_beta_density(d, d_prior_alpha_, d_prior_beta_); + lp = Md::log_beta_density(d, d_prior_alpha_, d_prior_beta_); if (has_alpha_prior()) - lp += log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_); + lp += Md::log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_); assert(lp <= 0.0); if (num_customers_) { if (d > 0.0) { -- cgit v1.2.3 From 77d35a1475adf7144b1109680377d17bff4233f7 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 8 Feb 2012 18:32:12 -0500 Subject: oops, broke mbr when i switched to the new scoring API --- mteval/mbr_kbest.cc | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mteval/mbr_kbest.cc b/mteval/mbr_kbest.cc index b5e4750c..2bd31566 100644 --- a/mteval/mbr_kbest.cc +++ b/mteval/mbr_kbest.cc @@ -77,6 +77,7 @@ int main(int argc, char** argv) { InitCommandLine(argc, argv, &conf); const string smetric = conf["evaluation_metric"].as(); EvaluationMetric* metric = EvaluationMetric::Instance(smetric); + const bool is_loss = (UppercaseString(smetric) == "TER"); const bool output_list = conf.count("output_list") > 0; const string file = conf["input"].as(); @@ -101,12 +102,14 @@ int main(int argc, char** argv) { double mbr_loss = numeric_limits::max(); for (int i = 0 ; i < list.size(); ++i) { const vector > refs(1, list[i].first); + boost::shared_ptr segeval = metric-> + CreateSegmentEvaluator(refs); double wl_acc = 0; for (int j = 0; j < list.size(); ++j) { if (i != j) { SufficientStats ss; - metric->ComputeSufficientStatistics(list[j].first, refs, &ss); + segeval->Evaluate(list[j].first, &ss); double loss = 1.0 - metric->ComputeScore(ss); if (is_loss) loss = 1.0 - loss; double weighted_loss = loss * (joints[j] / marginal).as_float(); -- cgit v1.2.3 From 21a4b6629fedae575583f0d1e34c97dba8de2511 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 10 Feb 2012 13:17:12 -0500 Subject: clean up alignment tools --- training/atools.cc | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/training/atools.cc b/training/atools.cc index 42579627..82e30c38 100644 --- a/training/atools.cc +++ b/training/atools.cc @@ -8,7 +8,6 @@ #include #include "filelib.h" -#include "aligner.h" #include "alignment_pharaoh.h" namespace po = boost::program_options; @@ -79,7 +78,7 @@ struct FMeasureCommand : public Command { struct DisplayCommand : public Command { string Name() const { return "display"; } bool RequiresTwoOperands() const { return false; } - void Apply(const Array2D& in, const Array2D¬_used, Array2D* x) { + void Apply(const Array2D& in, const Array2D&, Array2D* x) { *x = in; cout << *x << endl; } @@ -88,7 +87,7 @@ struct DisplayCommand : public Command { struct ConvertCommand : public Command { string Name() const { return "convert"; } bool RequiresTwoOperands() const { return false; } - void Apply(const Array2D& in, const Array2D¬_used, Array2D* x) { + void Apply(const Array2D& in, const Array2D&, Array2D* x) { *x = in; } }; @@ -96,7 +95,7 @@ struct ConvertCommand : public Command { struct InvertCommand : public Command { string Name() const { return "invert"; } bool RequiresTwoOperands() const { return false; } - void Apply(const Array2D& in, const Array2D¬_used, Array2D* x) { + void Apply(const Array2D& in, const Array2D&, Array2D* x) { Array2D& res = *x; res.resize(in.height(), in.width()); for (int i = 0; i < in.height(); ++i) @@ -275,8 +274,8 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { } string cstr = os.str(); opts.add_options() - ("input_1,i", po::value(), "[REQ] Alignment 1 file, - for STDIN") - ("input_2,j", po::value(), "[OPT] Alignment 2 file, - for STDIN") + ("input_1,i", po::value(), "[REQUIRED] Alignment 1 file, - for STDIN") + ("input_2,j", po::value(), "Alignment 2 file, - for STDIN") ("command,c", po::value()->default_value("convert"), cstr.c_str()) ("help,h", "Print this help message and exit"); po::options_description clo("Command line options"); -- cgit v1.2.3 From 50105660d8c18889e8908cf3e4c583b551dc05af Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 10 Feb 2012 13:18:59 -0500 Subject: move atools to utils directory --- training/Makefile.am | 4 - training/atools.cc | 369 --------------------------------------------------- utils/Makefile.am | 4 +- utils/atools.cc | 369 +++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 372 insertions(+), 374 deletions(-) delete mode 100644 training/atools.cc create mode 100644 utils/atools.cc diff --git a/training/Makefile.am b/training/Makefile.am index 2a11ae52..d2f1ccc5 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -6,7 +6,6 @@ bin_PROGRAMS = \ mr_reduce_to_weights \ mr_optimize_reduce \ grammar_convert \ - atools \ plftools \ collapse_weights \ mpi_extract_reachable \ @@ -47,9 +46,6 @@ augment_grammar_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/lib test_ngram_SOURCES = test_ngram.cc test_ngram_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -atools_SOURCES = atools.cc -atools_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz - model1_SOURCES = model1.cc ttables.cc model1_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz diff --git a/training/atools.cc b/training/atools.cc deleted file mode 100644 index 82e30c38..00000000 --- a/training/atools.cc +++ /dev/null @@ -1,369 +0,0 @@ -#include -#include -#include - -#include -#include -#include -#include - -#include "filelib.h" -#include "alignment_pharaoh.h" - -namespace po = boost::program_options; -using namespace std; -using boost::shared_ptr; - -struct Command { - virtual ~Command() {} - virtual string Name() const = 0; - - // returns 1 for alignment grid output [default] - // returns 2 if Summary() should be called [for AER, etc] - virtual int Result() const { return 1; } - - virtual bool RequiresTwoOperands() const { return true; } - virtual void Apply(const Array2D& a, const Array2D& b, Array2D* x) = 0; - void EnsureSize(const Array2D& a, const Array2D& b, Array2D* x) { - x->resize(max(a.width(), b.width()), max(a.height(), b.height())); - } - static bool Safe(const Array2D& a, int i, int j) { - if (i >= 0 && j >= 0 && i < a.width() && j < a.height()) - return a(i,j); - else - return false; - } - virtual void Summary() { assert(!"Summary should have been overridden"); } -}; - -// compute fmeasure, second alignment is reference, first is hyp -struct FMeasureCommand : public Command { - FMeasureCommand() : matches(), num_predicted(), num_in_ref() {} - int Result() const { return 2; } - string Name() const { return "fmeasure"; } - bool RequiresTwoOperands() const { return true; } - void Apply(const Array2D& hyp, const Array2D& ref, Array2D* x) { - (void) x; // AER just computes statistics, not an alignment - int i_len = ref.width(); - int j_len = ref.height(); - for (int i = 0; i < i_len; ++i) { - for (int j = 0; j < j_len; ++j) { - if (ref(i,j)) { - ++num_in_ref; - if (Safe(hyp, i, j)) ++matches; - } - } - } - for (int i = 0; i < hyp.width(); ++i) - for (int j = 0; j < hyp.height(); ++j) - if (hyp(i,j)) ++num_predicted; - } - void Summary() { - if (num_predicted == 0 || num_in_ref == 0) { - cerr << "Insufficient statistics to compute f-measure!\n"; - abort(); - } - const double prec = static_cast(matches) / num_predicted; - const double rec = static_cast(matches) / num_in_ref; - cout << "P: " << prec << endl; - cout << "R: " << rec << endl; - const double f = (2.0 * prec * rec) / (rec + prec); - cout << "F: " << f << endl; - } - int matches; - int num_predicted; - int num_in_ref; -}; - -struct DisplayCommand : public Command { - string Name() const { return "display"; } - bool RequiresTwoOperands() const { return false; } - void Apply(const Array2D& in, const Array2D&, Array2D* x) { - *x = in; - cout << *x << endl; - } -}; - -struct ConvertCommand : public Command { - string Name() const { return "convert"; } - bool RequiresTwoOperands() const { return false; } - void Apply(const Array2D& in, const Array2D&, Array2D* x) { - *x = in; - } -}; - -struct InvertCommand : public Command { - string Name() const { return "invert"; } - bool RequiresTwoOperands() const { return false; } - void Apply(const Array2D& in, const Array2D&, Array2D* x) { - Array2D& res = *x; - res.resize(in.height(), in.width()); - for (int i = 0; i < in.height(); ++i) - for (int j = 0; j < in.width(); ++j) - res(i, j) = in(j, i); - } -}; - -struct IntersectCommand : public Command { - string Name() const { return "intersect"; } - bool RequiresTwoOperands() const { return true; } - void Apply(const Array2D& a, const Array2D& b, Array2D* x) { - EnsureSize(a, b, x); - Array2D& res = *x; - for (int i = 0; i < a.width(); ++i) - for (int j = 0; j < a.height(); ++j) - res(i, j) = Safe(a, i, j) && Safe(b, i, j); - } -}; - -struct UnionCommand : public Command { - string Name() const { return "union"; } - bool RequiresTwoOperands() const { return true; } - void Apply(const Array2D& a, const Array2D& b, Array2D* x) { - EnsureSize(a, b, x); - Array2D& res = *x; - for (int i = 0; i < res.width(); ++i) - for (int j = 0; j < res.height(); ++j) - res(i, j) = Safe(a, i, j) || Safe(b, i, j); - } -}; - -struct RefineCommand : public Command { - RefineCommand() { - neighbors_.push_back(make_pair(1,0)); - neighbors_.push_back(make_pair(-1,0)); - neighbors_.push_back(make_pair(0,1)); - neighbors_.push_back(make_pair(0,-1)); - } - bool RequiresTwoOperands() const { return true; } - - void Align(int i, int j) { - res_(i, j) = true; - is_i_aligned_[i] = true; - is_j_aligned_[j] = true; - } - - bool IsNeighborAligned(int i, int j) const { - for (int k = 0; k < neighbors_.size(); ++k) { - const int di = neighbors_[k].first; - const int dj = neighbors_[k].second; - if (Safe(res_, i + di, j + dj)) - return true; - } - return false; - } - - bool IsNeitherAligned(int i, int j) const { - return !(is_i_aligned_[i] || is_j_aligned_[j]); - } - - bool IsOneOrBothUnaligned(int i, int j) const { - return !(is_i_aligned_[i] && is_j_aligned_[j]); - } - - bool KoehnAligned(int i, int j) const { - return IsOneOrBothUnaligned(i, j) && IsNeighborAligned(i, j); - } - - typedef bool (RefineCommand::*Predicate)(int i, int j) const; - - protected: - void InitRefine( - const Array2D& a, - const Array2D& b) { - res_.clear(); - EnsureSize(a, b, &res_); - in_.clear(); un_.clear(); is_i_aligned_.clear(); is_j_aligned_.clear(); - EnsureSize(a, b, &in_); - EnsureSize(a, b, &un_); - is_i_aligned_.resize(res_.width(), false); - is_j_aligned_.resize(res_.height(), false); - for (int i = 0; i < in_.width(); ++i) - for (int j = 0; j < in_.height(); ++j) { - un_(i, j) = Safe(a, i, j) || Safe(b, i, j); - in_(i, j) = Safe(a, i, j) && Safe(b, i, j); - if (in_(i, j)) Align(i, j); - } - } - // "grow" the resulting alignment using the points in adds - // if they match the constraints determined by pred - void Grow(Predicate pred, bool idempotent, const Array2D& adds) { - if (idempotent) { - for (int i = 0; i < adds.width(); ++i) - for (int j = 0; j < adds.height(); ++j) { - if (adds(i, j) && !res_(i, j) && - (this->*pred)(i, j)) Align(i, j); - } - return; - } - set > p; - for (int i = 0; i < adds.width(); ++i) - for (int j = 0; j < adds.height(); ++j) - if (adds(i, j) && !res_(i, j)) - p.insert(make_pair(i, j)); - bool keep_going = !p.empty(); - while (keep_going) { - keep_going = false; - for (set >::iterator pi = p.begin(); - pi != p.end(); ++pi) { - if ((this->*pred)(pi->first, pi->second)) { - Align(pi->first, pi->second); - p.erase(pi); - keep_going = true; - } - } - } - } - Array2D res_; // refined alignment - Array2D in_; // intersection alignment - Array2D un_; // union alignment - vector is_i_aligned_; - vector is_j_aligned_; - vector > neighbors_; -}; - -struct DiagCommand : public RefineCommand { - DiagCommand() { - neighbors_.push_back(make_pair(1,1)); - neighbors_.push_back(make_pair(-1,1)); - neighbors_.push_back(make_pair(1,-1)); - neighbors_.push_back(make_pair(-1,-1)); - } -}; - -struct GDCommand : public DiagCommand { - string Name() const { return "grow-diag"; } - void Apply(const Array2D& a, const Array2D& b, Array2D* x) { - InitRefine(a, b); - Grow(&RefineCommand::KoehnAligned, false, un_); - *x = res_; - } -}; - -struct GDFCommand : public DiagCommand { - string Name() const { return "grow-diag-final"; } - void Apply(const Array2D& a, const Array2D& b, Array2D* x) { - InitRefine(a, b); - Grow(&RefineCommand::KoehnAligned, false, un_); - Grow(&RefineCommand::IsOneOrBothUnaligned, true, a); - Grow(&RefineCommand::IsOneOrBothUnaligned, true, b); - *x = res_; - } -}; - -struct GDFACommand : public DiagCommand { - string Name() const { return "grow-diag-final-and"; } - void Apply(const Array2D& a, const Array2D& b, Array2D* x) { - InitRefine(a, b); - Grow(&RefineCommand::KoehnAligned, false, un_); - Grow(&RefineCommand::IsNeitherAligned, true, a); - Grow(&RefineCommand::IsNeitherAligned, true, b); - *x = res_; - } -}; - -map > commands; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - ostringstream os; - os << "[REQ] Operation to perform:"; - for (map >::iterator it = commands.begin(); - it != commands.end(); ++it) { - os << ' ' << it->first; - } - string cstr = os.str(); - opts.add_options() - ("input_1,i", po::value(), "[REQUIRED] Alignment 1 file, - for STDIN") - ("input_2,j", po::value(), "Alignment 2 file, - for STDIN") - ("command,c", po::value()->default_value("convert"), cstr.c_str()) - ("help,h", "Print this help message and exit"); - po::options_description clo("Command line options"); - po::options_description dcmdline_options; - dcmdline_options.add(opts); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - po::notify(*conf); - - if (conf->count("help") || conf->count("input_1") == 0 || conf->count("command") == 0) { - cerr << dcmdline_options << endl; - exit(1); - } - const string cmd = (*conf)["command"].as(); - if (commands.count(cmd) == 0) { - cerr << "Don't understand command: " << cmd << endl; - exit(1); - } - if (commands[cmd]->RequiresTwoOperands()) { - if (conf->count("input_2") == 0) { - cerr << "Command '" << cmd << "' requires two alignment files\n"; - exit(1); - } - if ((*conf)["input_1"].as() == "-" && (*conf)["input_2"].as() == "-") { - cerr << "Both inputs cannot be STDIN\n"; - exit(1); - } - } else { - if (conf->count("input_2") != 0) { - cerr << "Command '" << cmd << "' requires only one alignment file\n"; - exit(1); - } - } -} - -template static void AddCommand() { - C* c = new C; - commands[c->Name()].reset(c); -} - -int main(int argc, char **argv) { - AddCommand(); - AddCommand(); - AddCommand(); - AddCommand(); - AddCommand(); - AddCommand(); - AddCommand(); - AddCommand(); - AddCommand(); - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - Command& cmd = *commands[conf["command"].as()]; - boost::shared_ptr rf1(new ReadFile(conf["input_1"].as())); - boost::shared_ptr rf2; - if (cmd.RequiresTwoOperands()) - rf2.reset(new ReadFile(conf["input_2"].as())); - istream* in1 = rf1->stream(); - istream* in2 = NULL; - if (rf2) in2 = rf2->stream(); - while(*in1) { - string line1; - string line2; - getline(*in1, line1); - if (in2) { - getline(*in2, line2); - if ((*in1 && !*in2) || (*in2 && !*in1)) { - cerr << "Mismatched number of lines!\n"; - exit(1); - } - } - if (line1.empty() && !*in1) break; - shared_ptr > out(new Array2D); - shared_ptr > a1 = AlignmentPharaoh::ReadPharaohAlignmentGrid(line1); - if (in2) { - shared_ptr > a2 = AlignmentPharaoh::ReadPharaohAlignmentGrid(line2); - cmd.Apply(*a1, *a2, out.get()); - } else { - Array2D dummy; - cmd.Apply(*a1, dummy, out.get()); - } - - if (cmd.Result() == 1) { - AlignmentPharaoh::SerializePharaohFormat(*out, &cout); - } - } - if (cmd.Result() == 2) - cmd.Summary(); - return 0; -} - diff --git a/utils/Makefile.am b/utils/Makefile.am index a1ea8270..6e0678de 100644 --- a/utils/Makefile.am +++ b/utils/Makefile.am @@ -1,5 +1,5 @@ -bin_PROGRAMS = reconstruct_weights +bin_PROGRAMS = reconstruct_weights atools noinst_PROGRAMS = ts phmt mfcr_test TESTS = ts phmt mfcr_test @@ -17,6 +17,8 @@ endif reconstruct_weights_SOURCES = reconstruct_weights.cc +atools_SOURCES = atools.cc + noinst_LIBRARIES = libutils.a libutils_a_SOURCES = \ diff --git a/utils/atools.cc b/utils/atools.cc new file mode 100644 index 00000000..c0a91731 --- /dev/null +++ b/utils/atools.cc @@ -0,0 +1,369 @@ +#include +#include +#include + +#include +#include +#include +#include + +#include "filelib.h" +#include "alignment_pharaoh.h" + +namespace po = boost::program_options; +using namespace std; +using boost::shared_ptr; + +struct Command { + virtual ~Command() {} + virtual string Name() const = 0; + + // returns 1 for alignment grid output [default] + // returns 2 if Summary() should be called [for AER, etc] + virtual int Result() const { return 1; } + + virtual bool RequiresTwoOperands() const { return true; } + virtual void Apply(const Array2D& a, const Array2D& b, Array2D* x) = 0; + void EnsureSize(const Array2D& a, const Array2D& b, Array2D* x) { + x->resize(max(a.width(), b.width()), max(a.height(), b.height())); + } + static bool Safe(const Array2D& a, int i, int j) { + if (i >= 0 && j >= 0 && i < a.width() && j < a.height()) + return a(i,j); + else + return false; + } + virtual void Summary() { assert(!"Summary should have been overridden"); } +}; + +// compute fmeasure, second alignment is reference, first is hyp +struct FMeasureCommand : public Command { + FMeasureCommand() : matches(), num_predicted(), num_in_ref() {} + int Result() const { return 2; } + string Name() const { return "fmeasure"; } + bool RequiresTwoOperands() const { return true; } + void Apply(const Array2D& hyp, const Array2D& ref, Array2D* x) { + (void) x; // AER just computes statistics, not an alignment + int i_len = ref.width(); + int j_len = ref.height(); + for (int i = 0; i < i_len; ++i) { + for (int j = 0; j < j_len; ++j) { + if (ref(i,j)) { + ++num_in_ref; + if (Safe(hyp, i, j)) ++matches; + } + } + } + for (int i = 0; i < hyp.width(); ++i) + for (int j = 0; j < hyp.height(); ++j) + if (hyp(i,j)) ++num_predicted; + } + void Summary() { + if (num_predicted == 0 || num_in_ref == 0) { + cerr << "Insufficient statistics to compute f-measure!\n"; + abort(); + } + const double prec = static_cast(matches) / num_predicted; + const double rec = static_cast(matches) / num_in_ref; + cout << "P: " << prec << endl; + cout << "R: " << rec << endl; + const double f = (2.0 * prec * rec) / (rec + prec); + cout << "F: " << f << endl; + } + int matches; + int num_predicted; + int num_in_ref; +}; + +struct DisplayCommand : public Command { + string Name() const { return "display"; } + bool RequiresTwoOperands() const { return false; } + void Apply(const Array2D& in, const Array2D&, Array2D* x) { + *x = in; + cout << *x << endl; + } +}; + +struct ConvertCommand : public Command { + string Name() const { return "convert"; } + bool RequiresTwoOperands() const { return false; } + void Apply(const Array2D& in, const Array2D&, Array2D* x) { + *x = in; + } +}; + +struct InvertCommand : public Command { + string Name() const { return "invert"; } + bool RequiresTwoOperands() const { return false; } + void Apply(const Array2D& in, const Array2D&, Array2D* x) { + Array2D& res = *x; + res.resize(in.height(), in.width()); + for (int i = 0; i < in.height(); ++i) + for (int j = 0; j < in.width(); ++j) + res(i, j) = in(j, i); + } +}; + +struct IntersectCommand : public Command { + string Name() const { return "intersect"; } + bool RequiresTwoOperands() const { return true; } + void Apply(const Array2D& a, const Array2D& b, Array2D* x) { + EnsureSize(a, b, x); + Array2D& res = *x; + for (int i = 0; i < a.width(); ++i) + for (int j = 0; j < a.height(); ++j) + res(i, j) = Safe(a, i, j) && Safe(b, i, j); + } +}; + +struct UnionCommand : public Command { + string Name() const { return "union"; } + bool RequiresTwoOperands() const { return true; } + void Apply(const Array2D& a, const Array2D& b, Array2D* x) { + EnsureSize(a, b, x); + Array2D& res = *x; + for (int i = 0; i < res.width(); ++i) + for (int j = 0; j < res.height(); ++j) + res(i, j) = Safe(a, i, j) || Safe(b, i, j); + } +}; + +struct RefineCommand : public Command { + RefineCommand() { + neighbors_.push_back(make_pair(1,0)); + neighbors_.push_back(make_pair(-1,0)); + neighbors_.push_back(make_pair(0,1)); + neighbors_.push_back(make_pair(0,-1)); + } + bool RequiresTwoOperands() const { return true; } + + void Align(int i, int j) { + res_(i, j) = true; + is_i_aligned_[i] = true; + is_j_aligned_[j] = true; + } + + bool IsNeighborAligned(int i, int j) const { + for (int k = 0; k < neighbors_.size(); ++k) { + const int di = neighbors_[k].first; + const int dj = neighbors_[k].second; + if (Safe(res_, i + di, j + dj)) + return true; + } + return false; + } + + bool IsNeitherAligned(int i, int j) const { + return !(is_i_aligned_[i] || is_j_aligned_[j]); + } + + bool IsOneOrBothUnaligned(int i, int j) const { + return !(is_i_aligned_[i] && is_j_aligned_[j]); + } + + bool KoehnAligned(int i, int j) const { + return IsOneOrBothUnaligned(i, j) && IsNeighborAligned(i, j); + } + + typedef bool (RefineCommand::*Predicate)(int i, int j) const; + + protected: + void InitRefine( + const Array2D& a, + const Array2D& b) { + res_.clear(); + EnsureSize(a, b, &res_); + in_.clear(); un_.clear(); is_i_aligned_.clear(); is_j_aligned_.clear(); + EnsureSize(a, b, &in_); + EnsureSize(a, b, &un_); + is_i_aligned_.resize(res_.width(), false); + is_j_aligned_.resize(res_.height(), false); + for (int i = 0; i < in_.width(); ++i) + for (int j = 0; j < in_.height(); ++j) { + un_(i, j) = Safe(a, i, j) || Safe(b, i, j); + in_(i, j) = Safe(a, i, j) && Safe(b, i, j); + if (in_(i, j)) Align(i, j); + } + } + // "grow" the resulting alignment using the points in adds + // if they match the constraints determined by pred + void Grow(Predicate pred, bool idempotent, const Array2D& adds) { + if (idempotent) { + for (int i = 0; i < adds.width(); ++i) + for (int j = 0; j < adds.height(); ++j) { + if (adds(i, j) && !res_(i, j) && + (this->*pred)(i, j)) Align(i, j); + } + return; + } + set > p; + for (int i = 0; i < adds.width(); ++i) + for (int j = 0; j < adds.height(); ++j) + if (adds(i, j) && !res_(i, j)) + p.insert(make_pair(i, j)); + bool keep_going = !p.empty(); + while (keep_going) { + keep_going = false; + for (set >::iterator pi = p.begin(); + pi != p.end(); ++pi) { + if ((this->*pred)(pi->first, pi->second)) { + Align(pi->first, pi->second); + p.erase(pi); + keep_going = true; + } + } + } + } + Array2D res_; // refined alignment + Array2D in_; // intersection alignment + Array2D un_; // union alignment + vector is_i_aligned_; + vector is_j_aligned_; + vector > neighbors_; +}; + +struct DiagCommand : public RefineCommand { + DiagCommand() { + neighbors_.push_back(make_pair(1,1)); + neighbors_.push_back(make_pair(-1,1)); + neighbors_.push_back(make_pair(1,-1)); + neighbors_.push_back(make_pair(-1,-1)); + } +}; + +struct GDCommand : public DiagCommand { + string Name() const { return "grow-diag"; } + void Apply(const Array2D& a, const Array2D& b, Array2D* x) { + InitRefine(a, b); + Grow(&RefineCommand::KoehnAligned, false, un_); + *x = res_; + } +}; + +struct GDFCommand : public DiagCommand { + string Name() const { return "grow-diag-final"; } + void Apply(const Array2D& a, const Array2D& b, Array2D* x) { + InitRefine(a, b); + Grow(&RefineCommand::KoehnAligned, false, un_); + Grow(&RefineCommand::IsOneOrBothUnaligned, true, a); + Grow(&RefineCommand::IsOneOrBothUnaligned, true, b); + *x = res_; + } +}; + +struct GDFACommand : public DiagCommand { + string Name() const { return "grow-diag-final-and"; } + void Apply(const Array2D& a, const Array2D& b, Array2D* x) { + InitRefine(a, b); + Grow(&RefineCommand::KoehnAligned, false, un_); + Grow(&RefineCommand::IsNeitherAligned, true, a); + Grow(&RefineCommand::IsNeitherAligned, true, b); + *x = res_; + } +}; + +map > commands; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + ostringstream os; + os << "Operation to perform:"; + for (map >::iterator it = commands.begin(); + it != commands.end(); ++it) { + os << ' ' << it->first; + } + string cstr = os.str(); + opts.add_options() + ("input_1,i", po::value(), "[REQUIRED] Alignment 1 file, - for STDIN") + ("input_2,j", po::value(), "Alignment 2 file, - for STDIN") + ("command,c", po::value()->default_value("convert"), cstr.c_str()) + ("help,h", "Print this help message and exit"); + po::options_description clo("Command line options"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + po::notify(*conf); + + if (conf->count("help") || conf->count("input_1") == 0 || conf->count("command") == 0) { + cerr << dcmdline_options << endl; + exit(1); + } + const string cmd = (*conf)["command"].as(); + if (commands.count(cmd) == 0) { + cerr << "Don't understand command: " << cmd << endl; + exit(1); + } + if (commands[cmd]->RequiresTwoOperands()) { + if (conf->count("input_2") == 0) { + cerr << "Command '" << cmd << "' requires two alignment files\n"; + exit(1); + } + if ((*conf)["input_1"].as() == "-" && (*conf)["input_2"].as() == "-") { + cerr << "Both inputs cannot be STDIN\n"; + exit(1); + } + } else { + if (conf->count("input_2") != 0) { + cerr << "Command '" << cmd << "' requires only one alignment file\n"; + exit(1); + } + } +} + +template static void AddCommand() { + C* c = new C; + commands[c->Name()].reset(c); +} + +int main(int argc, char **argv) { + AddCommand(); + AddCommand(); + AddCommand(); + AddCommand(); + AddCommand(); + AddCommand(); + AddCommand(); + AddCommand(); + AddCommand(); + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + Command& cmd = *commands[conf["command"].as()]; + boost::shared_ptr rf1(new ReadFile(conf["input_1"].as())); + boost::shared_ptr rf2; + if (cmd.RequiresTwoOperands()) + rf2.reset(new ReadFile(conf["input_2"].as())); + istream* in1 = rf1->stream(); + istream* in2 = NULL; + if (rf2) in2 = rf2->stream(); + while(*in1) { + string line1; + string line2; + getline(*in1, line1); + if (in2) { + getline(*in2, line2); + if ((*in1 && !*in2) || (*in2 && !*in1)) { + cerr << "Mismatched number of lines!\n"; + exit(1); + } + } + if (line1.empty() && !*in1) break; + shared_ptr > out(new Array2D); + shared_ptr > a1 = AlignmentPharaoh::ReadPharaohAlignmentGrid(line1); + if (in2) { + shared_ptr > a2 = AlignmentPharaoh::ReadPharaohAlignmentGrid(line2); + cmd.Apply(*a1, *a2, out.get()); + } else { + Array2D dummy; + cmd.Apply(*a1, dummy, out.get()); + } + + if (cmd.Result() == 1) { + AlignmentPharaoh::SerializePharaohFormat(*out, &cout); + } + } + if (cmd.Result() == 2) + cmd.Summary(); + return 0; +} + -- cgit v1.2.3 From f15bbfbf105ff873e89a8bdf55e845f3ac7b030e Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 10 Feb 2012 18:31:05 +0000 Subject: better error checking --- gi/pf/guess-translits.pl | 1 + 1 file changed, 1 insertion(+) diff --git a/gi/pf/guess-translits.pl b/gi/pf/guess-translits.pl index ab737121..aafec13a 100755 --- a/gi/pf/guess-translits.pl +++ b/gi/pf/guess-translits.pl @@ -28,6 +28,7 @@ while() { my %b2a; for my $ap (@as) { my ($a,$b) = split /-/, $ap; + die "BAD INPUT: $_\n" unless defined $a && defined $b; $a2b{$a}->{$b} = 1; $b2a{$b}->{$a} = 1; } -- cgit v1.2.3 From a97f481dda269c1474722ce7dc987fb5868951b6 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Fri, 10 Feb 2012 14:13:24 -0500 Subject: Dear windows users, code is not executable --- decoder/apply_fsa_models.h | 0 decoder/cfg.cc | 0 decoder/cfg.h | 0 decoder/cfg_binarize.h | 0 decoder/cfg_format.h | 0 decoder/cfg_options.h | 0 decoder/cfg_test.cc | 0 decoder/ff_register.h | 0 decoder/ff_sample_fsa.h | 0 decoder/hg_cfg.h | 0 decoder/hg_test.h | 0 decoder/nt_span.h | 0 decoder/oracle_bleu.h | 0 decoder/program_options.h | 0 decoder/sentences.h | 0 15 files changed, 0 insertions(+), 0 deletions(-) mode change 100755 => 100644 decoder/apply_fsa_models.h mode change 100755 => 100644 decoder/cfg.cc mode change 100755 => 100644 decoder/cfg.h mode change 100755 => 100644 decoder/cfg_binarize.h mode change 100755 => 100644 decoder/cfg_format.h mode change 100755 => 100644 decoder/cfg_options.h mode change 100755 => 100644 decoder/cfg_test.cc mode change 100755 => 100644 decoder/ff_register.h mode change 100755 => 100644 decoder/ff_sample_fsa.h mode change 100755 => 100644 decoder/hg_cfg.h mode change 100755 => 100644 decoder/hg_test.h mode change 100755 => 100644 decoder/nt_span.h mode change 100755 => 100644 decoder/oracle_bleu.h mode change 100755 => 100644 decoder/program_options.h mode change 100755 => 100644 decoder/sentences.h diff --git a/decoder/apply_fsa_models.h b/decoder/apply_fsa_models.h old mode 100755 new mode 100644 diff --git a/decoder/cfg.cc b/decoder/cfg.cc old mode 100755 new mode 100644 diff --git a/decoder/cfg.h b/decoder/cfg.h old mode 100755 new mode 100644 diff --git a/decoder/cfg_binarize.h b/decoder/cfg_binarize.h old mode 100755 new mode 100644 diff --git a/decoder/cfg_format.h b/decoder/cfg_format.h old mode 100755 new mode 100644 diff --git a/decoder/cfg_options.h b/decoder/cfg_options.h old mode 100755 new mode 100644 diff --git a/decoder/cfg_test.cc b/decoder/cfg_test.cc old mode 100755 new mode 100644 diff --git a/decoder/ff_register.h b/decoder/ff_register.h old mode 100755 new mode 100644 diff --git a/decoder/ff_sample_fsa.h b/decoder/ff_sample_fsa.h old mode 100755 new mode 100644 diff --git a/decoder/hg_cfg.h b/decoder/hg_cfg.h old mode 100755 new mode 100644 diff --git a/decoder/hg_test.h b/decoder/hg_test.h old mode 100755 new mode 100644 diff --git a/decoder/nt_span.h b/decoder/nt_span.h old mode 100755 new mode 100644 diff --git a/decoder/oracle_bleu.h b/decoder/oracle_bleu.h old mode 100755 new mode 100644 diff --git a/decoder/program_options.h b/decoder/program_options.h old mode 100755 new mode 100644 diff --git a/decoder/sentences.h b/decoder/sentences.h old mode 100755 new mode 100644 -- cgit v1.2.3 From feab7be095f6454f9ce3021190939ca64bf41e62 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Sun, 12 Feb 2012 16:46:46 -0500 Subject: Target-side only output format --- decoder/decoder.cc | 3 +++ decoder/hg_io.cc | 27 +++++++++++++++++++++++++++ decoder/hg_io.h | 3 +++ 3 files changed, 33 insertions(+) diff --git a/decoder/decoder.cc b/decoder/decoder.cc index 3b53fd6b..3394e0b8 100644 --- a/decoder/decoder.cc +++ b/decoder/decoder.cc @@ -408,6 +408,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream ("show_partition,z", "Compute and show the partition (inside score)") ("show_conditional_prob", "Output the conditional log prob to STDOUT instead of a translation") ("show_cfg_search_space", "Show the search space as a CFG") + ("show_target_graph", "Output the target hypergraph") ("coarse_to_fine_beam_prune", po::value(), "Prune paths from coarse parse forest before fine parse, keeping paths within exp(alpha>=0)") ("ctf_beam_widen", po::value()->default_value(2.0), "Expand coarse pass beam by this factor if no fine parse is found") ("ctf_num_widenings", po::value()->default_value(2), "Widen coarse beam this many times before backing off to full parse") @@ -1017,6 +1018,8 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { } if (conf.count("show_cfg_search_space")) HypergraphIO::WriteAsCFG(forest); + if (conf.count("show_target_graph")) + HypergraphIO::WriteTarget(forest); if (has_ref) { if (HG::Intersect(ref, &forest)) { // if (crf_uniform_empirical) { diff --git a/decoder/hg_io.cc b/decoder/hg_io.cc index c1c93933..0283ec3c 100644 --- a/decoder/hg_io.cc +++ b/decoder/hg_io.cc @@ -624,3 +624,30 @@ void HypergraphIO::WriteAsCFG(const Hypergraph& hg) { } } +/* Output format: + * #vertices + * for each vertex in bottom-up topological order: + * #downward_edges + * for each downward edge: + * RHS with [vertex_index] for NTs ||| scores + */ +void HypergraphIO::WriteTarget(const Hypergraph& hg) { + cout << hg.nodes_.size() << '\n'; + for (unsigned int i = 0; i < hg.nodes_.size(); ++i) { + const Hypergraph::EdgesVector &edges = hg.nodes_[i].in_edges_; + cout << edges.size() << '\n'; + for (unsigned int j = 0; j < edges.size(); ++j) { + const Hypergraph::Edge &edge = hg.edges_[edges[j]]; + const std::vector &e = edge.rule_->e(); + for (std::vector::const_iterator word = e.begin(); word != e.end(); ++word) { + if (*word <= 0) { + cout << '[' << edge.tail_nodes_[-*word] << "] "; + } else { + cout << TD::Convert(*word) << ' '; + } + } + cout << "||| " << edge.rule_->scores_ << '\n'; + } + } +} + diff --git a/decoder/hg_io.h b/decoder/hg_io.h index 082489d8..44817157 100644 --- a/decoder/hg_io.h +++ b/decoder/hg_io.h @@ -23,6 +23,9 @@ struct HypergraphIO { static void WriteAsCFG(const Hypergraph& hg); + // Write only the target size information in bottom-up order. + static void WriteTarget(const Hypergraph& hg); + // serialization utils static void ReadFromPLF(const std::string& in, Hypergraph* out, int line = 0); // return PLF string representation (undefined behavior on non-lattices) -- cgit v1.2.3 From e8583574e25c8ef09c9cd21cbc7421d9d12cf75f Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Sun, 12 Feb 2012 17:40:03 -0500 Subject: Might as well provide the edge count as well --- decoder/hg_io.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/decoder/hg_io.cc b/decoder/hg_io.cc index 0283ec3c..9f0f50fa 100644 --- a/decoder/hg_io.cc +++ b/decoder/hg_io.cc @@ -632,7 +632,7 @@ void HypergraphIO::WriteAsCFG(const Hypergraph& hg) { * RHS with [vertex_index] for NTs ||| scores */ void HypergraphIO::WriteTarget(const Hypergraph& hg) { - cout << hg.nodes_.size() << '\n'; + cout << hg.nodes_.size() << ' ' << hg.edges_.size() << '\n'; for (unsigned int i = 0; i < hg.nodes_.size(); ++i) { const Hypergraph::EdgesVector &edges = hg.nodes_[i].in_edges_; cout << edges.size() << '\n'; -- cgit v1.2.3 From a38b3fa383412e56eb958db998662c026bc08f4b Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 17 Feb 2012 13:01:54 -0500 Subject: boost version checking, check for Eigen, get rid of old digamma stuff --- configure.ac | 21 +++++++++++++++------ training/em_utils.h | 24 ------------------------ training/model1.cc | 1 - training/mr_em_adapted_reduce.cc | 6 +++--- training/ttables.h | 4 ++-- utils/m.h | 6 ++++++ 6 files changed, 26 insertions(+), 36 deletions(-) delete mode 100644 training/em_utils.h diff --git a/configure.ac b/configure.ac index cd78ee72..aa79027f 100644 --- a/configure.ac +++ b/configure.ac @@ -9,7 +9,7 @@ esac AC_PROG_CC AC_PROG_CXX AC_LANG_CPLUSPLUS -BOOST_REQUIRE +BOOST_REQUIRE([1.44]) BOOST_PROGRAM_OPTIONS AC_ARG_ENABLE(mpi, [ --enable-mpi Build MPI binaries, assumes mpi.h is present ], @@ -38,7 +38,7 @@ then CPPFLAGS="$CPPFLAGS -I${with_cmph}/include" AC_CHECK_HEADER(cmph.h, - [AC_DEFINE([HAVE_CMPH], [], [flag for cmph perfect hashing library])], + [AC_DEFINE([HAVE_CMPH], [1], [flag for cmph perfect hashing library])], [AC_MSG_ERROR([Cannot find cmph library!])]) LDFLAGS="$LDFLAGS -L${with_cmph}/lib" @@ -46,6 +46,18 @@ then AM_CONDITIONAL([HAVE_CMPH], true) fi +if test "x$with_eigen" != 'xno' +then + SAVE_CPPFLAGS="$CPPFLAGS" + CPPFLAGS="$CPPFLAGS -I${with_eigen}" + + AC_CHECK_HEADER(Eigen, + [AC_DEFINE([HAVE_EIGEN], [1], [flag for Eigen linear algebra library])], + [AC_MSG_ERROR([Cannot find Eigen!])]) + + AM_CONDITIONAL([HAVE_EIGEN], true) +fi + #BOOST_THREADS CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS" LDFLAGS="$LDFLAGS $BOOST_PROGRAM_OPTIONS_LDFLAGS" @@ -53,11 +65,8 @@ LDFLAGS="$LDFLAGS $BOOST_PROGRAM_OPTIONS_LDFLAGS" LIBS="$LIBS $BOOST_PROGRAM_OPTIONS_LIBS" # $BOOST_THREAD_LIBS" -AC_CHECK_HEADER(boost/math/special_functions/digamma.hpp, - [AC_DEFINE([HAVE_BOOST_DIGAMMA], [], [flag for boost::math::digamma])]) - AC_CHECK_HEADER(google/dense_hash_map, - [AC_DEFINE([HAVE_SPARSEHASH], [], [flag for google::dense_hash_map])]) + [AC_DEFINE([HAVE_SPARSEHASH], [1], [flag for google::dense_hash_map])]) AC_PROG_INSTALL GTEST_LIB_CHECK(1.0) diff --git a/training/em_utils.h b/training/em_utils.h deleted file mode 100644 index 37762978..00000000 --- a/training/em_utils.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef _EM_UTILS_H_ -#define _EM_UTILS_H_ - -#include "config.h" -#ifdef HAVE_BOOST_DIGAMMA -#include -using boost::math::digamma; -#else -#warning Using Mark Johnsons digamma() -#include -inline double digamma(double x) { - double result = 0, xx, xx2, xx4; - assert(x > 0); - for ( ; x < 7; ++x) - result -= 1/x; - x -= 1.0/2.0; - xx = 1.0/x; - xx2 = xx*xx; - xx4 = xx2*xx2; - result += log(x)+(1./24.)*xx2-(7.0/960.0)*xx4+(31.0/8064.0)*xx4*xx2-(127.0/30720.0)*xx4*xx4; - return result; -} -#endif -#endif diff --git a/training/model1.cc b/training/model1.cc index 40249aa3..a87d388f 100644 --- a/training/model1.cc +++ b/training/model1.cc @@ -9,7 +9,6 @@ #include "filelib.h" #include "ttables.h" #include "tdict.h" -#include "em_utils.h" namespace po = boost::program_options; using namespace std; diff --git a/training/mr_em_adapted_reduce.cc b/training/mr_em_adapted_reduce.cc index d4c16a2f..f65b5440 100644 --- a/training/mr_em_adapted_reduce.cc +++ b/training/mr_em_adapted_reduce.cc @@ -10,7 +10,7 @@ #include "fdict.h" #include "weights.h" #include "sparse_vector.h" -#include "em_utils.h" +#include "m.h" using namespace std; namespace po = boost::program_options; @@ -63,11 +63,11 @@ void Maximize(const bool use_vb, assert(tot > 0.0); double ltot = log(tot); if (use_vb) - ltot = digamma(tot + total_event_types * alpha); + ltot = Md::digamma(tot + total_event_types * alpha); for (SparseVector::const_iterator it = counts.begin(); it != counts.end(); ++it) { if (use_vb) { - pc->set_value(it->first, NoZero(digamma(it->second + alpha) - ltot)); + pc->set_value(it->first, NoZero(Md::digamma(it->second + alpha) - ltot)); } else { pc->set_value(it->first, NoZero(log(it->second) - ltot)); } diff --git a/training/ttables.h b/training/ttables.h index 50d85a68..bf3351d2 100644 --- a/training/ttables.h +++ b/training/ttables.h @@ -4,9 +4,9 @@ #include #include +#include "m.h" #include "wordid.h" #include "tdict.h" -#include "em_utils.h" class TTable { public: @@ -39,7 +39,7 @@ class TTable { for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it) tot += it->second + alpha; for (Word2Double::iterator it = cpd.begin(); it != cpd.end(); ++it) - it->second = exp(digamma(it->second + alpha) - digamma(tot)); + it->second = exp(Md::digamma(it->second + alpha) - Md::digamma(tot)); } counts.clear(); } diff --git a/utils/m.h b/utils/m.h index b25248c2..5e45efee 100644 --- a/utils/m.h +++ b/utils/m.h @@ -3,6 +3,7 @@ #include #include +#include template struct M { @@ -81,6 +82,11 @@ struct M { } } + // digamma is the first derivative of the log-gamma function + static inline F digamma(const F& x) { + return boost::math::digamma(x); + } + }; typedef M Md; -- cgit v1.2.3 From d3ccf26cf501cb15ed300bc0ad17596a4e59fbeb Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 18 Feb 2012 15:16:17 -0500 Subject: fix diagonal model --- configure.ac | 2 +- training/model1.cc | 29 +++++++++++++++++------------ 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/configure.ac b/configure.ac index aa79027f..026dad01 100644 --- a/configure.ac +++ b/configure.ac @@ -51,7 +51,7 @@ then SAVE_CPPFLAGS="$CPPFLAGS" CPPFLAGS="$CPPFLAGS -I${with_eigen}" - AC_CHECK_HEADER(Eigen, + AC_CHECK_HEADER(Eigen/Dense, [AC_DEFINE([HAVE_EIGEN], [1], [flag for Eigen linear algebra library])], [AC_MSG_ERROR([Cannot find Eigen!])]) diff --git a/training/model1.cc b/training/model1.cc index a87d388f..73104304 100644 --- a/training/model1.cc +++ b/training/model1.cc @@ -4,6 +4,7 @@ #include #include +#include "m.h" #include "lattice.h" #include "stringlib.h" #include "filelib.h" @@ -13,11 +14,6 @@ namespace po = boost::program_options; using namespace std; -inline double log_poisson(unsigned x, const double& lambda) { - assert(lambda > 0.0); - return log(lambda) * x - lgamma(x + 1) - lambda; -} - bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() @@ -82,6 +78,7 @@ int main(int argc, char** argv) { TTable::Word2Word2Double was_viterbi; double tot_len_ratio = 0; double mean_srclen_multiplier = 0; + vector unnormed_a_i; for (int iter = 0; iter < ITERATIONS; ++iter) { const bool final_iteration = (iter == (ITERATIONS - 1)); cerr << "ITERATION " << (iter + 1) << (final_iteration ? " (FINAL)" : "") << endl; @@ -108,6 +105,8 @@ int main(int argc, char** argv) { assert(src.size() > 0); assert(trg.size() > 0); } + if (src.size() > unnormed_a_i.size()) + unnormed_a_i.resize(src.size()); if (iter == 0) tot_len_ratio += static_cast(trg.size()) / static_cast(src.size()); denom += trg.size(); @@ -125,13 +124,15 @@ int main(int argc, char** argv) { } double az = 0; if (favor_diagonal) { - for (int ta = 0; ta < src.size(); ++ta) - az += exp(-fabs(double(ta) / src.size() - j_over_ts) * diagonal_tension); + for (int ta = 0; ta < src.size(); ++ta) { + unnormed_a_i[ta] = exp(-fabs(double(ta) / src.size() - j_over_ts) * diagonal_tension); + az += unnormed_a_i[ta]; + } az /= prob_align_not_null; } for (int i = 1; i <= src.size(); ++i) { if (favor_diagonal) - prob_a_i = exp(-fabs(double(i) / src.size() - j_over_ts) * diagonal_tension) / az; + prob_a_i = unnormed_a_i[i-1] / az; probs[i] = tt.prob(src[i-1][0].label, f_j) * prob_a_i; sum += probs[i]; } @@ -202,7 +203,9 @@ int main(int argc, char** argv) { Lattice src, trg; LatticeTools::ConvertTextToLattice(ssrc, &src); LatticeTools::ConvertTextToLattice(strg, &trg); - double log_prob = log_poisson(trg.size(), 0.05 + src.size() * mean_srclen_multiplier); + double log_prob = Md::log_poisson(trg.size(), 0.05 + src.size() * mean_srclen_multiplier); + if (src.size() > unnormed_a_i.size()) + unnormed_a_i.resize(src.size()); // compute likelihood for (int j = 0; j < trg.size(); ++j) { @@ -216,13 +219,15 @@ int main(int argc, char** argv) { } double az = 0; if (favor_diagonal) { - for (int ta = 0; ta < src.size(); ++ta) - az += exp(-fabs(double(ta) / src.size() - j_over_ts) * diagonal_tension); + for (int ta = 0; ta < src.size(); ++ta) { + unnormed_a_i[ta] = exp(-fabs(double(ta) / src.size() - j_over_ts) * diagonal_tension); + az += unnormed_a_i[ta]; + } az /= prob_align_not_null; } for (int i = 1; i <= src.size(); ++i) { if (favor_diagonal) - prob_a_i = exp(-fabs(double(i) / src.size() - j_over_ts) * diagonal_tension) / az; + prob_a_i = unnormed_a_i[i-1] / az; sum += tt.prob(src[i-1][0].label, f_j) * prob_a_i; } log_prob += log(sum); -- cgit v1.2.3 From 63d14a96b62163ea1cb1b42a5af009ebb82a1e4d Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 18 Feb 2012 20:22:42 +0000 Subject: fix eigen option --- configure.ac | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/configure.ac b/configure.ac index 026dad01..869f7cf9 100644 --- a/configure.ac +++ b/configure.ac @@ -46,6 +46,13 @@ then AM_CONDITIONAL([HAVE_CMPH], true) fi +AM_CONDITIONAL([HAVE_EIGEN], false) +AC_ARG_WITH(eigen, + [AC_HELP_STRING([--with-eigen=PATH], [(optional) path to Eigen linear algebra library])], + [with_eigen=$withval], + [with_eigen=no] + ) + if test "x$with_eigen" != 'xno' then SAVE_CPPFLAGS="$CPPFLAGS" -- cgit v1.2.3 From 2903e0a0daf941b20da812149f647cd4e0f4dd66 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 18 Feb 2012 22:09:47 -0500 Subject: initial lbl_model stub --- training/Makefile.am | 4 ++ training/lbl_model.cc | 131 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 135 insertions(+) create mode 100644 training/lbl_model.cc diff --git a/training/Makefile.am b/training/Makefile.am index d2f1ccc5..330341ac 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -1,5 +1,6 @@ bin_PROGRAMS = \ model1 \ + lbl_model \ test_ngram \ mr_em_map_adapter \ mr_em_adapted_reduce \ @@ -49,6 +50,9 @@ test_ngram_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteva model1_SOURCES = model1.cc ttables.cc model1_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz +lbl_model_SOURCES = lbl_model.cc ttables.cc +lbl_model_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz + grammar_convert_SOURCES = grammar_convert.cc grammar_convert_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz diff --git a/training/lbl_model.cc b/training/lbl_model.cc new file mode 100644 index 00000000..72d80a56 --- /dev/null +++ b/training/lbl_model.cc @@ -0,0 +1,131 @@ +#include + +#include "config.h" +#ifndef HAVE_EIGEN + int main() { std::cerr << "Please rebuild with --with-eigen PATH\n"; return 1; } +#else + +#include + +#include +#include +#include + +#include "m.h" +#include "lattice.h" +#include "stringlib.h" +#include "filelib.h" +#include "tdict.h" + +namespace po = boost::program_options; +using namespace std; + +bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("iterations,i",po::value()->default_value(5),"Number of iterations of training") + ("diagonal_tension,T", po::value()->default_value(4.0), "How sharp or flat around the diagonal is the alignment distribution (0 = uniform, >0 sharpens)") + ("testset,x", po::value(), "After training completes, compute the log likelihood of this set of sentence pairs under the learned model"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (argc < 2 || conf->count("help")) { + cerr << "Usage " << argv[0] << " [OPTIONS] corpus.fr-en\n"; + cerr << dcmdline_options << endl; + return false; + } + return true; +} + +int main(int argc, char** argv) { + po::variables_map conf; + if (!InitCommandLine(argc, argv, &conf)) return 1; + const string fname = argv[argc - 1]; + const int ITERATIONS = conf["iterations"].as(); + const double diagonal_tension = conf["diagonal_tension"].as(); + string testset; + if (conf.count("testset")) testset = conf["testset"].as(); + + double tot_len_ratio = 0; + double mean_srclen_multiplier = 0; + vector unnormed_a_i; + for (int iter = 0; iter < ITERATIONS; ++iter) { + cerr << "ITERATION " << (iter + 1) << endl; + ReadFile rf(fname); + istream& in = *rf.stream(); + double likelihood = 0; + double denom = 0.0; + int lc = 0; + bool flag = false; + string line; + string ssrc, strg; + while(true) { + getline(in, line); + if (!in) break; + ++lc; + if (lc % 1000 == 0) { cerr << '.'; flag = true; } + if (lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; } + ParseTranslatorInput(line, &ssrc, &strg); + Lattice src, trg; + LatticeTools::ConvertTextToLattice(ssrc, &src); + LatticeTools::ConvertTextToLattice(strg, &trg); + if (src.size() == 0 || trg.size() == 0) { + cerr << "Error: " << lc << "\n" << line << endl; + assert(src.size() > 0); + assert(trg.size() > 0); + } + if (src.size() > unnormed_a_i.size()) + unnormed_a_i.resize(src.size()); + if (iter == 0) + tot_len_ratio += static_cast(trg.size()) / static_cast(src.size()); + denom += trg.size(); + vector probs(src.size() + 1); + bool first_al = true; // used for write_alignments + for (int j = 0; j < trg.size(); ++j) { + const WordID& f_j = trg[j][0].label; + double sum = 0; + const double j_over_ts = double(j) / trg.size(); + double prob_a_i = 1.0 / src.size(); + double az = 0; + for (int ta = 0; ta < src.size(); ++ta) { + unnormed_a_i[ta] = exp(-fabs(double(ta) / src.size() - j_over_ts) * diagonal_tension); + az += unnormed_a_i[ta]; + } + for (int i = 1; i <= src.size(); ++i) { + prob_a_i = unnormed_a_i[i-1] / az; + probs[i] = 1; // tt.prob(src[i-1][0].label, f_j) * prob_a_i; + sum += probs[i]; + } + } + } + + // log(e) = 1.0 + double base2_likelihood = likelihood / log(2); + + if (flag) { cerr << endl; } + if (iter == 0) { + mean_srclen_multiplier = tot_len_ratio / lc; + cerr << "expected target length = source length * " << mean_srclen_multiplier << endl; + } + cerr << " log_e likelihood: " << likelihood << endl; + cerr << " log_2 likelihood: " << base2_likelihood << endl; + cerr << " cross entropy: " << (-base2_likelihood / denom) << endl; + cerr << " perplexity: " << pow(2.0, -base2_likelihood / denom) << endl; + } + return 0; +} + +#endif + -- cgit v1.2.3 From 0498227db2f45bcf7ac44809106846866a6f85e1 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 18 Feb 2012 22:10:29 -0500 Subject: .gitignore lbl_model --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 4f75d153..327f7261 100644 --- a/.gitignore +++ b/.gitignore @@ -33,6 +33,7 @@ sa-extract/rulefactory.so sa-extract/sgml.pyc sa-extract/sym.c sa-extract/sym.so +training/lbl_model training/mpi_flex_optimize training/test_ngram utils/dict_test -- cgit v1.2.3 From c4ffa6df1fdd89e3db9c6d3829b7b84edac20bcf Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 19 Feb 2012 04:27:55 -0500 Subject: lbl preliminary clean up --- decoder/lattice.cc | 1 + training/lbl_model.cc | 84 +++++++++++++++++++++++++++++++++------------------ 2 files changed, 55 insertions(+), 30 deletions(-) diff --git a/decoder/lattice.cc b/decoder/lattice.cc index e3631e59..89da3cd0 100644 --- a/decoder/lattice.cc +++ b/decoder/lattice.cc @@ -46,6 +46,7 @@ void LatticeTools::ConvertTextToLattice(const string& text, Lattice* pl) { Lattice& l = *pl; vector ids; TD::ConvertSentence(text, &ids); + l.clear(); l.resize(ids.size()); for (int i = 0; i < l.size(); ++i) l[i].push_back(LatticeArc(ids[i], 0.0, 1)); diff --git a/training/lbl_model.cc b/training/lbl_model.cc index 72d80a56..ccd29255 100644 --- a/training/lbl_model.cc +++ b/training/lbl_model.cc @@ -6,6 +6,7 @@ #else #include +#include #include #include @@ -20,10 +21,17 @@ namespace po = boost::program_options; using namespace std; +#define kDIMENSIONS 10 +typedef Eigen::Matrix RVector; +typedef Eigen::Matrix RTVector; +typedef Eigen::Matrix TMatrix; +vector r_src, r_trg; + bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() - ("iterations,i",po::value()->default_value(5),"Number of iterations of training") + ("input,i",po::value(),"Input file") + ("iterations,I",po::value()->default_value(1000),"Number of iterations of training") ("diagonal_tension,T", po::value()->default_value(4.0), "How sharp or flat around the diagonal is the alignment distribution (0 = uniform, >0 sharpens)") ("testset,x", po::value(), "After training completes, compute the log likelihood of this set of sentence pairs under the learned model"); po::options_description clo("Command line options"); @@ -42,7 +50,7 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::notify(*conf); if (argc < 2 || conf->count("help")) { - cerr << "Usage " << argv[0] << " [OPTIONS] corpus.fr-en\n"; + cerr << "Usage " << argv[0] << " [OPTIONS] -i corpus.fr-en\n"; cerr << dcmdline_options << endl; return false; } @@ -52,33 +60,32 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { int main(int argc, char** argv) { po::variables_map conf; if (!InitCommandLine(argc, argv, &conf)) return 1; - const string fname = argv[argc - 1]; + const string fname = conf["input"].as(); const int ITERATIONS = conf["iterations"].as(); const double diagonal_tension = conf["diagonal_tension"].as(); + if (diagonal_tension < 0.0) { + cerr << "Invalid value for diagonal_tension: must be >= 0\n"; + return 1; + } string testset; if (conf.count("testset")) testset = conf["testset"].as(); - double tot_len_ratio = 0; - double mean_srclen_multiplier = 0; + int lc = 0; vector unnormed_a_i; - for (int iter = 0; iter < ITERATIONS; ++iter) { - cerr << "ITERATION " << (iter + 1) << endl; + string line; + string ssrc, strg; + bool flag = false; + Lattice src, trg; + set vocab_e; + { // read through corpus, initialize int map, check lines are good + cerr << "INITIAL READ OF " << fname << endl; ReadFile rf(fname); istream& in = *rf.stream(); - double likelihood = 0; - double denom = 0.0; - int lc = 0; - bool flag = false; - string line; - string ssrc, strg; - while(true) { - getline(in, line); - if (!in) break; + while(getline(in, line)) { ++lc; if (lc % 1000 == 0) { cerr << '.'; flag = true; } if (lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; } ParseTranslatorInput(line, &ssrc, &strg); - Lattice src, trg; LatticeTools::ConvertTextToLattice(ssrc, &src); LatticeTools::ConvertTextToLattice(strg, &trg); if (src.size() == 0 || trg.size() == 0) { @@ -88,37 +95,54 @@ int main(int argc, char** argv) { } if (src.size() > unnormed_a_i.size()) unnormed_a_i.resize(src.size()); - if (iter == 0) - tot_len_ratio += static_cast(trg.size()) / static_cast(src.size()); + for (unsigned i = 0; i < trg.size(); ++i) { + assert(trg[i].size() == 1); + vocab_e.insert(trg[i][0].label); + } + } + } + if (flag) cerr << endl; + + // do optimization + for (int iter = 0; iter < ITERATIONS; ++iter) { + cerr << "ITERATION " << (iter + 1) << endl; + ReadFile rf(fname); + istream& in = *rf.stream(); + double likelihood = 0; + double denom = 0.0; + lc = 0; + flag = false; + while(true) { + getline(in, line); + if (!in) break; + ++lc; + if (lc % 1000 == 0) { cerr << '.'; flag = true; } + if (lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; } + ParseTranslatorInput(line, &ssrc, &strg); + LatticeTools::ConvertTextToLattice(ssrc, &src); + LatticeTools::ConvertTextToLattice(strg, &trg); denom += trg.size(); vector probs(src.size() + 1); - bool first_al = true; // used for write_alignments for (int j = 0; j < trg.size(); ++j) { const WordID& f_j = trg[j][0].label; double sum = 0; const double j_over_ts = double(j) / trg.size(); - double prob_a_i = 1.0 / src.size(); double az = 0; for (int ta = 0; ta < src.size(); ++ta) { unnormed_a_i[ta] = exp(-fabs(double(ta) / src.size() - j_over_ts) * diagonal_tension); az += unnormed_a_i[ta]; } for (int i = 1; i <= src.size(); ++i) { - prob_a_i = unnormed_a_i[i-1] / az; + const double prob_a_i = unnormed_a_i[i-1] / az; + // TODO probs[i] = 1; // tt.prob(src[i-1][0].label, f_j) * prob_a_i; sum += probs[i]; } } } - - // log(e) = 1.0 - double base2_likelihood = likelihood / log(2); - if (flag) { cerr << endl; } - if (iter == 0) { - mean_srclen_multiplier = tot_len_ratio / lc; - cerr << "expected target length = source length * " << mean_srclen_multiplier << endl; - } + + const double base2_likelihood = likelihood / log(2); cerr << " log_e likelihood: " << likelihood << endl; cerr << " log_2 likelihood: " << base2_likelihood << endl; cerr << " cross entropy: " << (-base2_likelihood / denom) << endl; -- cgit v1.2.3 From 9e45f895aaec5c7a2f362aa532ca5ca4325e102b Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 21 Feb 2012 11:53:01 -0500 Subject: basic lbl model, nothing to see here --- training/lbl_model.cc | 147 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 126 insertions(+), 21 deletions(-) diff --git a/training/lbl_model.cc b/training/lbl_model.cc index ccd29255..4759eedc 100644 --- a/training/lbl_model.cc +++ b/training/lbl_model.cc @@ -5,13 +5,18 @@ int main() { std::cerr << "Please rebuild with --with-eigen PATH\n"; return 1; } #else +#include +#include #include #include +#include // memset +#include #include #include #include +#include "array2d.h" #include "m.h" #include "lattice.h" #include "stringlib.h" @@ -21,7 +26,7 @@ namespace po = boost::program_options; using namespace std; -#define kDIMENSIONS 10 +#define kDIMENSIONS 25 typedef Eigen::Matrix RVector; typedef Eigen::Matrix RTVector; typedef Eigen::Matrix TMatrix; @@ -32,6 +37,8 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { opts.add_options() ("input,i",po::value(),"Input file") ("iterations,I",po::value()->default_value(1000),"Number of iterations of training") + ("eta,e", po::value()->default_value(0.1f), "Eta for SGD") + ("random_seed", po::value(), "Random seed") ("diagonal_tension,T", po::value()->default_value(4.0), "How sharp or flat around the diagonal is the alignment distribution (0 = uniform, >0 sharpens)") ("testset,x", po::value(), "After training completes, compute the log likelihood of this set of sentence pairs under the learned model"); po::options_description clo("Command line options"); @@ -57,12 +64,19 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { return true; } +void Normalize(RVector* v) { + float norm = v->norm(); + *v /= norm; +} + int main(int argc, char** argv) { po::variables_map conf; if (!InitCommandLine(argc, argv, &conf)) return 1; const string fname = conf["input"].as(); const int ITERATIONS = conf["iterations"].as(); + const float eta = conf["eta"].as(); const double diagonal_tension = conf["diagonal_tension"].as(); + bool SGD = true; if (diagonal_tension < 0.0) { cerr << "Invalid value for diagonal_tension: must be >= 0\n"; return 1; @@ -70,14 +84,15 @@ int main(int argc, char** argv) { string testset; if (conf.count("testset")) testset = conf["testset"].as(); - int lc = 0; + unsigned lc = 0; vector unnormed_a_i; string line; string ssrc, strg; bool flag = false; Lattice src, trg; - set vocab_e; + vector vocab_e; { // read through corpus, initialize int map, check lines are good + set svocab_e; cerr << "INITIAL READ OF " << fname << endl; ReadFile rf(fname); istream& in = *rf.stream(); @@ -97,13 +112,39 @@ int main(int argc, char** argv) { unnormed_a_i.resize(src.size()); for (unsigned i = 0; i < trg.size(); ++i) { assert(trg[i].size() == 1); - vocab_e.insert(trg[i][0].label); + svocab_e.insert(trg[i][0].label); } } + copy(svocab_e.begin(), svocab_e.end(), back_inserter(vocab_e)); } if (flag) cerr << endl; + cerr << "Number of target word types: " << vocab_e.size() << endl; + const float num_examples = lc; + + r_trg.resize(TD::NumWords() + 1); + r_src.resize(TD::NumWords() + 1); + if (conf.count("random_seed")) { + srand(conf["random_seed"].as()); + } else { + unsigned seed = time(NULL); + cerr << "Random seed: " << seed << endl; + srand(seed); + } + TMatrix t = TMatrix::Random() / 100.0; + for (unsigned i = 1; i < r_trg.size(); ++i) { + r_trg[i] = RVector::Random(); + r_src[i] = RVector::Random(); + r_trg[i][i % kDIMENSIONS] = 0.5; + r_src[i][(i-1) % kDIMENSIONS] = 0.5; + Normalize(&r_trg[i]); + Normalize(&r_src[i]); + } + vector > trg_pos(TD::NumWords() + 1); // do optimization + TMatrix g; + vector exp_src; + vector z_src; for (int iter = 0; iter < ITERATIONS; ++iter) { cerr << "ITERATION " << (iter + 1) << endl; ReadFile rf(fname); @@ -112,9 +153,8 @@ int main(int argc, char** argv) { double denom = 0.0; lc = 0; flag = false; - while(true) { - getline(in, line); - if (!in) break; + g *= 0; + while(getline(in, line)) { ++lc; if (lc % 1000 == 0) { cerr << '.'; flag = true; } if (lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; } @@ -122,23 +162,86 @@ int main(int argc, char** argv) { LatticeTools::ConvertTextToLattice(ssrc, &src); LatticeTools::ConvertTextToLattice(strg, &trg); denom += trg.size(); - vector probs(src.size() + 1); - for (int j = 0; j < trg.size(); ++j) { - const WordID& f_j = trg[j][0].label; - double sum = 0; - const double j_over_ts = double(j) / trg.size(); - double az = 0; - for (int ta = 0; ta < src.size(); ++ta) { - unnormed_a_i[ta] = exp(-fabs(double(ta) / src.size() - j_over_ts) * diagonal_tension); - az += unnormed_a_i[ta]; + + exp_src.clear(); exp_src.resize(src.size(), TMatrix::Zero()); + z_src.clear(); z_src.resize(src.size(), 0.0); + Array2D exp_refs(src.size(), trg.size(), TMatrix::Zero()); + Array2D z_refs(src.size(), trg.size(), 0.0); + for (unsigned j = 0; j < trg.size(); ++j) + trg_pos[trg[j][0].label].insert(j); + + for (unsigned i = 0; i < src.size(); ++i) { + const RVector& r_s = r_src[src[i][0].label]; + const RTVector pred = r_s.transpose() * t; + TMatrix& exp_m = exp_src[i]; + double& z = z_src[i]; + for (unsigned k = 0; k < vocab_e.size(); ++k) { + const WordID v_k = vocab_e[k]; + const RVector& r_t = r_trg[v_k]; + const double dot_prod = pred * r_t; + const double u = exp(dot_prod); + z += u; + const TMatrix v = r_s * r_t.transpose() * u; + exp_m += v; + set& ref_locs = trg_pos[v_k]; + if (!ref_locs.empty()) { + for (set::iterator it = ref_locs.begin(); it != ref_locs.end(); ++it) { + TMatrix& exp_ref_ij = exp_refs(i, *it); + double& z_ref_ij = z_refs(i, *it); + z_ref_ij += u; + exp_ref_ij += v; + } + } + } + } + for (unsigned j = 0; j < trg.size(); ++j) + trg_pos[trg[j][0].label].clear(); + + // model expectations for a single target generation with + // uniform alignment prior + double m_z = 0; + TMatrix m_exp = TMatrix::Zero(); + for (unsigned i = 0; i < src.size(); ++i) { + m_exp += exp_src[i]; + m_z += z_src[i]; + } + m_exp /= m_z; + + Array2D al(src.size(), trg.size(), false); + for (unsigned j = 0; j < trg.size(); ++j) { + double ref_z = 0; + TMatrix ref_exp = TMatrix::Zero(); + int max_i = 0; + double max_s = -9999999; + for (unsigned i = 0; i < src.size(); ++i) { + ref_exp += exp_refs(i, j); + ref_z += z_refs(i, j); + if (log(z_refs(i, j)) > max_s) { + max_s = log(z_refs(i, j)); + max_i = i; + } + // TODO handle alignment prob + } + if (ref_z <= 0) { + cerr << "TRG=" << TD::Convert(trg[j][0].label) << endl; + cerr << " LINE=" << line << endl; + cerr << " REF_EXP=\n" << ref_exp << endl; + cerr << " M_EXP=\n" << m_exp << endl; + abort(); } - for (int i = 1; i <= src.size(); ++i) { - const double prob_a_i = unnormed_a_i[i-1] / az; - // TODO - probs[i] = 1; // tt.prob(src[i-1][0].label, f_j) * prob_a_i; - sum += probs[i]; + al(max_i, j) = true; + ref_exp /= ref_z; + g += m_exp - ref_exp; + likelihood += log(ref_z) - log(m_z); + if (SGD) { + t -= g * eta / num_examples; + g *= 0; + } else { + assert(!"not implemented"); } } + + if (iter == (ITERATIONS - 1) || lc == 28) { cerr << al << endl; } } if (flag) { cerr << endl; } @@ -147,7 +250,9 @@ int main(int argc, char** argv) { cerr << " log_2 likelihood: " << base2_likelihood << endl; cerr << " cross entropy: " << (-base2_likelihood / denom) << endl; cerr << " perplexity: " << pow(2.0, -base2_likelihood / denom) << endl; + cerr << t << endl; } + cerr << "TRANSLATION MATRIX:" << endl << t << endl; return 0; } -- cgit v1.2.3 From c0e9dc2889b6beb039c5365ebd0af6486b7ec574 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 21 Feb 2012 17:51:44 -0500 Subject: use lbfgs --- training/Makefile.am | 2 +- training/lbl_model.cc | 33 ++++++++++++++++++++++++++++----- 2 files changed, 29 insertions(+), 6 deletions(-) diff --git a/training/Makefile.am b/training/Makefile.am index 330341ac..991ac210 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -50,7 +50,7 @@ test_ngram_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteva model1_SOURCES = model1.cc ttables.cc model1_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz -lbl_model_SOURCES = lbl_model.cc ttables.cc +lbl_model_SOURCES = lbl_model.cc optimize.cc lbl_model_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/utils/libutils.a -lz grammar_convert_SOURCES = grammar_convert.cc diff --git a/training/lbl_model.cc b/training/lbl_model.cc index 4759eedc..eb3e194d 100644 --- a/training/lbl_model.cc +++ b/training/lbl_model.cc @@ -16,6 +16,7 @@ #include #include +#include "optimize.h" #include "array2d.h" #include "m.h" #include "lattice.h" @@ -26,7 +27,7 @@ namespace po = boost::program_options; using namespace std; -#define kDIMENSIONS 25 +#define kDIMENSIONS 8 typedef Eigen::Matrix RVector; typedef Eigen::Matrix RTVector; typedef Eigen::Matrix TMatrix; @@ -69,6 +70,21 @@ void Normalize(RVector* v) { *v /= norm; } +void Flatten(const TMatrix& m, vector* v) { + unsigned c = 0; + v->resize(kDIMENSIONS * kDIMENSIONS); + for (unsigned i = 0; i < kDIMENSIONS; ++i) + for (unsigned j = 0; j < kDIMENSIONS; ++j) + (*v)[c++] = m(i,j); +} + +void Unflatten(const vector& v, TMatrix* m) { + unsigned c = 0; + for (unsigned i = 0; i < kDIMENSIONS; ++i) + for (unsigned j = 0; j < kDIMENSIONS; ++j) + (*m)(i, j) = v[c++]; +} + int main(int argc, char** argv) { po::variables_map conf; if (!InitCommandLine(argc, argv, &conf)) return 1; @@ -76,7 +92,7 @@ int main(int argc, char** argv) { const int ITERATIONS = conf["iterations"].as(); const float eta = conf["eta"].as(); const double diagonal_tension = conf["diagonal_tension"].as(); - bool SGD = true; + bool SGD = false; if (diagonal_tension < 0.0) { cerr << "Invalid value for diagonal_tension: must be >= 0\n"; return 1; @@ -121,6 +137,7 @@ int main(int argc, char** argv) { cerr << "Number of target word types: " << vocab_e.size() << endl; const float num_examples = lc; + LBFGSOptimizer lbfgs(kDIMENSIONS * kDIMENSIONS, 100); r_trg.resize(TD::NumWords() + 1); r_src.resize(TD::NumWords() + 1); if (conf.count("random_seed")) { @@ -130,7 +147,7 @@ int main(int argc, char** argv) { cerr << "Random seed: " << seed << endl; srand(seed); } - TMatrix t = TMatrix::Random() / 100.0; + TMatrix t = TMatrix::Random() / 1024.0; for (unsigned i = 1; i < r_trg.size(); ++i) { r_trg[i] = RVector::Random(); r_src[i] = RVector::Random(); @@ -145,6 +162,8 @@ int main(int argc, char** argv) { TMatrix g; vector exp_src; vector z_src; + vector flat_g, flat_t; + Flatten(t, &flat_t); for (int iter = 0; iter < ITERATIONS; ++iter) { cerr << "ITERATION " << (iter + 1) << endl; ReadFile rf(fname); @@ -236,8 +255,6 @@ int main(int argc, char** argv) { if (SGD) { t -= g * eta / num_examples; g *= 0; - } else { - assert(!"not implemented"); } } @@ -250,6 +267,12 @@ int main(int argc, char** argv) { cerr << " log_2 likelihood: " << base2_likelihood << endl; cerr << " cross entropy: " << (-base2_likelihood / denom) << endl; cerr << " perplexity: " << pow(2.0, -base2_likelihood / denom) << endl; + if (!SGD) { + Flatten(g, &flat_g); + lbfgs.Optimize(-likelihood, flat_g, &flat_t); + Unflatten(flat_t, &t); + if (lbfgs.HasConverged()) break; + } cerr << t << endl; } cerr << "TRANSLATION MATRIX:" << endl << t << endl; -- cgit v1.2.3 From dd16e83d4a593392465ee317c43ffc2c490add2e Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 22 Feb 2012 16:10:56 +0000 Subject: add regularization --- training/lbl_model.cc | 50 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 9 deletions(-) diff --git a/training/lbl_model.cc b/training/lbl_model.cc index eb3e194d..a114bba7 100644 --- a/training/lbl_model.cc +++ b/training/lbl_model.cc @@ -12,6 +12,7 @@ #include // memset #include +#include #include #include #include @@ -27,7 +28,7 @@ namespace po = boost::program_options; using namespace std; -#define kDIMENSIONS 8 +#define kDIMENSIONS 110 typedef Eigen::Matrix RVector; typedef Eigen::Matrix RTVector; typedef Eigen::Matrix TMatrix; @@ -38,8 +39,9 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { opts.add_options() ("input,i",po::value(),"Input file") ("iterations,I",po::value()->default_value(1000),"Number of iterations of training") + ("regularization_strength,C",po::value()->default_value(0.1),"L2 regularization strength (0 for no regularization)") ("eta,e", po::value()->default_value(0.1f), "Eta for SGD") - ("random_seed", po::value(), "Random seed") + ("random_seed,s", po::value(), "Random seed") ("diagonal_tension,T", po::value()->default_value(4.0), "How sharp or flat around the diagonal is the alignment distribution (0 = uniform, >0 sharpens)") ("testset,x", po::value(), "After training completes, compute the log likelihood of this set of sentence pairs under the learned model"); po::options_description clo("Command line options"); @@ -67,6 +69,7 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { void Normalize(RVector* v) { float norm = v->norm(); + assert(norm > 0.0f); *v /= norm; } @@ -74,21 +77,42 @@ void Flatten(const TMatrix& m, vector* v) { unsigned c = 0; v->resize(kDIMENSIONS * kDIMENSIONS); for (unsigned i = 0; i < kDIMENSIONS; ++i) - for (unsigned j = 0; j < kDIMENSIONS; ++j) + for (unsigned j = 0; j < kDIMENSIONS; ++j) { + assert(boost::math::isnormal(m(i, j))); (*v)[c++] = m(i,j); + } } void Unflatten(const vector& v, TMatrix* m) { unsigned c = 0; for (unsigned i = 0; i < kDIMENSIONS; ++i) - for (unsigned j = 0; j < kDIMENSIONS; ++j) + for (unsigned j = 0; j < kDIMENSIONS; ++j) { + assert(boost::math::isnormal(v[c])); (*m)(i, j) = v[c++]; + } +} + +double ApplyRegularization(const double C, + const vector& weights, + vector* g) { + assert(weights.size() == g->size()); + double reg = 0; + for (size_t i = 0; i < weights.size(); ++i) { + const double& w_i = weights[i]; + double& g_i = (*g)[i]; + reg += C * w_i * w_i; + g_i += 2 * C * w_i; + } + return reg; } int main(int argc, char** argv) { po::variables_map conf; if (!InitCommandLine(argc, argv, &conf)) return 1; const string fname = conf["input"].as(); + const float reg_strength = conf["regularization_strength"].as(); + const bool has_l2 = reg_strength; + assert(reg_strength >= 0.0f); const int ITERATIONS = conf["iterations"].as(); const float eta = conf["eta"].as(); const double diagonal_tension = conf["diagonal_tension"].as(); @@ -147,7 +171,7 @@ int main(int argc, char** argv) { cerr << "Random seed: " << seed << endl; srand(seed); } - TMatrix t = TMatrix::Random() / 1024.0; + TMatrix t = TMatrix::Random() / 50.0; for (unsigned i = 1; i < r_trg.size(); ++i) { r_trg[i] = RVector::Random(); r_src[i] = RVector::Random(); @@ -159,7 +183,7 @@ int main(int argc, char** argv) { vector > trg_pos(TD::NumWords() + 1); // do optimization - TMatrix g; + TMatrix g = TMatrix::Zero(); vector exp_src; vector z_src; vector flat_g, flat_t; @@ -265,11 +289,19 @@ int main(int argc, char** argv) { const double base2_likelihood = likelihood / log(2); cerr << " log_e likelihood: " << likelihood << endl; cerr << " log_2 likelihood: " << base2_likelihood << endl; - cerr << " cross entropy: " << (-base2_likelihood / denom) << endl; - cerr << " perplexity: " << pow(2.0, -base2_likelihood / denom) << endl; + cerr << " cross entropy: " << (-base2_likelihood / denom) << endl; + cerr << " perplexity: " << pow(2.0, -base2_likelihood / denom) << endl; if (!SGD) { Flatten(g, &flat_g); - lbfgs.Optimize(-likelihood, flat_g, &flat_t); + double obj = -likelihood; + if (has_l2) { + const double r = ApplyRegularization(reg_strength, + flat_t, + &flat_g); + obj += r; + cerr << " regularization: " << r << endl; + } + lbfgs.Optimize(obj, flat_g, &flat_t); Unflatten(flat_t, &t); if (lbfgs.HasConverged()) break; } -- cgit v1.2.3 From 2faca3e7b3b8e4eba6c036c635a5b23883e72337 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 24 Feb 2012 00:47:48 -0500 Subject: load embeddings from file --- training/lbl_model.cc | 69 ++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 63 insertions(+), 6 deletions(-) diff --git a/training/lbl_model.cc b/training/lbl_model.cc index a114bba7..2af848b5 100644 --- a/training/lbl_model.cc +++ b/training/lbl_model.cc @@ -28,7 +28,7 @@ namespace po = boost::program_options; using namespace std; -#define kDIMENSIONS 110 +#define kDIMENSIONS 100 typedef Eigen::Matrix RVector; typedef Eigen::Matrix RTVector; typedef Eigen::Matrix TMatrix; @@ -40,7 +40,9 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("input,i",po::value(),"Input file") ("iterations,I",po::value()->default_value(1000),"Number of iterations of training") ("regularization_strength,C",po::value()->default_value(0.1),"L2 regularization strength (0 for no regularization)") - ("eta,e", po::value()->default_value(0.1f), "Eta for SGD") + ("eta", po::value()->default_value(0.1f), "Eta for SGD") + ("source_embeddings,f", po::value(), "File containing source embeddings (if unset, random vectors will be used)") + ("target_embeddings,e", po::value(), "File containing target embeddings (if unset, random vectors will be used)") ("random_seed,s", po::value(), "Random seed") ("diagonal_tension,T", po::value()->default_value(4.0), "How sharp or flat around the diagonal is the alignment distribution (0 = uniform, >0 sharpens)") ("testset,x", po::value(), "After training completes, compute the log likelihood of this set of sentence pairs under the learned model"); @@ -106,6 +108,59 @@ double ApplyRegularization(const double C, return reg; } +void LoadEmbeddings(const string& filename, vector* pv) { + vector& v = *pv; + cerr << "Reading embeddings from " << filename << " ...\n"; + ReadFile rf(filename); + istream& in = *rf.stream(); + string line; + unsigned lc = 0; + while(getline(in, line)) { + ++lc; + size_t cur = line.find(' '); + if (cur == string::npos || cur == 0) { + cerr << "Parse error reading line " << lc << ":\n" << line << endl; + abort(); + } + WordID w = TD::Convert(line.substr(0, cur)); + if (w >= v.size()) continue; + RVector& curv = v[w]; + line[cur] = 0; + size_t start = cur + 1; + cur = start + 1; + size_t c = 0; + while(cur < line.size()) { + if (line[cur] == ' ') { + line[cur] = 0; + curv[c++] = strtod(&line[start], NULL); + start = cur + 1; + cur = start; + if (c == kDIMENSIONS) break; + } + ++cur; + } + if (c < kDIMENSIONS && cur != start) { + if (cur < line.size()) line[cur] = 0; + curv[c++] = strtod(&line[start], NULL); + } + if (c != kDIMENSIONS) { + static bool first = true; + if (first) { + cerr << " read " << c << " dimensions from embedding file, but built with " << kDIMENSIONS << " (filling in with random values)\n"; + first = false; + } + for (; c < kDIMENSIONS; ++c) curv[c] = rand(); + } + if (c == kDIMENSIONS && cur != line.size()) { + static bool first = true; + if (first) { + cerr << " embedding file contains more dimensions than configured with, truncating.\n"; + first = false; + } + } + } +} + int main(int argc, char** argv) { po::variables_map conf; if (!InitCommandLine(argc, argv, &conf)) return 1; @@ -175,11 +230,11 @@ int main(int argc, char** argv) { for (unsigned i = 1; i < r_trg.size(); ++i) { r_trg[i] = RVector::Random(); r_src[i] = RVector::Random(); - r_trg[i][i % kDIMENSIONS] = 0.5; - r_src[i][(i-1) % kDIMENSIONS] = 0.5; - Normalize(&r_trg[i]); - Normalize(&r_src[i]); } + if (conf.count("source_embeddings")) + LoadEmbeddings(conf["source_embeddings"].as(), &r_src); + if (conf.count("target_embeddings")) + LoadEmbeddings(conf["target_embeddings"].as(), &r_trg); vector > trg_pos(TD::NumWords() + 1); // do optimization @@ -242,6 +297,8 @@ int main(int argc, char** argv) { // model expectations for a single target generation with // uniform alignment prior + // TODO: when using a non-uniform alignment, m_exp will be + // a function of j (below) double m_z = 0; TMatrix m_exp = TMatrix::Zero(); for (unsigned i = 0; i < src.size(); ++i) { -- cgit v1.2.3 From 9007216a43c5572c2c343a1700ac79fb35b7d82f Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 25 Feb 2012 21:22:27 -0500 Subject: really slow hiero lm --- gi/pf/Makefile.am | 4 +- gi/pf/hierolm.cc | 309 +++++++++++++++++++++++++++++++++++++++++++++ phrasinator/ccrp.h | 294 ------------------------------------------- utils/ccrp.h | 340 ++++++++++++++++++++++++++++++++++++++++++++++++++ utils/ccrp_onetable.h | 12 ++ utils/sampler.h | 2 +- 6 files changed, 665 insertions(+), 296 deletions(-) create mode 100644 gi/pf/hierolm.cc delete mode 100644 phrasinator/ccrp.h create mode 100644 utils/ccrp.h diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 8d43f36d..ed5b6fd3 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,4 +1,4 @@ -bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp +bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp hierolm noinst_LIBRARIES = libpf.a libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc @@ -9,6 +9,8 @@ align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc itg_SOURCES = itg.cc +hierolm_SOURCES = hierolm.cc + condnaive_SOURCES = condnaive.cc dpnaive_SOURCES = dpnaive.cc diff --git a/gi/pf/hierolm.cc b/gi/pf/hierolm.cc new file mode 100644 index 00000000..afb12fef --- /dev/null +++ b/gi/pf/hierolm.cc @@ -0,0 +1,309 @@ +#include +#include +#include + +#include +#include +#include + +#include "inside_outside.h" +#include "hg.h" +#include "bottom_up_parser.h" +#include "fdict.h" +#include "grammar.h" +#include "m.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp.h" +#include "ccrp_onetable.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +shared_ptr prng; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("input,i",po::value(),"Read parallel data from") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +void ReadCorpus(const string& filename, + vector >* e, + set* vocab_e) { + e->clear(); + vocab_e->clear(); + istream* in; + if (filename == "-") + in = &cin; + else + in = new ifstream(filename.c_str()); + assert(*in); + string line; + while(*in) { + getline(*in, line); + if (line.empty() && !*in) break; + e->push_back(vector()); + vector& le = e->back(); + TD::ConvertSentence(line, &le); + for (unsigned i = 0; i < le.size(); ++i) + vocab_e->insert(le[i]); + } + if (in != &cin) delete in; +} + +struct Grid { + // a b c d e + // 0 - 0 - - + vector grid; +}; + +struct BaseRuleModel { + explicit BaseRuleModel(unsigned term_size, + unsigned nonterm_size = 1) : + unif_term(1.0 / term_size), + unif_nonterm(1.0 / nonterm_size) {} + prob_t operator()(const TRule& r) const { + prob_t p; p.logeq(Md::log_poisson(1.0, r.f_.size())); + const prob_t term_prob((2.0 + 0.01*r.f_.size()) / (r.f_.size() + 2)); + const prob_t nonterm_prob(1.0 - term_prob.as_float()); + for (unsigned i = 0; i < r.f_.size(); ++i) { + if (r.f_[i] <= 0) { // nonterminal + p *= nonterm_prob; + p *= unif_nonterm; + } else { // terminal + p *= term_prob; + p *= unif_term; + } + } + return p; + } + const prob_t unif_term, unif_nonterm; +}; + +struct HieroLMModel { + explicit HieroLMModel(unsigned vocab_size) : p0(vocab_size), x(1,1,1,1) {} + + prob_t Prob(const TRule& r) const { + return x.probT(r, p0(r)); + } + + int Increment(const TRule& r, MT19937* rng) { + return x.incrementT(r, p0(r), rng); + // return x.increment(r); + } + + int Decrement(const TRule& r, MT19937* rng) { + return x.decrement(r, rng); + //return x.decrement(r); + } + + prob_t Likelihood() const { + prob_t p; + p.logeq(x.log_crp_prob()); + for (CCRP::const_iterator it = x.begin(); it != x.end(); ++it) { + prob_t tp = p0(it->first); + tp.poweq(it->second.table_counts_.size()); + p *= tp; + } + //for (CCRP_OneTable::const_iterator it = x.begin(); it != x.end(); ++it) + // p *= p0(it->first); + return p; + } + + void ResampleHyperparameters(MT19937* rng) { + x.resample_hyperparameters(rng); + cerr << " d=" << x.discount() << ", alpha=" << x.concentration() << endl; + } + + const BaseRuleModel p0; + CCRP x; + //CCRP_OneTable x; +}; + +vector tofreelist; + +HieroLMModel* plm; + +struct NPGrammarIter : public GrammarIter, public RuleBin { + NPGrammarIter() : arity() { tofreelist.push_back(this); } + NPGrammarIter(const TRulePtr& inr, const int a, int symbol) : arity(a + (symbol < 0 ? 1 : 0)) { + if (inr) { + r.reset(new TRule(*inr)); + } else { + static const int kLHS = -TD::Convert("X"); + r.reset(new TRule); + r->lhs_ = kLHS; + } + TRule& rr = *r; + rr.f_.push_back(symbol); + rr.e_.push_back(symbol < 0 ? (1-int(arity)) : symbol); + tofreelist.push_back(this); + } + virtual int GetNumRules() const { + if (r) return 1; else return 0; + } + virtual TRulePtr GetIthRule(int) const { + return r; + } + virtual int Arity() const { + return arity; + } + virtual const RuleBin* GetRules() const { + if (!r) return NULL; else return this; + } + virtual const GrammarIter* Extend(int symbol) const { + return new NPGrammarIter(r, arity, symbol); + } + const unsigned char arity; + TRulePtr r; +}; + +struct NPGrammar : public Grammar { + virtual const GrammarIter* GetRoot() const { + return new NPGrammarIter; + } +}; + +void SampleDerivation(const Hypergraph& hg, MT19937* rng, vector* sampled_deriv, HieroLMModel* plm) { + HieroLMModel& lm = *plm; + vector node_probs; + const prob_t total_prob = Inside(hg, &node_probs); + queue q; + q.push(hg.nodes_.size() - 3); + while(!q.empty()) { + unsigned cur_node_id = q.front(); +// cerr << "NODE=" << cur_node_id << endl; + q.pop(); + const Hypergraph::Node& node = hg.nodes_[cur_node_id]; + const unsigned num_in_edges = node.in_edges_.size(); + unsigned sampled_edge = 0; + if (num_in_edges == 1) { + sampled_edge = node.in_edges_[0]; + } else { + //prob_t z; + assert(num_in_edges > 1); + SampleSet ss; + for (unsigned j = 0; j < num_in_edges; ++j) { + const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; + prob_t p = edge.edge_prob_; + for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k) + p *= node_probs[edge.tail_nodes_[k]]; + ss.add(p); +// cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl; + //z += p; + } +// for (unsigned j = 0; j < num_in_edges; ++j) { +// const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; +// cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl; +// } +// cerr << " --- \n"; + sampled_edge = node.in_edges_[rng->SelectSample(ss)]; + } + sampled_deriv->push_back(sampled_edge); + const Hypergraph::Edge& edge = hg.edges_[sampled_edge]; + for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) { + q.push(edge.tail_nodes_[j]); + } + } + for (unsigned i = 0; i < sampled_deriv->size(); ++i) { + cerr << *hg.edges_[(*sampled_deriv)[i]].rule_ << endl; + } +} + +void IncrementDerivation(const Hypergraph& hg, const vector& d, HieroLMModel* plm, MT19937* rng) { + for (unsigned i = 0; i < d.size(); ++i) + plm->Increment(*hg.edges_[d[i]].rule_, rng); +} + +void DecrementDerivation(const Hypergraph& hg, const vector& d, HieroLMModel* plm, MT19937* rng) { + for (unsigned i = 0; i < d.size(); ++i) + plm->Decrement(*hg.edges_[d[i]].rule_, rng); +} + +int main(int argc, char** argv) { + po::variables_map conf; + vector grammars; + grammars.push_back(GrammarPtr(new NPGrammar)); + + InitCommandLine(argc, argv, &conf); + const unsigned samples = conf["samples"].as(); + + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); + MT19937& rng = *prng; + + vector > corpuse; + set vocabe; + cerr << "Reading corpus...\n"; + ReadCorpus(conf["input"].as(), &corpuse, &vocabe); + cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; + HieroLMModel lm(vocabe.size()); + + plm = &lm; + ExhaustiveBottomUpParser parser("X", grammars); + + Hypergraph hg; + const int kX = -TD::Convert("X"); + const int kLP = FD::Convert("LogProb"); + SparseVector v; v.set_value(kLP, 1.0); + vector > derivs(corpuse.size()); + for (int SS=0; SS < samples; ++SS) { + for (int ci = 0; ci < corpuse.size(); ++ci) { + vector& src = corpuse[ci]; + Lattice lat(src.size()); + for (unsigned i = 0; i < src.size(); ++i) + lat[i].push_back(LatticeArc(src[i], 0.0, 1)); + cerr << TD::GetString(src) << endl; + hg.clear(); + parser.Parse(lat, &hg); // exhaustive parse + DecrementDerivation(hg, derivs[ci], &lm, &rng); + for (unsigned i = 0; i < hg.edges_.size(); ++i) { + TRule& r = *hg.edges_[i].rule_; + if (r.lhs_ == kX) + hg.edges_[i].edge_prob_ = lm.Prob(r); + } + vector d; + SampleDerivation(hg, &rng, &d, &lm); + derivs[ci] = d; + IncrementDerivation(hg, derivs[ci], &lm, &rng); + if (tofreelist.size() > 100000) { + cerr << "Freeing ... "; + for (unsigned i = 0; i < tofreelist.size(); ++i) + delete tofreelist[i]; + tofreelist.clear(); + cerr << "Freed.\n"; + } + } + cerr << "LLH=" << lm.Likelihood() << endl; + } + return 0; +} + diff --git a/phrasinator/ccrp.h b/phrasinator/ccrp.h deleted file mode 100644 index 9acf12ab..00000000 --- a/phrasinator/ccrp.h +++ /dev/null @@ -1,294 +0,0 @@ -#ifndef _CCRP_H_ -#define _CCRP_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "sampler.h" -#include "slice_sampler.h" - -// Chinese restaurant process (Pitman-Yor parameters) with table tracking. - -template > -class CCRP { - public: - CCRP(double disc, double conc) : - num_tables_(), - num_customers_(), - discount_(disc), - concentration_(conc), - discount_prior_alpha_(std::numeric_limits::quiet_NaN()), - discount_prior_beta_(std::numeric_limits::quiet_NaN()), - concentration_prior_shape_(std::numeric_limits::quiet_NaN()), - concentration_prior_rate_(std::numeric_limits::quiet_NaN()) {} - - CCRP(double d_alpha, double d_beta, double c_shape, double c_rate, double d = 0.1, double c = 10.0) : - num_tables_(), - num_customers_(), - discount_(d), - concentration_(c), - discount_prior_alpha_(d_alpha), - discount_prior_beta_(d_beta), - concentration_prior_shape_(c_shape), - concentration_prior_rate_(c_rate) {} - - double discount() const { return discount_; } - double concentration() const { return concentration_; } - - bool has_discount_prior() const { - return !std::isnan(discount_prior_alpha_); - } - - bool has_concentration_prior() const { - return !std::isnan(concentration_prior_shape_); - } - - void clear() { - num_tables_ = 0; - num_customers_ = 0; - dish_locs_.clear(); - } - - unsigned num_tables() const { - return num_tables_; - } - - unsigned num_tables(const Dish& dish) const { - const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); - if (it == dish_locs_.end()) return 0; - return it->second.table_counts_.size(); - } - - unsigned num_customers() const { - return num_customers_; - } - - unsigned num_customers(const Dish& dish) const { - const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); - if (it == dish_locs_.end()) return 0; - return it->total_dish_count_; - } - - // returns +1 or 0 indicating whether a new table was opened - int increment(const Dish& dish, const double& p0, MT19937* rng) { - DishLocations& loc = dish_locs_[dish]; - bool share_table = false; - if (loc.total_dish_count_) { - const double p_empty = (concentration_ + num_tables_ * discount_) * p0; - const double p_share = (loc.total_dish_count_ - loc.table_counts_.size() * discount_); - share_table = rng->SelectSample(p_empty, p_share); - } - if (share_table) { - double r = rng->next() * (loc.total_dish_count_ - loc.table_counts_.size() * discount_); - for (typename std::list::iterator ti = loc.table_counts_.begin(); - ti != loc.table_counts_.end(); ++ti) { - r -= (*ti - discount_); - if (r <= 0.0) { - ++(*ti); - break; - } - } - if (r > 0.0) { - std::cerr << "Serious error: r=" << r << std::endl; - Print(&std::cerr); - assert(r <= 0.0); - } - } else { - loc.table_counts_.push_back(1u); - ++num_tables_; - } - ++loc.total_dish_count_; - ++num_customers_; - return (share_table ? 0 : 1); - } - - // returns -1 or 0, indicating whether a table was closed - int decrement(const Dish& dish, MT19937* rng) { - DishLocations& loc = dish_locs_[dish]; - assert(loc.total_dish_count_); - if (loc.total_dish_count_ == 1) { - dish_locs_.erase(dish); - --num_tables_; - --num_customers_; - return -1; - } else { - int delta = 0; - // sample customer to remove UNIFORMLY. that is, do NOT use the discount - // here. if you do, it will introduce (unwanted) bias! - double r = rng->next() * loc.total_dish_count_; - --loc.total_dish_count_; - for (typename std::list::iterator ti = loc.table_counts_.begin(); - ti != loc.table_counts_.end(); ++ti) { - r -= *ti; - if (r <= 0.0) { - if ((--(*ti)) == 0) { - --num_tables_; - delta = -1; - loc.table_counts_.erase(ti); - } - break; - } - } - if (r > 0.0) { - std::cerr << "Serious error: r=" << r << std::endl; - Print(&std::cerr); - assert(r <= 0.0); - } - --num_customers_; - return delta; - } - } - - double prob(const Dish& dish, const double& p0) const { - const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); - const double r = num_tables_ * discount_ + concentration_; - if (it == dish_locs_.end()) { - return r * p0 / (num_customers_ + concentration_); - } else { - return (it->second.total_dish_count_ - discount_ * it->second.table_counts_.size() + r * p0) / - (num_customers_ + concentration_); - } - } - - double log_crp_prob() const { - return log_crp_prob(discount_, concentration_); - } - - static double log_beta_density(const double& x, const double& alpha, const double& beta) { - assert(x > 0.0); - assert(x < 1.0); - assert(alpha > 0.0); - assert(beta > 0.0); - const double lp = (alpha-1)*log(x)+(beta-1)*log(1-x)+lgamma(alpha+beta)-lgamma(alpha)-lgamma(beta); - return lp; - } - - static double log_gamma_density(const double& x, const double& shape, const double& rate) { - assert(x >= 0.0); - assert(shape > 0.0); - assert(rate > 0.0); - const double lp = (shape-1)*log(x) - shape*log(rate) - x/rate - lgamma(shape); - return lp; - } - - // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process - // does not include P_0's - double log_crp_prob(const double& discount, const double& concentration) const { - double lp = 0.0; - if (has_discount_prior()) - lp = log_beta_density(discount, discount_prior_alpha_, discount_prior_beta_); - if (has_concentration_prior()) - lp += log_gamma_density(concentration, concentration_prior_shape_, concentration_prior_rate_); - assert(lp <= 0.0); - if (num_customers_) { - if (discount > 0.0) { - const double r = lgamma(1.0 - discount); - lp += lgamma(concentration) - lgamma(concentration + num_customers_) - + num_tables_ * log(discount) + lgamma(concentration / discount + num_tables_) - - lgamma(concentration / discount); - assert(std::isfinite(lp)); - for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); - it != dish_locs_.end(); ++it) { - const DishLocations& cur = it->second; - for (std::list::const_iterator ti = cur.table_counts_.begin(); ti != cur.table_counts_.end(); ++ti) { - lp += lgamma(*ti - discount) - r; - } - } - } else { - assert(!"not implemented yet"); - } - } - assert(std::isfinite(lp)); - return lp; - } - - void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { - assert(has_discount_prior() || has_concentration_prior()); - DiscountResampler dr(*this); - ConcentrationResampler cr(*this); - for (int iter = 0; iter < nloop; ++iter) { - if (has_concentration_prior()) { - concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, - std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); - } - if (has_discount_prior()) { - discount_ = slice_sampler1d(dr, discount_, *rng, std::numeric_limits::min(), - 1.0, 0.0, niterations, 100*niterations); - } - } - concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, - std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); - } - - struct DiscountResampler { - DiscountResampler(const CCRP& crp) : crp_(crp) {} - const CCRP& crp_; - double operator()(const double& proposed_discount) const { - return crp_.log_crp_prob(proposed_discount, crp_.concentration_); - } - }; - - struct ConcentrationResampler { - ConcentrationResampler(const CCRP& crp) : crp_(crp) {} - const CCRP& crp_; - double operator()(const double& proposed_concentration) const { - return crp_.log_crp_prob(crp_.discount_, proposed_concentration); - } - }; - - struct DishLocations { - DishLocations() : total_dish_count_() {} - unsigned total_dish_count_; // customers at all tables with this dish - std::list table_counts_; // list<> gives O(1) deletion and insertion, which we want - // .size() is the number of tables for this dish - }; - - void Print(std::ostream* out) const { - std::cerr << "PYP(d=" << discount_ << ",c=" << concentration_ << ") customers=" << num_customers_ << std::endl; - for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); - it != dish_locs_.end(); ++it) { - (*out) << it->first << " (" << it->second.total_dish_count_ << " on " << it->second.table_counts_.size() << " tables): "; - for (typename std::list::const_iterator i = it->second.table_counts_.begin(); - i != it->second.table_counts_.end(); ++i) { - (*out) << " " << *i; - } - (*out) << std::endl; - } - } - - typedef typename std::tr1::unordered_map::const_iterator const_iterator; - const_iterator begin() const { - return dish_locs_.begin(); - } - const_iterator end() const { - return dish_locs_.end(); - } - - unsigned num_tables_; - unsigned num_customers_; - std::tr1::unordered_map dish_locs_; - - double discount_; - double concentration_; - - // optional beta prior on discount_ (NaN if no prior) - double discount_prior_alpha_; - double discount_prior_beta_; - - // optional gamma prior on concentration_ (NaN if no prior) - double concentration_prior_shape_; - double concentration_prior_rate_; -}; - -template -std::ostream& operator<<(std::ostream& o, const CCRP& c) { - c.Print(&o); - return o; -} - -#endif diff --git a/utils/ccrp.h b/utils/ccrp.h new file mode 100644 index 00000000..1a9e3ed5 --- /dev/null +++ b/utils/ccrp.h @@ -0,0 +1,340 @@ +#ifndef _CCRP_H_ +#define _CCRP_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "sampler.h" +#include "slice_sampler.h" + +// Chinese restaurant process (Pitman-Yor parameters) with table tracking. + +template > +class CCRP { + public: + CCRP(double disc, double conc) : + num_tables_(), + num_customers_(), + discount_(disc), + concentration_(conc), + discount_prior_alpha_(std::numeric_limits::quiet_NaN()), + discount_prior_beta_(std::numeric_limits::quiet_NaN()), + concentration_prior_shape_(std::numeric_limits::quiet_NaN()), + concentration_prior_rate_(std::numeric_limits::quiet_NaN()) {} + + CCRP(double d_alpha, double d_beta, double c_shape, double c_rate, double d = 0.9, double c = 1.0) : + num_tables_(), + num_customers_(), + discount_(d), + concentration_(c), + discount_prior_alpha_(d_alpha), + discount_prior_beta_(d_beta), + concentration_prior_shape_(c_shape), + concentration_prior_rate_(c_rate) {} + + double discount() const { return discount_; } + double concentration() const { return concentration_; } + + bool has_discount_prior() const { + return !std::isnan(discount_prior_alpha_); + } + + bool has_concentration_prior() const { + return !std::isnan(concentration_prior_shape_); + } + + void clear() { + num_tables_ = 0; + num_customers_ = 0; + dish_locs_.clear(); + } + + unsigned num_tables() const { + return num_tables_; + } + + unsigned num_tables(const Dish& dish) const { + const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); + if (it == dish_locs_.end()) return 0; + return it->second.table_counts_.size(); + } + + unsigned num_customers() const { + return num_customers_; + } + + unsigned num_customers(const Dish& dish) const { + const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); + if (it == dish_locs_.end()) return 0; + return it->total_dish_count_; + } + + // returns +1 or 0 indicating whether a new table was opened + int increment(const Dish& dish, const double& p0, MT19937* rng) { + DishLocations& loc = dish_locs_[dish]; + bool share_table = false; + if (loc.total_dish_count_) { + const double p_empty = (concentration_ + num_tables_ * discount_) * p0; + const double p_share = (loc.total_dish_count_ - loc.table_counts_.size() * discount_); + share_table = rng->SelectSample(p_empty, p_share); + } + if (share_table) { + double r = rng->next() * (loc.total_dish_count_ - loc.table_counts_.size() * discount_); + for (typename std::list::iterator ti = loc.table_counts_.begin(); + ti != loc.table_counts_.end(); ++ti) { + r -= (*ti - discount_); + if (r <= 0.0) { + ++(*ti); + break; + } + } + if (r > 0.0) { + std::cerr << "Serious error: r=" << r << std::endl; + Print(&std::cerr); + assert(r <= 0.0); + } + } else { + loc.table_counts_.push_back(1u); + ++num_tables_; + } + ++loc.total_dish_count_; + ++num_customers_; + return (share_table ? 0 : 1); + } + + // returns +1 or 0 indicating whether a new table was opened + template + int incrementT(const Dish& dish, const T& p0, MT19937* rng) { + DishLocations& loc = dish_locs_[dish]; + bool share_table = false; + if (loc.total_dish_count_) { + const T p_empty = T(concentration_ + num_tables_ * discount_) * p0; + const T p_share = T(loc.total_dish_count_ - loc.table_counts_.size() * discount_); + share_table = rng->SelectSample(p_empty, p_share); + } + if (share_table) { + double r = rng->next() * (loc.total_dish_count_ - loc.table_counts_.size() * discount_); + for (typename std::list::iterator ti = loc.table_counts_.begin(); + ti != loc.table_counts_.end(); ++ti) { + r -= (*ti - discount_); + if (r <= 0.0) { + ++(*ti); + break; + } + } + if (r > 0.0) { + std::cerr << "Serious error: r=" << r << std::endl; + Print(&std::cerr); + assert(r <= 0.0); + } + } else { + loc.table_counts_.push_back(1u); + ++num_tables_; + } + ++loc.total_dish_count_; + ++num_customers_; + return (share_table ? 0 : 1); + } + + // returns -1 or 0, indicating whether a table was closed + int decrement(const Dish& dish, MT19937* rng) { + DishLocations& loc = dish_locs_[dish]; + assert(loc.total_dish_count_); + if (loc.total_dish_count_ == 1) { + dish_locs_.erase(dish); + --num_tables_; + --num_customers_; + return -1; + } else { + int delta = 0; + // sample customer to remove UNIFORMLY. that is, do NOT use the discount + // here. if you do, it will introduce (unwanted) bias! + double r = rng->next() * loc.total_dish_count_; + --loc.total_dish_count_; + for (typename std::list::iterator ti = loc.table_counts_.begin(); + ti != loc.table_counts_.end(); ++ti) { + r -= *ti; + if (r <= 0.0) { + if ((--(*ti)) == 0) { + --num_tables_; + delta = -1; + loc.table_counts_.erase(ti); + } + break; + } + } + if (r > 0.0) { + std::cerr << "Serious error: r=" << r << std::endl; + Print(&std::cerr); + assert(r <= 0.0); + } + --num_customers_; + return delta; + } + } + + double prob(const Dish& dish, const double& p0) const { + const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); + const double r = num_tables_ * discount_ + concentration_; + if (it == dish_locs_.end()) { + return r * p0 / (num_customers_ + concentration_); + } else { + return (it->second.total_dish_count_ - discount_ * it->second.table_counts_.size() + r * p0) / + (num_customers_ + concentration_); + } + } + + template + T probT(const Dish& dish, const T& p0) const { + const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); + const T r = T(num_tables_ * discount_ + concentration_); + if (it == dish_locs_.end()) { + return r * p0 / T(num_customers_ + concentration_); + } else { + return (T(it->second.total_dish_count_ - discount_ * it->second.table_counts_.size()) + r * p0) / + T(num_customers_ + concentration_); + } + } + + double log_crp_prob() const { + return log_crp_prob(discount_, concentration_); + } + + static double log_beta_density(const double& x, const double& alpha, const double& beta) { + assert(x > 0.0); + assert(x < 1.0); + assert(alpha > 0.0); + assert(beta > 0.0); + const double lp = (alpha-1)*log(x)+(beta-1)*log(1-x)+lgamma(alpha+beta)-lgamma(alpha)-lgamma(beta); + return lp; + } + + static double log_gamma_density(const double& x, const double& shape, const double& rate) { + assert(x >= 0.0); + assert(shape > 0.0); + assert(rate > 0.0); + const double lp = (shape-1)*log(x) - shape*log(rate) - x/rate - lgamma(shape); + return lp; + } + + // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process + // does not include P_0's + double log_crp_prob(const double& discount, const double& concentration) const { + double lp = 0.0; + if (has_discount_prior()) + lp = log_beta_density(discount, discount_prior_alpha_, discount_prior_beta_); + if (has_concentration_prior()) + lp += log_gamma_density(concentration, concentration_prior_shape_, concentration_prior_rate_); + assert(lp <= 0.0); + if (num_customers_) { + if (discount > 0.0) { + const double r = lgamma(1.0 - discount); + lp += lgamma(concentration) - lgamma(concentration + num_customers_) + + num_tables_ * log(discount) + lgamma(concentration / discount + num_tables_) + - lgamma(concentration / discount); + assert(std::isfinite(lp)); + for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); + it != dish_locs_.end(); ++it) { + const DishLocations& cur = it->second; + for (std::list::const_iterator ti = cur.table_counts_.begin(); ti != cur.table_counts_.end(); ++ti) { + lp += lgamma(*ti - discount) - r; + } + } + } else { + assert(!"not implemented yet"); + } + } + assert(std::isfinite(lp)); + return lp; + } + + void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { + assert(has_discount_prior() || has_concentration_prior()); + DiscountResampler dr(*this); + ConcentrationResampler cr(*this); + for (int iter = 0; iter < nloop; ++iter) { + if (has_concentration_prior()) { + concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + } + if (has_discount_prior()) { + discount_ = slice_sampler1d(dr, discount_, *rng, std::numeric_limits::min(), + 1.0, 0.0, niterations, 100*niterations); + } + } + concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + } + + struct DiscountResampler { + DiscountResampler(const CCRP& crp) : crp_(crp) {} + const CCRP& crp_; + double operator()(const double& proposed_discount) const { + return crp_.log_crp_prob(proposed_discount, crp_.concentration_); + } + }; + + struct ConcentrationResampler { + ConcentrationResampler(const CCRP& crp) : crp_(crp) {} + const CCRP& crp_; + double operator()(const double& proposed_concentration) const { + return crp_.log_crp_prob(crp_.discount_, proposed_concentration); + } + }; + + struct DishLocations { + DishLocations() : total_dish_count_() {} + unsigned total_dish_count_; // customers at all tables with this dish + std::list table_counts_; // list<> gives O(1) deletion and insertion, which we want + // .size() is the number of tables for this dish + }; + + void Print(std::ostream* out) const { + std::cerr << "PYP(d=" << discount_ << ",c=" << concentration_ << ") customers=" << num_customers_ << std::endl; + for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); + it != dish_locs_.end(); ++it) { + (*out) << it->first << " (" << it->second.total_dish_count_ << " on " << it->second.table_counts_.size() << " tables): "; + for (typename std::list::const_iterator i = it->second.table_counts_.begin(); + i != it->second.table_counts_.end(); ++i) { + (*out) << " " << *i; + } + (*out) << std::endl; + } + } + + typedef typename std::tr1::unordered_map::const_iterator const_iterator; + const_iterator begin() const { + return dish_locs_.begin(); + } + const_iterator end() const { + return dish_locs_.end(); + } + + unsigned num_tables_; + unsigned num_customers_; + std::tr1::unordered_map dish_locs_; + + double discount_; + double concentration_; + + // optional beta prior on discount_ (NaN if no prior) + double discount_prior_alpha_; + double discount_prior_beta_; + + // optional gamma prior on concentration_ (NaN if no prior) + double concentration_prior_shape_; + double concentration_prior_rate_; +}; + +template +std::ostream& operator<<(std::ostream& o, const CCRP& c) { + c.Print(&o); + return o; +} + +#endif diff --git a/utils/ccrp_onetable.h b/utils/ccrp_onetable.h index a868af9a..b63737d1 100644 --- a/utils/ccrp_onetable.h +++ b/utils/ccrp_onetable.h @@ -117,6 +117,18 @@ class CCRP_OneTable { } } + template + T probT(const Dish& dish, const T& p0) const { + const typename DishMapType::const_iterator it = dish_counts_.find(dish); + const T r(num_tables_ * discount_ + concentration_); + if (it == dish_counts_.end()) { + return r * p0 / T(num_customers_ + concentration_); + } else { + return (T(it->second - discount_) + r * p0) / + T(num_customers_ + concentration_); + } + } + double log_crp_prob() const { return log_crp_prob(discount_, concentration_); } diff --git a/utils/sampler.h b/utils/sampler.h index 153e7ef1..22c873d4 100644 --- a/utils/sampler.h +++ b/utils/sampler.h @@ -48,7 +48,7 @@ struct RandomNumberGenerator { template size_t SelectSample(const F& a, const F& b, double T = 1.0) { if (T == 1.0) { - if (this->next() > (a / (a + b))) return 1; else return 0; + if (F(this->next()) > (a / (a + b))) return 1; else return 0; } else { assert(!"not implemented"); } -- cgit v1.2.3 From d87220030b82fed860efee40487502e9ee8f0651 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 27 Feb 2012 02:19:34 +0000 Subject: generic bayesian cfg learner with a bunch of cfg grammar types --- .gitignore | 1 + decoder/trule.cc | 16 +-- gi/pf/Makefile.am | 4 +- gi/pf/hierolm.cc | 309 ----------------------------------------- gi/pf/learn_cfg.cc | 394 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 398 insertions(+), 326 deletions(-) delete mode 100644 gi/pf/hierolm.cc create mode 100644 gi/pf/learn_cfg.cc diff --git a/.gitignore b/.gitignore index 327f7261..28d5a60a 100644 --- a/.gitignore +++ b/.gitignore @@ -57,6 +57,7 @@ training/mpi_extract_reachable klm/lm/build_binary extools/extractor_monolingual gi/pf/.deps +gi/pf/learn_cfg gi/pf/brat gi/pf/cbgi gi/pf/dpnaive diff --git a/decoder/trule.cc b/decoder/trule.cc index 40235542..141b8faa 100644 --- a/decoder/trule.cc +++ b/decoder/trule.cc @@ -232,16 +232,6 @@ void TRule::ComputeArity() { arity_ = 1 - min; } -static string AnonymousStrVar(int i) { - string res("[v]"); - if(!(i <= 0 && i >= -8)) { - cerr << "Can't handle more than 9 non-terminals: index=" << (-i) << endl; - abort(); - } - res[1] = '1' - i; - return res; -} - string TRule::AsString(bool verbose) const { ostringstream os; int idx = 0; @@ -259,15 +249,11 @@ string TRule::AsString(bool verbose) const { } } os << " ||| "; - if (idx > 9) { - cerr << "Too many non-terminals!\n partial: " << os.str() << endl; - exit(1); - } for (int i =0; i -#include -#include - -#include -#include -#include - -#include "inside_outside.h" -#include "hg.h" -#include "bottom_up_parser.h" -#include "fdict.h" -#include "grammar.h" -#include "m.h" -#include "trule.h" -#include "tdict.h" -#include "filelib.h" -#include "dict.h" -#include "sampler.h" -#include "ccrp.h" -#include "ccrp_onetable.h" - -using namespace std; -using namespace tr1; -namespace po = boost::program_options; - -shared_ptr prng; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("samples,s",po::value()->default_value(1000),"Number of samples") - ("input,i",po::value(),"Read parallel data from") - ("random_seed,S",po::value(), "Random seed"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || (conf->count("input") == 0)) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -void ReadCorpus(const string& filename, - vector >* e, - set* vocab_e) { - e->clear(); - vocab_e->clear(); - istream* in; - if (filename == "-") - in = &cin; - else - in = new ifstream(filename.c_str()); - assert(*in); - string line; - while(*in) { - getline(*in, line); - if (line.empty() && !*in) break; - e->push_back(vector()); - vector& le = e->back(); - TD::ConvertSentence(line, &le); - for (unsigned i = 0; i < le.size(); ++i) - vocab_e->insert(le[i]); - } - if (in != &cin) delete in; -} - -struct Grid { - // a b c d e - // 0 - 0 - - - vector grid; -}; - -struct BaseRuleModel { - explicit BaseRuleModel(unsigned term_size, - unsigned nonterm_size = 1) : - unif_term(1.0 / term_size), - unif_nonterm(1.0 / nonterm_size) {} - prob_t operator()(const TRule& r) const { - prob_t p; p.logeq(Md::log_poisson(1.0, r.f_.size())); - const prob_t term_prob((2.0 + 0.01*r.f_.size()) / (r.f_.size() + 2)); - const prob_t nonterm_prob(1.0 - term_prob.as_float()); - for (unsigned i = 0; i < r.f_.size(); ++i) { - if (r.f_[i] <= 0) { // nonterminal - p *= nonterm_prob; - p *= unif_nonterm; - } else { // terminal - p *= term_prob; - p *= unif_term; - } - } - return p; - } - const prob_t unif_term, unif_nonterm; -}; - -struct HieroLMModel { - explicit HieroLMModel(unsigned vocab_size) : p0(vocab_size), x(1,1,1,1) {} - - prob_t Prob(const TRule& r) const { - return x.probT(r, p0(r)); - } - - int Increment(const TRule& r, MT19937* rng) { - return x.incrementT(r, p0(r), rng); - // return x.increment(r); - } - - int Decrement(const TRule& r, MT19937* rng) { - return x.decrement(r, rng); - //return x.decrement(r); - } - - prob_t Likelihood() const { - prob_t p; - p.logeq(x.log_crp_prob()); - for (CCRP::const_iterator it = x.begin(); it != x.end(); ++it) { - prob_t tp = p0(it->first); - tp.poweq(it->second.table_counts_.size()); - p *= tp; - } - //for (CCRP_OneTable::const_iterator it = x.begin(); it != x.end(); ++it) - // p *= p0(it->first); - return p; - } - - void ResampleHyperparameters(MT19937* rng) { - x.resample_hyperparameters(rng); - cerr << " d=" << x.discount() << ", alpha=" << x.concentration() << endl; - } - - const BaseRuleModel p0; - CCRP x; - //CCRP_OneTable x; -}; - -vector tofreelist; - -HieroLMModel* plm; - -struct NPGrammarIter : public GrammarIter, public RuleBin { - NPGrammarIter() : arity() { tofreelist.push_back(this); } - NPGrammarIter(const TRulePtr& inr, const int a, int symbol) : arity(a + (symbol < 0 ? 1 : 0)) { - if (inr) { - r.reset(new TRule(*inr)); - } else { - static const int kLHS = -TD::Convert("X"); - r.reset(new TRule); - r->lhs_ = kLHS; - } - TRule& rr = *r; - rr.f_.push_back(symbol); - rr.e_.push_back(symbol < 0 ? (1-int(arity)) : symbol); - tofreelist.push_back(this); - } - virtual int GetNumRules() const { - if (r) return 1; else return 0; - } - virtual TRulePtr GetIthRule(int) const { - return r; - } - virtual int Arity() const { - return arity; - } - virtual const RuleBin* GetRules() const { - if (!r) return NULL; else return this; - } - virtual const GrammarIter* Extend(int symbol) const { - return new NPGrammarIter(r, arity, symbol); - } - const unsigned char arity; - TRulePtr r; -}; - -struct NPGrammar : public Grammar { - virtual const GrammarIter* GetRoot() const { - return new NPGrammarIter; - } -}; - -void SampleDerivation(const Hypergraph& hg, MT19937* rng, vector* sampled_deriv, HieroLMModel* plm) { - HieroLMModel& lm = *plm; - vector node_probs; - const prob_t total_prob = Inside(hg, &node_probs); - queue q; - q.push(hg.nodes_.size() - 3); - while(!q.empty()) { - unsigned cur_node_id = q.front(); -// cerr << "NODE=" << cur_node_id << endl; - q.pop(); - const Hypergraph::Node& node = hg.nodes_[cur_node_id]; - const unsigned num_in_edges = node.in_edges_.size(); - unsigned sampled_edge = 0; - if (num_in_edges == 1) { - sampled_edge = node.in_edges_[0]; - } else { - //prob_t z; - assert(num_in_edges > 1); - SampleSet ss; - for (unsigned j = 0; j < num_in_edges; ++j) { - const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; - prob_t p = edge.edge_prob_; - for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k) - p *= node_probs[edge.tail_nodes_[k]]; - ss.add(p); -// cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl; - //z += p; - } -// for (unsigned j = 0; j < num_in_edges; ++j) { -// const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; -// cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl; -// } -// cerr << " --- \n"; - sampled_edge = node.in_edges_[rng->SelectSample(ss)]; - } - sampled_deriv->push_back(sampled_edge); - const Hypergraph::Edge& edge = hg.edges_[sampled_edge]; - for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) { - q.push(edge.tail_nodes_[j]); - } - } - for (unsigned i = 0; i < sampled_deriv->size(); ++i) { - cerr << *hg.edges_[(*sampled_deriv)[i]].rule_ << endl; - } -} - -void IncrementDerivation(const Hypergraph& hg, const vector& d, HieroLMModel* plm, MT19937* rng) { - for (unsigned i = 0; i < d.size(); ++i) - plm->Increment(*hg.edges_[d[i]].rule_, rng); -} - -void DecrementDerivation(const Hypergraph& hg, const vector& d, HieroLMModel* plm, MT19937* rng) { - for (unsigned i = 0; i < d.size(); ++i) - plm->Decrement(*hg.edges_[d[i]].rule_, rng); -} - -int main(int argc, char** argv) { - po::variables_map conf; - vector grammars; - grammars.push_back(GrammarPtr(new NPGrammar)); - - InitCommandLine(argc, argv, &conf); - const unsigned samples = conf["samples"].as(); - - if (conf.count("random_seed")) - prng.reset(new MT19937(conf["random_seed"].as())); - else - prng.reset(new MT19937); - MT19937& rng = *prng; - - vector > corpuse; - set vocabe; - cerr << "Reading corpus...\n"; - ReadCorpus(conf["input"].as(), &corpuse, &vocabe); - cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; - HieroLMModel lm(vocabe.size()); - - plm = &lm; - ExhaustiveBottomUpParser parser("X", grammars); - - Hypergraph hg; - const int kX = -TD::Convert("X"); - const int kLP = FD::Convert("LogProb"); - SparseVector v; v.set_value(kLP, 1.0); - vector > derivs(corpuse.size()); - for (int SS=0; SS < samples; ++SS) { - for (int ci = 0; ci < corpuse.size(); ++ci) { - vector& src = corpuse[ci]; - Lattice lat(src.size()); - for (unsigned i = 0; i < src.size(); ++i) - lat[i].push_back(LatticeArc(src[i], 0.0, 1)); - cerr << TD::GetString(src) << endl; - hg.clear(); - parser.Parse(lat, &hg); // exhaustive parse - DecrementDerivation(hg, derivs[ci], &lm, &rng); - for (unsigned i = 0; i < hg.edges_.size(); ++i) { - TRule& r = *hg.edges_[i].rule_; - if (r.lhs_ == kX) - hg.edges_[i].edge_prob_ = lm.Prob(r); - } - vector d; - SampleDerivation(hg, &rng, &d, &lm); - derivs[ci] = d; - IncrementDerivation(hg, derivs[ci], &lm, &rng); - if (tofreelist.size() > 100000) { - cerr << "Freeing ... "; - for (unsigned i = 0; i < tofreelist.size(); ++i) - delete tofreelist[i]; - tofreelist.clear(); - cerr << "Freed.\n"; - } - } - cerr << "LLH=" << lm.Likelihood() << endl; - } - return 0; -} - diff --git a/gi/pf/learn_cfg.cc b/gi/pf/learn_cfg.cc new file mode 100644 index 00000000..3d202816 --- /dev/null +++ b/gi/pf/learn_cfg.cc @@ -0,0 +1,394 @@ +#include +#include +#include + +#include +#include +#include + +#include "inside_outside.h" +#include "hg.h" +#include "bottom_up_parser.h" +#include "fdict.h" +#include "grammar.h" +#include "m.h" +#include "trule.h" +#include "tdict.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "ccrp.h" +#include "ccrp_onetable.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +shared_ptr prng; +vector nt_vocab; +vector nt_id_to_index; +static unsigned kMAX_RULE_SIZE = 0; +static unsigned kMAX_ARITY = 0; +static bool kALLOW_MIXED = true; // allow rules with mixed terminals and NTs + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("input,i",po::value(),"Read parallel data from") + ("max_rule_size,m", po::value()->default_value(0), "Maximum rule size (0 for unlimited)") + ("max_arity,a", po::value()->default_value(0), "Maximum number of nonterminals in a rule (0 for unlimited)") + ("no_mixed_rules,M", "Do not mix terminals and nonterminals in a rule RHS") + ("nonterminals,n", po::value()->default_value(1), "Size of nonterminal vocabulary") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +unsigned ReadCorpus(const string& filename, + vector >* e, + set* vocab_e) { + e->clear(); + vocab_e->clear(); + istream* in; + if (filename == "-") + in = &cin; + else + in = new ifstream(filename.c_str()); + assert(*in); + string line; + unsigned toks = 0; + while(*in) { + getline(*in, line); + if (line.empty() && !*in) break; + e->push_back(vector()); + vector& le = e->back(); + TD::ConvertSentence(line, &le); + for (unsigned i = 0; i < le.size(); ++i) + vocab_e->insert(le[i]); + toks += le.size(); + } + if (in != &cin) delete in; + return toks; +} + +struct Grid { + // a b c d e + // 0 - 0 - - + vector grid; +}; + +struct BaseRuleModel { + explicit BaseRuleModel(unsigned term_size, + unsigned nonterm_size = 1) : + unif_term(1.0 / term_size), + unif_nonterm(1.0 / nonterm_size) {} + prob_t operator()(const TRule& r) const { + prob_t p; p.logeq(Md::log_poisson(1.0, r.f_.size())); + const prob_t term_prob((2.0 + 0.01*r.f_.size()) / (r.f_.size() + 2)); + const prob_t nonterm_prob(1.0 - term_prob.as_float()); + for (unsigned i = 0; i < r.f_.size(); ++i) { + if (r.f_[i] <= 0) { // nonterminal + p *= nonterm_prob; + p *= unif_nonterm; + } else { // terminal + p *= term_prob; + p *= unif_term; + } + } + return p; + } + const prob_t unif_term, unif_nonterm; +}; + +struct HieroLMModel { + explicit HieroLMModel(unsigned vocab_size, unsigned num_nts = 1) : p0(vocab_size, num_nts), nts(num_nts, CCRP(1,1,1,1)) {} + + prob_t Prob(const TRule& r) const { + return nts[nt_id_to_index[-r.lhs_]].probT(r, p0(r)); + } + + int Increment(const TRule& r, MT19937* rng) { + return nts[nt_id_to_index[-r.lhs_]].incrementT(r, p0(r), rng); + // return x.increment(r); + } + + int Decrement(const TRule& r, MT19937* rng) { + return nts[nt_id_to_index[-r.lhs_]].decrement(r, rng); + //return x.decrement(r); + } + + prob_t Likelihood() const { + prob_t p = prob_t::One(); + for (unsigned i = 0; i < nts.size(); ++i) { + prob_t q; q.logeq(nts[i].log_crp_prob()); + p *= q; + for (CCRP::const_iterator it = nts[i].begin(); it != nts[i].end(); ++it) { + prob_t tp = p0(it->first); + tp.poweq(it->second.table_counts_.size()); + p *= tp; + } + } + //for (CCRP_OneTable::const_iterator it = x.begin(); it != x.end(); ++it) + // p *= p0(it->first); + return p; + } + + void ResampleHyperparameters(MT19937* rng) { + for (unsigned i = 0; i < nts.size(); ++i) + nts[i].resample_hyperparameters(rng); + cerr << " d=" << nts[0].discount() << ", alpha=" << nts[0].concentration() << endl; + } + + const BaseRuleModel p0; + vector > nts; + //CCRP_OneTable x; +}; + +vector tofreelist; + +HieroLMModel* plm; + +struct NPGrammarIter : public GrammarIter, public RuleBin { + NPGrammarIter() : arity() { tofreelist.push_back(this); } + NPGrammarIter(const TRulePtr& inr, const int a, int symbol) : arity(a) { + if (inr) { + r.reset(new TRule(*inr)); + } else { + r.reset(new TRule); + } + TRule& rr = *r; + rr.lhs_ = nt_vocab[0]; + rr.f_.push_back(symbol); + rr.e_.push_back(symbol < 0 ? (1-int(arity)) : symbol); + tofreelist.push_back(this); + } + inline static unsigned NextArity(int cur_a, int symbol) { + return cur_a + (symbol <= 0 ? 1 : 0); + } + virtual int GetNumRules() const { + if (r) return nt_vocab.size(); else return 0; + } + virtual TRulePtr GetIthRule(int i) const { + if (i == 0) return r; + TRulePtr nr(new TRule(*r)); + nr->lhs_ = nt_vocab[i]; + return nr; + } + virtual int Arity() const { + return arity; + } + virtual const RuleBin* GetRules() const { + if (!r) return NULL; else return this; + } + virtual const GrammarIter* Extend(int symbol) const { + const int next_arity = NextArity(arity, symbol); + if (kMAX_ARITY && next_arity > kMAX_ARITY) + return NULL; + if (!kALLOW_MIXED && r) { + bool t1 = r->f_.front() <= 0; + bool t2 = symbol <= 0; + if (t1 != t2) return NULL; + } + if (!kMAX_RULE_SIZE || !r || (r->f_.size() < kMAX_RULE_SIZE)) + return new NPGrammarIter(r, next_arity, symbol); + else + return NULL; + } + const unsigned char arity; + TRulePtr r; +}; + +struct NPGrammar : public Grammar { + virtual const GrammarIter* GetRoot() const { + return new NPGrammarIter; + } +}; + +prob_t TotalProb(const Hypergraph& hg) { + return Inside(hg); +} + +void SampleDerivation(const Hypergraph& hg, MT19937* rng, vector* sampled_deriv) { + vector node_probs; + Inside(hg, &node_probs); + queue q; + q.push(hg.nodes_.size() - 2); + while(!q.empty()) { + unsigned cur_node_id = q.front(); +// cerr << "NODE=" << cur_node_id << endl; + q.pop(); + const Hypergraph::Node& node = hg.nodes_[cur_node_id]; + const unsigned num_in_edges = node.in_edges_.size(); + unsigned sampled_edge = 0; + if (num_in_edges == 1) { + sampled_edge = node.in_edges_[0]; + } else { + //prob_t z; + assert(num_in_edges > 1); + SampleSet ss; + for (unsigned j = 0; j < num_in_edges; ++j) { + const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; + prob_t p = edge.edge_prob_; + for (unsigned k = 0; k < edge.tail_nodes_.size(); ++k) + p *= node_probs[edge.tail_nodes_[k]]; + ss.add(p); +// cerr << log(ss[j]) << " ||| " << edge.rule_->AsString() << endl; + //z += p; + } +// for (unsigned j = 0; j < num_in_edges; ++j) { +// const Hypergraph::Edge& edge = hg.edges_[node.in_edges_[j]]; +// cerr << exp(log(ss[j] / z)) << " ||| " << edge.rule_->AsString() << endl; +// } +// cerr << " --- \n"; + sampled_edge = node.in_edges_[rng->SelectSample(ss)]; + } + sampled_deriv->push_back(sampled_edge); + const Hypergraph::Edge& edge = hg.edges_[sampled_edge]; + for (unsigned j = 0; j < edge.tail_nodes_.size(); ++j) { + q.push(edge.tail_nodes_[j]); + } + } + for (unsigned i = 0; i < sampled_deriv->size(); ++i) { + cerr << *hg.edges_[(*sampled_deriv)[i]].rule_ << endl; + } +} + +void IncrementDerivation(const Hypergraph& hg, const vector& d, HieroLMModel* plm, MT19937* rng) { + for (unsigned i = 0; i < d.size(); ++i) + plm->Increment(*hg.edges_[d[i]].rule_, rng); +} + +void DecrementDerivation(const Hypergraph& hg, const vector& d, HieroLMModel* plm, MT19937* rng) { + for (unsigned i = 0; i < d.size(); ++i) + plm->Decrement(*hg.edges_[d[i]].rule_, rng); +} + +int main(int argc, char** argv) { + po::variables_map conf; + + InitCommandLine(argc, argv, &conf); + nt_vocab.resize(conf["nonterminals"].as()); + assert(nt_vocab.size() > 0); + assert(nt_vocab.size() < 26); + { + string nt = "X"; + for (unsigned i = 0; i < nt_vocab.size(); ++i) { + if (nt_vocab.size() > 1) nt[0] = ('A' + i); + int pid = TD::Convert(nt); + nt_vocab[i] = -pid; + if (pid >= nt_id_to_index.size()) { + nt_id_to_index.resize(pid + 1, -1); + } + nt_id_to_index[pid] = i; + } + } + vector grammars; + grammars.push_back(GrammarPtr(new NPGrammar)); + + const unsigned samples = conf["samples"].as(); + kMAX_RULE_SIZE = conf["max_rule_size"].as(); + if (kMAX_RULE_SIZE == 1) { + cerr << "Invalid maximum rule size: must be 0 or >1\n"; + return 1; + } + kMAX_ARITY = conf["max_arity"].as(); + if (kMAX_ARITY == 1) { + cerr << "Invalid maximum arity: must be 0 or >1\n"; + return 1; + } + kALLOW_MIXED = !conf.count("no_mixed_rules"); + + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); + MT19937& rng = *prng; + vector > corpuse; + set vocabe; + cerr << "Reading corpus...\n"; + const unsigned toks = ReadCorpus(conf["input"].as(), &corpuse, &vocabe); + cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; + HieroLMModel lm(vocabe.size(), nt_vocab.size()); + + plm = &lm; + ExhaustiveBottomUpParser parser(TD::Convert(-nt_vocab[0]), grammars); + + Hypergraph hg; + const int kGoal = -TD::Convert("Goal"); + const int kLP = FD::Convert("LogProb"); + SparseVector v; v.set_value(kLP, 1.0); + vector > derivs(corpuse.size()); + vector cl(corpuse.size()); + for (int ci = 0; ci < corpuse.size(); ++ci) { + vector& src = corpuse[ci]; + Lattice& lat = cl[ci]; + lat.resize(src.size()); + for (unsigned i = 0; i < src.size(); ++i) + lat[i].push_back(LatticeArc(src[i], 0.0, 1)); + } + for (int SS=0; SS < samples; ++SS) { + const bool is_last = ((samples - 1) == SS); + prob_t dlh = prob_t::One(); + for (int ci = 0; ci < corpuse.size(); ++ci) { + const vector& src = corpuse[ci]; + const Lattice& lat = cl[ci]; + cerr << TD::GetString(src) << endl; + hg.clear(); + parser.Parse(lat, &hg); // exhaustive parse + vector& d = derivs[ci]; + if (!is_last) DecrementDerivation(hg, d, &lm, &rng); + for (unsigned i = 0; i < hg.edges_.size(); ++i) { + TRule& r = *hg.edges_[i].rule_; + if (r.lhs_ == kGoal) + hg.edges_[i].edge_prob_ = prob_t::One(); + else + hg.edges_[i].edge_prob_ = lm.Prob(r); + } + if (!is_last) { + d.clear(); + SampleDerivation(hg, &rng, &d); + IncrementDerivation(hg, derivs[ci], &lm, &rng); + } else { + prob_t p = TotalProb(hg); + dlh *= p; + cerr << " p(sentence) = " << log(p) << "\t" << log(dlh) << endl; + } + if (tofreelist.size() > 200000) { + cerr << "Freeing ... "; + for (unsigned i = 0; i < tofreelist.size(); ++i) + delete tofreelist[i]; + tofreelist.clear(); + cerr << "Freed.\n"; + } + } + double llh = log(lm.Likelihood()); + cerr << "LLH=" << llh << "\tENTROPY=" << (-llh / log(2) / toks) << "\tPPL=" << pow(2, -llh / log(2) / toks) << endl; + if (SS % 10 == 9) lm.ResampleHyperparameters(&rng); + if (is_last) { + double z = log(dlh); + cerr << "TOTAL_PROB=" << z << "\tENTROPY=" << (-z / log(2) / toks) << "\tPPL=" << pow(2, -z / log(2) / toks) << endl; + } + } + for (unsigned i = 0; i < nt_vocab.size(); ++i) + cerr << lm.nts[i] << endl; + return 0; +} + -- cgit v1.2.3 From d4c89a181635b8ca1286bcc8887dfb368000cc6f Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 27 Feb 2012 02:40:00 +0000 Subject: fix base distribution, partially --- gi/pf/learn_cfg.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gi/pf/learn_cfg.cc b/gi/pf/learn_cfg.cc index 3d202816..6e574035 100644 --- a/gi/pf/learn_cfg.cc +++ b/gi/pf/learn_cfg.cc @@ -106,10 +106,10 @@ struct BaseRuleModel { const prob_t nonterm_prob(1.0 - term_prob.as_float()); for (unsigned i = 0; i < r.f_.size(); ++i) { if (r.f_[i] <= 0) { // nonterminal - p *= nonterm_prob; + if (kALLOW_MIXED) p *= nonterm_prob; p *= unif_nonterm; } else { // terminal - p *= term_prob; + if (kALLOW_MIXED) p *= term_prob; p *= unif_term; } } -- cgit v1.2.3 From 1f0ded1e7f59b13d7512111dd910d0f4b2f82d02 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 28 Feb 2012 00:47:20 -0500 Subject: optional hierarchical prior --- gi/pf/learn_cfg.cc | 46 ++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 40 insertions(+), 6 deletions(-) diff --git a/gi/pf/learn_cfg.cc b/gi/pf/learn_cfg.cc index 6e574035..b2ca029a 100644 --- a/gi/pf/learn_cfg.cc +++ b/gi/pf/learn_cfg.cc @@ -30,6 +30,7 @@ vector nt_id_to_index; static unsigned kMAX_RULE_SIZE = 0; static unsigned kMAX_ARITY = 0; static bool kALLOW_MIXED = true; // allow rules with mixed terminals and NTs +static bool kHIERARCHICAL_PRIOR = false; void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); @@ -40,11 +41,12 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("max_arity,a", po::value()->default_value(0), "Maximum number of nonterminals in a rule (0 for unlimited)") ("no_mixed_rules,M", "Do not mix terminals and nonterminals in a rule RHS") ("nonterminals,n", po::value()->default_value(1), "Size of nonterminal vocabulary") + ("hierarchical_prior,h", "Use hierarchical prior") ("random_seed,S",po::value(), "Random seed"); po::options_description clo("Command line options"); clo.add_options() ("config", po::value(), "Configuration file") - ("help,h", "Print this help message and exit"); + ("help", "Print this help message and exit"); po::options_description dconfig_options, dcmdline_options; dconfig_options.add(opts); dcmdline_options.add(opts).add(clo); @@ -119,19 +121,35 @@ struct BaseRuleModel { }; struct HieroLMModel { - explicit HieroLMModel(unsigned vocab_size, unsigned num_nts = 1) : p0(vocab_size, num_nts), nts(num_nts, CCRP(1,1,1,1)) {} + explicit HieroLMModel(unsigned vocab_size, unsigned num_nts = 1) : + base(vocab_size, num_nts), + q0(1,1,1,1), + nts(num_nts, CCRP(1,1,1,1)) {} prob_t Prob(const TRule& r) const { return nts[nt_id_to_index[-r.lhs_]].probT(r, p0(r)); } + inline prob_t p0(const TRule& r) const { + if (kHIERARCHICAL_PRIOR) + return q0.probT(r, base(r)); + else + return base(r); + } + int Increment(const TRule& r, MT19937* rng) { - return nts[nt_id_to_index[-r.lhs_]].incrementT(r, p0(r), rng); + const int delta = nts[nt_id_to_index[-r.lhs_]].incrementT(r, p0(r), rng); + if (kHIERARCHICAL_PRIOR && delta) + q0.incrementT(r, base(r), rng); + return delta; // return x.increment(r); } int Decrement(const TRule& r, MT19937* rng) { - return nts[nt_id_to_index[-r.lhs_]].decrement(r, rng); + const int delta = nts[nt_id_to_index[-r.lhs_]].decrement(r, rng); + if (kHIERARCHICAL_PRIOR && delta) + q0.decrement(r, rng); + return delta; //return x.decrement(r); } @@ -146,18 +164,32 @@ struct HieroLMModel { p *= tp; } } + if (kHIERARCHICAL_PRIOR) { + prob_t q; q.logeq(q0.log_crp_prob()); + p *= q; + for (CCRP::const_iterator it = q0.begin(); it != q0.end(); ++it) { + prob_t tp = base(it->first); + tp.poweq(it->second.table_counts_.size()); + p *= tp; + } + } //for (CCRP_OneTable::const_iterator it = x.begin(); it != x.end(); ++it) - // p *= p0(it->first); + // p *= base(it->first); return p; } void ResampleHyperparameters(MT19937* rng) { for (unsigned i = 0; i < nts.size(); ++i) nts[i].resample_hyperparameters(rng); + if (kHIERARCHICAL_PRIOR) { + q0.resample_hyperparameters(rng); + cerr << "[base d=" << q0.discount() << ", alpha=" << q0.discount() << "]"; + } cerr << " d=" << nts[0].discount() << ", alpha=" << nts[0].concentration() << endl; } - const BaseRuleModel p0; + const BaseRuleModel base; + CCRP q0; vector > nts; //CCRP_OneTable x; }; @@ -316,6 +348,8 @@ int main(int argc, char** argv) { } kALLOW_MIXED = !conf.count("no_mixed_rules"); + kHIERARCHICAL_PRIOR = conf.count("hierarchical_prior"); + if (conf.count("random_seed")) prng.reset(new MT19937(conf["random_seed"].as())); else -- cgit v1.2.3 From 89238977fc9d8f8d9a6421b0d4f35afc200f08e7 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Tue, 28 Feb 2012 17:23:55 -0500 Subject: Subject: where's my kenlm update?? From: Chris Dyer --- klm/lm/bhiksha.cc | 7 +- klm/lm/bhiksha.hh | 2 +- klm/lm/binary_format.cc | 139 ++++++++++++---------- klm/lm/binary_format.hh | 14 +-- klm/lm/blank.hh | 2 +- klm/lm/build_binary.cc | 35 ++++-- klm/lm/config.cc | 1 + klm/lm/config.hh | 8 ++ klm/lm/left_test.cc | 11 +- klm/lm/model.cc | 17 ++- klm/lm/model.hh | 11 +- klm/lm/model_test.cc | 24 +++- klm/lm/ngram_query.cc | 145 ++++++----------------- klm/lm/ngram_query.hh | 103 +++++++++++++++++ klm/lm/quantize.cc | 38 +++--- klm/lm/quantize.hh | 2 +- klm/lm/read_arpa.cc | 2 +- klm/lm/return.hh | 2 +- klm/lm/search_hashed.cc | 22 ++-- klm/lm/search_hashed.hh | 63 +++++++--- klm/lm/search_trie.cc | 88 +++++--------- klm/lm/search_trie.hh | 10 +- klm/lm/trie.hh | 2 +- klm/lm/trie_sort.cc | 217 ++++++++++++++++++++--------------- klm/lm/trie_sort.hh | 55 ++++++--- klm/lm/vocab.cc | 53 +++++---- klm/lm/vocab.hh | 36 ++++-- klm/util/bit_packing.hh | 66 ++++++++--- klm/util/exception.cc | 2 +- klm/util/file.cc | 222 +++++++++++++++++++++++++++++++++--- klm/util/file.hh | 42 ++++++- klm/util/file_piece.cc | 62 +++++----- klm/util/file_piece.hh | 31 +++-- klm/util/file_piece_test.cc | 41 +++++-- klm/util/getopt.c | 78 +++++++++++++ klm/util/getopt.hh | 33 ++++++ klm/util/key_value_packing.hh | 126 -------------------- klm/util/key_value_packing_test.cc | 75 ------------ klm/util/mmap.cc | 123 ++++++++++++++++---- klm/util/mmap.hh | 14 ++- klm/util/murmur_hash.cc | 39 ++++++- klm/util/murmur_hash.hh | 2 +- klm/util/probing_hash_table.hh | 33 +++--- klm/util/probing_hash_table_test.cc | 27 ++++- klm/util/sized_iterator.hh | 2 +- klm/util/sorted_uniform.hh | 95 +-------------- klm/util/sorted_uniform_test.cc | 85 ++++++++------ klm/util/tokenize_piece.hh | 64 +++-------- klm/util/tokenize_piece_test.cc | 50 +------- 49 files changed, 1387 insertions(+), 1034 deletions(-) create mode 100644 klm/lm/ngram_query.hh create mode 100644 klm/util/getopt.c create mode 100644 klm/util/getopt.hh delete mode 100644 klm/util/key_value_packing.hh delete mode 100644 klm/util/key_value_packing_test.cc diff --git a/klm/lm/bhiksha.cc b/klm/lm/bhiksha.cc index bf86fd4b..cdeafb47 100644 --- a/klm/lm/bhiksha.cc +++ b/klm/lm/bhiksha.cc @@ -1,5 +1,6 @@ #include "lm/bhiksha.hh" #include "lm/config.hh" +#include "util/file.hh" #include @@ -12,12 +13,12 @@ DontBhiksha::DontBhiksha(const void * /*base*/, uint64_t /*max_offset*/, uint64_ const uint8_t kArrayBhikshaVersion = 0; +// TODO: put this in binary file header instead when I change the binary file format again. void ArrayBhiksha::UpdateConfigFromBinary(int fd, Config &config) { uint8_t version; uint8_t configured_bits; - if (read(fd, &version, 1) != 1 || read(fd, &configured_bits, 1) != 1) { - UTIL_THROW(util::ErrnoException, "Could not read from binary file"); - } + util::ReadOrThrow(fd, &version, 1); + util::ReadOrThrow(fd, &configured_bits, 1); if (version != kArrayBhikshaVersion) UTIL_THROW(FormatLoadException, "This file has sorted array compression version " << (unsigned) version << " but the code expects version " << (unsigned)kArrayBhikshaVersion); config.pointer_bhiksha_bits = configured_bits; } diff --git a/klm/lm/bhiksha.hh b/klm/lm/bhiksha.hh index 3df43dda..5182ee2e 100644 --- a/klm/lm/bhiksha.hh +++ b/klm/lm/bhiksha.hh @@ -13,7 +13,7 @@ #ifndef LM_BHIKSHA__ #define LM_BHIKSHA__ -#include +#include #include #include "lm/model_type.hh" diff --git a/klm/lm/binary_format.cc b/klm/lm/binary_format.cc index 27cada13..4796f6d1 100644 --- a/klm/lm/binary_format.cc +++ b/klm/lm/binary_format.cc @@ -1,19 +1,15 @@ #include "lm/binary_format.hh" #include "lm/lm_exception.hh" +#include "util/file.hh" #include "util/file_piece.hh" +#include +#include #include #include -#include -#include -#include -#include -#include -#include -#include -#include +#include namespace lm { namespace ngram { @@ -24,14 +20,16 @@ const char kMagicBytes[] = "mmap lm http://kheafield.com/code format version 5\n const char kMagicIncomplete[] = "mmap lm http://kheafield.com/code incomplete\n"; const long int kMagicVersion = 5; -// Test values. -struct Sanity { +// Old binary files built on 32-bit machines have this header. +// TODO: eliminate with next binary release. +struct OldSanity { char magic[sizeof(kMagicBytes)]; float zero_f, one_f, minus_half_f; WordIndex one_word_index, max_word_index; uint64_t one_uint64; void SetToReference() { + std::memset(this, 0, sizeof(OldSanity)); std::memcpy(magic, kMagicBytes, sizeof(magic)); zero_f = 0.0; one_f = 1.0; minus_half_f = -0.5; one_word_index = 1; @@ -40,27 +38,35 @@ struct Sanity { } }; -const char *kModelNames[6] = {"hashed n-grams with probing", "hashed n-grams with sorted uniform find", "trie", "trie with quantization", "trie with array-compressed pointers", "trie with quantization and array-compressed pointers"}; -std::size_t TotalHeaderSize(unsigned char order) { - return Align8(sizeof(Sanity) + sizeof(FixedWidthParameters) + sizeof(uint64_t) * order); -} +// Test values aligned to 8 bytes. +struct Sanity { + char magic[ALIGN8(sizeof(kMagicBytes))]; + float zero_f, one_f, minus_half_f; + WordIndex one_word_index, max_word_index, padding_to_8; + uint64_t one_uint64; -void ReadLoop(int fd, void *to_void, std::size_t size) { - uint8_t *to = static_cast(to_void); - while (size) { - ssize_t ret = read(fd, to, size); - if (ret == -1) UTIL_THROW(util::ErrnoException, "Failed to read from binary file"); - if (ret == 0) UTIL_THROW(util::ErrnoException, "Binary file too short"); - to += ret; - size -= ret; + void SetToReference() { + std::memset(this, 0, sizeof(Sanity)); + std::memcpy(magic, kMagicBytes, sizeof(kMagicBytes)); + zero_f = 0.0; one_f = 1.0; minus_half_f = -0.5; + one_word_index = 1; + max_word_index = std::numeric_limits::max(); + padding_to_8 = 0; + one_uint64 = 1; } +}; + +const char *kModelNames[6] = {"hashed n-grams with probing", "hashed n-grams with sorted uniform find", "trie", "trie with quantization", "trie with array-compressed pointers", "trie with quantization and array-compressed pointers"}; + +std::size_t TotalHeaderSize(unsigned char order) { + return ALIGN8(sizeof(Sanity) + sizeof(FixedWidthParameters) + sizeof(uint64_t) * order); } void WriteHeader(void *to, const Parameters ¶ms) { Sanity header = Sanity(); header.SetToReference(); - memcpy(to, &header, sizeof(Sanity)); + std::memcpy(to, &header, sizeof(Sanity)); char *out = reinterpret_cast(to) + sizeof(Sanity); *reinterpret_cast(out) = params.fixed; @@ -74,14 +80,6 @@ void WriteHeader(void *to, const Parameters ¶ms) { } // namespace -void SeekOrThrow(int fd, off_t off) { - if ((off_t)-1 == lseek(fd, off, SEEK_SET)) UTIL_THROW(util::ErrnoException, "Seek failed"); -} - -void AdvanceOrThrow(int fd, off_t off) { - if ((off_t)-1 == lseek(fd, off, SEEK_CUR)) UTIL_THROW(util::ErrnoException, "Seek failed"); -} - uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing) { if (config.write_mmap) { std::size_t total = TotalHeaderSize(order) + memory_size; @@ -89,7 +87,7 @@ uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_ strncpy(reinterpret_cast(backing.vocab.get()), kMagicIncomplete, TotalHeaderSize(order)); return reinterpret_cast(backing.vocab.get()) + TotalHeaderSize(order); } else { - backing.vocab.reset(util::MapAnonymous(memory_size), memory_size, util::scoped_memory::MMAP_ALLOCATED); + util::MapAnonymous(memory_size, backing.vocab); return reinterpret_cast(backing.vocab.get()); } } @@ -98,42 +96,58 @@ uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t std::size_t adjusted_vocab = backing.vocab.size() + vocab_pad; if (config.write_mmap) { // Grow the file to accomodate the search, using zeros. - if (-1 == ftruncate(backing.file.get(), adjusted_vocab + memory_size)) - UTIL_THROW(util::ErrnoException, "ftruncate on " << config.write_mmap << " to " << (adjusted_vocab + memory_size) << " failed"); + try { + util::ResizeOrThrow(backing.file.get(), adjusted_vocab + memory_size); + } catch (util::ErrnoException &e) { + e << " for file " << config.write_mmap; + throw e; + } + if (config.write_method == Config::WRITE_AFTER) { + util::MapAnonymous(memory_size, backing.search); + return reinterpret_cast(backing.search.get()); + } + // mmap it now. // We're skipping over the header and vocab for the search space mmap. mmap likes page aligned offsets, so some arithmetic to round the offset down. - off_t page_size = sysconf(_SC_PAGE_SIZE); - off_t alignment_cruft = adjusted_vocab % page_size; + std::size_t page_size = util::SizePage(); + std::size_t alignment_cruft = adjusted_vocab % page_size; backing.search.reset(util::MapOrThrow(alignment_cruft + memory_size, true, util::kFileFlags, false, backing.file.get(), adjusted_vocab - alignment_cruft), alignment_cruft + memory_size, util::scoped_memory::MMAP_ALLOCATED); - return reinterpret_cast(backing.search.get()) + alignment_cruft; } else { - backing.search.reset(util::MapAnonymous(memory_size), memory_size, util::scoped_memory::MMAP_ALLOCATED); + util::MapAnonymous(memory_size, backing.search); return reinterpret_cast(backing.search.get()); } } -void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector &counts, Backing &backing) { - if (config.write_mmap) { - if (msync(backing.search.get(), backing.search.size(), MS_SYNC) || msync(backing.vocab.get(), backing.vocab.size(), MS_SYNC)) - UTIL_THROW(util::ErrnoException, "msync failed for " << config.write_mmap); - // header and vocab share the same mmap. The header is written here because we know the counts. - Parameters params; - params.counts = counts; - params.fixed.order = counts.size(); - params.fixed.probing_multiplier = config.probing_multiplier; - params.fixed.model_type = model_type; - params.fixed.has_vocabulary = config.include_vocab; - params.fixed.search_version = search_version; - WriteHeader(backing.vocab.get(), params); +void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector &counts, std::size_t vocab_pad, Backing &backing) { + if (!config.write_mmap) return; + util::SyncOrThrow(backing.vocab.get(), backing.vocab.size()); + switch (config.write_method) { + case Config::WRITE_MMAP: + util::SyncOrThrow(backing.search.get(), backing.search.size()); + break; + case Config::WRITE_AFTER: + util::SeekOrThrow(backing.file.get(), backing.vocab.size() + vocab_pad); + util::WriteOrThrow(backing.file.get(), backing.search.get(), backing.search.size()); + util::FSyncOrThrow(backing.file.get()); + break; } + // header and vocab share the same mmap. The header is written here because we know the counts. + Parameters params = Parameters(); + params.counts = counts; + params.fixed.order = counts.size(); + params.fixed.probing_multiplier = config.probing_multiplier; + params.fixed.model_type = model_type; + params.fixed.has_vocabulary = config.include_vocab; + params.fixed.search_version = search_version; + WriteHeader(backing.vocab.get(), params); } namespace detail { bool IsBinaryFormat(int fd) { - const off_t size = util::SizeFile(fd); - if (size == util::kBadSize || (size <= static_cast(sizeof(Sanity)))) return false; + const uint64_t size = util::SizeFile(fd); + if (size == util::kBadSize || (size <= static_cast(sizeof(Sanity)))) return false; // Try reading the header. util::scoped_memory memory; try { @@ -154,19 +168,23 @@ bool IsBinaryFormat(int fd) { if ((end_ptr != begin_version) && version != kMagicVersion) { UTIL_THROW(FormatLoadException, "Binary file has version " << version << " but this implementation expects version " << kMagicVersion << " so you'll have to use the ARPA to rebuild your binary"); } + + OldSanity old_sanity = OldSanity(); + old_sanity.SetToReference(); + UTIL_THROW_IF(!memcmp(memory.get(), &old_sanity, sizeof(OldSanity)), FormatLoadException, "Looks like this is an old 32-bit format. The old 32-bit format has been removed so that 64-bit and 32-bit files are exchangeable."); UTIL_THROW(FormatLoadException, "File looks like it should be loaded with mmap, but the test values don't match. Try rebuilding the binary format LM using the same code revision, compiler, and architecture"); } return false; } void ReadHeader(int fd, Parameters &out) { - SeekOrThrow(fd, sizeof(Sanity)); - ReadLoop(fd, &out.fixed, sizeof(out.fixed)); + util::SeekOrThrow(fd, sizeof(Sanity)); + util::ReadOrThrow(fd, &out.fixed, sizeof(out.fixed)); if (out.fixed.probing_multiplier < 1.0) UTIL_THROW(FormatLoadException, "Binary format claims to have a probing multiplier of " << out.fixed.probing_multiplier << " which is < 1.0."); out.counts.resize(static_cast(out.fixed.order)); - ReadLoop(fd, &*out.counts.begin(), sizeof(uint64_t) * out.fixed.order); + if (out.fixed.order) util::ReadOrThrow(fd, &*out.counts.begin(), sizeof(uint64_t) * out.fixed.order); } void MatchCheck(ModelType model_type, unsigned int search_version, const Parameters ¶ms) { @@ -179,11 +197,11 @@ void MatchCheck(ModelType model_type, unsigned int search_version, const Paramet } void SeekPastHeader(int fd, const Parameters ¶ms) { - SeekOrThrow(fd, TotalHeaderSize(params.counts.size())); + util::SeekOrThrow(fd, TotalHeaderSize(params.counts.size())); } uint8_t *SetupBinary(const Config &config, const Parameters ¶ms, std::size_t memory_size, Backing &backing) { - const off_t file_size = util::SizeFile(backing.file.get()); + const uint64_t file_size = util::SizeFile(backing.file.get()); // The header is smaller than a page, so we have to map the whole header as well. std::size_t total_map = TotalHeaderSize(params.counts.size()) + memory_size; if (file_size != util::kBadSize && static_cast(file_size) < total_map) @@ -194,9 +212,8 @@ uint8_t *SetupBinary(const Config &config, const Parameters ¶ms, std::size_t if (config.enumerate_vocab && !params.fixed.has_vocabulary) UTIL_THROW(FormatLoadException, "The decoder requested all the vocabulary strings, but this binary file does not have them. You may need to rebuild the binary file with an updated version of build_binary."); - if (config.enumerate_vocab) { - SeekOrThrow(backing.file.get(), total_map); - } + // Seek to vocabulary words + util::SeekOrThrow(backing.file.get(), total_map); return reinterpret_cast(backing.search.get()) + TotalHeaderSize(params.counts.size()); } diff --git a/klm/lm/binary_format.hh b/klm/lm/binary_format.hh index e9df0892..dd795f62 100644 --- a/klm/lm/binary_format.hh +++ b/klm/lm/binary_format.hh @@ -12,7 +12,7 @@ #include #include -#include +#include namespace lm { namespace ngram { @@ -33,10 +33,8 @@ struct FixedWidthParameters { unsigned int search_version; }; -inline std::size_t Align8(std::size_t in) { - std::size_t off = in % 8; - return off ? (in + 8 - off) : in; -} +// This is a macro instead of an inline function so constants can be assigned using it. +#define ALIGN8(a) ((std::ptrdiff_t(((a)-1)/8)+1)*8) // Parameters stored in the header of a binary file. struct Parameters { @@ -53,10 +51,6 @@ struct Backing { util::scoped_memory search; }; -void SeekOrThrow(int fd, off_t off); -// Seek forward -void AdvanceOrThrow(int fd, off_t off); - // Create just enough of a binary file to write vocabulary to it. uint8_t *SetupJustVocab(const Config &config, uint8_t order, std::size_t memory_size, Backing &backing); // Grow the binary file for the search data structure and set backing.search, returning the memory address where the search data structure should begin. @@ -64,7 +58,7 @@ uint8_t *GrowForSearch(const Config &config, std::size_t vocab_pad, std::size_t // Write header to binary file. This is done last to prevent incomplete files // from loading. -void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector &counts, Backing &backing); +void FinishFile(const Config &config, ModelType model_type, unsigned int search_version, const std::vector &counts, std::size_t vocab_pad, Backing &backing); namespace detail { diff --git a/klm/lm/blank.hh b/klm/lm/blank.hh index 2fb64cd0..4da81209 100644 --- a/klm/lm/blank.hh +++ b/klm/lm/blank.hh @@ -3,7 +3,7 @@ #include -#include +#include #include namespace lm { diff --git a/klm/lm/build_binary.cc b/klm/lm/build_binary.cc index fdb62a71..8cbb69d0 100644 --- a/klm/lm/build_binary.cc +++ b/klm/lm/build_binary.cc @@ -8,18 +8,24 @@ #include #include -#include + +#ifdef WIN32 +#include "util/getopt.hh" +#endif namespace lm { namespace ngram { namespace { void Usage(const char *name) { - std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n" + std::cerr << "Usage: " << name << " [-u log10_unknown_probability] [-s] [-i] [-w mmap|after] [-p probing_multiplier] [-t trie_temporary] [-m trie_building_megabytes] [-q bits] [-b bits] [-a bits] [type] input.arpa [output.mmap]\n\n" "-u sets the log10 probability for if the ARPA file does not have one.\n" " Default is -100. The ARPA file will always take precedence.\n" "-s allows models to be built even if they do not have and .\n" -"-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n\n" +"-i allows buggy models from IRSTLM by mapping positive log probability to 0.\n" +"-w mmap|after determines how writing is done.\n" +" mmap maps the binary file and writes to it. Default for trie.\n" +" after allocates anonymous memory, builds, and writes. Default for probing.\n\n" "type is either probing or trie. Default is probing.\n\n" "probing uses a probing hash table. It is the fastest but uses the most memory.\n" "-p sets the space multiplier and must be >1.0. The default is 1.5.\n\n" @@ -55,7 +61,7 @@ uint8_t ParseBitCount(const char *from) { unsigned long val = ParseUInt(from); if (val > 25) { util::ParseNumberException e(from); - e << " bit counts are limited to 256."; + e << " bit counts are limited to 25."; } return val; } @@ -87,7 +93,7 @@ void ShowSizes(const char *file, const lm::ngram::Config &config) { prefix = 'G'; divide = 1 << 30; } - long int length = std::max(2, lrint(ceil(log10(max_length / divide)))); + long int length = std::max(2, static_cast(ceil(log10((double) max_length / divide)))); std::cout << "Memory estimate:\ntype "; // right align bytes. for (long int i = 0; i < length - 2; ++i) std::cout << ' '; @@ -112,10 +118,10 @@ int main(int argc, char *argv[]) { using namespace lm::ngram; try { - bool quantize = false, set_backoff_bits = false, bhiksha = false; + bool quantize = false, set_backoff_bits = false, bhiksha = false, set_write_method = false; lm::ngram::Config config; int opt; - while ((opt = getopt(argc, argv, "siu:p:t:m:q:b:a:")) != -1) { + while ((opt = getopt(argc, argv, "q:b:a:u:p:t:m:w:si")) != -1) { switch(opt) { case 'q': config.prob_bits = ParseBitCount(optarg); @@ -129,6 +135,7 @@ int main(int argc, char *argv[]) { case 'a': config.pointer_bhiksha_bits = ParseBitCount(optarg); bhiksha = true; + break; case 'u': config.unknown_missing_logprob = ParseFloat(optarg); break; @@ -141,6 +148,16 @@ int main(int argc, char *argv[]) { case 'm': config.building_memory = ParseUInt(optarg) * 1048576; break; + case 'w': + set_write_method = true; + if (!strcmp(optarg, "mmap")) { + config.write_method = Config::WRITE_MMAP; + } else if (!strcmp(optarg, "after")) { + config.write_method = Config::WRITE_AFTER; + } else { + Usage(argv[0]); + } + break; case 's': config.sentence_marker_missing = lm::SILENT; break; @@ -166,9 +183,11 @@ int main(int argc, char *argv[]) { const char *from_file = argv[optind + 1]; config.write_mmap = argv[optind + 2]; if (!strcmp(model_type, "probing")) { + if (!set_write_method) config.write_method = Config::WRITE_AFTER; if (quantize || set_backoff_bits) ProbingQuantizationUnsupported(); ProbingModel(from_file, config); } else if (!strcmp(model_type, "trie")) { + if (!set_write_method) config.write_method = Config::WRITE_MMAP; if (quantize) { if (bhiksha) { QuantArrayTrieModel(from_file, config); @@ -191,7 +210,9 @@ int main(int argc, char *argv[]) { } catch (const std::exception &e) { std::cerr << e.what() << std::endl; + std::cerr << "ERROR" << std::endl; return 1; } + std::cerr << "SUCCESS" << std::endl; return 0; } diff --git a/klm/lm/config.cc b/klm/lm/config.cc index 297589a4..dbe762b3 100644 --- a/klm/lm/config.cc +++ b/klm/lm/config.cc @@ -17,6 +17,7 @@ Config::Config() : temporary_directory_prefix(NULL), arpa_complain(ALL), write_mmap(NULL), + write_method(WRITE_AFTER), include_vocab(true), prob_bits(8), backoff_bits(8), diff --git a/klm/lm/config.hh b/klm/lm/config.hh index 8564661b..01b75632 100644 --- a/klm/lm/config.hh +++ b/klm/lm/config.hh @@ -70,9 +70,17 @@ struct Config { // to NULL to disable. const char *write_mmap; + typedef enum { + WRITE_MMAP, // Map the file directly. + WRITE_AFTER // Write after we're done. + } WriteMethod; + WriteMethod write_method; + // Include the vocab in the binary file? Only effective if write_mmap != NULL. bool include_vocab; + + // Quantization options. Only effective for QuantTrieModel. One value is // reserved for each of prob and backoff, so 2^bits - 1 buckets will be used // to quantize (and one of the remaining backoffs will be 0). diff --git a/klm/lm/left_test.cc b/klm/lm/left_test.cc index 8bb91cb3..c85e5efa 100644 --- a/klm/lm/left_test.cc +++ b/klm/lm/left_test.cc @@ -142,7 +142,7 @@ template float TreeMiddle(const M &m, const std::vector &wo template void LookupVocab(const M &m, const StringPiece &str, std::vector &out) { out.clear(); - for (util::PieceIterator<' '> i(str); i; ++i) { + for (util::TokenIter i(str, ' '); i; ++i) { out.push_back(m.GetVocabulary().Index(*i)); } } @@ -326,10 +326,17 @@ template void FullGrow(const M &m) { } } +const char *FileLocation() { + if (boost::unit_test::framework::master_test_suite().argc < 2) { + return "test.arpa"; + } + return boost::unit_test::framework::master_test_suite().argv[1]; +} + template void Everything() { Config config; config.messages = NULL; - M m("test.arpa", config); + M m(FileLocation(), config); Short(m); Charge(m); diff --git a/klm/lm/model.cc b/klm/lm/model.cc index e4c1ec1d..478ebed1 100644 --- a/klm/lm/model.cc +++ b/klm/lm/model.cc @@ -46,7 +46,7 @@ template GenericModel::Ge template void GenericModel::InitializeFromBinary(void *start, const Parameters ¶ms, const Config &config, int fd) { SetupMemory(start, params.counts, config); - vocab_.LoadedBinary(fd, config.enumerate_vocab); + vocab_.LoadedBinary(params.fixed.has_vocabulary, fd, config.enumerate_vocab); search_.LoadedBinary(); } @@ -82,13 +82,18 @@ template void GenericModel void GenericModel::UpdateConfigFromBinary(int fd, const std::vector &counts, Config &config) { + util::AdvanceOrThrow(fd, VocabularyT::Size(counts[0], config)); + Search::UpdateConfigFromBinary(fd, counts, config); +} + template FullScoreReturn GenericModel::FullScore(const State &in_state, const WordIndex new_word, State &out_state) const { FullScoreReturn ret = ScoreExceptBackoff(in_state.words, in_state.words + in_state.length, new_word, out_state); for (const float *i = in_state.backoff + ret.ngram_length - 1; i < in_state.backoff + in_state.length; ++i) { @@ -114,7 +119,7 @@ template FullScoreReturn GenericModel void GenericModel FullScoreReturn GenericModel FullScoreReturn GenericModel class GenericModel : public base::Mod * TrieModel. To classify binary files, call RecognizeBinary in * lm/binary_format.hh. */ - GenericModel(const char *file, const Config &config = Config()); + explicit GenericModel(const char *file, const Config &config = Config()); /* Score p(new_word | in_state) and incorporate new_word into out_state. * Note that in_state and out_state must be different references: @@ -137,14 +137,9 @@ template class GenericModel : public base::Mod unsigned char &next_use) const; private: - friend void LoadLM<>(const char *file, const Config &config, GenericModel &to); + friend void lm::ngram::LoadLM<>(const char *file, const Config &config, GenericModel &to); - static void UpdateConfigFromBinary(int fd, const std::vector &counts, Config &config) { - AdvanceOrThrow(fd, VocabularyT::Size(counts[0], config)); - Search::UpdateConfigFromBinary(fd, counts, config); - } - - float SlowBackoffLookup(const WordIndex *const context_rbegin, const WordIndex *const context_rend, unsigned char start) const; + static void UpdateConfigFromBinary(int fd, const std::vector &counts, Config &config); FullScoreReturn ScoreExceptBackoff(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const; diff --git a/klm/lm/model_test.cc b/klm/lm/model_test.cc index 2654071f..461704d4 100644 --- a/klm/lm/model_test.cc +++ b/klm/lm/model_test.cc @@ -19,6 +19,20 @@ std::ostream &operator<<(std::ostream &o, const State &state) { namespace { +const char *TestLocation() { + if (boost::unit_test::framework::master_test_suite().argc < 2) { + return "test.arpa"; + } + return boost::unit_test::framework::master_test_suite().argv[1]; +} +const char *TestNoUnkLocation() { + if (boost::unit_test::framework::master_test_suite().argc < 3) { + return "test_nounk.arpa"; + } + return boost::unit_test::framework::master_test_suite().argv[2]; + +} + #define StartTest(word, ngram, score, indep_left) \ ret = model.FullScore( \ state, \ @@ -307,7 +321,7 @@ template void LoadingTest() { { ExpectEnumerateVocab enumerate; config.enumerate_vocab = &enumerate; - ModelT m("test.arpa", config); + ModelT m(TestLocation(), config); enumerate.Check(m.GetVocabulary()); BOOST_CHECK_EQUAL((WordIndex)37, m.GetVocabulary().Bound()); Everything(m); @@ -315,7 +329,7 @@ template void LoadingTest() { { ExpectEnumerateVocab enumerate; config.enumerate_vocab = &enumerate; - ModelT m("test_nounk.arpa", config); + ModelT m(TestNoUnkLocation(), config); enumerate.Check(m.GetVocabulary()); BOOST_CHECK_EQUAL((WordIndex)37, m.GetVocabulary().Bound()); NoUnkCheck(m); @@ -346,7 +360,7 @@ template void BinaryTest() { config.enumerate_vocab = &enumerate; { - ModelT copy_model("test.arpa", config); + ModelT copy_model(TestLocation(), config); enumerate.Check(copy_model.GetVocabulary()); enumerate.Clear(); Everything(copy_model); @@ -370,14 +384,14 @@ template void BinaryTest() { config.messages = NULL; enumerate.Clear(); { - ModelT copy_model("test_nounk.arpa", config); + ModelT copy_model(TestNoUnkLocation(), config); enumerate.Check(copy_model.GetVocabulary()); enumerate.Clear(); NoUnkCheck(copy_model); } config.write_mmap = NULL; { - ModelT binary("test_nounk.binary", config); + ModelT binary(TestNoUnkLocation(), config); enumerate.Check(binary.GetVocabulary()); NoUnkCheck(binary); } diff --git a/klm/lm/ngram_query.cc b/klm/lm/ngram_query.cc index d9db4aa2..8f7a0e1c 100644 --- a/klm/lm/ngram_query.cc +++ b/klm/lm/ngram_query.cc @@ -1,87 +1,4 @@ -#include "lm/enumerate_vocab.hh" -#include "lm/model.hh" - -#include -#include -#include -#include - -#include - -#include -#include - -float FloatSec(const struct timeval &tv) { - return static_cast(tv.tv_sec) + (static_cast(tv.tv_usec) / 1000000000.0); -} - -void PrintUsage(const char *message) { - struct rusage usage; - if (getrusage(RUSAGE_SELF, &usage)) { - perror("getrusage"); - return; - } - std::cerr << message; - std::cerr << "user\t" << FloatSec(usage.ru_utime) << "\nsys\t" << FloatSec(usage.ru_stime) << '\n'; - - // Linux doesn't set memory usage :-(. - std::ifstream status("/proc/self/status", std::ios::in); - std::string line; - while (getline(status, line)) { - if (!strncmp(line.c_str(), "VmRSS:\t", 7)) { - std::cerr << "rss " << (line.c_str() + 7) << '\n'; - break; - } - } -} - -template void Query(const Model &model, bool sentence_context) { - PrintUsage("Loading statistics:\n"); - typename Model::State state, out; - lm::FullScoreReturn ret; - std::string word; - - while (std::cin) { - state = sentence_context ? model.BeginSentenceState() : model.NullContextState(); - float total = 0.0; - bool got = false; - unsigned int oov = 0; - while (std::cin >> word) { - got = true; - lm::WordIndex vocab = model.GetVocabulary().Index(word); - if (vocab == 0) ++oov; - ret = model.FullScore(state, vocab, out); - total += ret.prob; - std::cout << word << '=' << vocab << ' ' << static_cast(ret.ngram_length) << ' ' << ret.prob << '\t'; - state = out; - char c; - while (true) { - c = std::cin.get(); - if (!std::cin) break; - if (c == '\n') break; - if (!isspace(c)) { - std::cin.unget(); - break; - } - } - if (c == '\n') break; - } - if (!got && !std::cin) break; - if (sentence_context) { - ret = model.FullScore(state, model.GetVocabulary().EndSentence(), out); - total += ret.prob; - std::cout << "=" << model.GetVocabulary().EndSentence() << ' ' << static_cast(ret.ngram_length) << ' ' << ret.prob << '\t'; - } - std::cout << "Total: " << total << " OOV: " << oov << '\n'; - } - PrintUsage("After queries:\n"); -} - -template void Query(const char *name) { - lm::ngram::Config config; - Model model(name, config); - Query(model); -} +#include "lm/ngram_query.hh" int main(int argc, char *argv[]) { if (!(argc == 2 || (argc == 3 && !strcmp(argv[2], "null")))) { @@ -89,34 +6,40 @@ int main(int argc, char *argv[]) { std::cerr << "Input is wrapped in and unless null is passed." << std::endl; return 1; } - bool sentence_context = (argc == 2); - lm::ngram::ModelType model_type; - if (lm::ngram::RecognizeBinary(argv[1], model_type)) { - switch(model_type) { - case lm::ngram::HASH_PROBING: - Query(argv[1], sentence_context); - break; - case lm::ngram::TRIE_SORTED: - Query(argv[1], sentence_context); - break; - case lm::ngram::QUANT_TRIE_SORTED: - Query(argv[1], sentence_context); - break; - case lm::ngram::ARRAY_TRIE_SORTED: - Query(argv[1], sentence_context); - break; - case lm::ngram::QUANT_ARRAY_TRIE_SORTED: - Query(argv[1], sentence_context); - break; - case lm::ngram::HASH_SORTED: - default: - std::cerr << "Unrecognized kenlm model type " << model_type << std::endl; - abort(); + try { + bool sentence_context = (argc == 2); + using namespace lm::ngram; + ModelType model_type; + if (RecognizeBinary(argv[1], model_type)) { + switch(model_type) { + case HASH_PROBING: + Query(argv[1], sentence_context, std::cin, std::cout); + break; + case TRIE_SORTED: + Query(argv[1], sentence_context, std::cin, std::cout); + break; + case QUANT_TRIE_SORTED: + Query(argv[1], sentence_context, std::cin, std::cout); + break; + case ARRAY_TRIE_SORTED: + Query(argv[1], sentence_context, std::cin, std::cout); + break; + case QUANT_ARRAY_TRIE_SORTED: + Query(argv[1], sentence_context, std::cin, std::cout); + break; + case HASH_SORTED: + default: + std::cerr << "Unrecognized kenlm model type " << model_type << std::endl; + abort(); + } + } else { + Query(argv[1], sentence_context, std::cin, std::cout); } - } else { - Query(argv[1], sentence_context); - } - PrintUsage("Total time including destruction:\n"); + PrintUsage("Total time including destruction:\n"); + } catch (const std::exception &e) { + std::cerr << e.what() << std::endl; + return 1; + } return 0; } diff --git a/klm/lm/ngram_query.hh b/klm/lm/ngram_query.hh new file mode 100644 index 00000000..4990df22 --- /dev/null +++ b/klm/lm/ngram_query.hh @@ -0,0 +1,103 @@ +#ifndef LM_NGRAM_QUERY__ +#define LM_NGRAM_QUERY__ + +#include "lm/enumerate_vocab.hh" +#include "lm/model.hh" + +#include +#include +#include +#include + +#include +#if !defined(_WIN32) && !defined(_WIN64) +#include +#include +#endif + +namespace lm { +namespace ngram { + +#if !defined(_WIN32) && !defined(_WIN64) +float FloatSec(const struct timeval &tv) { + return static_cast(tv.tv_sec) + (static_cast(tv.tv_usec) / 1000000000.0); +} +#endif + +void PrintUsage(const char *message) { +#if !defined(_WIN32) && !defined(_WIN64) + struct rusage usage; + if (getrusage(RUSAGE_SELF, &usage)) { + perror("getrusage"); + return; + } + std::cerr << message; + std::cerr << "user\t" << FloatSec(usage.ru_utime) << "\nsys\t" << FloatSec(usage.ru_stime) << '\n'; + + // Linux doesn't set memory usage :-(. + std::ifstream status("/proc/self/status", std::ios::in); + std::string line; + while (getline(status, line)) { + if (!strncmp(line.c_str(), "VmRSS:\t", 7)) { + std::cerr << "rss " << (line.c_str() + 7) << '\n'; + break; + } + } +#endif +} + +template void Query(const Model &model, bool sentence_context, std::istream &in_stream, std::ostream &out_stream) { + PrintUsage("Loading statistics:\n"); + typename Model::State state, out; + lm::FullScoreReturn ret; + std::string word; + + while (in_stream) { + state = sentence_context ? model.BeginSentenceState() : model.NullContextState(); + float total = 0.0; + bool got = false; + unsigned int oov = 0; + while (in_stream >> word) { + got = true; + lm::WordIndex vocab = model.GetVocabulary().Index(word); + if (vocab == 0) ++oov; + ret = model.FullScore(state, vocab, out); + total += ret.prob; + out_stream << word << '=' << vocab << ' ' << static_cast(ret.ngram_length) << ' ' << ret.prob << '\t'; + state = out; + char c; + while (true) { + c = in_stream.get(); + if (!in_stream) break; + if (c == '\n') break; + if (!isspace(c)) { + in_stream.unget(); + break; + } + } + if (c == '\n') break; + } + if (!got && !in_stream) break; + if (sentence_context) { + ret = model.FullScore(state, model.GetVocabulary().EndSentence(), out); + total += ret.prob; + out_stream << "=" << model.GetVocabulary().EndSentence() << ' ' << static_cast(ret.ngram_length) << ' ' << ret.prob << '\t'; + } + out_stream << "Total: " << total << " OOV: " << oov << '\n'; + } + PrintUsage("After queries:\n"); +} + +template void Query(const char *file, bool sentence_context, std::istream &in_stream, std::ostream &out_stream) { + Config config; +// config.load_method = util::LAZY; + M model(file, config); + Query(model, sentence_context, in_stream, out_stream); +} + +} // namespace ngram +} // namespace lm + +#endif // LM_NGRAM_QUERY__ + + diff --git a/klm/lm/quantize.cc b/klm/lm/quantize.cc index 98a5d048..a8e0cb21 100644 --- a/klm/lm/quantize.cc +++ b/klm/lm/quantize.cc @@ -1,31 +1,30 @@ +/* Quantize into bins of equal size as described in + * M. Federico and N. Bertoldi. 2006. How many bits are needed + * to store probabilities for phrase-based translation? In Proc. + * of the Workshop on Statistical Machine Translation, pages + * 94–101, New York City, June. Association for Computa- + * tional Linguistics. + */ + #include "lm/quantize.hh" #include "lm/binary_format.hh" #include "lm/lm_exception.hh" +#include "util/file.hh" #include #include -#include - namespace lm { namespace ngram { -/* Quantize into bins of equal size as described in - * M. Federico and N. Bertoldi. 2006. How many bits are needed - * to store probabilities for phrase-based translation? In Proc. - * of the Workshop on Statistical Machine Translation, pages - * 94–101, New York City, June. Association for Computa- - * tional Linguistics. - */ - namespace { -void MakeBins(float *values, float *values_end, float *centers, uint32_t bins) { - std::sort(values, values_end); - const float *start = values, *finish; +void MakeBins(std::vector &values, float *centers, uint32_t bins) { + std::sort(values.begin(), values.end()); + std::vector::const_iterator start = values.begin(), finish; for (uint32_t i = 0; i < bins; ++i, ++centers, start = finish) { - finish = values + (((values_end - values) * static_cast(i + 1)) / bins); + finish = values.begin() + ((values.size() * static_cast(i + 1)) / bins); if (finish == start) { // zero length bucket. *centers = i ? *(centers - 1) : -std::numeric_limits::infinity(); @@ -41,10 +40,11 @@ const char kSeparatelyQuantizeVersion = 2; void SeparatelyQuantize::UpdateConfigFromBinary(int fd, const std::vector &/*counts*/, Config &config) { char version; - if (read(fd, &version, 1) != 1 || read(fd, &config.prob_bits, 1) != 1 || read(fd, &config.backoff_bits, 1) != 1) - UTIL_THROW(util::ErrnoException, "Failed to read header for quantization."); + util::ReadOrThrow(fd, &version, 1); + util::ReadOrThrow(fd, &config.prob_bits, 1); + util::ReadOrThrow(fd, &config.backoff_bits, 1); if (version != kSeparatelyQuantizeVersion) UTIL_THROW(FormatLoadException, "This file has quantization version " << (unsigned)version << " but the code expects version " << (unsigned)kSeparatelyQuantizeVersion); - AdvanceOrThrow(fd, -3); + util::AdvanceOrThrow(fd, -3); } void SeparatelyQuantize::SetupMemory(void *start, const Config &config) { @@ -66,12 +66,12 @@ void SeparatelyQuantize::Train(uint8_t order, std::vector &prob, std::vec float *centers = start_ + TableStart(order) + ProbTableLength(); *(centers++) = kNoExtensionBackoff; *(centers++) = kExtensionBackoff; - MakeBins(&*backoff.begin(), &*backoff.end(), centers, (1ULL << backoff_bits_) - 2); + MakeBins(backoff, centers, (1ULL << backoff_bits_) - 2); } void SeparatelyQuantize::TrainProb(uint8_t order, std::vector &prob) { float *centers = start_ + TableStart(order); - MakeBins(&*prob.begin(), &*prob.end(), centers, (1ULL << prob_bits_)); + MakeBins(prob, centers, (1ULL << prob_bits_)); } void SeparatelyQuantize::FinishedLoading(const Config &config) { diff --git a/klm/lm/quantize.hh b/klm/lm/quantize.hh index 4cf4236e..6d130a57 100644 --- a/klm/lm/quantize.hh +++ b/klm/lm/quantize.hh @@ -9,7 +9,7 @@ #include #include -#include +#include #include diff --git a/klm/lm/read_arpa.cc b/klm/lm/read_arpa.cc index dce73f77..05f761be 100644 --- a/klm/lm/read_arpa.cc +++ b/klm/lm/read_arpa.cc @@ -8,7 +8,7 @@ #include #include -#include +#include namespace lm { diff --git a/klm/lm/return.hh b/klm/lm/return.hh index 15571960..1b55091b 100644 --- a/klm/lm/return.hh +++ b/klm/lm/return.hh @@ -1,7 +1,7 @@ #ifndef LM_RETURN__ #define LM_RETURN__ -#include +#include namespace lm { /* Structure returned by scoring routines. */ diff --git a/klm/lm/search_hashed.cc b/klm/lm/search_hashed.cc index 247832b0..1d6fb5be 100644 --- a/klm/lm/search_hashed.cc +++ b/klm/lm/search_hashed.cc @@ -30,7 +30,7 @@ template class ActivateLowerMiddle { // TODO: somehow get text of n-gram for this error message. if (!modify_.UnsafeMutableFind(hash, i)) UTIL_THROW(FormatLoadException, "The context of every " << n << "-gram should appear as a " << (n-1) << "-gram"); - SetExtension(i->MutableValue().backoff); + SetExtension(i->value.backoff); } private: @@ -65,7 +65,7 @@ template void FixSRI(int lower, float negative_lower_prob, unsign blank.prob -= unigrams[vocab_ids[1]].backoff; SetExtension(unigrams[vocab_ids[1]].backoff); // Bigram including a unigram's backoff - middle[0].Insert(Middle::Packing::Make(keys[0], blank)); + middle[0].Insert(detail::ProbBackoffEntry::Make(keys[0], blank)); fix = 1; } else { for (unsigned int i = 3; i < fix + 2; ++i) backoff_hash = detail::CombineWordHash(backoff_hash, vocab_ids[i]); @@ -74,22 +74,24 @@ template void FixSRI(int lower, float negative_lower_prob, unsign for (; fix <= n - 3; ++fix) { typename Middle::MutableIterator gotit; if (middle[fix - 1].UnsafeMutableFind(backoff_hash, gotit)) { - float &backoff = gotit->MutableValue().backoff; + float &backoff = gotit->value.backoff; SetExtension(backoff); blank.prob -= backoff; } - middle[fix].Insert(Middle::Packing::Make(keys[fix], blank)); + middle[fix].Insert(detail::ProbBackoffEntry::Make(keys[fix], blank)); backoff_hash = detail::CombineWordHash(backoff_hash, vocab_ids[fix + 2]); } } template void ReadNGrams(util::FilePiece &f, const unsigned int n, const size_t count, const Voc &vocab, ProbBackoff *unigrams, std::vector &middle, Activate activate, Store &store, PositiveProbWarn &warn) { + assert(n >= 2); ReadNGramHeader(f, n); - // vocab ids of words in reverse order + // Both vocab_ids and keys are non-empty because n >= 2. + // vocab ids of words in reverse order. std::vector vocab_ids(n); std::vector keys(n-1); - typename Store::Packing::Value value; + typename Store::Entry::Value value; typename Middle::MutableIterator found; for (size_t i = 0; i < count; ++i) { ReadNGram(f, n, vocab, &*vocab_ids.begin(), value, warn); @@ -100,7 +102,7 @@ template void ReadNGrams( } // Initially the sign bit is on, indicating it does not extend left. Most already have this but there might +0.0. util::SetSign(value.prob); - store.Insert(Store::Packing::Make(keys[n-2], value)); + store.Insert(Store::Entry::Make(keys[n-2], value)); // Go back and find the longest right-aligned entry, informing it that it extends left. Normally this will match immediately, but sometimes SRI is dumb. int lower; util::FloatEnc fix_prob; @@ -113,9 +115,9 @@ template void ReadNGrams( } if (middle[lower].UnsafeMutableFind(keys[lower], found)) { // Turn off sign bit to indicate that it extends left. - fix_prob.f = found->MutableValue().prob; + fix_prob.f = found->value.prob; fix_prob.i &= ~util::kSignBit; - found->MutableValue().prob = fix_prob.f; + found->value.prob = fix_prob.f; // We don't need to recurse further down because this entry already set the bits for lower entries. break; } @@ -147,7 +149,7 @@ template uint8_t *TemplateHashedSearch template void TemplateHashedSearch::InitializeFromARPA(const char * /*file*/, util::FilePiece &f, const std::vector &counts, const Config &config, Voc &vocab, Backing &backing) { // TODO: fix sorted. - SetupMemory(GrowForSearch(config, 0, Size(counts, config), backing), counts, config); + SetupMemory(GrowForSearch(config, vocab.UnkCountChangePadding(), Size(counts, config), backing), counts, config); PositiveProbWarn warn(config.positive_log_probability); diff --git a/klm/lm/search_hashed.hh b/klm/lm/search_hashed.hh index e289fd11..4352c72d 100644 --- a/klm/lm/search_hashed.hh +++ b/klm/lm/search_hashed.hh @@ -8,7 +8,6 @@ #include "lm/weights.hh" #include "util/bit_packing.hh" -#include "util/key_value_packing.hh" #include "util/probing_hash_table.hh" #include @@ -92,8 +91,10 @@ template class TemplateHashedSearch : public Has template void InitializeFromARPA(const char *file, util::FilePiece &f, const std::vector &counts, const Config &config, Voc &vocab, Backing &backing); - const Middle *MiddleBegin() const { return &*middle_.begin(); } - const Middle *MiddleEnd() const { return &*middle_.end(); } + typedef typename std::vector::const_iterator MiddleIter; + + MiddleIter MiddleBegin() const { return middle_.begin(); } + MiddleIter MiddleEnd() const { return middle_.end(); } Node Unpack(uint64_t extend_pointer, unsigned char extend_length, float &prob) const { util::FloatEnc val; @@ -105,7 +106,7 @@ template class TemplateHashedSearch : public Has std::cerr << "Extend pointer " << extend_pointer << " should have been found for length " << (unsigned) extend_length << std::endl; abort(); } - val.f = found->GetValue().prob; + val.f = found->value.prob; } val.i |= util::kSignBit; prob = val.f; @@ -117,12 +118,12 @@ template class TemplateHashedSearch : public Has typename Middle::ConstIterator found; if (!middle.Find(node, found)) return false; util::FloatEnc enc; - enc.f = found->GetValue().prob; + enc.f = found->value.prob; ret.independent_left = (enc.i & util::kSignBit); ret.extend_left = node; enc.i |= util::kSignBit; ret.prob = enc.f; - backoff = found->GetValue().backoff; + backoff = found->value.backoff; return true; } @@ -132,7 +133,7 @@ template class TemplateHashedSearch : public Has node = CombineWordHash(node, word); typename Middle::ConstIterator found; if (!middle.Find(node, found)) return false; - backoff = found->GetValue().backoff; + backoff = found->value.backoff; return true; } @@ -141,7 +142,7 @@ template class TemplateHashedSearch : public Has node = CombineWordHash(node, word); typename Longest::ConstIterator found; if (!longest.Find(node, found)) return false; - prob = found->GetValue().prob; + prob = found->value.prob; return true; } @@ -160,14 +161,50 @@ template class TemplateHashedSearch : public Has std::vector middle_; }; -// std::identity is an SGI extension :-( -struct IdentityHash : public std::unary_function { - size_t operator()(uint64_t arg) const { return static_cast(arg); } +/* These look like perfect candidates for a template, right? Ancient gcc (4.1 + * on RedHat stale linux) doesn't pack templates correctly. ProbBackoffEntry + * is a multiple of 8 bytes anyway. ProbEntry is 12 bytes so it's set to pack. + */ +struct ProbBackoffEntry { + uint64_t key; + ProbBackoff value; + typedef uint64_t Key; + typedef ProbBackoff Value; + uint64_t GetKey() const { + return key; + } + static ProbBackoffEntry Make(uint64_t key, ProbBackoff value) { + ProbBackoffEntry ret; + ret.key = key; + ret.value = value; + return ret; + } }; +#pragma pack(push) +#pragma pack(4) +struct ProbEntry { + uint64_t key; + Prob value; + typedef uint64_t Key; + typedef Prob Value; + uint64_t GetKey() const { + return key; + } + static ProbEntry Make(uint64_t key, Prob value) { + ProbEntry ret; + ret.key = key; + ret.value = value; + return ret; + } +}; + +#pragma pack(pop) + + struct ProbingHashedSearch : public TemplateHashedSearch< - util::ProbingHashTable, IdentityHash>, - util::ProbingHashTable, IdentityHash> > { + util::ProbingHashTable, + util::ProbingHashTable > { static const ModelType kModelType = HASH_PROBING; }; diff --git a/klm/lm/search_trie.cc b/klm/lm/search_trie.cc index 4bd3f4ee..ffadfa94 100644 --- a/klm/lm/search_trie.cc +++ b/klm/lm/search_trie.cc @@ -13,6 +13,7 @@ #include "lm/weights.hh" #include "lm/word_index.hh" #include "util/ersatz_progress.hh" +#include "util/mmap.hh" #include "util/proxy_iterator.hh" #include "util/scoped.hh" #include "util/sized_iterator.hh" @@ -20,14 +21,15 @@ #include #include #include +#include #include #include #include #include -#include -#include -#include +#if defined(_WIN32) || defined(_WIN64) +#include +#endif namespace lm { namespace ngram { @@ -195,7 +197,7 @@ class SRISucks { void ObtainBackoffs(unsigned char total_order, FILE *unigram_file, RecordReader *reader) { for (unsigned char i = 0; i < kMaxOrder - 1; ++i) { - it_[i] = &*values_[i].begin(); + it_[i] = values_[i].empty() ? NULL : &*values_[i].begin(); } messages_[0].Apply(it_, unigram_file); BackoffMessages *messages = messages_ + 1; @@ -227,8 +229,8 @@ class SRISucks { class FindBlanks { public: - FindBlanks(uint64_t *counts, unsigned char order, const ProbBackoff *unigrams, SRISucks &messages) - : counts_(counts), longest_counts_(counts + order - 1), unigrams_(unigrams), sri_(messages) {} + FindBlanks(unsigned char order, const ProbBackoff *unigrams, SRISucks &messages) + : counts_(order), unigrams_(unigrams), sri_(messages) {} float UnigramProb(WordIndex index) const { return unigrams_[index].prob; @@ -248,7 +250,7 @@ class FindBlanks { } void Longest(const void * /*data*/) { - ++*longest_counts_; + ++counts_.back(); } // Unigrams wrote one past. @@ -256,8 +258,12 @@ class FindBlanks { --counts_[0]; } + const std::vector &Counts() const { + return counts_; + } + private: - uint64_t *const counts_, *const longest_counts_; + std::vector counts_; const ProbBackoff *unigrams_; @@ -375,7 +381,7 @@ template class BlankManager { template void RecursiveInsert(const unsigned char total_order, const WordIndex unigram_count, RecordReader *input, std::ostream *progress_out, const char *message, Doing &doing) { util::ErsatzProgress progress(progress_out, message, unigram_count + 1); - unsigned int unigram = 0; + WordIndex unigram = 0; std::priority_queue grams; grams.push(Gram(&unigram, 1)); for (unsigned char i = 2; i <= total_order; ++i) { @@ -461,42 +467,33 @@ void PopulateUnigramWeights(FILE *file, WordIndex unigram_count, RecordReader &c } // namespace -template void BuildTrie(const std::string &file_prefix, std::vector &counts, const Config &config, TrieSearch &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing) { +template void BuildTrie(SortedFiles &files, std::vector &counts, const Config &config, TrieSearch &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing) { RecordReader inputs[kMaxOrder - 1]; RecordReader contexts[kMaxOrder - 1]; for (unsigned char i = 2; i <= counts.size(); ++i) { - std::stringstream assembled; - assembled << file_prefix << static_cast(i) << "_merged"; - inputs[i-2].Init(assembled.str(), i * sizeof(WordIndex) + (i == counts.size() ? sizeof(Prob) : sizeof(ProbBackoff))); - util::RemoveOrThrow(assembled.str().c_str()); - assembled << kContextSuffix; - contexts[i-2].Init(assembled.str(), (i-1) * sizeof(WordIndex)); - util::RemoveOrThrow(assembled.str().c_str()); + inputs[i-2].Init(files.Full(i), i * sizeof(WordIndex) + (i == counts.size() ? sizeof(Prob) : sizeof(ProbBackoff))); + contexts[i-2].Init(files.Context(i), (i-1) * sizeof(WordIndex)); } SRISucks sri; - std::vector fixed_counts(counts.size()); + std::vector fixed_counts; + util::scoped_FILE unigram_file; + util::scoped_fd unigram_fd(files.StealUnigram()); { - std::string temp(file_prefix); temp += "unigrams"; - util::scoped_fd unigram_file(util::OpenReadOrThrow(temp.c_str())); util::scoped_memory unigrams; - MapRead(util::POPULATE_OR_READ, unigram_file.get(), 0, counts[0] * sizeof(ProbBackoff), unigrams); - FindBlanks finder(&*fixed_counts.begin(), counts.size(), reinterpret_cast(unigrams.get()), sri); + MapRead(util::POPULATE_OR_READ, unigram_fd.get(), 0, counts[0] * sizeof(ProbBackoff), unigrams); + FindBlanks finder(counts.size(), reinterpret_cast(unigrams.get()), sri); RecursiveInsert(counts.size(), counts[0], inputs, config.messages, "Identifying n-grams omitted by SRI", finder); + fixed_counts = finder.Counts(); } + unigram_file.reset(util::FDOpenOrThrow(unigram_fd)); for (const RecordReader *i = inputs; i != inputs + counts.size() - 2; ++i) { if (*i) UTIL_THROW(FormatLoadException, "There's a bug in the trie implementation: the " << (i - inputs + 2) << "-gram table did not complete reading"); } SanityCheckCounts(counts, fixed_counts); counts = fixed_counts; - util::scoped_FILE unigram_file; - { - std::string name(file_prefix + "unigrams"); - unigram_file.reset(OpenOrThrow(name.c_str(), "r+")); - util::RemoveOrThrow(name.c_str()); - } sri.ObtainBackoffs(counts.size(), unigram_file.get(), inputs); out.SetupMemory(GrowForSearch(config, vocab.UnkCountChangePadding(), TrieSearch::Size(fixed_counts, config), backing), fixed_counts, config); @@ -587,42 +584,19 @@ template void TrieSearch::LoadedBin longest.LoadedBinary(); } -namespace { -bool IsDirectory(const char *path) { - struct stat info; - if (0 != stat(path, &info)) return false; - return S_ISDIR(info.st_mode); -} -} // namespace - template void TrieSearch::InitializeFromARPA(const char *file, util::FilePiece &f, std::vector &counts, const Config &config, SortedVocabulary &vocab, Backing &backing) { - std::string temporary_directory; + std::string temporary_prefix; if (config.temporary_directory_prefix) { - temporary_directory = config.temporary_directory_prefix; - if (!temporary_directory.empty() && temporary_directory[temporary_directory.size() - 1] != '/' && IsDirectory(temporary_directory.c_str())) - temporary_directory += '/'; + temporary_prefix = config.temporary_directory_prefix; } else if (config.write_mmap) { - temporary_directory = config.write_mmap; + temporary_prefix = config.write_mmap; } else { - temporary_directory = file; - } - // Null on end is kludge to ensure null termination. - temporary_directory += "_trie_tmp_XXXXXX"; - temporary_directory += '\0'; - if (!mkdtemp(&temporary_directory[0])) { - UTIL_THROW(util::ErrnoException, "Failed to make a temporary directory based on the name " << temporary_directory.c_str()); + temporary_prefix = file; } - // Chop off null kludge. - temporary_directory.resize(strlen(temporary_directory.c_str())); - // Add directory delimiter. Assumes a real operating system. - temporary_directory += '/'; // At least 1MB sorting memory. - ARPAToSortedFiles(config, f, counts, std::max(config.building_memory, 1048576), temporary_directory.c_str(), vocab); + SortedFiles sorted(config, f, counts, std::max(config.building_memory, 1048576), temporary_prefix, vocab); - BuildTrie(temporary_directory, counts, config, *this, quant_, vocab, backing); - if (rmdir(temporary_directory.c_str()) && config.messages) { - *config.messages << "Failed to delete " << temporary_directory << std::endl; - } + BuildTrie(sorted, counts, config, *this, quant_, vocab, backing); } template class TrieSearch; diff --git a/klm/lm/search_trie.hh b/klm/lm/search_trie.hh index 33ae8cff..5155ca02 100644 --- a/klm/lm/search_trie.hh +++ b/klm/lm/search_trie.hh @@ -7,6 +7,7 @@ #include "lm/trie.hh" #include "lm/weights.hh" +#include "util/file.hh" #include "util/file_piece.hh" #include @@ -20,7 +21,8 @@ class SortedVocabulary; namespace trie { template class TrieSearch; -template void BuildTrie(const std::string &file_prefix, std::vector &counts, const Config &config, TrieSearch &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing); +class SortedFiles; +template void BuildTrie(SortedFiles &files, std::vector &counts, const Config &config, TrieSearch &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing); template class TrieSearch { public: @@ -40,7 +42,7 @@ template class TrieSearch { static void UpdateConfigFromBinary(int fd, const std::vector &counts, Config &config) { Quant::UpdateConfigFromBinary(fd, counts, config); - AdvanceOrThrow(fd, Quant::Size(counts.size(), config) + Unigram::Size(counts[0])); + util::AdvanceOrThrow(fd, Quant::Size(counts.size(), config) + Unigram::Size(counts[0])); Bhiksha::UpdateConfigFromBinary(fd, config); } @@ -60,6 +62,8 @@ template class TrieSearch { void LoadedBinary(); + typedef const Middle *MiddleIter; + const Middle *MiddleBegin() const { return middle_begin_; } const Middle *MiddleEnd() const { return middle_end_; } @@ -108,7 +112,7 @@ template class TrieSearch { } private: - friend void BuildTrie(const std::string &file_prefix, std::vector &counts, const Config &config, TrieSearch &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing); + friend void BuildTrie(SortedFiles &files, std::vector &counts, const Config &config, TrieSearch &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing); // Middles are managed manually so we can delay construction and they don't have to be copyable. void FreeMiddles() { diff --git a/klm/lm/trie.hh b/klm/lm/trie.hh index 06cc96ac..ebe9910f 100644 --- a/klm/lm/trie.hh +++ b/klm/lm/trie.hh @@ -1,7 +1,7 @@ #ifndef LM_TRIE__ #define LM_TRIE__ -#include +#include #include diff --git a/klm/lm/trie_sort.cc b/klm/lm/trie_sort.cc index bb126f18..b80fed02 100644 --- a/klm/lm/trie_sort.cc +++ b/klm/lm/trie_sort.cc @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -22,14 +23,6 @@ namespace lm { namespace ngram { namespace trie { -const char *kContextSuffix = "_contexts"; - -FILE *OpenOrThrow(const char *name, const char *mode) { - FILE *ret = fopen(name, mode); - if (!ret) UTIL_THROW(util::ErrnoException, "Could not open " << name << " for " << mode); - return ret; -} - void WriteOrThrow(FILE *to, const void *data, size_t size) { assert(size); if (1 != std::fwrite(data, size, 1, to)) UTIL_THROW(util::ErrnoException, "Short write; requested size " << size); @@ -78,28 +71,29 @@ class PartialViewProxy { typedef util::ProxyIterator PartialIter; -std::string DiskFlush(const void *mem_begin, const void *mem_end, const std::string &file_prefix, std::size_t batch, unsigned char order) { - std::stringstream assembled; - assembled << file_prefix << static_cast(order) << '_' << batch; - std::string ret(assembled.str()); - util::scoped_fd out(util::CreateOrThrow(ret.c_str())); - util::WriteOrThrow(out.get(), mem_begin, (uint8_t*)mem_end - (uint8_t*)mem_begin); - return ret; +FILE *DiskFlush(const void *mem_begin, const void *mem_end, const util::TempMaker &maker) { + util::scoped_fd file(maker.Make()); + util::WriteOrThrow(file.get(), mem_begin, (uint8_t*)mem_end - (uint8_t*)mem_begin); + return util::FDOpenOrThrow(file); } -void WriteContextFile(uint8_t *begin, uint8_t *end, const std::string &ngram_file_name, std::size_t entry_size, unsigned char order) { +FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const util::TempMaker &maker, std::size_t entry_size, unsigned char order) { const size_t context_size = sizeof(WordIndex) * (order - 1); // Sort just the contexts using the same memory. PartialIter context_begin(PartialViewProxy(begin + sizeof(WordIndex), entry_size, context_size)); PartialIter context_end(PartialViewProxy(end + sizeof(WordIndex), entry_size, context_size)); - std::sort(context_begin, context_end, util::SizedCompare(EntryCompare(order - 1))); +#if defined(_WIN32) || defined(_WIN64) + std::stable_sort +#else + std::sort +#endif + (context_begin, context_end, util::SizedCompare(EntryCompare(order - 1))); - std::string name(ngram_file_name + kContextSuffix); - util::scoped_FILE out(OpenOrThrow(name.c_str(), "w")); + util::scoped_FILE out(maker.MakeFile()); // Write out to file and uniqueify at the same time. Could have used unique_copy if there was an appropriate OutputIterator. - if (context_begin == context_end) return; + if (context_begin == context_end) return out.release(); PartialIter i(context_begin); WriteOrThrow(out.get(), i->Data(), context_size); const void *previous = i->Data(); @@ -110,6 +104,7 @@ void WriteContextFile(uint8_t *begin, uint8_t *end, const std::string &ngram_fil previous = i->Data(); } } + return out.release(); } struct ThrowCombine { @@ -125,14 +120,12 @@ struct FirstCombine { } }; -template void MergeSortedFiles(const std::string &first_name, const std::string &second_name, const std::string &out, std::size_t weights_size, unsigned char order, const Combine &combine = ThrowCombine()) { +template FILE *MergeSortedFiles(FILE *first_file, FILE *second_file, const util::TempMaker &maker, std::size_t weights_size, unsigned char order, const Combine &combine) { std::size_t entry_size = sizeof(WordIndex) * order + weights_size; RecordReader first, second; - first.Init(first_name.c_str(), entry_size); - util::RemoveOrThrow(first_name.c_str()); - second.Init(second_name.c_str(), entry_size); - util::RemoveOrThrow(second_name.c_str()); - util::scoped_FILE out_file(OpenOrThrow(out.c_str(), "w")); + first.Init(first_file, entry_size); + second.Init(second_file, entry_size); + util::scoped_FILE out_file(maker.MakeFile()); EntryCompare less(order); while (first && second) { if (less(first.Data(), second.Data())) { @@ -149,67 +142,14 @@ template void MergeSortedFiles(const std::string &first_name, co for (RecordReader &remains = (first ? first : second); remains; ++remains) { WriteOrThrow(out_file.get(), remains.Data(), entry_size); } -} - -void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector &counts, util::scoped_memory &mem, const std::string &file_prefix, unsigned char order, PositiveProbWarn &warn) { - ReadNGramHeader(f, order); - const size_t count = counts[order - 1]; - // Size of weights. Does it include backoff? - const size_t words_size = sizeof(WordIndex) * order; - const size_t weights_size = sizeof(float) + ((order == counts.size()) ? 0 : sizeof(float)); - const size_t entry_size = words_size + weights_size; - const size_t batch_size = std::min(count, mem.size() / entry_size); - uint8_t *const begin = reinterpret_cast(mem.get()); - std::deque files; - for (std::size_t batch = 0, done = 0; done < count; ++batch) { - uint8_t *out = begin; - uint8_t *out_end = out + std::min(count - done, batch_size) * entry_size; - if (order == counts.size()) { - for (; out != out_end; out += entry_size) { - ReadNGram(f, order, vocab, reinterpret_cast(out), *reinterpret_cast(out + words_size), warn); - } - } else { - for (; out != out_end; out += entry_size) { - ReadNGram(f, order, vocab, reinterpret_cast(out), *reinterpret_cast(out + words_size), warn); - } - } - // Sort full records by full n-gram. - util::SizedProxy proxy_begin(begin, entry_size), proxy_end(out_end, entry_size); - // parallel_sort uses too much RAM - std::sort(NGramIter(proxy_begin), NGramIter(proxy_end), util::SizedCompare(EntryCompare(order))); - files.push_back(DiskFlush(begin, out_end, file_prefix, batch, order)); - WriteContextFile(begin, out_end, files.back(), entry_size, order); - - done += (out_end - begin) / entry_size; - } - - // All individual files created. Merge them. - - std::size_t merge_count = 0; - while (files.size() > 1) { - std::stringstream assembled; - assembled << file_prefix << static_cast(order) << "_merge_" << (merge_count++); - files.push_back(assembled.str()); - MergeSortedFiles(files[0], files[1], files.back(), weights_size, order, ThrowCombine()); - MergeSortedFiles(files[0] + kContextSuffix, files[1] + kContextSuffix, files.back() + kContextSuffix, 0, order - 1, FirstCombine()); - files.pop_front(); - files.pop_front(); - } - if (!files.empty()) { - std::stringstream assembled; - assembled << file_prefix << static_cast(order) << "_merged"; - std::string merged_name(assembled.str()); - if (std::rename(files[0].c_str(), merged_name.c_str())) UTIL_THROW(util::ErrnoException, "Could not rename " << files[0].c_str() << " to " << merged_name.c_str()); - std::string context_name = files[0] + kContextSuffix; - merged_name += kContextSuffix; - if (std::rename(context_name.c_str(), merged_name.c_str())) UTIL_THROW(util::ErrnoException, "Could not rename " << context_name << " to " << merged_name.c_str()); - } + return out_file.release(); } } // namespace -void RecordReader::Init(const std::string &name, std::size_t entry_size) { - file_.reset(OpenOrThrow(name.c_str(), "r+")); +void RecordReader::Init(FILE *file, std::size_t entry_size) { + rewind(file); + file_ = file; data_.reset(malloc(entry_size)); UTIL_THROW_IF(!data_.get(), util::ErrnoException, "Failed to malloc read buffer"); remains_ = true; @@ -219,20 +159,29 @@ void RecordReader::Init(const std::string &name, std::size_t entry_size) { void RecordReader::Overwrite(const void *start, std::size_t amount) { long internal = (uint8_t*)start - (uint8_t*)data_.get(); - UTIL_THROW_IF(fseek(file_.get(), internal - entry_size_, SEEK_CUR), util::ErrnoException, "Couldn't seek backwards for revision"); - WriteOrThrow(file_.get(), start, amount); + UTIL_THROW_IF(fseek(file_, internal - entry_size_, SEEK_CUR), util::ErrnoException, "Couldn't seek backwards for revision"); + WriteOrThrow(file_, start, amount); long forward = entry_size_ - internal - amount; - if (forward) UTIL_THROW_IF(fseek(file_.get(), forward, SEEK_CUR), util::ErrnoException, "Couldn't seek forwards past revision"); +#if !defined(_WIN32) && !defined(_WIN64) + if (forward) +#endif + UTIL_THROW_IF(fseek(file_, forward, SEEK_CUR), util::ErrnoException, "Couldn't seek forwards past revision"); } -void ARPAToSortedFiles(const Config &config, util::FilePiece &f, std::vector &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab) { +void RecordReader::Rewind() { + rewind(file_); + remains_ = true; + ++*this; +} + +SortedFiles::SortedFiles(const Config &config, util::FilePiece &f, std::vector &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab) { + util::TempMaker maker(file_prefix); PositiveProbWarn warn(config.positive_log_probability); + unigram_.reset(maker.Make()); { - std::string unigram_name = file_prefix + "unigrams"; - util::scoped_fd unigram_file; // In case appears. - size_t file_out = (counts[0] + 1) * sizeof(ProbBackoff); - util::scoped_mmap unigram_mmap(util::MapZeroedWrite(unigram_name.c_str(), file_out, unigram_file), file_out); + size_t size_out = (counts[0] + 1) * sizeof(ProbBackoff); + util::scoped_mmap unigram_mmap(util::MapZeroedWrite(unigram_.get(), size_out), size_out); Read1Grams(f, counts[0], vocab, reinterpret_cast(unigram_mmap.get()), warn); CheckSpecials(config, vocab); if (!vocab.SawUnk()) ++counts[0]; @@ -246,16 +195,96 @@ void ARPAToSortedFiles(const Config &config, util::FilePiece &f, std::vector(buffer_use, static_cast((sizeof(WordIndex) * counts.size() + sizeof(float)) * counts.back())); buffer = std::min(buffer, buffer_use); - util::scoped_memory mem; - mem.reset(malloc(buffer), buffer, util::scoped_memory::MALLOC_ALLOCATED); + util::scoped_malloc mem; + mem.reset(malloc(buffer)); if (!mem.get()) UTIL_THROW(util::ErrnoException, "malloc failed for sort buffer size " << buffer); for (unsigned char order = 2; order <= counts.size(); ++order) { - ConvertToSorted(f, vocab, counts, mem, file_prefix, order, warn); + ConvertToSorted(f, vocab, counts, maker, order, warn, mem.get(), buffer); } ReadEnd(f); } +namespace { +class Closer { + public: + explicit Closer(std::deque &files) : files_(files) {} + + ~Closer() { + for (std::deque::iterator i = files_.begin(); i != files_.end(); ++i) { + util::scoped_FILE deleter(*i); + } + } + + void PopFront() { + util::scoped_FILE deleter(files_.front()); + files_.pop_front(); + } + private: + std::deque &files_; +}; +} // namespace + +void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector &counts, const util::TempMaker &maker, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size) { + ReadNGramHeader(f, order); + const size_t count = counts[order - 1]; + // Size of weights. Does it include backoff? + const size_t words_size = sizeof(WordIndex) * order; + const size_t weights_size = sizeof(float) + ((order == counts.size()) ? 0 : sizeof(float)); + const size_t entry_size = words_size + weights_size; + const size_t batch_size = std::min(count, mem_size / entry_size); + uint8_t *const begin = reinterpret_cast(mem); + + std::deque files, contexts; + Closer files_closer(files), contexts_closer(contexts); + + for (std::size_t batch = 0, done = 0; done < count; ++batch) { + uint8_t *out = begin; + uint8_t *out_end = out + std::min(count - done, batch_size) * entry_size; + if (order == counts.size()) { + for (; out != out_end; out += entry_size) { + ReadNGram(f, order, vocab, reinterpret_cast(out), *reinterpret_cast(out + words_size), warn); + } + } else { + for (; out != out_end; out += entry_size) { + ReadNGram(f, order, vocab, reinterpret_cast(out), *reinterpret_cast(out + words_size), warn); + } + } + // Sort full records by full n-gram. + util::SizedProxy proxy_begin(begin, entry_size), proxy_end(out_end, entry_size); + // parallel_sort uses too much RAM. TODO: figure out why windows sort doesn't like my proxies. +#if defined(_WIN32) || defined(_WIN64) + std::stable_sort +#else + std::sort +#endif + (NGramIter(proxy_begin), NGramIter(proxy_end), util::SizedCompare(EntryCompare(order))); + files.push_back(DiskFlush(begin, out_end, maker)); + contexts.push_back(WriteContextFile(begin, out_end, maker, entry_size, order)); + + done += (out_end - begin) / entry_size; + } + + // All individual files created. Merge them. + + while (files.size() > 1) { + files.push_back(MergeSortedFiles(files[0], files[1], maker, weights_size, order, ThrowCombine())); + files_closer.PopFront(); + files_closer.PopFront(); + contexts.push_back(MergeSortedFiles(contexts[0], contexts[1], maker, 0, order - 1, FirstCombine())); + contexts_closer.PopFront(); + contexts_closer.PopFront(); + } + + if (!files.empty()) { + // Steal from closers. + full_[order - 2].reset(files.front()); + files.pop_front(); + context_[order - 2].reset(contexts.front()); + contexts.pop_front(); + } +} + } // namespace trie } // namespace ngram } // namespace lm diff --git a/klm/lm/trie_sort.hh b/klm/lm/trie_sort.hh index a6916483..3036319d 100644 --- a/klm/lm/trie_sort.hh +++ b/klm/lm/trie_sort.hh @@ -1,6 +1,9 @@ +// Step of trie builder: create sorted files. + #ifndef LM_TRIE_SORT__ #define LM_TRIE_SORT__ +#include "lm/max_order.hh" #include "lm/word_index.hh" #include "util/file.hh" @@ -11,20 +14,21 @@ #include #include -#include +#include -namespace util { class FilePiece; } +namespace util { +class FilePiece; +class TempMaker; +} // namespace util -// Step of trie builder: create sorted files. namespace lm { +class PositiveProbWarn; namespace ngram { class SortedVocabulary; class Config; namespace trie { -extern const char *kContextSuffix; -FILE *OpenOrThrow(const char *name, const char *mode); void WriteOrThrow(FILE *to, const void *data, size_t size); class EntryCompare : public std::binary_function { @@ -49,15 +53,15 @@ class RecordReader { public: RecordReader() : remains_(true) {} - void Init(const std::string &name, std::size_t entry_size); + void Init(FILE *file, std::size_t entry_size); void *Data() { return data_.get(); } const void *Data() const { return data_.get(); } RecordReader &operator++() { - std::size_t ret = fread(data_.get(), entry_size_, 1, file_.get()); + std::size_t ret = fread(data_.get(), entry_size_, 1, file_); if (!ret) { - UTIL_THROW_IF(!feof(file_.get()), util::ErrnoException, "Error reading temporary file"); + UTIL_THROW_IF(!feof(file_), util::ErrnoException, "Error reading temporary file"); remains_ = false; } return *this; @@ -65,27 +69,46 @@ class RecordReader { operator bool() const { return remains_; } - void Rewind() { - rewind(file_.get()); - remains_ = true; - ++*this; - } + void Rewind(); std::size_t EntrySize() const { return entry_size_; } void Overwrite(const void *start, std::size_t amount); private: + FILE *file_; + util::scoped_malloc data_; bool remains_; std::size_t entry_size_; - - util::scoped_FILE file_; }; -void ARPAToSortedFiles(const Config &config, util::FilePiece &f, std::vector &counts, size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab); +class SortedFiles { + public: + // Build from ARPA + SortedFiles(const Config &config, util::FilePiece &f, std::vector &counts, std::size_t buffer, const std::string &file_prefix, SortedVocabulary &vocab); + + int StealUnigram() { + return unigram_.release(); + } + + FILE *Full(unsigned char order) { + return full_[order - 2].get(); + } + + FILE *Context(unsigned char of_order) { + return context_[of_order - 2].get(); + } + + private: + void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector &counts, const util::TempMaker &maker, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size); + + util::scoped_fd unigram_; + + util::scoped_FILE full_[kMaxOrder - 1], context_[kMaxOrder - 1]; +}; } // namespace trie } // namespace ngram diff --git a/klm/lm/vocab.cc b/klm/lm/vocab.cc index ffec41ca..9fd698bb 100644 --- a/klm/lm/vocab.cc +++ b/klm/lm/vocab.cc @@ -6,12 +6,15 @@ #include "lm/config.hh" #include "lm/weights.hh" #include "util/exception.hh" +#include "util/file.hh" #include "util/joint_sort.hh" #include "util/murmur_hash.hh" #include "util/probing_hash_table.hh" #include +#include + namespace lm { namespace ngram { @@ -29,23 +32,30 @@ const uint64_t kUnknownHash = detail::HashForVocab("", 5); // Sadly some LMs have . const uint64_t kUnknownCapHash = detail::HashForVocab("", 5); -WordIndex ReadWords(int fd, EnumerateVocab *enumerate) { - if (!enumerate) return std::numeric_limits::max(); +void ReadWords(int fd, EnumerateVocab *enumerate, WordIndex expected_count) { + // Check that we're at the right place by reading which is always first. + char check_unk[6]; + util::ReadOrThrow(fd, check_unk, 6); + UTIL_THROW_IF( + memcmp(check_unk, "", 6), + FormatLoadException, + "Vocabulary words are in the wrong place. This could be because the binary file was built with stale gcc and old kenlm. Stale gcc, including the gcc distributed with RedHat and OS X, has a bug that ignores pragma pack for template-dependent types. New kenlm works around this, so you'll save memory but have to rebuild any binary files using the probing data structure."); + if (!enumerate) return; + enumerate->Add(0, ""); + + // Read all the words after unk. const std::size_t kInitialRead = 16384; std::string buf; buf.reserve(kInitialRead + 100); buf.resize(kInitialRead); - WordIndex index = 0; + WordIndex index = 1; // Read already. while (true) { - ssize_t got = read(fd, &buf[0], kInitialRead); - UTIL_THROW_IF(got == -1, util::ErrnoException, "Reading vocabulary words"); - if (got == 0) return index; + std::size_t got = util::ReadOrEOF(fd, &buf[0], kInitialRead); + if (got == 0) break; buf.resize(got); while (buf[buf.size() - 1]) { char next_char; - ssize_t ret = read(fd, &next_char, 1); - UTIL_THROW_IF(ret == -1, util::ErrnoException, "Reading vocabulary words"); - UTIL_THROW_IF(ret == 0, FormatLoadException, "Missing null terminator on a vocab word."); + util::ReadOrThrow(fd, &next_char, 1); buf.push_back(next_char); } // Ok now we have null terminated strings. @@ -55,6 +65,8 @@ WordIndex ReadWords(int fd, EnumerateVocab *enumerate) { i += length + 1 /* null byte */; } } + + UTIL_THROW_IF(expected_count != index, FormatLoadException, "The binary file has the wrong number of words at the end. This could be caused by a truncated binary file."); } } // namespace @@ -69,8 +81,7 @@ void WriteWordsWrapper::Add(WordIndex index, const StringPiece &str) { } void WriteWordsWrapper::Write(int fd) { - if ((off_t)-1 == lseek(fd, 0, SEEK_END)) - UTIL_THROW(util::ErrnoException, "Failed to seek in binary to vocab words"); + util::SeekEnd(fd); util::WriteOrThrow(fd, buffer_.data(), buffer_.size()); } @@ -114,8 +125,10 @@ WordIndex SortedVocabulary::Insert(const StringPiece &str) { void SortedVocabulary::FinishedLoading(ProbBackoff *reorder_vocab) { if (enumerate_) { - util::PairedIterator values(reorder_vocab + 1, &*strings_to_enumerate_.begin()); - util::JointSort(begin_, end_, values); + if (!strings_to_enumerate_.empty()) { + util::PairedIterator values(reorder_vocab + 1, &*strings_to_enumerate_.begin()); + util::JointSort(begin_, end_, values); + } for (WordIndex i = 0; i < static_cast(end_ - begin_); ++i) { // strikes again: +1 here. enumerate_->Add(i + 1, strings_to_enumerate_[i]); @@ -131,11 +144,11 @@ void SortedVocabulary::FinishedLoading(ProbBackoff *reorder_vocab) { bound_ = end_ - begin_ + 1; } -void SortedVocabulary::LoadedBinary(int fd, EnumerateVocab *to) { +void SortedVocabulary::LoadedBinary(bool have_words, int fd, EnumerateVocab *to) { end_ = begin_ + *(reinterpret_cast(begin_) - 1); - ReadWords(fd, to); SetSpecial(Index(""), Index(""), 0); bound_ = end_ - begin_ + 1; + if (have_words) ReadWords(fd, to, bound_); } namespace { @@ -153,12 +166,12 @@ struct ProbingVocabularyHeader { ProbingVocabulary::ProbingVocabulary() : enumerate_(NULL) {} std::size_t ProbingVocabulary::Size(std::size_t entries, const Config &config) { - return Align8(sizeof(detail::ProbingVocabularyHeader)) + Lookup::Size(entries, config.probing_multiplier); + return ALIGN8(sizeof(detail::ProbingVocabularyHeader)) + Lookup::Size(entries, config.probing_multiplier); } void ProbingVocabulary::SetupMemory(void *start, std::size_t allocated, std::size_t /*entries*/, const Config &/*config*/) { header_ = static_cast(start); - lookup_ = Lookup(static_cast(start) + Align8(sizeof(detail::ProbingVocabularyHeader)), allocated); + lookup_ = Lookup(static_cast(start) + ALIGN8(sizeof(detail::ProbingVocabularyHeader)), allocated); bound_ = 1; saw_unk_ = false; } @@ -178,7 +191,7 @@ WordIndex ProbingVocabulary::Insert(const StringPiece &str) { return 0; } else { if (enumerate_) enumerate_->Add(bound_, str); - lookup_.Insert(Lookup::Packing::Make(hashed, bound_)); + lookup_.Insert(ProbingVocabuaryEntry::Make(hashed, bound_)); return bound_++; } } @@ -190,12 +203,12 @@ void ProbingVocabulary::FinishedLoading(ProbBackoff * /*reorder_vocab*/) { SetSpecial(Index(""), Index(""), 0); } -void ProbingVocabulary::LoadedBinary(int fd, EnumerateVocab *to) { +void ProbingVocabulary::LoadedBinary(bool have_words, int fd, EnumerateVocab *to) { UTIL_THROW_IF(header_->version != kProbingVocabularyVersion, FormatLoadException, "The binary file has probing version " << header_->version << " but the code expects version " << kProbingVocabularyVersion << ". Please rerun build_binary using the same version of the code."); lookup_.LoadedBinary(); - ReadWords(fd, to); bound_ = header_->bound; SetSpecial(Index(""), Index(""), 0); + if (have_words) ReadWords(fd, to, bound_); } void MissingUnknown(const Config &config) throw(SpecialWordMissingException) { diff --git a/klm/lm/vocab.hh b/klm/lm/vocab.hh index 3c3414fb..06fdefe4 100644 --- a/klm/lm/vocab.hh +++ b/klm/lm/vocab.hh @@ -4,7 +4,6 @@ #include "lm/enumerate_vocab.hh" #include "lm/lm_exception.hh" #include "lm/virtual_interface.hh" -#include "util/key_value_packing.hh" #include "util/probing_hash_table.hh" #include "util/sorted_uniform.hh" #include "util/string_piece.hh" @@ -83,7 +82,7 @@ class SortedVocabulary : public base::Vocabulary { bool SawUnk() const { return saw_unk_; } - void LoadedBinary(int fd, EnumerateVocab *to); + void LoadedBinary(bool have_words, int fd, EnumerateVocab *to); private: uint64_t *begin_, *end_; @@ -100,6 +99,26 @@ class SortedVocabulary : public base::Vocabulary { std::vector strings_to_enumerate_; }; +#pragma pack(push) +#pragma pack(4) +struct ProbingVocabuaryEntry { + uint64_t key; + WordIndex value; + + typedef uint64_t Key; + uint64_t GetKey() const { + return key; + } + + static ProbingVocabuaryEntry Make(uint64_t key, WordIndex value) { + ProbingVocabuaryEntry ret; + ret.key = key; + ret.value = value; + return ret; + } +}; +#pragma pack(pop) + // Vocabulary storing a map from uint64_t to WordIndex. class ProbingVocabulary : public base::Vocabulary { public: @@ -107,7 +126,7 @@ class ProbingVocabulary : public base::Vocabulary { WordIndex Index(const StringPiece &str) const { Lookup::ConstIterator i; - return lookup_.Find(detail::HashForVocab(str), i) ? i->GetValue() : 0; + return lookup_.Find(detail::HashForVocab(str), i) ? i->value : 0; } static size_t Size(std::size_t entries, const Config &config); @@ -124,17 +143,14 @@ class ProbingVocabulary : public base::Vocabulary { void FinishedLoading(ProbBackoff *reorder_vocab); + std::size_t UnkCountChangePadding() const { return 0; } + bool SawUnk() const { return saw_unk_; } - void LoadedBinary(int fd, EnumerateVocab *to); + void LoadedBinary(bool have_words, int fd, EnumerateVocab *to); private: - // std::identity is an SGI extension :-( - struct IdentityHash : public std::unary_function { - std::size_t operator()(uint64_t arg) const { return static_cast(arg); } - }; - - typedef util::ProbingHashTable, IdentityHash> Lookup; + typedef util::ProbingHashTable Lookup; Lookup lookup_; diff --git a/klm/util/bit_packing.hh b/klm/util/bit_packing.hh index 33266b94..73a5cb22 100644 --- a/klm/util/bit_packing.hh +++ b/klm/util/bit_packing.hh @@ -1,33 +1,37 @@ #ifndef UTIL_BIT_PACKING__ #define UTIL_BIT_PACKING__ -/* Bit-level packing routines */ +/* Bit-level packing routines + * + * WARNING WARNING WARNING: + * The write functions assume that memory is zero initially. This makes them + * faster and is the appropriate case for mmapped language model construction. + * These routines assume that unaligned access to uint64_t is fast. This is + * the case on x86_64. I'm not sure how fast unaligned 64-bit access is on + * x86 but my target audience is large language models for which 64-bit is + * necessary. + * + * Call the BitPackingSanity function to sanity check. Calling once suffices, + * but it may be called multiple times when that's inconvenient. + * + * ARM and MinGW ports contributed by Hideo Okuma and Tomoyuki Yoshimura at + * NICT. + */ #include #ifdef __APPLE__ #include #elif __linux__ #include -#else +#elif !defined(_WIN32) && !defined(_WIN64) #include #endif -#include - -namespace util { +#include -/* WARNING WARNING WARNING: - * The write functions assume that memory is zero initially. This makes them - * faster and is the appropriate case for mmapped language model construction. - * These routines assume that unaligned access to uint64_t is fast and that - * storage is little endian. This is the case on x86_64. I'm not sure how - * fast unaligned 64-bit access is on x86 but my target audience is large - * language models for which 64-bit is necessary. - * - * Call the BitPackingSanity function to sanity check. Calling once suffices, - * but it may be called multiple times when that's inconvenient. - */ +#include +namespace util { // Fun fact: __BYTE_ORDER is wrong on Solaris Sparc, but the version without __ is correct. #if BYTE_ORDER == LITTLE_ENDIAN @@ -43,7 +47,14 @@ inline uint8_t BitPackShift(uint8_t bit, uint8_t length) { #endif inline uint64_t ReadOff(const void *base, uint64_t bit_off) { +#if defined(__arm) || defined(__arm__) + const uint8_t *base_off = reinterpret_cast(base) + (bit_off >> 3); + uint64_t value64; + memcpy(&value64, base_off, sizeof(value64)); + return value64; +#else return *reinterpret_cast(reinterpret_cast(base) + (bit_off >> 3)); +#endif } /* Pack integers up to 57 bits using their least significant digits. @@ -57,18 +68,41 @@ inline uint64_t ReadInt57(const void *base, uint64_t bit_off, uint8_t length, ui * Assumes the memory is zero initially. */ inline void WriteInt57(void *base, uint64_t bit_off, uint8_t length, uint64_t value) { +#if defined(__arm) || defined(__arm__) + uint8_t *base_off = reinterpret_cast(base) + (bit_off >> 3); + uint64_t value64; + memcpy(&value64, base_off, sizeof(value64)); + value64 |= (value << BitPackShift(bit_off & 7, length)); + memcpy(base_off, &value64, sizeof(value64)); +#else *reinterpret_cast(reinterpret_cast(base) + (bit_off >> 3)) |= (value << BitPackShift(bit_off & 7, length)); +#endif } /* Same caveats as above, but for a 25 bit limit. */ inline uint32_t ReadInt25(const void *base, uint64_t bit_off, uint8_t length, uint32_t mask) { +#if defined(__arm) || defined(__arm__) + const uint8_t *base_off = reinterpret_cast(base) + (bit_off >> 3); + uint32_t value32; + memcpy(&value32, base_off, sizeof(value32)); + return (value32 >> BitPackShift(bit_off & 7, length)) & mask; +#else return (*reinterpret_cast(reinterpret_cast(base) + (bit_off >> 3)) >> BitPackShift(bit_off & 7, length)) & mask; +#endif } inline void WriteInt25(void *base, uint64_t bit_off, uint8_t length, uint32_t value) { +#if defined(__arm) || defined(__arm__) + uint8_t *base_off = reinterpret_cast(base) + (bit_off >> 3); + uint32_t value32; + memcpy(&value32, base_off, sizeof(value32)); + value32 |= (value << BitPackShift(bit_off & 7, length)); + memcpy(base_off, &value32, sizeof(value32)); +#else *reinterpret_cast(reinterpret_cast(base) + (bit_off >> 3)) |= (value << BitPackShift(bit_off & 7, length)); +#endif } typedef union { float f; uint32_t i; } FloatEnc; diff --git a/klm/util/exception.cc b/klm/util/exception.cc index 96951495..c4f8c04c 100644 --- a/klm/util/exception.cc +++ b/klm/util/exception.cc @@ -66,7 +66,7 @@ const char *HandleStrerror(const char *ret, const char * /*buf*/) { ErrnoException::ErrnoException() throw() : errno_(errno) { char buf[200]; buf[0] = 0; -#ifdef sun +#if defined(sun) || defined(_WIN32) || defined(_WIN64) const char *add = strerror(errno); #else const char *add = HandleStrerror(strerror_r(errno, buf, 200), buf); diff --git a/klm/util/file.cc b/klm/util/file.cc index d707568e..aee7c77a 100644 --- a/klm/util/file.cc +++ b/klm/util/file.cc @@ -9,8 +9,12 @@ #include #include #include -#include -#include +#include + +#if defined(_WIN32) || defined(_WIN64) +#include +#include +#endif namespace util { @@ -30,33 +34,61 @@ scoped_FILE::~scoped_FILE() { int OpenReadOrThrow(const char *name) { int ret; +#if defined(_WIN32) || defined(_WIN64) + UTIL_THROW_IF(-1 == (ret = _open(name, _O_BINARY | _O_RDONLY)), ErrnoException, "while opening " << name); +#else UTIL_THROW_IF(-1 == (ret = open(name, O_RDONLY)), ErrnoException, "while opening " << name); +#endif return ret; } -int CreateOrThrow(const char *name) { - int ret; - UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR)), ErrnoException, "while creating " << name); - return ret; -} - -off_t SizeFile(int fd) { +uint64_t SizeFile(int fd) { +#if defined(_WIN32) || defined(_WIN64) + __int64 ret = _filelengthi64(fd); + return (ret == -1) ? kBadSize : ret; +#else struct stat sb; if (fstat(fd, &sb) == -1 || (!sb.st_size && !S_ISREG(sb.st_mode))) return kBadSize; return sb.st_size; +#endif +} + +void ResizeOrThrow(int fd, uint64_t to) { +#if defined(_WIN32) || defined(_WIN64) + UTIL_THROW_IF(_chsize_s(fd, to), ErrnoException, "Resizing to " << to << " bytes failed"); +#else + UTIL_THROW_IF(ftruncate(fd, to), ErrnoException, "Resizing to " << to << " bytes failed"); +#endif } +#ifdef WIN32 +typedef int ssize_t; +#endif + void ReadOrThrow(int fd, void *to_void, std::size_t amount) { uint8_t *to = static_cast(to_void); while (amount) { ssize_t ret = read(fd, to, amount); - if (ret == -1) UTIL_THROW(ErrnoException, "Reading " << amount << " from fd " << fd << " failed."); - if (ret == 0) UTIL_THROW(Exception, "Hit EOF in fd " << fd << " but there should be " << amount << " more bytes to read."); + UTIL_THROW_IF(ret == -1, ErrnoException, "Reading " << amount << " from fd " << fd << " failed."); + UTIL_THROW_IF(ret == 0, EndOfFileException, "Hit EOF in fd " << fd << " but there should be " << amount << " more bytes to read."); amount -= ret; to += ret; } } +std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount) { + uint8_t *to = static_cast(to_void); + std::size_t remaining = amount; + while (remaining) { + ssize_t ret = read(fd, to, remaining); + UTIL_THROW_IF(ret == -1, ErrnoException, "Reading " << remaining << " from fd " << fd << " failed."); + if (!ret) return amount - remaining; + remaining -= ret; + to += ret; + } + return amount; +} + void WriteOrThrow(int fd, const void *data_void, std::size_t size) { const uint8_t *data = static_cast(data_void); while (size) { @@ -67,8 +99,172 @@ void WriteOrThrow(int fd, const void *data_void, std::size_t size) { } } -void RemoveOrThrow(const char *name) { - UTIL_THROW_IF(std::remove(name), util::ErrnoException, "Could not remove " << name); +void FSyncOrThrow(int fd) { +// Apparently windows doesn't have fsync? +#if !defined(_WIN32) && !defined(_WIN64) + UTIL_THROW_IF(-1 == fsync(fd), ErrnoException, "Sync of " << fd << " failed."); +#endif +} + +namespace { +void InternalSeek(int fd, off_t off, int whence) { + UTIL_THROW_IF((off_t)-1 == lseek(fd, off, whence), ErrnoException, "Seek failed"); +} +} // namespace + +void SeekOrThrow(int fd, uint64_t off) { + InternalSeek(fd, off, SEEK_SET); +} + +void AdvanceOrThrow(int fd, int64_t off) { + InternalSeek(fd, off, SEEK_CUR); +} + +void SeekEnd(int fd) { + InternalSeek(fd, 0, SEEK_END); +} + +std::FILE *FDOpenOrThrow(scoped_fd &file) { + std::FILE *ret = fdopen(file.get(), "r+b"); + if (!ret) UTIL_THROW(util::ErrnoException, "Could not fdopen"); + file.release(); + return ret; +} + +TempMaker::TempMaker(const std::string &prefix) : base_(prefix) { + base_ += "XXXXXX"; +} + +// Sigh. Windows temporary file creation is full of race conditions. +#if defined(_WIN32) || defined(_WIN64) +/* mkstemp extracted from libc/sysdeps/posix/tempname.c. Copyright + (C) 1991-1999, 2000, 2001, 2006 Free Software Foundation, Inc. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. */ + +/* This has been modified from the original version to rename the function and + * set the Windows temporary flag. */ + +static const char letters[] = +"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"; + +/* Generate a temporary file name based on TMPL. TMPL must match the + rules for mk[s]temp (i.e. end in "XXXXXX"). The name constructed + does not exist at the time of the call to mkstemp. TMPL is + overwritten with the result. */ +int +mkstemp_and_unlink(char *tmpl) +{ + int len; + char *XXXXXX; + static unsigned long long value; + unsigned long long random_time_bits; + unsigned int count; + int fd = -1; + int save_errno = errno; + + /* A lower bound on the number of temporary files to attempt to + generate. The maximum total number of temporary file names that + can exist for a given template is 62**6. It should never be + necessary to try all these combinations. Instead if a reasonable + number of names is tried (we define reasonable as 62**3) fail to + give the system administrator the chance to remove the problems. */ +#define ATTEMPTS_MIN (62 * 62 * 62) + + /* The number of times to attempt to generate a temporary file. To + conform to POSIX, this must be no smaller than TMP_MAX. */ +#if ATTEMPTS_MIN < TMP_MAX + unsigned int attempts = TMP_MAX; +#else + unsigned int attempts = ATTEMPTS_MIN; +#endif + + len = strlen (tmpl); + if (len < 6 || strcmp (&tmpl[len - 6], "XXXXXX")) + { + errno = EINVAL; + return -1; + } + +/* This is where the Xs start. */ + XXXXXX = &tmpl[len - 6]; + + /* Get some more or less random data. */ + { + SYSTEMTIME stNow; + FILETIME ftNow; + + // get system time + GetSystemTime(&stNow); + stNow.wMilliseconds = 500; + if (!SystemTimeToFileTime(&stNow, &ftNow)) + { + errno = -1; + return -1; + } + + random_time_bits = (((unsigned long long)ftNow.dwHighDateTime << 32) + | (unsigned long long)ftNow.dwLowDateTime); + } + value += random_time_bits ^ (unsigned long long)GetCurrentThreadId (); + + for (count = 0; count < attempts; value += 7777, ++count) + { + unsigned long long v = value; + + /* Fill in the random bits. */ + XXXXXX[0] = letters[v % 62]; + v /= 62; + XXXXXX[1] = letters[v % 62]; + v /= 62; + XXXXXX[2] = letters[v % 62]; + v /= 62; + XXXXXX[3] = letters[v % 62]; + v /= 62; + XXXXXX[4] = letters[v % 62]; + v /= 62; + XXXXXX[5] = letters[v % 62]; + + /* Modified for windows and to unlink */ + // fd = open (tmpl, O_RDWR | O_CREAT | O_EXCL, _S_IREAD | _S_IWRITE); + fd = _open (tmpl, _O_RDWR | _O_CREAT | _O_TEMPORARY | _O_EXCL | _O_BINARY, _S_IREAD | _S_IWRITE); + if (fd >= 0) + { + errno = save_errno; + return fd; + } + else if (errno != EEXIST) + return -1; + } + + /* We got out of the loop because we ran out of combinations to try. */ + errno = EEXIST; + return -1; +} +#else +int +mkstemp_and_unlink(char *tmpl) { + int ret = mkstemp(tmpl); + if (ret == -1) return -1; + UTIL_THROW_IF(unlink(tmpl), util::ErrnoException, "Failed to delete " << tmpl); + return ret; +} +#endif + +int TempMaker::Make() const { + std::string copy(base_); + copy.push_back(0); + int ret; + UTIL_THROW_IF(-1 == (ret = mkstemp_and_unlink(©[0])), util::ErrnoException, "Failed to make a temporary based on " << base_); + return ret; +} + +std::FILE *TempMaker::MakeFile() const { + util::scoped_fd file(Make()); + return FDOpenOrThrow(file); } } // namespace util diff --git a/klm/util/file.hh b/klm/util/file.hh index d6cca41d..5c57e2a9 100644 --- a/klm/util/file.hh +++ b/klm/util/file.hh @@ -1,8 +1,11 @@ #ifndef UTIL_FILE__ #define UTIL_FILE__ +#include #include -#include +#include + +#include namespace util { @@ -52,22 +55,49 @@ class scoped_FILE { file_ = to; } + std::FILE *release() { + std::FILE *ret = file_; + file_ = NULL; + return ret; + } + private: std::FILE *file_; }; int OpenReadOrThrow(const char *name); -int CreateOrThrow(const char *name); - // Return value for SizeFile when it can't size properly. -const off_t kBadSize = -1; -off_t SizeFile(int fd); +const uint64_t kBadSize = (uint64_t)-1; +uint64_t SizeFile(int fd); + +void ResizeOrThrow(int fd, uint64_t to); void ReadOrThrow(int fd, void *to, std::size_t size); +std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount); + void WriteOrThrow(int fd, const void *data_void, std::size_t size); -void RemoveOrThrow(const char *name); +void FSyncOrThrow(int fd); + +// Seeking +void SeekOrThrow(int fd, uint64_t off); +void AdvanceOrThrow(int fd, int64_t off); +void SeekEnd(int fd); + +std::FILE *FDOpenOrThrow(scoped_fd &file); + +class TempMaker { + public: + explicit TempMaker(const std::string &prefix); + + int Make() const; + + std::FILE *MakeFile() const; + + private: + std::string base_; +}; } // namespace util diff --git a/klm/util/file_piece.cc b/klm/util/file_piece.cc index b57582a0..081e662b 100644 --- a/klm/util/file_piece.cc +++ b/klm/util/file_piece.cc @@ -2,6 +2,10 @@ #include "util/exception.hh" #include "util/file.hh" +#include "util/mmap.hh" +#ifdef WIN32 +#include +#endif // WIN32 #include #include @@ -11,14 +15,8 @@ #include #include #include -#include #include #include -#include - -#ifdef HAVE_ZLIB -#include -#endif namespace util { @@ -26,24 +24,24 @@ ParseNumberException::ParseNumberException(StringPiece value) throw() { *this << "Could not parse \"" << value << "\" into a number"; } -GZException::GZException(void *file) { #ifdef HAVE_ZLIB +GZException::GZException(gzFile file) { int num; - *this << gzerror(file, &num) << " from zlib"; -#endif // HAVE_ZLIB + *this << gzerror( file, &num) << " from zlib"; } +#endif // HAVE_ZLIB // Sigh this is the only way I could come up with to do a _const_ bool. It has ' ', '\f', '\n', '\r', '\t', and '\v' (same as isspace on C locale). const bool kSpaces[256] = {0,0,0,0,0,0,0,0,0,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}; -FilePiece::FilePiece(const char *name, std::ostream *show_progress, off_t min_buffer) : - file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)), +FilePiece::FilePiece(const char *name, std::ostream *show_progress, std::size_t min_buffer) : + file_(OpenReadOrThrow(name)), total_size_(SizeFile(file_.get())), page_(SizePage()), progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) { Initialize(name, show_progress, min_buffer); } -FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, off_t min_buffer) : - file_(fd), total_size_(SizeFile(file_.get())), page_(sysconf(_SC_PAGE_SIZE)), +FilePiece::FilePiece(int fd, const char *name, std::ostream *show_progress, std::size_t min_buffer) : + file_(fd), total_size_(SizeFile(file_.get())), page_(SizePage()), progress_(total_size_ == kBadSize ? NULL : show_progress, std::string("Reading ") + name, total_size_) { Initialize(name, show_progress, min_buffer); } @@ -63,7 +61,7 @@ FilePiece::~FilePiece() { } StringPiece FilePiece::ReadLine(char delim) { - size_t skip = 0; + std::size_t skip = 0; while (true) { for (const char *i = position_ + skip; i < position_end_; ++i) { if (*i == delim) { @@ -94,13 +92,13 @@ unsigned long int FilePiece::ReadULong() { return ReadNumber(); } -void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t min_buffer) { +void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer) { #ifdef HAVE_ZLIB gz_file_ = NULL; #endif file_name_ = name; - default_map_size_ = page_ * std::max((min_buffer / page_ + 1), 2); + default_map_size_ = page_ * std::max((min_buffer / page_ + 1), 2); position_ = NULL; position_end_ = NULL; mapped_offset_ = 0; @@ -130,7 +128,7 @@ void FilePiece::Initialize(const char *name, std::ostream *show_progress, off_t namespace { void ParseNumber(const char *begin, char *&end, float &out) { -#ifdef sun +#if defined(sun) || defined(WIN32) out = static_cast(strtod(begin, &end)); #else out = strtof(begin, &end); @@ -171,7 +169,7 @@ template T FilePiece::ReadNumber() { } const char *FilePiece::FindDelimiterOrEOF(const bool *delim) { - size_t skip = 0; + std::size_t skip = 0; while (true) { for (const char *i = position_ + skip; i < position_end_; ++i) { if (delim[static_cast(*i)]) return i; @@ -190,7 +188,7 @@ void FilePiece::Shift() { progress_.Finished(); throw EndOfFileException(); } - off_t desired_begin = position_ - data_.begin() + mapped_offset_; + uint64_t desired_begin = position_ - data_.begin() + mapped_offset_; if (!fallback_to_read_) MMapShift(desired_begin); // Notice an mmap failure might set the fallback. @@ -201,18 +199,18 @@ void FilePiece::Shift() { } } -void FilePiece::MMapShift(off_t desired_begin) { +void FilePiece::MMapShift(uint64_t desired_begin) { // Use mmap. - off_t ignore = desired_begin % page_; + uint64_t ignore = desired_begin % page_; // Duplicate request for Shift means give more data. if (position_ == data_.begin() + ignore) { default_map_size_ *= 2; } // Local version so that in case of failure it doesn't overwrite the class variable. - off_t mapped_offset = desired_begin - ignore; + uint64_t mapped_offset = desired_begin - ignore; - off_t mapped_size; - if (default_map_size_ >= static_cast(total_size_ - mapped_offset)) { + uint64_t mapped_size; + if (default_map_size_ >= static_cast(total_size_ - mapped_offset)) { at_end_ = true; mapped_size = total_size_ - mapped_offset; } else { @@ -221,15 +219,11 @@ void FilePiece::MMapShift(off_t desired_begin) { // Forcibly clear the existing mmap first. data_.reset(); - data_.reset(mmap(NULL, mapped_size, PROT_READ, MAP_SHARED - // Populate where available on linux -#ifdef MAP_POPULATE - | MAP_POPULATE -#endif - , *file_, mapped_offset), mapped_size, scoped_memory::MMAP_ALLOCATED); - if (data_.get() == MAP_FAILED) { + try { + MapRead(POPULATE_OR_LAZY, *file_, mapped_offset, mapped_size, data_); + } catch (const util::ErrnoException &e) { if (desired_begin) { - if (((off_t)-1) == lseek(*file_, desired_begin, SEEK_SET)) UTIL_THROW(ErrnoException, "mmap failed even though it worked before. lseek failed too, so using read isn't an option either."); + SeekOrThrow(*file_, desired_begin); } // The mmap was scheduled to end the file, but now we're going to read it. at_end_ = false; @@ -259,6 +253,10 @@ void FilePiece::TransitionToRead() { #endif } +#ifdef WIN32 +typedef int ssize_t; +#endif + void FilePiece::ReadShift() { assert(fallback_to_read_); // Bytes [data_.begin(), position_) have been consumed. diff --git a/klm/util/file_piece.hh b/klm/util/file_piece.hh index a627f38c..af93d8aa 100644 --- a/klm/util/file_piece.hh +++ b/klm/util/file_piece.hh @@ -8,9 +8,14 @@ #include "util/mmap.hh" #include "util/string_piece.hh" +#include #include -#include +#include + +#ifdef HAVE_ZLIB +#include +#endif namespace util { @@ -22,7 +27,9 @@ class ParseNumberException : public Exception { class GZException : public Exception { public: - explicit GZException(void *file); +#ifdef HAVE_ZLIB + explicit GZException(gzFile file); +#endif GZException() throw() {} ~GZException() throw() {} }; @@ -33,9 +40,9 @@ extern const bool kSpaces[256]; class FilePiece { public: // 32 MB default. - explicit FilePiece(const char *file, std::ostream *show_progress = NULL, off_t min_buffer = 33554432); + explicit FilePiece(const char *file, std::ostream *show_progress = NULL, std::size_t min_buffer = 33554432); // Takes ownership of fd. name is used for messages. - explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, off_t min_buffer = 33554432); + explicit FilePiece(int fd, const char *name, std::ostream *show_progress = NULL, std::size_t min_buffer = 33554432); ~FilePiece(); @@ -70,14 +77,14 @@ class FilePiece { } } - off_t Offset() const { + uint64_t Offset() const { return position_ - data_.begin() + mapped_offset_; } const std::string &FileName() const { return file_name_; } private: - void Initialize(const char *name, std::ostream *show_progress, off_t min_buffer); + void Initialize(const char *name, std::ostream *show_progress, std::size_t min_buffer); template T ReadNumber(); @@ -91,7 +98,7 @@ class FilePiece { void Shift(); // Backends to Shift(). - void MMapShift(off_t desired_begin); + void MMapShift(uint64_t desired_begin); void TransitionToRead(); void ReadShift(); @@ -99,11 +106,11 @@ class FilePiece { const char *position_, *last_space_, *position_end_; scoped_fd file_; - const off_t total_size_; - const off_t page_; + const uint64_t total_size_; + const uint64_t page_; - size_t default_map_size_; - off_t mapped_offset_; + std::size_t default_map_size_; + uint64_t mapped_offset_; // Order matters: file_ should always be destroyed after this. scoped_memory data_; @@ -116,7 +123,7 @@ class FilePiece { std::string file_name_; #ifdef HAVE_ZLIB - void *gz_file_; + gzFile gz_file_; #endif // HAVE_ZLIB }; diff --git a/klm/util/file_piece_test.cc b/klm/util/file_piece_test.cc index dc9ec7e7..f912e18a 100644 --- a/klm/util/file_piece_test.cc +++ b/klm/util/file_piece_test.cc @@ -1,3 +1,4 @@ +// Tests might fail if you have creative characters in your path. Sue me. #include "util/file_piece.hh" #include "util/scoped.hh" @@ -14,10 +15,18 @@ namespace util { namespace { +std::string FileLocation() { + if (boost::unit_test::framework::master_test_suite().argc < 2) { + return "file_piece.cc"; + } + std::string ret(boost::unit_test::framework::master_test_suite().argv[1]); + return ret; +} + /* mmap implementation */ BOOST_AUTO_TEST_CASE(MMapReadLine) { - std::fstream ref("file_piece.cc", std::ios::in); - FilePiece test("file_piece.cc", NULL, 1); + std::fstream ref(FileLocation().c_str(), std::ios::in); + FilePiece test(FileLocation().c_str(), NULL, 1); std::string ref_line; while (getline(ref, ref_line)) { StringPiece test_line(test.ReadLine()); @@ -35,9 +44,13 @@ BOOST_AUTO_TEST_CASE(MMapReadLine) { */ /* read() implementation */ BOOST_AUTO_TEST_CASE(StreamReadLine) { - std::fstream ref("file_piece.cc", std::ios::in); + std::fstream ref(FileLocation().c_str(), std::ios::in); + + std::string popen_args = "cat \""; + popen_args += FileLocation(); + popen_args += '"'; - FILE *catter = popen("cat file_piece.cc", "r"); + FILE *catter = popen(popen_args.c_str(), "r"); BOOST_REQUIRE(catter); FilePiece test(dup(fileno(catter)), "file_piece.cc", NULL, 1); @@ -58,10 +71,15 @@ BOOST_AUTO_TEST_CASE(StreamReadLine) { // gzip file BOOST_AUTO_TEST_CASE(PlainZipReadLine) { - std::fstream ref("file_piece.cc", std::ios::in); + std::string location(FileLocation()); + std::fstream ref(location.c_str(), std::ios::in); - BOOST_REQUIRE_EQUAL(0, system("gzip file_piece.cc.gz")); - FilePiece test("file_piece.cc.gz", NULL, 1); + std::string command("gzip <\""); + command += location + "\" >\"" + location + "\".gz"; + + BOOST_REQUIRE_EQUAL(0, system(command.c_str())); + FilePiece test((location + ".gz").c_str(), NULL, 1); + unlink((location + ".gz").c_str()); std::string ref_line; while (getline(ref, ref_line)) { StringPiece test_line(test.ReadLine()); @@ -77,12 +95,15 @@ BOOST_AUTO_TEST_CASE(PlainZipReadLine) { // the test. #ifndef __APPLE__ BOOST_AUTO_TEST_CASE(StreamZipReadLine) { - std::fstream ref("file_piece.cc", std::ios::in); + std::fstream ref(FileLocation().c_str(), std::ios::in); + + std::string command("gzip <\""); + command += FileLocation() + "\""; - FILE * catter = popen("gzip +#include + +#define NULL 0 +#define EOF (-1) +#define ERR(s, c) if(opterr){\ + char errbuf[2];\ + errbuf[0] = c; errbuf[1] = '\n';\ + fputs(argv[0], stderr);\ + fputs(s, stderr);\ + fputc(c, stderr);} + //(void) write(2, argv[0], (unsigned)strlen(argv[0]));\ + //(void) write(2, s, (unsigned)strlen(s));\ + //(void) write(2, errbuf, 2);} + +int opterr = 1; +int optind = 1; +int optopt; +char *optarg; + +int +getopt(argc, argv, opts) +int argc; +char **argv, *opts; +{ + static int sp = 1; + register int c; + register char *cp; + + if(sp == 1) + if(optind >= argc || + argv[optind][0] != '-' || argv[optind][1] == '\0') + return(EOF); + else if(strcmp(argv[optind], "--") == NULL) { + optind++; + return(EOF); + } + optopt = c = argv[optind][sp]; + if(c == ':' || (cp=strchr(opts, c)) == NULL) { + ERR(": illegal option -- ", c); + if(argv[optind][++sp] == '\0') { + optind++; + sp = 1; + } + return('?'); + } + if(*++cp == ':') { + if(argv[optind][sp+1] != '\0') + optarg = &argv[optind++][sp+1]; + else if(++optind >= argc) { + ERR(": option requires an argument -- ", c); + sp = 1; + return('?'); + } else + optarg = argv[optind++]; + sp = 1; + } else { + if(argv[optind][++sp] == '\0') { + sp = 1; + optind++; + } + optarg = NULL; + } + return(c); +} + +#endif /* __GNUC__ */ diff --git a/klm/util/getopt.hh b/klm/util/getopt.hh new file mode 100644 index 00000000..6ad97732 --- /dev/null +++ b/klm/util/getopt.hh @@ -0,0 +1,33 @@ +/* +POSIX getopt for Windows + +AT&T Public License + +Code given out at the 1985 UNIFORUM conference in Dallas. +*/ + +#ifdef __GNUC__ +#include +#endif +#ifndef __GNUC__ + +#ifndef _WINGETOPT_H_ +#define _WINGETOPT_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +extern int opterr; +extern int optind; +extern int optopt; +extern char *optarg; +extern int getopt(int argc, char **argv, char *opts); + +#ifdef __cplusplus +} +#endif + +#endif /* _GETOPT_H_ */ +#endif /* __GNUC__ */ + diff --git a/klm/util/key_value_packing.hh b/klm/util/key_value_packing.hh deleted file mode 100644 index b84a5aad..00000000 --- a/klm/util/key_value_packing.hh +++ /dev/null @@ -1,126 +0,0 @@ -#ifndef UTIL_KEY_VALUE_PACKING__ -#define UTIL_KEY_VALUE_PACKING__ - -/* Why such a general interface? I'm planning on doing bit-level packing. */ - -#include -#include -#include - -#include - -namespace util { - -template struct Entry { - Key key; - Value value; - - const Key &GetKey() const { return key; } - const Value &GetValue() const { return value; } - - Value &MutableValue() { return value; } - - void Set(const Key &key_in, const Value &value_in) { - SetKey(key_in); - SetValue(value_in); - } - void SetKey(const Key &key_in) { key = key_in; } - void SetValue(const Value &value_in) { value = value_in; } - - bool operator<(const Entry &other) const { return GetKey() < other.GetKey(); } -}; - -// And now for a brief interlude to specialize std::swap. -} // namespace util -namespace std { -template void swap(util::Entry &first, util::Entry &second) { - swap(first.key, second.key); - swap(first.value, second.value); -} -}// namespace std -namespace util { - -template class AlignedPacking { - public: - typedef KeyT Key; - typedef ValueT Value; - - public: - static const std::size_t kBytes = sizeof(Entry); - static const std::size_t kBits = kBytes * 8; - - typedef Entry * MutableIterator; - typedef const Entry * ConstIterator; - typedef const Entry & ConstReference; - - static MutableIterator FromVoid(void *start) { - return reinterpret_cast(start); - } - - static Entry Make(const Key &key, const Value &value) { - Entry ret; - ret.Set(key, value); - return ret; - } -}; - -template class ByteAlignedPacking { - public: - typedef KeyT Key; - typedef ValueT Value; - - private: -#pragma pack(push) -#pragma pack(1) - struct RawEntry { - Key key; - Value value; - - const Key &GetKey() const { return key; } - const Value &GetValue() const { return value; } - - Value &MutableValue() { return value; } - - void Set(const Key &key_in, const Value &value_in) { - SetKey(key_in); - SetValue(value_in); - } - void SetKey(const Key &key_in) { key = key_in; } - void SetValue(const Value &value_in) { value = value_in; } - - bool operator<(const RawEntry &other) const { return GetKey() < other.GetKey(); } - }; -#pragma pack(pop) - - friend void std::swap<>(RawEntry&, RawEntry&); - - public: - typedef RawEntry *MutableIterator; - typedef const RawEntry *ConstIterator; - typedef RawEntry &ConstReference; - - static const std::size_t kBytes = sizeof(RawEntry); - static const std::size_t kBits = kBytes * 8; - - static MutableIterator FromVoid(void *start) { - return MutableIterator(reinterpret_cast(start)); - } - - static RawEntry Make(const Key &key, const Value &value) { - RawEntry ret; - ret.Set(key, value); - return ret; - } -}; - -} // namespace util -namespace std { -template void swap( - typename util::ByteAlignedPacking::RawEntry &first, - typename util::ByteAlignedPacking::RawEntry &second) { - swap(first.key, second.key); - swap(first.value, second.value); -} -}// namespace std - -#endif // UTIL_KEY_VALUE_PACKING__ diff --git a/klm/util/key_value_packing_test.cc b/klm/util/key_value_packing_test.cc deleted file mode 100644 index a0d33fd7..00000000 --- a/klm/util/key_value_packing_test.cc +++ /dev/null @@ -1,75 +0,0 @@ -#include "util/key_value_packing.hh" - -#include -#include -#include -#include -#define BOOST_TEST_MODULE KeyValueStoreTest -#include - -#include -#include - -namespace util { -namespace { - -BOOST_AUTO_TEST_CASE(basic_in_out) { - typedef ByteAlignedPacking Packing; - void *backing = malloc(Packing::kBytes * 2); - Packing::MutableIterator i(Packing::FromVoid(backing)); - i->SetKey(10); - BOOST_CHECK_EQUAL(10, i->GetKey()); - i->SetValue(3); - BOOST_CHECK_EQUAL(3, i->GetValue()); - ++i; - i->SetKey(5); - BOOST_CHECK_EQUAL(5, i->GetKey()); - i->SetValue(42); - BOOST_CHECK_EQUAL(42, i->GetValue()); - - Packing::ConstIterator c(i); - BOOST_CHECK_EQUAL(5, c->GetKey()); - --c; - BOOST_CHECK_EQUAL(10, c->GetKey()); - BOOST_CHECK_EQUAL(42, i->GetValue()); - - BOOST_CHECK_EQUAL(5, i->GetKey()); - free(backing); -} - -BOOST_AUTO_TEST_CASE(simple_sort) { - typedef ByteAlignedPacking Packing; - char foo[Packing::kBytes * 4]; - Packing::MutableIterator begin(Packing::FromVoid(foo)); - Packing::MutableIterator i = begin; - i->SetKey(0); ++i; - i->SetKey(2); ++i; - i->SetKey(3); ++i; - i->SetKey(1); ++i; - std::sort(begin, i); - BOOST_CHECK_EQUAL(0, begin[0].GetKey()); - BOOST_CHECK_EQUAL(1, begin[1].GetKey()); - BOOST_CHECK_EQUAL(2, begin[2].GetKey()); - BOOST_CHECK_EQUAL(3, begin[3].GetKey()); -} - -BOOST_AUTO_TEST_CASE(big_sort) { - typedef ByteAlignedPacking Packing; - boost::scoped_array memory(new char[Packing::kBytes * 1000]); - Packing::MutableIterator begin(Packing::FromVoid(memory.get())); - - boost::mt19937 rng; - boost::uniform_int range(0, std::numeric_limits::max()); - boost::variate_generator > gen(rng, range); - - for (size_t i = 0; i < 1000; ++i) { - (begin + i)->SetKey(gen()); - } - std::sort(begin, begin + 1000); - for (size_t i = 0; i < 999; ++i) { - BOOST_CHECK(begin[i] < begin[i+1]); - } -} - -} // namespace -} // namespace util diff --git a/klm/util/mmap.cc b/klm/util/mmap.cc index 279bafa8..a329ce4e 100644 --- a/klm/util/mmap.cc +++ b/klm/util/mmap.cc @@ -1,23 +1,63 @@ +/* Memory mapping wrappers. + * ARM and MinGW ports contributed by Hideo Okuma and Tomoyuki Yoshimura at + * NICT. + */ +#include "util/mmap.hh" + #include "util/exception.hh" #include "util/file.hh" -#include "util/mmap.hh" #include #include #include #include -#include +#include #include -#include + +#if defined(_WIN32) || defined(_WIN64) +#include +#include +#else +#include +#endif namespace util { +long SizePage() { +#if defined(_WIN32) || defined(_WIN64) + SYSTEM_INFO si; + GetSystemInfo(&si); + return si.dwAllocationGranularity; +#else + return sysconf(_SC_PAGE_SIZE); +#endif +} + +void SyncOrThrow(void *start, size_t length) { +#if defined(_WIN32) || defined(_WIN64) + UTIL_THROW_IF(!::FlushViewOfFile(start, length), ErrnoException, "Failed to sync mmap"); +#else + UTIL_THROW_IF(msync(start, length, MS_SYNC), ErrnoException, "Failed to sync mmap"); +#endif +} + +void UnmapOrThrow(void *start, size_t length) { +#if defined(_WIN32) || defined(_WIN64) + UTIL_THROW_IF(!::UnmapViewOfFile(start), ErrnoException, "Failed to unmap a file"); +#else + UTIL_THROW_IF(munmap(start, length), ErrnoException, "munmap failed"); +#endif +} + scoped_mmap::~scoped_mmap() { if (data_ != (void*)-1) { - // Thanks Denis Filimonov for pointing out NFS likes msync first. - if (msync(data_, size_, MS_SYNC) || munmap(data_, size_)) { - std::cerr << "msync or mmap failed for " << size_ << " bytes." << std::endl; + try { + // Thanks Denis Filimonov for pointing out NFS likes msync first. + SyncOrThrow(data_, size_); + UnmapOrThrow(data_, size_); + } catch (const util::ErrnoException &e) { + std::cerr << e.what(); abort(); } } @@ -52,29 +92,40 @@ void scoped_memory::call_realloc(std::size_t size) { } } -void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, off_t offset) { +void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, uint64_t offset) { #ifdef MAP_POPULATE // Linux specific if (prefault) { flags |= MAP_POPULATE; } #endif +#if defined(_WIN32) || defined(_WIN64) + int protectC = for_write ? PAGE_READWRITE : PAGE_READONLY; + int protectM = for_write ? FILE_MAP_WRITE : FILE_MAP_READ; + uint64_t total_size = size + offset; + HANDLE hMapping = CreateFileMapping((HANDLE)_get_osfhandle(fd), NULL, protectC, total_size >> 32, static_cast(total_size), NULL); + UTIL_THROW_IF(!hMapping, ErrnoException, "CreateFileMapping failed"); + LPVOID ret = MapViewOfFile(hMapping, protectM, offset >> 32, offset, size); + CloseHandle(hMapping); + UTIL_THROW_IF(!ret, ErrnoException, "MapViewOfFile failed"); +#else int protect = for_write ? (PROT_READ | PROT_WRITE) : PROT_READ; void *ret = mmap(NULL, size, protect, flags, fd, offset); - if (ret == MAP_FAILED) { - UTIL_THROW(ErrnoException, "mmap failed for size " << size << " at offset " << offset); - } + UTIL_THROW_IF(ret == MAP_FAILED, ErrnoException, "mmap failed for size " << size << " at offset " << offset); +#endif return ret; } const int kFileFlags = -#ifdef MAP_FILE +#if defined(_WIN32) || defined(_WIN64) + 0 // MapOrThrow ignores flags on windows +#elif defined(MAP_FILE) MAP_FILE | MAP_SHARED #else MAP_SHARED #endif ; -void MapRead(LoadMethod method, int fd, off_t offset, std::size_t size, scoped_memory &out) { +void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out) { switch (method) { case LAZY: out.reset(MapOrThrow(size, false, kFileFlags, false, fd, offset), size, scoped_memory::MMAP_ALLOCATED); @@ -91,30 +142,52 @@ void MapRead(LoadMethod method, int fd, off_t offset, std::size_t size, scoped_m case READ: out.reset(malloc(size), size, scoped_memory::MALLOC_ALLOCATED); if (!out.get()) UTIL_THROW(util::ErrnoException, "Allocating " << size << " bytes with malloc"); - if (-1 == lseek(fd, offset, SEEK_SET)) UTIL_THROW(ErrnoException, "lseek to " << offset << " in fd " << fd << " failed."); + SeekOrThrow(fd, offset); ReadOrThrow(fd, out.get(), size); break; } } -void *MapAnonymous(std::size_t size) { - return MapOrThrow(size, true, -#ifdef MAP_ANONYMOUS - MAP_ANONYMOUS // Linux +// Allocates zeroed memory in to. +void MapAnonymous(std::size_t size, util::scoped_memory &to) { + to.reset(); +#if defined(_WIN32) || defined(_WIN64) + to.reset(calloc(1, size), size, scoped_memory::MALLOC_ALLOCATED); +#else + to.reset(MapOrThrow(size, true, +# if defined(MAP_ANONYMOUS) + MAP_ANONYMOUS | MAP_PRIVATE // Linux +# else + MAP_ANON | MAP_PRIVATE // BSD +# endif + , false, -1, 0), size, scoped_memory::MMAP_ALLOCATED); +#endif +} + +void *MapZeroedWrite(int fd, std::size_t size) { + ResizeOrThrow(fd, 0); + ResizeOrThrow(fd, size); + return MapOrThrow(size, true, kFileFlags, false, fd, 0); +} + +namespace { + +int CreateOrThrow(const char *name) { + int ret; +#if defined(_WIN32) || defined(_WIN64) + UTIL_THROW_IF(-1 == (ret = _open(name, _O_CREAT | _O_TRUNC | _O_RDWR, _S_IREAD | _S_IWRITE)), ErrnoException, "while creating " << name); #else - MAP_ANON // BSD + UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)), ErrnoException, "while creating " << name); #endif - | MAP_PRIVATE, false, -1, 0); + return ret; } +} // namespace + void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file) { - file.reset(open(name, O_CREAT | O_RDWR | O_TRUNC, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)); - if (-1 == file.get()) - UTIL_THROW(ErrnoException, "Failed to open " << name << " for writing"); - if (-1 == ftruncate(file.get(), size)) - UTIL_THROW(ErrnoException, "ftruncate on " << name << " to " << size << " failed"); + file.reset(CreateOrThrow(name)); try { - return MapOrThrow(size, true, kFileFlags, false, file.get(), 0); + return MapZeroedWrite(file.get(), size); } catch (ErrnoException &e) { e << " in file " << name; throw; diff --git a/klm/util/mmap.hh b/klm/util/mmap.hh index b0eb6672..b218c4d1 100644 --- a/klm/util/mmap.hh +++ b/klm/util/mmap.hh @@ -4,13 +4,15 @@ #include -#include +#include #include namespace util { class scoped_fd; +long SizePage(); + // (void*)-1 is MAP_FAILED; this is done to avoid including the mmap header here. class scoped_mmap { public: @@ -94,15 +96,19 @@ typedef enum { extern const int kFileFlags; // Wrapper around mmap to check it worked and hide some platform macros. -void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, off_t offset = 0); +void *MapOrThrow(std::size_t size, bool for_write, int flags, bool prefault, int fd, uint64_t offset = 0); -void MapRead(LoadMethod method, int fd, off_t offset, std::size_t size, scoped_memory &out); +void MapRead(LoadMethod method, int fd, uint64_t offset, std::size_t size, scoped_memory &out); -void *MapAnonymous(std::size_t size); +void MapAnonymous(std::size_t size, scoped_memory &to); // Open file name with mmap of size bytes, all of which are initially zero. +void *MapZeroedWrite(int fd, std::size_t size); void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file); +// msync wrapper +void SyncOrThrow(void *start, size_t length); + } // namespace util #endif // UTIL_MMAP__ diff --git a/klm/util/murmur_hash.cc b/klm/util/murmur_hash.cc index ef5783fe..6accc21a 100644 --- a/klm/util/murmur_hash.cc +++ b/klm/util/murmur_hash.cc @@ -7,9 +7,11 @@ * placed in namespace util * add MurmurHashNative * default option = 0 for seed + * ARM port from NICT */ #include "util/murmur_hash.hh" +#include namespace util { @@ -28,12 +30,24 @@ uint64_t MurmurHash64A ( const void * key, std::size_t len, unsigned int seed ) uint64_t h = seed ^ (len * m); +#if defined(__arm) || defined(__arm__) + const size_t ksize = sizeof(uint64_t); + const unsigned char * data = (const unsigned char *)key; + const unsigned char * end = data + (std::size_t)(len/8) * ksize; +#else const uint64_t * data = (const uint64_t *)key; const uint64_t * end = data + (len/8); +#endif while(data != end) { +#if defined(__arm) || defined(__arm__) + uint64_t k; + memcpy(&k, data, ksize); + data += ksize; +#else uint64_t k = *data++; +#endif k *= m; k ^= k >> r; @@ -75,16 +89,30 @@ uint64_t MurmurHash64B ( const void * key, std::size_t len, unsigned int seed ) unsigned int h1 = seed ^ len; unsigned int h2 = 0; +#if defined(__arm) || defined(__arm__) + size_t ksize = sizeof(unsigned int); + const unsigned char * data = (const unsigned char *)key; +#else const unsigned int * data = (const unsigned int *)key; +#endif + unsigned int k1, k2; while(len >= 8) { - unsigned int k1 = *data++; +#if defined(__arm) || defined(__arm__) + memcpy(&k1, data, ksize); + data += ksize; + memcpy(&k2, data, ksize); + data += ksize; +#else + k1 = *data++; + k2 = *data++; +#endif + k1 *= m; k1 ^= k1 >> r; k1 *= m; h1 *= m; h1 ^= k1; len -= 4; - unsigned int k2 = *data++; k2 *= m; k2 ^= k2 >> r; k2 *= m; h2 *= m; h2 ^= k2; len -= 4; @@ -92,7 +120,12 @@ uint64_t MurmurHash64B ( const void * key, std::size_t len, unsigned int seed ) if(len >= 4) { - unsigned int k1 = *data++; +#if defined(__arm) || defined(__arm__) + memcpy(&k1, data, ksize); + data += ksize; +#else + k1 = *data++; +#endif k1 *= m; k1 ^= k1 >> r; k1 *= m; h1 *= m; h1 ^= k1; len -= 4; diff --git a/klm/util/murmur_hash.hh b/klm/util/murmur_hash.hh index 78fe583f..638aaeb2 100644 --- a/klm/util/murmur_hash.hh +++ b/klm/util/murmur_hash.hh @@ -1,7 +1,7 @@ #ifndef UTIL_MURMUR_HASH__ #define UTIL_MURMUR_HASH__ #include -#include +#include namespace util { diff --git a/klm/util/probing_hash_table.hh b/klm/util/probing_hash_table.hh index 8122d69c..f466cebc 100644 --- a/klm/util/probing_hash_table.hh +++ b/klm/util/probing_hash_table.hh @@ -18,27 +18,33 @@ class ProbingSizeException : public Exception { ~ProbingSizeException() throw() {} }; +// std::identity is an SGI extension :-( +struct IdentityHash { + template T operator()(T arg) const { return arg; } +}; + /* Non-standard hash table * Buckets must be set at the beginning and must be greater than maximum number - * of elements, else an infinite loop happens. + * of elements, else it throws ProbingSizeException. * Memory management and initialization is externalized to make it easier to * serialize these to disk and load them quickly. * Uses linear probing to find value. * Only insert and lookup operations. */ -template > class ProbingHashTable { +template > class ProbingHashTable { public: - typedef PackingT Packing; - typedef typename Packing::Key Key; - typedef typename Packing::MutableIterator MutableIterator; - typedef typename Packing::ConstIterator ConstIterator; - + typedef EntryT Entry; + typedef typename Entry::Key Key; + typedef const Entry *ConstIterator; + typedef Entry *MutableIterator; typedef HashT Hash; typedef EqualT Equal; + public: static std::size_t Size(std::size_t entries, float multiplier) { - return std::max(entries + 1, static_cast(multiplier * static_cast(entries))) * Packing::kBytes; + std::size_t buckets = std::max(entries + 1, static_cast(multiplier * static_cast(entries))); + return buckets * sizeof(Entry); } // Must be assigned to later. @@ -49,9 +55,9 @@ template (start)), + buckets_(allocated / sizeof(Entry)), + end_(begin_ + buckets_), invalid_(invalid), hash_(hash_func), equal_(equal_func), @@ -62,11 +68,10 @@ template MutableIterator Insert(const T &t) { - if (++entries_ >= buckets_) - UTIL_THROW(ProbingSizeException, "Hash table with " << buckets_ << " buckets is full."); #ifdef DEBUG assert(initialized_); #endif + UTIL_THROW_IF(++entries_ >= buckets_, ProbingSizeException, "Hash table with " << buckets_ << " buckets is full."); for (MutableIterator i(begin_ + (hash_(t.GetKey()) % buckets_));;) { if (equal_(i->GetKey(), invalid_)) { *i = t; return i; } if (++i == end_) { i = begin_; } @@ -84,7 +89,7 @@ template bool Find(const Key key, ConstIterator &out) const { diff --git a/klm/util/probing_hash_table_test.cc b/klm/util/probing_hash_table_test.cc index ff2f5af3..ef68e5f2 100644 --- a/klm/util/probing_hash_table_test.cc +++ b/klm/util/probing_hash_table_test.cc @@ -1,6 +1,6 @@ #include "util/probing_hash_table.hh" -#include "util/key_value_packing.hh" +#include #define BOOST_TEST_MODULE ProbingHashTableTest #include @@ -9,17 +9,34 @@ namespace util { namespace { -typedef AlignedPacking Packing; -typedef ProbingHashTable > Table; +struct Entry { + unsigned char key; + typedef unsigned char Key; + + unsigned char GetKey() const { + return key; + } + + uint64_t GetValue() const { + return value; + } + + uint64_t value; +}; + +typedef ProbingHashTable > Table; BOOST_AUTO_TEST_CASE(simple) { char mem[Table::Size(10, 1.2)]; memset(mem, 0, sizeof(mem)); Table table(mem, sizeof(mem)); - Packing::ConstIterator i = Packing::ConstIterator(); + const Entry *i = NULL; BOOST_CHECK(!table.Find(2, i)); - table.Insert(Packing::Make(3, 328920)); + Entry to_ins; + to_ins.key = 3; + to_ins.value = 328920; + table.Insert(to_ins); BOOST_REQUIRE(table.Find(3, i)); BOOST_CHECK_EQUAL(3, i->GetKey()); BOOST_CHECK_EQUAL(static_cast(328920), i->GetValue()); diff --git a/klm/util/sized_iterator.hh b/klm/util/sized_iterator.hh index 47dfc245..aabcc531 100644 --- a/klm/util/sized_iterator.hh +++ b/klm/util/sized_iterator.hh @@ -6,7 +6,7 @@ #include #include -#include +#include #include namespace util { diff --git a/klm/util/sorted_uniform.hh b/klm/util/sorted_uniform.hh index 0d6ecbbd..7700d9e6 100644 --- a/klm/util/sorted_uniform.hh +++ b/klm/util/sorted_uniform.hh @@ -5,7 +5,7 @@ #include #include -#include +#include namespace util { @@ -122,99 +122,6 @@ template Iterator BinaryBelow( return begin - 1; } -// To use this template, you need to define a Pivot function to match Key. -template class SortedUniformMap { - public: - typedef PackingT Packing; - typedef typename Packing::ConstIterator ConstIterator; - typedef typename Packing::MutableIterator MutableIterator; - - struct Accessor { - public: - typedef typename Packing::Key Key; - const Key &operator()(const ConstIterator &i) const { return i->GetKey(); } - Key &operator()(const MutableIterator &i) const { return i->GetKey(); } - }; - - // Offer consistent API with probing hash. - static std::size_t Size(std::size_t entries, float /*ignore*/ = 0.0) { - return sizeof(uint64_t) + entries * Packing::kBytes; - } - - SortedUniformMap() -#ifdef DEBUG - : initialized_(false), loaded_(false) -#endif - {} - - SortedUniformMap(void *start, std::size_t /*allocated*/) : - begin_(Packing::FromVoid(reinterpret_cast(start) + 1)), - end_(begin_), size_ptr_(reinterpret_cast(start)) -#ifdef DEBUG - , initialized_(true), loaded_(false) -#endif - {} - - void LoadedBinary() { -#ifdef DEBUG - assert(initialized_); - assert(!loaded_); - loaded_ = true; -#endif - // Restore the size. - end_ = begin_ + *size_ptr_; - } - - // Caller responsible for not exceeding specified size. Do not call after FinishedInserting. - template void Insert(const T &t) { -#ifdef DEBUG - assert(initialized_); - assert(!loaded_); -#endif - *end_ = t; - ++end_; - } - - void FinishedInserting() { -#ifdef DEBUG - assert(initialized_); - assert(!loaded_); - loaded_ = true; -#endif - std::sort(begin_, end_); - *size_ptr_ = (end_ - begin_); - } - - // Don't use this to change the key. - template bool UnsafeMutableFind(const Key key, MutableIterator &out) { -#ifdef DEBUG - assert(initialized_); - assert(loaded_); -#endif - return SortedUniformFind(begin_, end_, key, out); - } - - // Do not call before FinishedInserting. - template bool Find(const Key key, ConstIterator &out) const { -#ifdef DEBUG - assert(initialized_); - assert(loaded_); -#endif - return SortedUniformFind(Accessor(), ConstIterator(begin_), ConstIterator(end_), key, out); - } - - ConstIterator begin() const { return begin_; } - ConstIterator end() const { return end_; } - - private: - typename Packing::MutableIterator begin_, end_; - uint64_t *size_ptr_; -#ifdef DEBUG - bool initialized_; - bool loaded_; -#endif -}; - } // namespace util #endif // UTIL_SORTED_UNIFORM__ diff --git a/klm/util/sorted_uniform_test.cc b/klm/util/sorted_uniform_test.cc index 4aa4c8aa..d9f6fad1 100644 --- a/klm/util/sorted_uniform_test.cc +++ b/klm/util/sorted_uniform_test.cc @@ -1,12 +1,11 @@ #include "util/sorted_uniform.hh" -#include "util/key_value_packing.hh" - #include #include #include #include #include + #define BOOST_TEST_MODULE SortedUniformTest #include @@ -17,74 +16,86 @@ namespace util { namespace { -template void Check(const Map &map, const boost::unordered_map &reference, const Key key) { +template struct Entry { + typedef KeyT Key; + typedef ValueT Value; + + Key key; + Value value; + + Key GetKey() const { + return key; + } + + Value GetValue() const { + return value; + } + + bool operator<(const Entry &other) const { + return key < other.key; + } +}; + +template struct Accessor { + typedef KeyT Key; + template Key operator()(const Entry *entry) const { + return entry->GetKey(); + } +}; + +template void Check(const Entry *begin, const Entry *end, const boost::unordered_map &reference, const Key key) { typename boost::unordered_map::const_iterator ref = reference.find(key); - typename Map::ConstIterator i = typename Map::ConstIterator(); + typedef const Entry *It; + // g++ can't tell that require will crash and burn. + It i = NULL; + bool ret = SortedUniformFind, Pivot64>(Accessor(), begin, end, key, i); if (ref == reference.end()) { - BOOST_CHECK(!map.Find(key, i)); + BOOST_CHECK(!ret); } else { - // g++ can't tell that require will crash and burn. - BOOST_REQUIRE(map.Find(key, i)); + BOOST_REQUIRE(ret); BOOST_CHECK_EQUAL(ref->second, i->GetValue()); } } -typedef SortedUniformMap > TestMap; - BOOST_AUTO_TEST_CASE(empty) { - char buf[TestMap::Size(0)]; - TestMap map(buf, TestMap::Size(0)); - map.FinishedInserting(); - TestMap::ConstIterator i; - BOOST_CHECK(!map.Find(42, i)); -} - -BOOST_AUTO_TEST_CASE(one) { - char buf[TestMap::Size(1)]; - TestMap map(buf, sizeof(buf)); - Entry e; - e.Set(42,2); - map.Insert(e); - map.FinishedInserting(); - TestMap::ConstIterator i = TestMap::ConstIterator(); - BOOST_REQUIRE(map.Find(42, i)); - BOOST_CHECK(i == map.begin()); - BOOST_CHECK(!map.Find(43, i)); - BOOST_CHECK(!map.Find(41, i)); + typedef const Entry T; + const T *i; + bool ret = SortedUniformFind, Pivot64>(Accessor(), (const T*)NULL, (const T*)NULL, (uint64_t)10, i); + BOOST_CHECK(!ret); } template void RandomTest(Key upper, size_t entries, size_t queries) { typedef unsigned char Value; - typedef SortedUniformMap > Map; - boost::scoped_array buffer(new char[Map::Size(entries)]); - Map map(buffer.get(), entries); boost::mt19937 rng; boost::uniform_int range_key(0, upper); boost::uniform_int range_value(0, 255); boost::variate_generator > gen_key(rng, range_key); boost::variate_generator > gen_value(rng, range_value); + typedef Entry Ent; + std::vector backing; boost::unordered_map reference; - Entry ent; + Ent ent; for (size_t i = 0; i < entries; ++i) { Key key = gen_key(); unsigned char value = gen_value(); if (reference.insert(std::make_pair(key, value)).second) { - ent.Set(key, value); - map.Insert(Entry(ent)); + ent.key = key; + ent.value = value; + backing.push_back(ent); } } - map.FinishedInserting(); + std::sort(backing.begin(), backing.end()); // Random queries. for (size_t i = 0; i < queries; ++i) { const Key key = gen_key(); - Check(map, reference, key); + Check(&*backing.begin(), &*backing.end(), reference, key); } typename boost::unordered_map::const_iterator it = reference.begin(); for (size_t i = 0; (i < queries) && (it != reference.end()); ++i, ++it) { - Check(map, reference, it->second); + Check(&*backing.begin(), &*backing.end(), reference, it->second); } } diff --git a/klm/util/tokenize_piece.hh b/klm/util/tokenize_piece.hh index 413bda0b..c7e1c863 100644 --- a/klm/util/tokenize_piece.hh +++ b/klm/util/tokenize_piece.hh @@ -1,6 +1,7 @@ #ifndef UTIL_TOKENIZE_PIECE__ #define UTIL_TOKENIZE_PIECE__ +#include "util/exception.hh" #include "util/string_piece.hh" #include @@ -8,63 +9,25 @@ #include #include -/* Usage: - * - * for (PieceIterator<' '> i(" foo \r\n bar "); i; ++i) { - * std::cout << *i << "\n"; - * } - * - */ - namespace util { -// Tokenize a StringPiece using an iterator interface. boost::tokenizer doesn't work with StringPiece. -template class PieceIterator : public boost::iterator_facade, const StringPiece, boost::forward_traversal_tag> { +// Thrown on dereference when out of tokens to parse +class OutOfTokens : public Exception { public: - // Default construct is end, which is also accessed by kEndPieceIterator; - PieceIterator() {} - - explicit PieceIterator(const StringPiece &str) - : after_(str) { - increment(); - } + OutOfTokens() throw() {} + ~OutOfTokens() throw() {} +}; - bool operator!() const { - return after_.data() == 0; - } - operator bool() const { - return after_.data() != 0; - } +class SingleCharacter { + public: + explicit SingleCharacter(char delim) : delim_(delim) {} - static PieceIterator end() { - return PieceIterator(); + StringPiece Find(const StringPiece &in) const { + return StringPiece(std::find(in.data(), in.data() + in.size(), delim_), 1); } private: - friend class boost::iterator_core_access; - - void increment() { - const char *start = after_.data(); - for (; (start != after_.data() + after_.size()) && (d == *start); ++start) {} - if (start == after_.data() + after_.size()) { - // End condition. - after_.clear(); - return; - } - const char *finish = start; - for (; (finish != after_.data() + after_.size()) && (d != *finish); ++finish) {} - current_ = StringPiece(start, finish - start); - after_ = StringPiece(finish, after_.data() + after_.size() - finish); - } - - bool equal(const PieceIterator &other) const { - return after_.data() == other.after_.data(); - } - - const StringPiece &dereference() const { return current_; } - - StringPiece current_; - StringPiece after_; + char delim_; }; class MultiCharacter { @@ -95,7 +58,7 @@ template class TokenIter : public boost::it public: TokenIter() {} - TokenIter(const StringPiece &str, const Find &finder) : after_(str), finder_(finder) { + template TokenIter(const StringPiece &str, const Construct &construct) : after_(str), finder_(construct) { increment(); } @@ -130,6 +93,7 @@ template class TokenIter : public boost::it } const StringPiece &dereference() const { + UTIL_THROW_IF(!current_.data(), OutOfTokens, "Ran out of tokens"); return current_; } diff --git a/klm/util/tokenize_piece_test.cc b/klm/util/tokenize_piece_test.cc index e07ebcf5..d856018f 100644 --- a/klm/util/tokenize_piece_test.cc +++ b/klm/util/tokenize_piece_test.cc @@ -9,53 +9,7 @@ namespace util { namespace { -BOOST_AUTO_TEST_CASE(simple) { - PieceIterator<' '> it("single spaced words."); - BOOST_REQUIRE(it); - BOOST_CHECK_EQUAL(StringPiece("single"), *it); - ++it; - BOOST_REQUIRE(it); - BOOST_CHECK_EQUAL(StringPiece("spaced"), *it); - ++it; - BOOST_REQUIRE(it); - BOOST_CHECK_EQUAL(StringPiece("words."), *it); - ++it; - BOOST_CHECK(!it); -} - -BOOST_AUTO_TEST_CASE(null_delimiter) { - const char str[] = "\0first\0\0second\0\0\0third\0fourth\0\0\0"; - PieceIterator<'\0'> it(StringPiece(str, sizeof(str) - 1)); - BOOST_REQUIRE(it); - BOOST_CHECK_EQUAL(StringPiece("first"), *it); - ++it; - BOOST_REQUIRE(it); - BOOST_CHECK_EQUAL(StringPiece("second"), *it); - ++it; - BOOST_REQUIRE(it); - BOOST_CHECK_EQUAL(StringPiece("third"), *it); - ++it; - BOOST_REQUIRE(it); - BOOST_CHECK_EQUAL(StringPiece("fourth"), *it); - ++it; - BOOST_CHECK(!it); -} - -BOOST_AUTO_TEST_CASE(null_entries) { - const char str[] = "\0split\0\0 \0me\0 "; - PieceIterator<' '> it(StringPiece(str, sizeof(str) - 1)); - BOOST_REQUIRE(it); - const char first[] = "\0split\0\0"; - BOOST_CHECK_EQUAL(StringPiece(first, sizeof(first) - 1), *it); - ++it; - BOOST_REQUIRE(it); - const char second[] = "\0me\0"; - BOOST_CHECK_EQUAL(StringPiece(second, sizeof(second) - 1), *it); - ++it; - BOOST_CHECK(!it); -} - -/*BOOST_AUTO_TEST_CASE(pipe_pipe_none) { +BOOST_AUTO_TEST_CASE(pipe_pipe_none) { const char str[] = "nodelimit at all"; TokenIter it(str, MultiCharacter("|||")); BOOST_REQUIRE(it); @@ -79,7 +33,7 @@ BOOST_AUTO_TEST_CASE(remove_empty) { const char str[] = "|||"; TokenIter it(str, MultiCharacter("|||")); BOOST_CHECK(!it); -}*/ +} BOOST_AUTO_TEST_CASE(remove_empty_keep) { const char str[] = " |||"; -- cgit v1.2.3 From 54bcfb835232d190a5ab6f0bd825de8a50dae126 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 29 Feb 2012 01:12:40 -0500 Subject: cleanup, mpi-ify lblmodel --- training/lbl_model.cc | 179 +++++++------- utils/agenda.h | 140 ----------- utils/best.h | 32 --- utils/corpus_tools.cc | 62 +++++ utils/corpus_tools.h | 19 ++ utils/d_ary_heap.h | 568 --------------------------------------------- utils/ftoa.h | 403 -------------------------------- utils/int_or_pointer.h | 70 ------ utils/intern_pool.h | 158 ------------- utils/lvalue_pmap.h | 31 --- utils/max_plus.h | 201 ---------------- utils/maybe_update_bound.h | 17 -- utils/nan.h | 42 ---- utils/string_to.h | 314 ------------------------- 14 files changed, 178 insertions(+), 2058 deletions(-) delete mode 100644 utils/agenda.h delete mode 100644 utils/best.h create mode 100644 utils/corpus_tools.cc create mode 100644 utils/corpus_tools.h delete mode 100644 utils/d_ary_heap.h delete mode 100644 utils/ftoa.h delete mode 100644 utils/int_or_pointer.h delete mode 100644 utils/intern_pool.h delete mode 100644 utils/lvalue_pmap.h delete mode 100644 utils/max_plus.h delete mode 100644 utils/maybe_update_bound.h delete mode 100644 utils/nan.h delete mode 100644 utils/string_to.h diff --git a/training/lbl_model.cc b/training/lbl_model.cc index 2af848b5..def5075a 100644 --- a/training/lbl_model.cc +++ b/training/lbl_model.cc @@ -12,11 +12,17 @@ #include // memset #include +#ifdef HAVE_MPI +#include +#include +namespace mpi = boost::mpi; +#endif #include #include #include #include +#include "corpus_tools.h" #include "optimize.h" #include "array2d.h" #include "m.h" @@ -29,9 +35,9 @@ namespace po = boost::program_options; using namespace std; #define kDIMENSIONS 100 -typedef Eigen::Matrix RVector; -typedef Eigen::Matrix RTVector; -typedef Eigen::Matrix TMatrix; +typedef Eigen::Matrix RVector; +typedef Eigen::Matrix RTVector; +typedef Eigen::Matrix TMatrix; vector r_src, r_trg; bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { @@ -39,8 +45,8 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { opts.add_options() ("input,i",po::value(),"Input file") ("iterations,I",po::value()->default_value(1000),"Number of iterations of training") - ("regularization_strength,C",po::value()->default_value(0.1),"L2 regularization strength (0 for no regularization)") - ("eta", po::value()->default_value(0.1f), "Eta for SGD") + ("regularization_strength,C",po::value()->default_value(0.1),"L2 regularization strength (0 for no regularization)") + ("eta", po::value()->default_value(0.1f), "Eta for SGD") ("source_embeddings,f", po::value(), "File containing source embeddings (if unset, random vectors will be used)") ("target_embeddings,e", po::value(), "File containing target embeddings (if unset, random vectors will be used)") ("random_seed,s", po::value(), "Random seed") @@ -70,7 +76,7 @@ bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { } void Normalize(RVector* v) { - float norm = v->norm(); + double norm = v->norm(); assert(norm > 0.0f); *v /= norm; } @@ -80,7 +86,7 @@ void Flatten(const TMatrix& m, vector* v) { v->resize(kDIMENSIONS * kDIMENSIONS); for (unsigned i = 0; i < kDIMENSIONS; ++i) for (unsigned j = 0; j < kDIMENSIONS; ++j) { - assert(boost::math::isnormal(m(i, j))); + assert(boost::math::isfinite(m(i, j))); (*v)[c++] = m(i,j); } } @@ -89,7 +95,7 @@ void Unflatten(const vector& v, TMatrix* m) { unsigned c = 0; for (unsigned i = 0; i < kDIMENSIONS; ++i) for (unsigned j = 0; j < kDIMENSIONS; ++j) { - assert(boost::math::isnormal(v[c])); + assert(boost::math::isfinite(v[c])); (*m)(i, j) = v[c++]; } } @@ -162,14 +168,25 @@ void LoadEmbeddings(const string& filename, vector* pv) { } int main(int argc, char** argv) { +#ifdef HAVE_MPI + std::cerr << "**MPI enabled.\n"; + mpi::environment env(argc, argv); + mpi::communicator world; + const int size = world.size(); + const int rank = world.rank(); +#else + std::cerr << "**MPI disabled.\n"; + const int rank = 0; + const int size = 1; +#endif po::variables_map conf; if (!InitCommandLine(argc, argv, &conf)) return 1; const string fname = conf["input"].as(); - const float reg_strength = conf["regularization_strength"].as(); + const double reg_strength = conf["regularization_strength"].as(); const bool has_l2 = reg_strength; assert(reg_strength >= 0.0f); const int ITERATIONS = conf["iterations"].as(); - const float eta = conf["eta"].as(); + const double eta = conf["eta"].as(); const double diagonal_tension = conf["diagonal_tension"].as(); bool SGD = false; if (diagonal_tension < 0.0) { @@ -181,61 +198,44 @@ int main(int argc, char** argv) { unsigned lc = 0; vector unnormed_a_i; - string line; - string ssrc, strg; bool flag = false; - Lattice src, trg; + vector > srcs, trgs; vector vocab_e; - { // read through corpus, initialize int map, check lines are good - set svocab_e; - cerr << "INITIAL READ OF " << fname << endl; - ReadFile rf(fname); - istream& in = *rf.stream(); - while(getline(in, line)) { - ++lc; - if (lc % 1000 == 0) { cerr << '.'; flag = true; } - if (lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; } - ParseTranslatorInput(line, &ssrc, &strg); - LatticeTools::ConvertTextToLattice(ssrc, &src); - LatticeTools::ConvertTextToLattice(strg, &trg); - if (src.size() == 0 || trg.size() == 0) { - cerr << "Error: " << lc << "\n" << line << endl; - assert(src.size() > 0); - assert(trg.size() > 0); - } - if (src.size() > unnormed_a_i.size()) - unnormed_a_i.resize(src.size()); - for (unsigned i = 0; i < trg.size(); ++i) { - assert(trg[i].size() == 1); - svocab_e.insert(trg[i][0].label); - } - } + { + set svocab_e, svocab_f; + CorpusTools::ReadFromFile(fname, &srcs, NULL, &trgs, &svocab_e, rank, size); copy(svocab_e.begin(), svocab_e.end(), back_inserter(vocab_e)); } - if (flag) cerr << endl; cerr << "Number of target word types: " << vocab_e.size() << endl; - const float num_examples = lc; + const double num_examples = lc; - LBFGSOptimizer lbfgs(kDIMENSIONS * kDIMENSIONS, 100); + boost::shared_ptr lbfgs; + if (rank == 0) + lbfgs.reset(new LBFGSOptimizer(kDIMENSIONS * kDIMENSIONS, 100)); r_trg.resize(TD::NumWords() + 1); r_src.resize(TD::NumWords() + 1); + vector > trg_pos(TD::NumWords() + 1); + if (conf.count("random_seed")) { srand(conf["random_seed"].as()); } else { - unsigned seed = time(NULL); + unsigned seed = time(NULL) + rank * 100; cerr << "Random seed: " << seed << endl; srand(seed); } - TMatrix t = TMatrix::Random() / 50.0; - for (unsigned i = 1; i < r_trg.size(); ++i) { - r_trg[i] = RVector::Random(); - r_src[i] = RVector::Random(); + + TMatrix t; + if (rank == 0) { + t = TMatrix::Random() / 50.0; + for (unsigned i = 1; i < r_trg.size(); ++i) { + r_trg[i] = RVector::Random(); + r_src[i] = RVector::Random(); + } + if (conf.count("source_embeddings")) + LoadEmbeddings(conf["source_embeddings"].as(), &r_src); + if (conf.count("target_embeddings")) + LoadEmbeddings(conf["target_embeddings"].as(), &r_trg); } - if (conf.count("source_embeddings")) - LoadEmbeddings(conf["source_embeddings"].as(), &r_src); - if (conf.count("target_embeddings")) - LoadEmbeddings(conf["target_embeddings"].as(), &r_trg); - vector > trg_pos(TD::NumWords() + 1); // do optimization TMatrix g = TMatrix::Zero(); @@ -243,22 +243,25 @@ int main(int argc, char** argv) { vector z_src; vector flat_g, flat_t; Flatten(t, &flat_t); - for (int iter = 0; iter < ITERATIONS; ++iter) { + bool converged = false; + // TODO broadcast embeddings + for (int iter = 0; !converged && iter < ITERATIONS; ++iter) { +#ifdef HAVE_MPI + mpi::broadcast(world, &flat_t[0], flat_t.size(), 0); +#endif + Unflatten(flat_t, &t); cerr << "ITERATION " << (iter + 1) << endl; - ReadFile rf(fname); - istream& in = *rf.stream(); double likelihood = 0; double denom = 0.0; lc = 0; flag = false; g *= 0; - while(getline(in, line)) { + for (unsigned i = 0; i < srcs.size(); ++i) { + const vector& src = srcs[i]; + const vector& trg = trgs[i]; ++lc; - if (lc % 1000 == 0) { cerr << '.'; flag = true; } - if (lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; } - ParseTranslatorInput(line, &ssrc, &strg); - LatticeTools::ConvertTextToLattice(ssrc, &src); - LatticeTools::ConvertTextToLattice(strg, &trg); + if (rank == 0 && lc % 1000 == 0) { cerr << '.'; flag = true; } + if (rank == 0 && lc %50000 == 0) { cerr << " [" << lc << "]\n" << flush; flag = false; } denom += trg.size(); exp_src.clear(); exp_src.resize(src.size(), TMatrix::Zero()); @@ -266,10 +269,10 @@ int main(int argc, char** argv) { Array2D exp_refs(src.size(), trg.size(), TMatrix::Zero()); Array2D z_refs(src.size(), trg.size(), 0.0); for (unsigned j = 0; j < trg.size(); ++j) - trg_pos[trg[j][0].label].insert(j); + trg_pos[trg[j]].insert(j); for (unsigned i = 0; i < src.size(); ++i) { - const RVector& r_s = r_src[src[i][0].label]; + const RVector& r_s = r_src[src[i]]; const RTVector pred = r_s.transpose() * t; TMatrix& exp_m = exp_src[i]; double& z = z_src[i]; @@ -293,7 +296,7 @@ int main(int argc, char** argv) { } } for (unsigned j = 0; j < trg.size(); ++j) - trg_pos[trg[j][0].label].clear(); + trg_pos[trg[j]].clear(); // model expectations for a single target generation with // uniform alignment prior @@ -323,8 +326,8 @@ int main(int argc, char** argv) { // TODO handle alignment prob } if (ref_z <= 0) { - cerr << "TRG=" << TD::Convert(trg[j][0].label) << endl; - cerr << " LINE=" << line << endl; + cerr << "TRG=" << TD::Convert(trg[j]) << endl; + cerr << " LINE=" << lc << " (RANK=" << rank << "/" << size << ")" << endl; cerr << " REF_EXP=\n" << ref_exp << endl; cerr << " M_EXP=\n" << m_exp << endl; abort(); @@ -339,30 +342,42 @@ int main(int argc, char** argv) { } } - if (iter == (ITERATIONS - 1) || lc == 28) { cerr << al << endl; } + if (rank == 0 && (iter == (ITERATIONS - 1) || lc < 12)) { cerr << al << endl; } } - if (flag) { cerr << endl; } + if (flag && rank == 0) { cerr << endl; } - const double base2_likelihood = likelihood / log(2); - cerr << " log_e likelihood: " << likelihood << endl; - cerr << " log_2 likelihood: " << base2_likelihood << endl; - cerr << " cross entropy: " << (-base2_likelihood / denom) << endl; - cerr << " perplexity: " << pow(2.0, -base2_likelihood / denom) << endl; + double obj = 0; if (!SGD) { Flatten(g, &flat_g); - double obj = -likelihood; - if (has_l2) { - const double r = ApplyRegularization(reg_strength, - flat_t, - &flat_g); - obj += r; - cerr << " regularization: " << r << endl; + obj = -likelihood; + // TODO - reduce gradient + } + + if (rank == 0) { + double gn = 0; + for (unsigned i = 0; i < flat_g.size(); ++i) + gn += flat_g[i]*flat_g[i]; + const double base2_likelihood = likelihood / log(2); + cerr << " log_e likelihood: " << likelihood << endl; + cerr << " log_2 likelihood: " << base2_likelihood << endl; + cerr << " cross entropy: " << (-base2_likelihood / denom) << endl; + cerr << " perplexity: " << pow(2.0, -base2_likelihood / denom) << endl; + cerr << " gradient norm: " << sqrt(gn) << endl; + if (!SGD) { + if (has_l2) { + const double r = ApplyRegularization(reg_strength, + flat_t, + &flat_g); + obj += r; + cerr << " regularization: " << r << endl; + } + lbfgs->Optimize(obj, flat_g, &flat_t); + converged = (lbfgs->HasConverged()); } - lbfgs.Optimize(obj, flat_g, &flat_t); - Unflatten(flat_t, &t); - if (lbfgs.HasConverged()) break; } - cerr << t << endl; +#ifdef HAVE_MPI + mpi::broadcast(world, converged, 0); +#endif } cerr << "TRANSLATION MATRIX:" << endl << t << endl; return 0; diff --git a/utils/agenda.h b/utils/agenda.h deleted file mode 100644 index d4f13696..00000000 --- a/utils/agenda.h +++ /dev/null @@ -1,140 +0,0 @@ -#ifndef AGENDA_H -#define AGENDA_H - -#define DBG_AGENDA(x) x -/* - a priority queue where you expect to queue the same item at different - priorities several times before finally popping it. higher priority = better. - so in best first you'd be using negative cost or e^-cost (probabilities, in - other words). - - this means you have a way to look up a key and see its location in the queue, - so its priority can be adjusted (or, simpler implementation: so when you pop, - you see if you've already popped before at a lower cost, and skip the - subsequent pops). - - it's assumed that you'll never queue an item @ a better priority after it has - already been popped. that is, the agenda will track already completed items. - maybe in the future i will let you recompute a cheaper way to reach things - after first-pop also, it's assumed that we're always improving prios of - existing items, never making them worse (even though technically this is - possible and sensible if it hasn't been popped yet). - - simple binary max heap for now. there are better practical options w/ - superior cache locaility. movements in the heap need to update a record for - that key of where the key went. i do this by creating canonical key pointers - out of boost object pools (if the key were lightweight e.g. an int, then it - would make sense to use the hash lookup too - - since i'm doing key hashing to start with, i also allow you to attach some - arbitrary data (value) payload beyond key+priority. - - hash map from key to done (has been popped) -> set where doneness is marked in key item? - - a slightly different way to make an adjustable heap would be to use - tree-structured parent/children links intrusively (or mapped by key) in the - key, rather than indices in a compact binary-tree heap - - */ - -#include "best.h" -#include "intern_pool.h" -#include "d_ary_heap.h" -#include "lvalue_pmap.h" -#include -#include - -/* -template -struct priority_traits { - typedef typename P::priority_type priority_type; -}; -*/ - -typedef best_t agenda_best_t; -typedef unsigned agenda_location_t; - -PMAP_MEMBER_INDIRECT(LocationMap,agenda_location_t,location) -PMAP_MEMBER_INDIRECT(PriorityMap,agenda_best_t,priority) - -struct Less { - typedef bool result_type; - template - bool operator()(A const& a,B const& b) const { return a,class HashKey=boost::hash,class EqKey=std::equal_to, class Pool=boost::object_pool > -struct Agenda : intern_pool { - typedef intern_pool Intern; // inherited because I want to use construct() - /* this is less generic than it could be, because I want to use a single hash mapping to intern to canonical mutable object pointers, where the property maps are just lvalue accessors */ - typedef typename KeyF::result_type Key; - typedef Item * Handle; - typedef LocationMap LocMap; - typedef PriorityMap PrioMap; - LocMap locmap; - PrioMap priomap; // note: priomap[item] is set by caller before giving us the item; then tracks best (for canonicalized item) thereafter - - Better better; - //NOT NEEDED: initialize function object state (there is none) - - typedef Item *ItemC; //canonicalized pointer - typedef Item *ItemP; - static const std::size_t heap_arity=4; // might be fastest possible (depends on key size probably - cache locality is bad w/ arity=2) - typedef std::vector HeapStorage; - typedef d_ary_heap_indirect Heap; - Heap q; - - // please don't call q.push etc. directly. - void add(ItemP i) { - bool fresh=interneq(i); - DBG_AGENDA(assert(fresh && !q.contains(i))); - q.push(i); - } - bool improve(ItemP i) { - ItemP c=i; - bool fresh=interneq(c); - if (fresh) { - add(c); - return true; - } - DBG_AGENDA(assert(q.contains(c))); - return q.maybe_improve(priomap[i]); - } - inline bool empty() { - return q.empty(); - } - // no need to destroy the canon. item because we want to remember the best cost and reject more expensive ways of using it). - ItemC pop() { - ItemC r=q.top(); - q.pop(); - return r; - } - void pop_discard() { - q.pop(); - } - - ItemC top() { - DBG_AGENDA(assert(!empty())); - return q.top(); - } - - agenda_best_t best() const { - return q.best(); //TODO: cache/track the global best? - } - - agenda_best_t second_best() const { - return q.second_best(); - } - - // add only if worse than queue current best, otherwise evaluate immediately (e.g. for early stopping w/ expensive to compute additional cost). return true if postponed (added) - bool postpone(ItemP i) { - if (better(priomap[i],best())) return false; - return improve(i); - } - - Agenda(unsigned reserve=1000000,LocMap const& lm=LocMap(),PrioMap const& pm=PrioMap(),EqKey const& eq=EqKey(),Better const& better=Better()) : locmap(lm), priomap(pm), better(better), q(priomap,locmap,better,reserve) { } -}; - -#endif diff --git a/utils/best.h b/utils/best.h deleted file mode 100644 index ed15e0be..00000000 --- a/utils/best.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef UTILS__BEST_H -#define UTILS__BEST_H - -#include "max_plus.h" - -typedef MaxPlus best_t; - -inline bool better(best_t const& a,best_t const& b) { - return a.v_>b.v_; // intentionally reversed, so default min-heap, sort, etc. put best first. -} - -inline bool operator <(best_t const& a,best_t const& b) { - return a.v_>b.v_; // intentionally reversed, so default min-heap, sort, etc. put best first. -} -struct BetterP { - inline bool operator ()(best_t const& a,best_t const& b) const { - return a.v_>b.v_; // intentionally reversed, so default min-heap, sort, etc. put best first. - } -}; - -inline void maybe_improve(best_t &a,best_t const& b) { - if (a.v_>b.v_) - a.v_=b.v_; -} - -template -inline void maybe_improve(best_t &a,O const& b) { - if (a.v_>b.v_) - a.v_=b.v_; -} - -#endif diff --git a/utils/corpus_tools.cc b/utils/corpus_tools.cc new file mode 100644 index 00000000..a0542b6e --- /dev/null +++ b/utils/corpus_tools.cc @@ -0,0 +1,62 @@ +#include "corpus_tools.h" + +#include + +#include "tdict.h" +#include "filelib.h" +#include "verbose.h" + +using namespace std; + +void CorpusTools::ReadFromFile(const string& filename, + vector >* src, + set* src_vocab, + vector >* trg, + set* trg_vocab, + int rank, + int size) { + assert(rank >= 0); + assert(size > 0); + assert(rank < size); + if (src) src->clear(); + if (src_vocab) src_vocab->clear(); + if (trg) trg->clear(); + if (trg_vocab) trg_vocab->clear(); + const int expected_fields = 1 + (trg == NULL ? 0 : 1); + if (!SILENT) cerr << "Reading from " << filename << " ...\n"; + ReadFile rf(filename); + istream& in = *rf.stream(); + string line; + int lc = 0; + static const WordID kDIV = TD::Convert("|||"); + vector tmp; + while(getline(in, line)) { + const bool skip = (lc % size != rank); + ++lc; + if (skip) continue; + TD::ConvertSentence(line, &tmp); + src->push_back(vector()); + vector* d = &src->back(); + set* v = src_vocab; + int s = 0; + for (unsigned i = 0; i < tmp.size(); ++i) { + if (tmp[i] == kDIV) { + ++s; + if (s > 1) { cerr << "Unexpected format in line " << lc << ": " << line << endl; abort(); } + assert(trg); + trg->push_back(vector()); + d = &trg->back(); + v = trg_vocab; + } else { + d->push_back(tmp[i]); + if (v) v->insert(tmp[i]); + } + } + ++s; + if (expected_fields != s) { + cerr << "Wrong number of fields in line " << lc << ": " << line << endl; abort(); + } + } +} + + diff --git a/utils/corpus_tools.h b/utils/corpus_tools.h new file mode 100644 index 00000000..97bdaa94 --- /dev/null +++ b/utils/corpus_tools.h @@ -0,0 +1,19 @@ +#ifndef _CORPUS_TOOLS_H_ +#define _CORPUS_TOOLS_H_ + +#include +#include +#include +#include "wordid.h" + +struct CorpusTools { + static void ReadFromFile(const std::string& filename, + std::vector >* src, + std::set* src_vocab = NULL, + std::vector >* trg = NULL, + std::set* trg_vocab = NULL, + int rank = 0, + int size = 1); +}; + +#endif diff --git a/utils/d_ary_heap.h b/utils/d_ary_heap.h deleted file mode 100644 index 1270638a..00000000 --- a/utils/d_ary_heap.h +++ /dev/null @@ -1,568 +0,0 @@ -#ifndef D_ARY_HEAP_H -#define D_ARY_HEAP_H - -#include "show.h" -#define DDARY(x) - -#define D_ARY_PUSH_GRAEHL 0 // untested -#define D_ARY_POP_GRAEHL 0 // untested -#define D_ARY_DOWN_GRAEHL 0 // untested -#define D_ARY_UP_GRAEHL 0 // untested -#define D_ARY_APPEND_ALWAYS_PUSH 1 // heapify (0) is untested. otherwise switch between push and heapify depending on size (cache effects, existing items vs. # appended ones) - -#define D_ARY_TRACK_OUT_OF_HEAP 0 // shouldn't need to track, because in contains() false positives looking up stale or random loc map values are impossible - we just check key. note: if you enable this, you must init location to D_ARY_HEAP_NULL_INDEX yourself until it's been added or popped -#define D_ARY_VERIFY_HEAP 1 -// This is a very expensive test so it should be disabled even when NDEBUG is not defined - -# undef D_ARY_HEAP_NULL_INDEX -# define D_ARY_HEAP_NULL_INDEX (-1) // you may init location to this. - -/* adapted from boost/graph/detail/d_ary_heap.hpp - - local modifications: - - clear, heapify, append range/container, Size type template arg, reserve constructor arg - - hole+move rather than swap. note: swap would be more efficient for heavyweight keys, until move ctors exist - - don't set locmap to -1 when removing from heap (waste of time) - - // unlike arity=2 case, you don't gain anything by having indices start at 1, with 0-based child indices - // root @1, A=2, children indices m={0,1}: parent(i)=i/2, child(i,m)=2*i+m - // root @0: parent(i)=(i-1)/A child(i,n)=i*A+n+1 - can't improve on this except child(i,m)=i*A+m - (integer division, a/b=floor(a/b), so (i-1)/A = ceil(i/A)-1, or greatest int less than (i/A)) - - actually, no need to adjust child index, since child is called only once and inline - - e.g. for A=3 gorn address in tree -> index - - () = root -> 0 - (1) -> 1 - (2) -> 2 - (3) (A) -> 3 - (1,1) -> (1*A+1) = 4 - (1,2) -> (1*A+2) = 5 - (1,3) -> (1*A+3) = 6 - (2,1) -> (2*A+1) = 7 - etc. - -//TODO: block-align siblings! assume data[0] is 16 or 32-byte aligned ... then we want root @ index (blocksize-1). see http://www.lamarca.org/anthony/pubs/heaps.pdf pg8. for pow2(e.g. 4)-ary heap, it may be reasonable to use root @index A-1. however, suppose the key size is not padded to a power of 2 (e.g. 12 bytes), then we would need internal gaps at times. would want to use compile const template based inlineable alignment math for this? possibly use a container like vector that lets you specify padding relative to some address multiple for v[0]. - - optimal D: see http://www.lamarca.org/anthony/pubs/heaps.pdf pg 9. depedns on relative cost of swap,compare, but in all cases except swap=free, 2 is worse than 3-4. for expensive swap (3x compare), 4 still as good as 5. so just use 4. boost benchmarking djikstra agrees; 4 is best. - - cache-aligned 4-heap speedup over regular 2-heap is 10-80% (for huge heaps, the speedup is more) - - splay/skew heaps are worse than 2heap or aligned 4heap in practice. - - //TODO: switch from heapify (Floyd's method) to repeated push past some size limit (in bytes) due to cache effect - - #define D_ARY_BYTES_OUT_OF_CACHE 0x1000000 - - //TODO: assuming locmap is an lvalue pmap, we can be more efficient. on the other hand, if it's an intrusive property map to an interned mutable object, there's no difference in performance, and that's what i'm going to do in my first uses. plus, if keys are indices and the map is a vector, it's barely any overhead. - - */ - -// -//======================================================================= -// Copyright 2009 Trustees of Indiana University -// Authors: Jeremiah J. Willcock, Andrew Lumsdaine -// -// Distributed under the Boost Software License, Version 1.0. (See -// accompanying file LICENSE_1_0.txt or copy at -// http://www.boost.org/LICENSE_1_0.txt) -//======================================================================= -// - -#include -#include -#include -#include -#include -#include -#include -#include - - - // D-ary heap using an indirect compare operator (use identity_property_map - // as DistanceMap to get a direct compare operator). This heap appears to be - // commonly used for Dijkstra's algorithm for its good practical performance - // on some platforms; asymptotically, it's not optimal; it has an O(lg N) decrease-key - // operation, which is (amortized) constant time on a relaxed heap or fibonacci heap. The - // implementation is mostly based on the binary heap page on Wikipedia and - // online sources that state that the operations are the same for d-ary - // heaps. This code is not based on the old Boost d-ary heap code. - // - // - d_ary_heap_indirect is a model of UpdatableQueue as is needed for - // dijkstra_shortest_paths. - // - // - Value must model Assignable. - // - Arity must be at least 2 (optimal value appears to be 4, both in my and - // third-party experiments). - // - IndexInHeapMap must be a ReadWritePropertyMap from Value to - // Container::size_type (to store the index of each stored value within the - // heap for decrease-key aka update). - // - DistanceMap must be a ReadablePropertyMap from Value to something - // (typedef'ed as distance_type). - // - Compare must be a BinaryPredicate used as a less-than operator on - // distance_type. - // - Container must be a random-access, contiguous container (in practice, - // the operations used probably require that it is std::vector). - // - template , - typename Container = std::vector, - typename Size = typename Container::size_type, - typename Equal = std::equal_to > - class d_ary_heap_indirect { - BOOST_STATIC_ASSERT (Arity >= 2); - public: - typedef Container container_type; - typedef Size size_type; - typedef Value value_type; - typedef typename Container::const_iterator const_iterator; - typedef const_iterator iterator; - // The distances being compared using better and that are stored in the - // distance map - typedef typename boost::property_traits::value_type distance_type; - d_ary_heap_indirect(DistanceMap const& distance, - IndexInHeapPropertyMap const& index_in_heap, - const Better& better = Better(), - size_type container_reserve = 100000, - Equal const& equal = Equal() - ) - : better(better), data(), distance(distance), - index_in_heap(index_in_heap),equal(equal) { - data.reserve(container_reserve); - } - /* Implicit copy constructor */ - /* Implicit assignment operator */ - - template - void append_heapify(C const& c) { - data.reserve(data.size()+c.size()); - append_heapify(c.begin(),c.end()); - } - - template - void append_heapify(I begin,I end) { - data.insert(data.end(),begin,end); - heapify(); - } - - template - void append_push(C const& c) { - data.reserve(data.size()+c.size()); - append_push(c.begin(),c.end()); - } - - // past some threshold, this should be faster than append_heapify. also, if there are many existing elements it will be faster. - template - void append_push(I begin,I end) { - for (;begin!=end;++begin) - push(*begin); - } - - template - void append(C const& c) { - if (D_ARY_APPEND_ALWAYS_PUSH || data.size()>=c.size()/2) - append_push(c); - else - append_heapify(c); - } - - // past some threshold, this should be faster than append_heapify. also, if there are many existing elements it will be faster. - template - void append(I begin,I end) { - if (D_ARY_APPEND_ALWAYS_PUSH || data.size()>=0x10000) - append_push(begin,end); - else - append_heapify(begin,end); - } - - // could allow mutation of data directly, e.g. push_back 1 at a time - but then they could forget to heapify() - - //from bottom of heap tree up, turn that subtree into a heap by adjusting the root down - // for n=size, array elements indexed by floor(n/2) + 1, floor(n/2) + 2, ... , n are all leaves for the tree, thus each is an one-element heap already - // warning: this is many fewer instructions but, at some point (when heap doesn't fit in Lx cache) it will become slower than repeated push(). - void heapify() { - for (size_type i=parent(data.size()-1);i>0;--i) // starting from parent of last node, ending at first child of root (i==1) - preserve_heap_property_down(i); - } - - void reserve(size_type s) { - data.reserve(s); - } - - size_type size() const { - return data.size(); - } - - bool empty() const { - return data.empty(); - } - - const_iterator begin() const { - return data.begin(); - } - - const_iterator end() const { - return data.end(); - } - - void clear() { -#if D_ARY_TRACK_OUT_OF_HEAP - using boost::put; - for (typename Container::iterator i=data.begin(),e=data.end();i!=e;++i) - put(index_in_heap,*i,(size_type)D_ARY_HEAP_NULL_INDEX); -#endif - data.clear(); - } - - void push(const Value& v) { - if (D_ARY_PUSH_GRAEHL) { - size_type i = data.size(); - data.push_back(Value()); // (hoping default construct is cheap, construct-copy inline) - preserve_heap_property_up(v,i); // we don't have to recopy v, or init index_in_heap - } else { - size_type index = data.size(); - data.push_back(v); - using boost::put; - put(index_in_heap, v, index); - preserve_heap_property_up(index); - } - verify_heap(); - } - - Value& top() { - return data[0]; - } - - const Value& top() const { - return data[0]; - } - - void pop() { - using boost::put; - if(D_ARY_TRACK_OUT_OF_HEAP) - put(index_in_heap, data[0], (size_type)D_ARY_HEAP_NULL_INDEX); - if (data.size() != 1) { - if (D_ARY_POP_GRAEHL) { - preserve_heap_property_down(data.back(),0,data.size()-1); - data.pop_back(); - } else { - data[0] = data.back(); - put(index_in_heap, data[0], 0); - data.pop_back(); - preserve_heap_property_down(); - } - verify_heap(); - } else { - data.pop_back(); - } - } - - // This function assumes the key has been improved - // (distance has become smaller, so it may need to rise toward top(). - // i.e. decrease-key in a min-heap - void update(const Value& v) { - using boost::get; - size_type index = get(index_in_heap, v); - preserve_heap_property_up(v,index); - verify_heap(); - } - - // return true if improved. - bool maybe_improve(const Value& v,distance_type dbetter) { - using boost::get; - if (better(dbetter,get(distance,v))) { - preserve_heap_property_up_dist(v,dbetter); - return true; - } - return false; - } - - distance_type best(distance_type null=0) const { - return empty() ? null : get(distance,data[0]); - } - distance_type second_best(distance_type null=0) const { - if (data.size()<2) return null; - int m=std::min(data.size(),Arity+1); -// if (m>=Arity) m=Arity+1; - distance_type b=get(distance,data[1]); - for (int i=2;i=0 && i=0 check to catch uninit. data - } -#include "warning_pop.h" - - inline bool contains(const Value& v) const { - using boost::get; - return contains(v,get(index_in_heap, v)); - } - - void push_or_update(const Value& v) { /* insert if not present, else update */ - using boost::get; - size_type index = get(index_in_heap, v); - if (D_ARY_PUSH_GRAEHL) { - if (contains(v,index)) - preserve_heap_property_up(v,index); - else - push(v); - } else { - if (!contains(v,index)) { - index = data.size(); - data.push_back(v); - using boost::put; - put(index_in_heap, v, index); - } - preserve_heap_property_up(index); - } - verify_heap(); - } - - private: - Better better; - Container data; - DistanceMap distance; - IndexInHeapPropertyMap index_in_heap; - Equal equal; - - // Get the parent of a given node in the heap - static inline size_type parent(size_type index) { - return (index - 1) / Arity; - } - - // Get the child_idx'th child of a given node; 0 <= child_idx < Arity - static inline size_type child(size_type index, std::size_t child_idx) { - return index * Arity + child_idx + 1; - } - - // Swap two elements in the heap by index, updating index_in_heap - inline void swap_heap_elements(size_type index_a, size_type index_b) { - using std::swap; - Value value_a = data[index_a]; - Value value_b = data[index_b]; - data[index_a] = value_b; - data[index_b] = value_a; - using boost::put; - put(index_in_heap, value_a, index_b); - put(index_in_heap, value_b, index_a); - } - - inline void move_heap_element(Value const& v,size_type ito) { - using boost::put; - put(index_in_heap,v,ito); - data[ito]=v; //todo: move assign? - } - - // Verify that the array forms a heap; commented out by default - void verify_heap() const { - // This is a very expensive test so it should be disabled even when - // NDEBUG is not defined -#if D_ARY_VERIFY_HEAP - using boost::get; - for (size_t i = 1; i < data.size(); ++i) { - if (better(get(distance,data[i]), get(distance,data[parent(i)]))) { - assert (!"Element is smaller than its parent"); - } - } -#endif - } - - // we have a copy of the key, so we don't need to do that stupid find # of levels to move then move. we act as though data[index]=currently_being_moved, but in fact it's an uninitialized "hole", which we fill at the very end - inline void preserve_heap_property_up(Value const& currently_being_moved,size_type index) { - using boost::get; - preserve_heap_property_up(currently_being_moved,index,get(distance,currently_being_moved)); - } - - inline void preserve_heap_property_up_set_dist(Value const& currently_being_moved,distance_type dbetter) { - using boost::get; - using boost::put; - put(distance,currently_being_moved,dbetter); - preserve_heap_property_up(currently_being_moved,get(index_in_heap,currently_being_moved),dbetter); - verify_heap(); - } - - void preserve_heap_property_up(Value const& currently_being_moved,size_type index,distance_type currently_being_moved_dist) { - using boost::put; - using boost::get; - if (D_ARY_UP_GRAEHL) { - for (;;) { - if (index == 0) break; // Stop at root - size_type parent_index = parent(index); - Value const& parent_value = data[parent_index]; - if (better(currently_being_moved_dist, get(distance, parent_value))) { - move_heap_element(parent_value,index); - index = parent_index; - } else { - break; // Heap property satisfied - } - } - //finish "swap chain" by filling hole w/ currently_being_moved - move_heap_element(currently_being_moved,index); // note: it's ok not to return early on index==0 at start, even if self-assignment isn't supported by Value - because currently_being_moved is a copy. - } else { - put(index_in_heap,currently_being_moved,index); - put(distance,currently_being_moved,currently_being_moved_dist); - preserve_heap_property_up(index); - } - } - - // Starting at a node, move up the tree swapping elements to preserve the - // heap property. doesn't actually use swap; uses hole - void preserve_heap_property_up(size_type index) { - using boost::get; - if (index == 0) return; // Do nothing on root - if (D_ARY_UP_GRAEHL) { - Value copyi=data[index]; - preserve_heap_property_up(copyi,index); - return; - } - size_type orig_index = index; - size_type num_levels_moved = 0; - // The first loop just saves swaps that need to be done in order to avoid - // aliasing issues in its search; there is a second loop that does the - // necessary swap operations - Value currently_being_moved = data[index]; - distance_type currently_being_moved_dist = - get(distance, currently_being_moved); - for (;;) { - if (index == 0) break; // Stop at root - size_type parent_index = parent(index); - Value parent_value = data[parent_index]; - if (better(currently_being_moved_dist, get(distance, parent_value))) { - ++num_levels_moved; - index = parent_index; - continue; - } else { - break; // Heap property satisfied - } - } - // Actually do the moves -- move num_levels_moved elements down in the - // tree, then put currently_being_moved at the top - index = orig_index; - using boost::put; - for (size_type i = 0; i < num_levels_moved; ++i) { - size_type parent_index = parent(index); - Value parent_value = data[parent_index]; - put(index_in_heap, parent_value, index); - data[index] = parent_value; - index = parent_index; - } - data[index] = currently_being_moved; - put(index_in_heap, currently_being_moved, index); - verify_heap(); - } - - - // From the root, swap elements (each one with its smallest child) if there - // are any parent-child pairs that violate the heap property. v is placed at data[i], but then pushed down (note: data[i] won't be read explicitly; it will instead be overwritten by percolation). this also means that v must be a copy of data[i] if it was already at i. - // e.g. v=data.back(), i=0, sz=data.size()-1 for pop(), implicitly swapping data[i], data.back(), and doing data.pop_back(), then adjusting from 0 down w/ swaps. updates index_in_heap for v. - inline void preserve_heap_property_down(Value const& currently_being_moved,size_type i,size_type heap_size) { - using boost::get; - distance_type currently_being_moved_dist=get(distance,currently_being_moved); - Value* data_ptr = &data[0]; - size_type index = 0; // hole at index - currently_being_moved to be put here when we find the final hole spot - for (;;) { - size_type first_child_index = child(index, 0); - if (first_child_index >= heap_size) break; /* No children */ - Value* child_base_ptr = data_ptr + first_child_index; // using index of first_child_index+smallest_child_index because we hope optimizer will be smart enough to const-unroll a loop below if we do this. i think the optimizer would have gotten it even without our help (i.e. store root-relative index) - - // begin find best child index/distance - size_type smallest_child_index = 0; // don't add to base first_child_index every time we update which is smallest. - distance_type smallest_child_dist = get(distance, child_base_ptr[smallest_child_index]); -#undef D_ARY_MAYBE_IMPROVE_CHILD_I -#define D_ARY_MAYBE_IMPROVE_CHILD_I \ - distance_type i_dist = get(distance, child_base_ptr[i]); \ - if (better(i_dist, smallest_child_dist)) { \ - smallest_child_index = i; \ - smallest_child_dist = i_dist; \ - } - if (first_child_index + Arity <= heap_size) { - // avoid repeated heap_size boundcheck (should test if this is really a speedup - instruction cache tradeoff - could use upperbound = min(Arity,heap_size-first_child_index) instead. but this optimizes to a fixed number of iterations (compile time known) so probably worth it - for (size_t i = 1; i < Arity; ++i) { - D_ARY_MAYBE_IMPROVE_CHILD_I - } - } else { - for (size_t i = 1,e=heap_size - first_child_index; i < e; ++i) { - D_ARY_MAYBE_IMPROVE_CHILD_I - } - } - //end: know best child - - if (better(smallest_child_dist, currently_being_moved_dist)) { - // instead of swapping, move. - move_heap_element(child_base_ptr[smallest_child_index],index); // move up - index=first_child_index+smallest_child_index; // descend - hole is now here - } else { - move_heap_element(currently_being_moved,index); // finish "swap chain" by filling hole - break; - } - } - verify_heap(); - } - - inline void preserve_heap_property_down(size_type i) { - preserve_heap_property_down(data[i],i,data.size()); - } - - void preserve_heap_property_down() { - using boost::get; - if (data.empty()) return; - if (D_ARY_DOWN_GRAEHL) { // this *should* be more efficient because i avoid swaps. - Value copy0=data[0]; - preserve_heap_property_down(copy0,0,data.size()); - return; - } - size_type index = 0; - Value currently_being_moved = data[0]; - distance_type currently_being_moved_dist = - get(distance, currently_being_moved); - size_type heap_size = data.size(); - Value* data_ptr = &data[0]; - for (;;) { - size_type first_child_index = child(index, 0); - if (first_child_index >= heap_size) break; /* No children */ - Value* child_base_ptr = data_ptr + first_child_index; - size_type smallest_child_index = 0; - distance_type smallest_child_dist = get(distance, child_base_ptr[smallest_child_index]); - if (first_child_index + Arity <= heap_size) { - for (size_t i = 1; i < Arity; ++i) { // can be unrolled completely. - - D_ARY_MAYBE_IMPROVE_CHILD_I - } - } else { - for (size_t i = 1,e=heap_size - first_child_index; i < e; ++i) { - D_ARY_MAYBE_IMPROVE_CHILD_I - } - } - if (better(smallest_child_dist, currently_being_moved_dist)) { - swap_heap_elements(smallest_child_index + first_child_index, index); - index = smallest_child_index + first_child_index; - continue; - } else { - break; // Heap property satisfied - } - } - verify_heap(); - } - - }; - -#endif diff --git a/utils/ftoa.h b/utils/ftoa.h deleted file mode 100644 index 3dba528d..00000000 --- a/utils/ftoa.h +++ /dev/null @@ -1,403 +0,0 @@ -#ifndef FTOA_H -#define FTOA_H - - -//TODO: for fractional digits/non-sci, determine the right amount of left padding (more if the whole number is indeed <1, to keep the significant digits), less if sci notation and/or mantissa has sig. digits (don't want N before . and N after!) - -#ifndef FTOA_ROUNDTRIP -# define FTOA_ROUNDTRIP 1 -#endif - -#ifndef FTOA_DEBUG -# define FTOA_DEBUG 0 -#endif - -#ifndef FTOA_USE_SPRINTF -#define FTOA_USE_SPRINTF 0 -#endif - -#if FTOA_DEBUG -# define FTOAassert(x) assert(x) -# define DBFTOA(x) std::cerr<<"\nFTOA " <<__func__<<"("<<__LINE__<<"): " #x "="< -#include -#include -#include -#include -#include -#include "utoa.h" -#include "nan.h" - -template -struct ftoa_traits { -}; - -//eP10, -// sigd decimal places normally printed, roundtripd needed so that round-trip float->string->float is identity - -#define DEFINE_FTOA_TRAITS(FLOATT,INTT,sigd,roundtripd,small,large,used,P10) \ -template <> \ -struct ftoa_traits { \ - typedef INTT int_t; \ - typedef u ## INTT uint_t; \ - typedef FLOATT float_t; \ - enum { digits10=std::numeric_limits::digits10, chars_block=P10, usedig=used, sigdig=sigd, roundtripdig=roundtripd, bufsize=roundtripdig+7 }; \ - static const double pow10_block = 1e ## P10; \ - static const float_t small_f = small; \ - static const float_t large_f = large; \ - static inline int sprintf(char *buf,double f) { return std::sprintf(buf,"%." #used "g",f); } \ - static inline int sprintf_sci(char *buf,double f) { return std::sprintf(buf,"%." #used "e",f); } \ - static inline int sprintf_nonsci(char *buf,double f) { return std::sprintf(buf,"%." #used "f",f); } \ - static inline uint_t fracblock(double frac) { FTOAassert(frac>=0 && frac<1); double f=frac*pow10_block;uint_t i=(uint_t)f;FTOAassert(i=0 && frac<1); double f=frac*pow10_block;uint_t i=(uint_t)(f+.5);FTOAassert(ilarge; } \ - static inline bool use_sci(float_t f) { return use_sci_abs(std::fabs(f)); } \ -}; -//TODO: decide on computations in double (would hurt long double) or in native float type - any advantage? more precision is usually better. - -//10^22 = 0x1.0f0cf064dd592p73 is the largest exactly representable power of 10 in the binary64 format. but round down to 18 so int64_t can hold it. - -#if FTOA_ROUNDTRIP -#define DEFINE_FTOA_TRAITS_ROUNDTRIP(FLOATT,INTT,sigd,roundtripd,small,large) DEFINE_FTOA_TRAITS(FLOATT,INTT,sigd,roundtripd,small,large,roundtripd,roundtripd) -#else -#define DEFINE_FTOA_TRAITS_ROUNDTRIP(FLOATT,INTT,sigd,roundtripd,small,large) DEFINE_FTOA_TRAITS(FLOATT,INTT,sigd,roundtripd,small,large,sigd,sigd) -#endif - -DEFINE_FTOA_TRAITS_ROUNDTRIP(double,int64_t,15,17,1e-5,1e8) -//i've heard that 1e10 is fine for float. but we only have 1e9 (9 decimal places) in int32. -DEFINE_FTOA_TRAITS_ROUNDTRIP(float,int32_t,6,9,1e-3,1e8) - - -template -inline void ftoa_error(F f,char const* msg="") { - using namespace std; - cerr<<"ftoa error: "< -char *prepend_pos_frac_digits(char *p,F f) { - FTOAassert(f<1 && f >0); - typedef ftoa_traits FT; - //repeat if very small??? nah, require sci notation to take care of it. - typename FT::uint_t i=FT::rounded_fracblock(f); - DBFTOA2(f,i); - if (i>0) { - unsigned n_skipped; - char *d=utoa_drop_trailing_0(p,i,n_skipped); - char *b=p-FT::chars_block+n_skipped; - FTOAassert(b<=d); - left_pad(b,d,'0'); - return b; - } else { - return p; - } -} - -template -char *append_pos_frac_digits(char *p,F f) { // '0' right-padded, nul terminated, return position of nul. [p,ret) are the digits - if (f==0) { - *p++='0'; - return p; - } - FTOAassert(f<1 && f >0); - typedef ftoa_traits FT; - //repeat if very small??? nah, require sci notation to take care of it. - typename FT::uint_t i=FT::rounded_fracblock(f); - DBFTOA2(f,i); - if (i>0) { - char *e=p+FT::chars_block; - utoa_left_pad(p,e,i,'0'); - *e=0; - return e; - } else { - *p=0; - return p; - } -} - -template -inline char *prepend_pos_frac(char *p,F f) { - FTOAassert(f<1 && f>=0); - if (f==0) { - *--p='0'; - return p; - } - p=prepend_pos_frac_digits(p,f); - *--p='.'; - if (DECIMAL_FOR_WHOLE>0) - *--p='0'; - return p; -} - -template -inline char *append_pos_frac(char *p,F f) { - DBFTOA(f); - if (DECIMAL_FOR_WHOLE>0) - *p++='0'; - *p++='.'; - return append_pos_frac_digits(p,f); -} - -template -inline char *prepend_frac(char *p,F f,bool positive_sign=false) { - FTOAassert(f<1 && f>-1); - if (f==0) - *--p='0'; - else if (f<0) { - p=prepend_pos_frac(p,-f); - *--p='-'; - } else { - p=prepend_pos_frac(p,f); - if (positive_sign) - *--p='+'; - } - return p; -} - - -template -inline char *append_sign(char *p,F f,bool positive_sign=false) { - if (f<0) { - *p++='-'; - } else if (positive_sign) - *p++='+'; - return p; -} - -template -inline char *append_frac(char *p,F f,bool positive_sign=false) { - FTOAassert(f<1 && f>-1); - if (f==0) { - *p++='0'; - return p; - } else if (f<0) { - *p++='-'; - return append_pos_frac(p,-f); - } - if (positive_sign) { - *p++='+'; - return append_pos_frac(p,f); - } - -} - - -//append_frac, append_pos_sci, append_sci. notice these are all composed according to a pattern (but reversing order of composition in pre vs app). or can implement with copy through buffer - -/* will switch to sci notation if integer part is too big for the int type. but for very small values, will simply display 0 (i.e. //TODO: find out log10 and leftpad 0s then convert rest) */ -template -char *prepend_pos_nonsci(char *p,F f) { - typedef ftoa_traits FT; - typedef typename FT::uint_t uint_t; - DBFTOA(f); - FTOAassert(f>0); - if (f>std::numeric_limits::max()) - return prepend_pos_sci(p,f); - //which is faster - modf is weird and returns negative frac part if f is negative. while we could deal with this using fabs, we instead only handle positive here (put - sign in front and negate, then call us) - ? -#if 0 - F intpart; - F frac=std::modf(f,&intpart); - uint_t u=intpart; -#else - uint_t u=f; - F frac=f-u; -#endif - DBFTOA2(u,frac); - if (frac == 0) { - if (DECIMAL_FOR_WHOLE>1) - *--p='.'; - } else { - p=prepend_pos_frac_digits(p,frac); - *--p='.'; - } - if (u==0) { - if (DECIMAL_FOR_WHOLE>0) - *--p='0'; - } else - p=utoa(p,u); - return p; -} - -// modify p; return true if handled -template -inline bool prepend_0_etc(char *&p,F f,bool positive_sign=false) { - if (f==0) { - *--p='0'; - return true; - } - if (is_nan(f)) { - p-=3; - p[0]='N';p[1]='A';p[2]='N'; - return true; - } - if (is_pos_inf(f)) { - p-=3; - p[0]='I';p[1]='N';p[2]='F'; - if (positive_sign) - *--p='+'; - return true; - } - if (is_neg_inf(f)) { - p-=4; - p[0]='-';p[1]='I';p[2]='N';p[3]='F'; - return true; - } - return false; -} - -template -inline char *prepend_nonsci(char *p,F f,bool positive_sign=false) { - if (prepend_0_etc(p,f,positive_sign)) return p; - if (f<0) { - p=prepend_pos_nonsci(p,-f); - *--p='-'; - } else { - p=prepend_pos_nonsci(p,f); - if (positive_sign) - *--p='+'; - } - return p; -} - -template -inline char *prepend_pos_sci(char *p,F f,bool positive_sign_exp=false) { - FTOAassert(f>0); - typedef ftoa_traits FT; - int e10; - F mant=FT::mantexp10(f,e10); - DBFTOA(f); - DBFTOA2(mant,e10); - FTOAassert(mant<10.00001); - if (mant>=10.) { - ++e10; - mant*=.1; - } else if (mant < 1.) { - --e10; - mant*=10; - } - p=itoa(p,e10,positive_sign_exp); - *--p='e'; - return prepend_pos_nonsci(p,mant); -} - -template -inline char *prepend_sci(char *p,F f,bool positive_sign_mant=false,bool positive_sign_exp=false) { - if (prepend_0_etc(p,f,positive_sign_mant)) return p; - if (f==0) - *--p='0'; - else if (f<0) { - p=prepend_pos_sci(p,-f,positive_sign_exp); - *--p='-'; - } else { - p=prepend_pos_sci(p,f,positive_sign_exp); - if (positive_sign_mant) - *--p='+'; - } - return p; -} - -template -inline char *append_nonsci(char *p,F f,bool positive_sign=false) { - if (positive_sign&&f>=0) *p++='+'; - return p+ftoa_traits::sprintf_nonsci(p,f); -} - -template -inline char *append_sci(char *p,F f,bool positive_sign=false) { - if (positive_sign&&f>=0) *p++='+'; - return p+ftoa_traits::sprintf_sci(p,f); -} - -template -inline char *append_ftoa(char *p,F f,bool positive_sign=false) { - if (positive_sign&&f>=0) *p++='+'; - return p+ftoa_traits::sprintf(p,f); -} - -template -inline char *prepend_ftoa(char *p,F f) -{ - typedef ftoa_traits FT; - return FT::use_sci(f) ? prepend_sci(p,f) : prepend_nonsci(p,f); -} - -template -inline std::string ftos_append(F f) { - typedef ftoa_traits FT; - char buf[FT::bufsize]; - return std::string(buf,append_ftoa(buf,f)); -} - -template -inline std::string ftos_prepend(F f) { - typedef ftoa_traits FT; - char buf[FT::bufsize]; - char *end=buf+FT::bufsize; - return std::string(prepend_ftoa(end,f),end); -} - - -template -inline std::string ftos(F f) { -#if 0 - // trust RVO? no extra copies? - return FTOA_USE_SPRINTF ? ftos_append(f) : ftos_prepend(f); -#else - typedef ftoa_traits FT; - char buf[FT::bufsize]; - if (FTOA_USE_SPRINTF) { - return std::string(buf,append_ftoa(buf,f)); - } else { - char *end=buf+FT::bufsize; - return std::string(prepend_ftoa(end,f),end); - } -#endif -} - -namespace { - const int ftoa_bufsize=30; - char ftoa_outbuf[ftoa_bufsize]; -} - -// not even THREADLOCAL - don't use. -inline char *static_ftoa(float f) -{ - if (FTOA_USE_SPRINTF) { - append_ftoa(ftoa_outbuf,f); - return ftoa_outbuf; - } else { - char *end=ftoa_outbuf+ftoa_bufsize; - return prepend_ftoa(end,f); - } -} - - -#endif diff --git a/utils/int_or_pointer.h b/utils/int_or_pointer.h deleted file mode 100644 index 4b6a9e4a..00000000 --- a/utils/int_or_pointer.h +++ /dev/null @@ -1,70 +0,0 @@ -#ifndef INT_OR_POINTER_H -#define INT_OR_POINTER_H - -// if you ever wanted to store a discriminated union of pointer/integer without an extra boolean flag, this will do it, assuming your pointers are never odd. - -// check lsb for expected tag? -#ifndef IOP_CHECK_LSB -# define IOP_CHECK_LSB 1 -#endif -#if IOP_CHECK_LSB -# define iop_assert(x) assert(x) -#else -# define iop_assert(x) -#endif - -#include -#include - -template -struct IntOrPointer { - typedef Pointed pointed_type; - typedef Int integer_type; - typedef Pointed *value_type; - typedef IntOrPointer self_type; - IntOrPointer(int j) { *this=j; } - IntOrPointer(size_t j) { *this=j; } - IntOrPointer(value_type v) { *this=v; } - bool is_integer() const { return i&1; } - bool is_pointer() const { return !(i&1); } - value_type & pointer() { return p; } - const value_type & pointer() const { iop_assert(is_pointer()); return p; } - integer_type integer() const { iop_assert(is_integer()); return i >> 1; } - void set_integer(Int j) { i=2*j+1; } - void set_pointer(value_type p_) { p=p_;iop_assert(is_pointer()); } - void operator=(unsigned j) { i = 2*(integer_type)j+1; } - void operator=(int j) { i = 2*(integer_type)j+1; } - template - void operator=(C j) { i = 2*(integer_type)j+1; } - void operator=(value_type v) { p=v; } - IntOrPointer() {} - IntOrPointer(const self_type &s) : p(s.p) {} - void operator=(const self_type &s) { p=s.p; } - template - bool operator ==(C* v) const { return p==v; } - template - bool operator ==(const C* v) const { return p==v; } - template - bool operator ==(C j) const { return integer() == j; } - bool operator ==(self_type s) const { return p==s.p; } - bool operator !=(self_type s) const { return p!=s.p; } - template void print(O&o) const - { - if (is_integer()) - o << integer(); - else { - o << "0x" << std::hex << (size_t)pointer() << std::dec; - } - } - friend inline std::ostream& operator<<(std::ostream &o,self_type const& s) { - s.print(o); return o; - } -protected: - union { - value_type p; // must be even (guaranteed unless you're pointing at packed chars) - integer_type i; // stored as 2*data+1, so only has half the range (one less bit) of a normal integer_type - }; -}; - - -#endif diff --git a/utils/intern_pool.h b/utils/intern_pool.h deleted file mode 100644 index 7c739add..00000000 --- a/utils/intern_pool.h +++ /dev/null @@ -1,158 +0,0 @@ -#ifndef INTERN_POOL_H -#define INTERN_POOL_H - -#define DEBUG_INTERN_POOL(x) x - -/* to "intern" a string in lisp is to make a symbol from it (a pointer to a canonical copy whose pointer can be equality-compared/hashed directly with other interned things). we take an Item that has a key part and some mutable parts (that aren't in its identity), and we hash-by-value the key part to map to a canonical on-heap Item - and we use a boost object pool to allocate them */ - -//FIXME: actually store function object state (assumed stateless so far) - -#include -#include "hash.h" -//#include "null_traits.h" -#include - -template -struct get_key { // default accessor for I = like pair - typedef typename I::first_type const& result_type; - typedef I const& argument_type; - result_type operator()(I const& i) const { - return i.first; - } -}; - -// Arg type should be the non-pointer version. this saves me from using boost type traits to remove_pointer. f may be binary or unary -template -struct compose_indirect { - typedef Arg *argument_type; // we also accept Arg & - KeyF kf; - F f; - typedef typename F::result_type result_type; - result_type operator()(Arg const& p) const { - return f(kf(p)); - } - result_type operator()(Arg & p) const { - return f(kf(p)); - } - result_type operator()(Arg * p) const { - return f(kf(*p)); - } - template - result_type operator()(V const& v) const { - return f(kf(*v)); - } - - result_type operator()(Arg const& a1,Arg const& a2) const { - return f(kf(a1),kf(a2)); - } - result_type operator()(Arg & a1,Arg & a2) const { - return f(kf(a1),kf(a2)); - } - result_type operator()(Arg * a1,Arg * a2) const { - return f(kf(*a1),kf(*a2)); - } - template - result_type operator()(V const& v,W const&w) const { - return f(kf(*v),kf(*w)); - } - - -}; - -template -struct equal_indirect { - typedef Arg *argument_type; // we also accept Arg & - KeyF kf; - F f; - typedef bool result_type; - - result_type operator()(Arg const& a1,Arg const& a2) const { - return f(kf(a1),kf(a2)); - } - result_type operator()(Arg & a1,Arg & a2) const { - return f(kf(a1),kf(a2)); - } - result_type operator()(Arg * a1,Arg * a2) const { - return a1==a2||(a1&&a2&&f(kf(*a1),kf(*a2))); - } - template - result_type operator()(V const& v,W const&w) const { - return v==w||(v&&w&&f(kf(*v),kf(*w))); - } - - -}; - -/* - -template -struct indirect_function { - F f; - explicit indirect_function(F const& f=F()) : f(f) {} - typedef typename F::result_type result_type; - template - result_type operator()(V *p) const { - return f(*p); - } -}; -*/ - -template ,class HashKey=boost::hash,class EqKey=std::equal_to, class Pool=boost::object_pool > -struct intern_pool : Pool { - KeyF key; - typedef typename KeyF::result_type Key; - typedef Item *Handle; - typedef compose_indirect HashDeep; - typedef equal_indirect EqDeep; - typedef HASH_SET Canonical; - typedef typename Canonical::iterator CFind; - typedef std::pair CInsert; - Canonical canonical; - bool interneq(Handle &i) { // returns true if i is newly interned, false if it already existed - CInsert i_new=canonical.insert(i); - i=*i_new.first; - return i_new.second; - } -// inherited: Handle construct(...) - Handle construct_fresh() { return Pool::construct(); } - Handle intern(Handle i) { // (maybe invalidating i, returning a valid canonical handle (pointer) - CInsert i_new=canonical.insert(i); - if (i_new.second) - return i; - else { - free(i); - return *i_new->first; - } - } - void destroy_interned(Handle i) { - DEBUG_INTERN_POOL(assert(canonical.find(i)!=canonical.end())); - canonical.erase(i); - destroy(i); - } - bool destroy_fresh(Handle i) { - DEBUG_INTERN_POOL(assert(canonical.find(i)!=canonical.end()||*canonical.find(i)!=i)); // i is a constructed item not yet interned. - destroy(i); - } - void destroy_both(Handle i) { // i must have come from this pool. may be interned, or not. destroy both the noninterned and interned. - if (!destroy_if_interned(i)) destroy(i); - } - // destroy intern(i) if it exists. return true if it existed AND its address was i. otherwise return false (whether or not a value-equal item existed and was destroyed) - bool destroy_if_interned(Handle i) { - CFind f=canonical.find(i); - if (f!=canonical.end()) { - Handle interned=*f; - canonical.erase(f); - destroy(f); - if (f==i) return true; - } - return false; - } - - intern_pool() { - HASH_MAP_EMPTY(canonical,(Handle)0); - } -}; - - - -#endif diff --git a/utils/lvalue_pmap.h b/utils/lvalue_pmap.h deleted file mode 100644 index 5b9403c0..00000000 --- a/utils/lvalue_pmap.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef LVALUE_PMAP_H -#define LVALUE_PMAP_H - -#include - -// i checked: boost provides get and put given [] - but it's not being found by ADL so instead i define them myself - -// lvalue property map pmapname

that is: P p; valtype &v=p->name; -#define PMAP_MEMBER_INDIRECT(pmapname,valtype,name) template struct pmapname { \ - typedef P key_type; \ - typedef valtype value_type; \ - typedef value_type & reference; \ - typedef boost::lvalue_property_map_tag category; \ - reference operator[](key_type p) const { return p->name; } \ - typedef pmapname

self_type; \ - friend inline value_type const& get(self_type const&,key_type p) { return p->name; } \ - friend inline void put(self_type &,key_type p,value_type const& v) { p->name = v; } \ -}; - -#define PMAP_MEMBER_INDIRECT_2(pmapname,name) template struct pmapname { \ - typedef P key_type; \ - typedef R value_type; \ - typedef value_type & reference; \ - typedef boost::lvalue_property_map_tag category; \ - reference operator[](key_type p) const { return p->name; } \ - typedef pmapname self_type; \ - friend inline value_type const& get(self_type const&,key_type p) { return p->name; } \ - friend inline void put(self_type &,key_type p,value_type const& v) { p->name = v; } \ -}; - -#endif diff --git a/utils/max_plus.h b/utils/max_plus.h deleted file mode 100644 index 2e56f85e..00000000 --- a/utils/max_plus.h +++ /dev/null @@ -1,201 +0,0 @@ -#ifndef MAX_PLUS_H_ -#define MAX_PLUS_H_ - -#define MAX_PLUS_ORDER 0 -#define MAX_PLUS_DEBUG(x) - -// max-plus algebra. ordering a > b really means that (i.e. default a > around -// x+y := max{x,y} -// x*y := x+y -// 0 := -inf -// 1 := 0 -// additive inverse does not, but mult. does. (inverse()) and x/y := x-y = x+y.inverse() -//WARNING: default order is reversed, on purpose, i.e. alog(p_b). sorry. defaults in libs are to order ascending, but we want best first. - -#include -#include -#include -#include -#include -#include -#include "semiring.h" -#include "show.h" -//#include "logval.h" - -template -class MaxPlus { - public: - void print(std::ostream &o) const { - o<) - template - void operator=(O const& o) { - v_=o.v_; - } - template - MaxPlus(O const& o) : v_(o.v_) { } - - typedef MaxPlus Self; - MaxPlus() : v_(LOGVAL_LOG0) {} - explicit MaxPlus(double x) : v_(std::log(x)) {} - MaxPlus(init_1) : v_(0) { } - MaxPlus(init_0) : v_(LOGVAL_LOG0) { } - MaxPlus(int x) : v_(std::log(x)) {} - MaxPlus(unsigned x) : v_(std::log(x)) { } - MaxPlus(double lnx,bool sign) : v_(lnx) { MAX_PLUS_DEBUG(assert(!sign)); } - MaxPlus(double lnx,init_lnx) : v_(lnx) {} - static Self exp(T lnx) { return MaxPlus(lnx,false); } - - // maybe the below are faster than == 1 and == 0. i don't know. - bool is_1() const { return v_==0; } - bool is_0() const { return v_==LOGVAL_LOG0; } - - static Self One() { return Self(init_1()); } - static Self Zero() { return Self(init_0()); } - static Self e() { return Self(1,false); } - void logeq(const T& v) { v_ = v; } - bool signbit() const { return false; } - - Self& logpluseq(const Self& a) { - if (a.is_0()) return *this; - if (a.v_ < v_) { - v_ = v_ + log1p(std::exp(a.v_ - v_)); - } else { - v_ = a.v_ + log1p(std::exp(v_ - a.v_)); - } - return *this; - } - - Self& besteq(const Self& a) { - if (a.v_ < v_) - v_=a.v_; - return *this; - } - - Self& operator+=(const Self& a) { - if (a.v_ < v_) - v_=a.v_; - return *this; - } - - Self& operator*=(const Self& a) { - v_ += a.v_; - return *this; - } - - Self& operator/=(const Self& a) { - v_ -= a.v_; - return *this; - } - - // Self(fabs(log(x)),x.s_) - friend Self abslog(Self x) { - if (x.v_<0) x.v_=-x.v_; - return x; - } - - Self& poweq(const T& power) { - v_ *= power; - return *this; - } - - Self inverse() const { - return Self(-v_,false); - } - - Self pow(const T& power) const { - Self res = *this; - res.poweq(power); - return res; - } - - Self root(const T& root) const { - return pow(1/root); - } - -// copy elision - as opposed to explicit copy of Self const& o1, we should be able to construct Logval r=a+(b+c) as a single result in place in r. todo: return std::move(o1) - C++0x - friend inline Self operator+(Self a,Self const& b) { - a+=b; - return a; - } - friend inline Self operator*(Self a,Self const& b) { - a*=b; - return a; - } - friend inline Self operator/(Self a,Self const& b) { - a/=b; - return a; - } - friend inline T log(Self const& a) { - return a.v_; - } - friend inline T pow(Self const& a,T const& e) { - return a.pow(e); - } - - // intentionally not defining an operator < or operator > - because you may want to default (for library convenience) a v_; - } - friend inline bool operator==(Self const& lhs, Self const& rhs) { - return lhs.v_ == rhs.v_; - } - friend inline bool operator!=(Self const& lhs, Self const& rhs) { - return lhs.v_ != rhs.v_; - } - std::size_t hash() const { - using namespace boost; - return hash_value(v_); - } - friend inline std::size_t hash_value(Self const& x) { - return x.hash(); - } - -/* - operator T() const { - return std::exp(v_); - } -*/ - T as_float() const { - return std::exp(v_); - } - - T v_; -}; - -template -struct semiring_traits > : default_semiring_traits > { - static const bool has_logplus=true; - static const bool has_besteq=true; -#if MAX_PLUS_ORDER - static const bool have_order=true; -#endif -}; - -#if MAX_PLUS_ORDER -template -bool operator<(const MaxPlus& lhs, const MaxPlus& rhs) { - return (lhs.v_ < rhs.v_); -} - -template -bool operator<=(const MaxPlus& lhs, const MaxPlus& rhs) { - return (lhs.v_ <= rhs.v_); -} - -template -bool operator>(const MaxPlus& lhs, const MaxPlus& rhs) { - return (lhs.v_ > rhs.v_); -} - -template -bool operator>=(const MaxPlus& lhs, const MaxPlus& rhs) { - return (lhs.v_ >= rhs.v_); -} -#endif - -#endif diff --git a/utils/maybe_update_bound.h b/utils/maybe_update_bound.h deleted file mode 100644 index d57215d0..00000000 --- a/utils/maybe_update_bound.h +++ /dev/null @@ -1,17 +0,0 @@ -#ifndef MAYBE_UPDATE_BOUND_H -#define MAYBE_UPDATE_BOUND_H - -template -inline void maybe_increase_max(To &to,const From &from) { - if (to -inline void maybe_decrease_min(To &to,const From &from) { - if (from - -template struct nan_static_assert; -template <> struct nan_static_assert { }; - -// is_iec559 i.e. only IEEE 754 float has x != x <=> x is nan -template -inline bool is_nan(T x) { -// static_cast(sizeof(nan_static_assert::has_quiet_NaN>)); - return std::numeric_limits::has_quiet_NaN && (x != x); -} - -template -inline bool is_inf(T x) { -// static_cast(sizeof(nan_static_assert::has_infinity>)); - return x == std::numeric_limits::infinity() || x == -std::numeric_limits::infinity(); -} - -template -inline bool is_pos_inf(T x) { -// static_cast(sizeof(nan_static_assert::has_infinity>)); - return x == std::numeric_limits::infinity(); -} - -template -inline bool is_neg_inf(T x) { -// static_cast(sizeof(nan_static_assert::has_infinity>)); - return x == -std::numeric_limits::infinity(); -} - -//c99 isfinite macro shoudl be much faster -template -inline bool is_finite(T x) { - return !is_nan(x) && !is_inf(x); -} - - -#endif diff --git a/utils/string_to.h b/utils/string_to.h deleted file mode 100644 index c78a5394..00000000 --- a/utils/string_to.h +++ /dev/null @@ -1,314 +0,0 @@ -#ifndef STRING_TO_H -#define STRING_TO_H - -/* - may not be any faster than boost::lexical_cast in later incarnations (see http://accu.org/index.php/journals/1375) - but is slightly simpler. no wide char or locale. - - X string_to(string); - string to_string(X); - X& string_into(string,X &); // note: returns the same ref you passed in, for convenience of use - - default implementation via stringstreams (quite slow, I'm sure) - - fast implementation for string, int<->string, unsigned<->string, float<->string, double<->string - -*/ - -#ifndef USE_FTOA -#define USE_FTOA 1 -#endif -#ifndef HAVE_STRTOUL -# define HAVE_STRTOUL 1 -#endif - -#include -#include -#include -#include - -#include "have_64_bits.h" -#include "utoa.h" -#if USE_FTOA -# include "ftoa.h" -#endif - -namespace { -// for faster numeric to/from string. TODO: separate into optional header -#include -#include -#include // access to evil (fast) C isspace etc. -#include //strtoul -} - -inline void throw_string_to(std::string const& msg,char const* prefix="string_to: ") { - throw std::runtime_error(prefix+msg); -} - -template -bool try_stream_into(I & i,To &to,bool complete=true) -{ - i >> to; - if (i.fail()) return false; - if (complete) { - char c; - return !(i >> c); - } - return true; -} - -template -bool try_string_into(Str const& str,To &to,bool complete=true) -{ - std::istringstream i(str); - return try_stream_into(i,to,complete); -} - -template inline -Data & string_into(const Str &str,Data &data) -{ - if (!try_string_into(str,data)) - throw std::runtime_error(std::string("Couldn't convert (string_into): ")+str); - return data; -} - - -template inline -Data string_to(const Str &str) -{ - Data ret; - string_into(str,ret); - return ret; -} - -template inline -std::string to_string(D const &d) -{ - std::ostringstream o; - o << d; - return o.str(); -} - -inline std::string to_string(unsigned x) { - return utos(x); -} - -inline std::string to_string(int x) { - return itos(x); -} - -inline long strtol_complete(char const* s,int base=10) { - char *e; - if (*s) { - long r=strtol(s,&e,base); - char c=*e; - if (!c || isspace(c)) //simplifying assumption: we're happy if there's other stuff in the string, so long as the number ends in a space or eos. TODO: loop consuming spaces until end? - return r; - } - throw_string_to(s,"Couldn't convert to integer: "); -} - -// returns -INT_MAX or INT_MAX if number is too large/small -inline int strtoi_complete_bounded(char const* s,int base=10) { - long l=strtol_complete(s,base); - if (l::min()) - return std::numeric_limits::min(); - if (l>std::numeric_limits::max()) - return std::numeric_limits::max(); - return l; -} -#define RANGE_STR(x) #x -#ifdef INT_MIN -# define INTRANGE_STR "[" RANGE_STR(INT_MIN) "," RANGE_STR(INT_MAX) "]" -#else -# define INTRANGE_STR "[-2137483648,2147483647]" -#endif - - // throw if out of int range -inline int strtoi_complete_exact(char const* s,int base=10) { - long l=strtol_complete(s,base); - if (l::min() || l>std::numeric_limits::max()) - throw_string_to(s,"Out of range for int " INTRANGE_STR ": "); - return l; -} - -#if HAVE_LONGER_LONG -inline int& string_into(std::string const& s,int &x) { - x=strtoi_complete_exact(s.c_str()); - return x; -} -inline int& string_into(char const* s,int &x) { - x=strtoi_complete_exact(s); - return x; -} -#endif - -inline long& string_into(std::string const& s,long &x) { - x=strtol_complete(s.c_str()); - return x; -} -inline long& string_into(char const* s,long &x) { - x=strtol_complete(s); - return x; -} - - -//FIXME: preprocessor separation for tokens int<->unsigned int, long<->unsigned long, strtol<->strtoul ? massive code duplication -inline unsigned long strtoul_complete(char const* s,int base=10) { - char *e; - if (*s) { -#if HAVE_STRTOUL - unsigned long r=strtoul(s,&e,base); -#else -// unsigned long r=strtol(s,&e,base); //FIXME: not usually safe - unsigned long r; - sscanf(s,"%ul",&r); -#endif - char c=*e; - if (!c || isspace(c)) //simplifying assumption: we're happy if there's other stuff in the string, so long as the number ends in a space or eos. TODO: loop consuming spaces until end? - return r; - } - throw_string_to(s,"Couldn't convert to integer: "); -} - -inline unsigned strtou_complete_bounded(char const* s,int base=10) { - unsigned long l=strtoul_complete(s,base); - if (l::min()) - return std::numeric_limits::min(); - if (l>std::numeric_limits::max()) - return std::numeric_limits::max(); - return l; -} - -#ifdef UINT_MIN -# define UINTRANGE_STR "[" RANGE_STR(UINT_MIN) "," RANGE_STR(UINT_MAX) "]" -#else -# define UINTRANGE_STR "[0,4,294,967,295]" -#endif - - // throw if out of int range -inline unsigned strtou_complete_exact(char const* s,int base=10) { - unsigned long l=strtoul_complete(s,base); - if (l::min() || l>std::numeric_limits::max()) - throw_string_to(s,"Out of range for uint " UINTRANGE_STR ": "); - return l; -} - -#if HAVE_LONGER_LONG -inline unsigned& string_into(std::string const& s,unsigned &x) { - x=strtou_complete_exact(s.c_str()); - return x; -} -inline unsigned& string_into(char const* s,unsigned &x) { - x=strtou_complete_exact(s); - return x; -} -#endif - -inline unsigned long& string_into(std::string const& s,unsigned long &x) { - x=strtoul_complete(s.c_str()); - return x; -} -inline unsigned long& string_into(char const* s,unsigned long &x) { - x=strtoul_complete(s); - return x; -} - -//FIXME: end code duplication - - -/* 9 decimal places needed to avoid rounding error in float->string->float. 17 for double->string->double - in terms of usable decimal places, there are 6 for float and 15 for double - */ -inline std::string to_string_roundtrip(float x) { - char buf[17]; - return std::string(buf,buf+sprintf(buf,"%.9g",x)); -} -inline std::string to_string(float x) { -#if USE_FTOA - return ftos(x); -#else - char buf[15]; - return std::string(buf,buf+sprintf(buf,"%.7g",x)); -#endif -} -inline std::string to_string_roundtrip(double x) { - char buf[32]; - return std::string(buf,buf+sprintf(buf,"%.17g",x)); -} -inline std::string to_string(double x) { -#if USE_FTOA - return ftos(x); -#else - char buf[30]; - return std::string(buf,buf+sprintf(buf,"%.15g",x)); -#endif -} - -inline double& string_into(char const* s,double &x) { - x=std::atof(s); - return x; -} -inline float& string_into(char const* s,float &x) { - x=std::atof(s); - return x; -} - -inline double& string_into(std::string const& s,double &x) { - x=std::atof(s.c_str()); - return x; -} -inline float& string_into(std::string const& s,float &x) { - x=std::atof(s.c_str()); - return x; -} - - -template -bool try_string_into(Str const& str,Str &to,bool complete=true) -{ - str=to; - return true; -} - -inline std::string const& to_string(std::string const& d) -{ - return d; -} - -template -Str const& string_to(Str const &s) -{ - return s; -} - -template -Str & string_into(Str const &s,Str &d) -{ - return d=s; -} - -/* - -template inline -void substring_into(const Str &str,size_type pos,size_type n,Data &data) -{ -// std::istringstream i(str,pos,n); // doesn't exist! - std::istringstream i(str.substr(pos,n)); - if (!(i>>*data)) - throw std::runtime_error("Couldn't convert (string_into): "+str); -} - -template inline -Data string_to(const Str &str,size_type pos,size_type n) -{ - Data ret; - substring_into(str,pos,n,ret); - return ret; -} - -*/ - - - -#endif -- cgit v1.2.3 From cc01f3b7c9b87928be91e8a89f233a07a183ac2e Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 29 Feb 2012 01:16:34 -0500 Subject: corpus tools --- utils/Makefile.am | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/Makefile.am b/utils/Makefile.am index 6e0678de..bb067ed9 100644 --- a/utils/Makefile.am +++ b/utils/Makefile.am @@ -24,6 +24,7 @@ noinst_LIBRARIES = libutils.a libutils_a_SOURCES = \ alignment_pharaoh.cc \ b64tools.cc \ + corpus_tools.cc \ dict.cc \ tdict.cc \ fdict.cc \ -- cgit v1.2.3 From a872f46ce1212703b8bed562c894ea1a932c0746 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 29 Feb 2012 07:00:49 +0000 Subject: mpi fixes --- training/lbl_model.cc | 54 +++++++++++++++++++++++++++++++++++++++++---------- utils/corpus_tools.cc | 16 +++++++++------ 2 files changed, 54 insertions(+), 16 deletions(-) diff --git a/training/lbl_model.cc b/training/lbl_model.cc index def5075a..a46ce33c 100644 --- a/training/lbl_model.cc +++ b/training/lbl_model.cc @@ -15,6 +15,7 @@ #ifdef HAVE_MPI #include #include +#include namespace mpi = boost::mpi; #endif #include @@ -34,12 +35,26 @@ namespace mpi = boost::mpi; namespace po = boost::program_options; using namespace std; -#define kDIMENSIONS 100 +#define kDIMENSIONS 10 typedef Eigen::Matrix RVector; typedef Eigen::Matrix RTVector; typedef Eigen::Matrix TMatrix; vector r_src, r_trg; +#if HAVE_MPI +namespace boost { +namespace serialization { + +template +void serialize(Archive & ar, RVector & v, const unsigned int version) { + for (unsigned i = 0; i < kDIMENSIONS; ++i) + ar & v[i]; +} + +} // namespace serialization +} // namespace boost +#endif + bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() @@ -224,7 +239,7 @@ int main(int argc, char** argv) { srand(seed); } - TMatrix t; + TMatrix t = TMatrix::Zero(); if (rank == 0) { t = TMatrix::Random() / 50.0; for (unsigned i = 1; i < r_trg.size(); ++i) { @@ -241,16 +256,18 @@ int main(int argc, char** argv) { TMatrix g = TMatrix::Zero(); vector exp_src; vector z_src; - vector flat_g, flat_t; + vector flat_g, flat_t, rcv_grad; Flatten(t, &flat_t); bool converged = false; - // TODO broadcast embeddings - for (int iter = 0; !converged && iter < ITERATIONS; ++iter) { -#ifdef HAVE_MPI - mpi::broadcast(world, &flat_t[0], flat_t.size(), 0); +#if HAVE_MPI + mpi::broadcast(world, &flat_t[0], flat_t.size(), 0); + mpi::broadcast(world, r_trg, 0); + mpi::broadcast(world, r_src, 0); #endif + cerr << "rank=" << rank << ": " << r_trg[0][4] << endl; + for (int iter = 0; !converged && iter < ITERATIONS; ++iter) { + if (rank == 0) cerr << "ITERATION " << (iter + 1) << endl; Unflatten(flat_t, &t); - cerr << "ITERATION " << (iter + 1) << endl; double likelihood = 0; double denom = 0.0; lc = 0; @@ -350,7 +367,22 @@ int main(int argc, char** argv) { if (!SGD) { Flatten(g, &flat_g); obj = -likelihood; - // TODO - reduce gradient +#if HAVE_MPI + rcv_grad.resize(flat_g.size(), 0.0); + mpi::reduce(world, &flat_g[0], flat_g.size(), &rcv_grad[0], plus(), 0); + swap(flat_g, rcv_grad); + rcv_grad.clear(); + + double to = 0; + mpi::reduce(world, obj, to, plus(), 0); + obj = to; + double tlh = 0; + mpi::reduce(world, likelihood, tlh, plus(), 0); + likelihood = tlh; + double td = 0; + mpi::reduce(world, denom, td, plus(), 0); + denom = td; +#endif } if (rank == 0) { @@ -376,10 +408,12 @@ int main(int argc, char** argv) { } } #ifdef HAVE_MPI + mpi::broadcast(world, &flat_t[0], flat_t.size(), 0); mpi::broadcast(world, converged, 0); #endif } - cerr << "TRANSLATION MATRIX:" << endl << t << endl; + if (rank == 0) + cerr << "TRANSLATION MATRIX:" << endl << t << endl; return 0; } diff --git a/utils/corpus_tools.cc b/utils/corpus_tools.cc index a0542b6e..d17785af 100644 --- a/utils/corpus_tools.cc +++ b/utils/corpus_tools.cc @@ -33,10 +33,12 @@ void CorpusTools::ReadFromFile(const string& filename, while(getline(in, line)) { const bool skip = (lc % size != rank); ++lc; - if (skip) continue; TD::ConvertSentence(line, &tmp); - src->push_back(vector()); - vector* d = &src->back(); + vector* d = NULL; + if (!skip) { + src->push_back(vector()); + d = &src->back(); + } set* v = src_vocab; int s = 0; for (unsigned i = 0; i < tmp.size(); ++i) { @@ -44,11 +46,13 @@ void CorpusTools::ReadFromFile(const string& filename, ++s; if (s > 1) { cerr << "Unexpected format in line " << lc << ": " << line << endl; abort(); } assert(trg); - trg->push_back(vector()); - d = &trg->back(); + if (!skip) { + trg->push_back(vector()); + d = &trg->back(); + } v = trg_vocab; } else { - d->push_back(tmp[i]); + if (d) d->push_back(tmp[i]); if (v) v->insert(tmp[i]); } } -- cgit v1.2.3 From e1a0c140e9f31461ab45ec7f9533ad98d2b9caa9 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Wed, 29 Feb 2012 12:58:53 -0800 Subject: Dump the forest before the language model rescoring --- decoder/decoder.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/decoder/decoder.cc b/decoder/decoder.cc index 3394e0b8..69fbaf85 100644 --- a/decoder/decoder.cc +++ b/decoder/decoder.cc @@ -812,6 +812,9 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { abort(); } + if (conf.count("show_target_graph")) + HypergraphIO::WriteTarget(forest); + for (int pass = 0; pass < rescoring_passes.size(); ++pass) { const RescoringPass& rp = rescoring_passes[pass]; const vector& cur_weights = *rp.weight_vector; @@ -1018,8 +1021,6 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) { } if (conf.count("show_cfg_search_space")) HypergraphIO::WriteAsCFG(forest); - if (conf.count("show_target_graph")) - HypergraphIO::WriteTarget(forest); if (has_ref) { if (HG::Intersect(ref, &forest)) { // if (crf_uniform_empirical) { -- cgit v1.2.3 From 378ba2373374015c8de4b360a30227f072616c6c Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 1 Mar 2012 23:09:32 -0500 Subject: compile fix on old versions of gcc with MPI enabled --- utils/fast_sparse_vector.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/fast_sparse_vector.h b/utils/fast_sparse_vector.h index 17fa47bf..d11be48f 100644 --- a/utils/fast_sparse_vector.h +++ b/utils/fast_sparse_vector.h @@ -363,7 +363,7 @@ class FastSparseVector { } ar & eff_size; while (it != this->end()) { - const std::pair wire_pair(FD::Convert(it->first), it->second); + const std::pair wire_pair(FD::Convert(it->first), it->second); ar & wire_pair; ++it; } -- cgit v1.2.3 From 064e53669428404269c9015af2dee135bf91226d Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 3 Mar 2012 01:21:05 -0500 Subject: use assert properly --- decoder/apply_models.cc | 9 ++++++--- decoder/earley_composer.cc | 5 ++++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/decoder/apply_models.cc b/decoder/apply_models.cc index 40fd27e4..9ba59d1b 100644 --- a/decoder/apply_models.cc +++ b/decoder/apply_models.cc @@ -270,7 +270,8 @@ public: const Hypergraph::Edge& edge = in.edges_[in_edges[i]]; const JVector j(edge.tail_nodes_.size(), 0); cand.push_back(new Candidate(edge, j, out, D, node_states_, smeta, models, is_goal)); - assert(unique_cands.insert(cand.back()).second); // these should all be unique! + bool is_new = unique_cands.insert(cand.back()).second; + assert(is_new); // these should all be unique! } // cerr << " making heap of " << cand.size() << " candidates\n"; make_heap(cand.begin(), cand.end(), HeapCandCompare()); @@ -378,7 +379,8 @@ public: pop_heap(cand.begin(), cand.end(), HeapCandCompare()); Candidate* item = cand.back(); cand.pop_back(); - assert(unique_accepted.insert(item).second); // these should all be unique! + bool is_new = unique_accepted.insert(item).second; + assert(is_new); // these should all be unique! // cerr << "POPPED: " << *item << endl; PushSuccFast2(*item, is_goal, &cand, &unique_accepted); @@ -419,7 +421,8 @@ public: Candidate* new_cand = new Candidate(*item.in_edge_, j, out, D, node_states_, smeta, models, is_goal); cand.push_back(new_cand); push_heap(cand.begin(), cand.end(), HeapCandCompare()); - assert(cs->insert(new_cand).second); // insert into uniqueness set, sanity check + bool is_new = cs->insert(new_cand).second; + assert(is_new); // insert into uniqueness set, sanity check } } } diff --git a/decoder/earley_composer.cc b/decoder/earley_composer.cc index 48e94a31..b7af801a 100644 --- a/decoder/earley_composer.cc +++ b/decoder/earley_composer.cc @@ -329,7 +329,10 @@ class EarleyComposerImpl { forest->ReserveNodes(kMAX_NODES); assert(sit != g.end()); Edge* init = new Edge(start_cat_, &sit->second, q_0_); - assert(IncorporateNewEdge(init)); + if (!IncorporateNewEdge(init)) { + cerr << "Failed to create initial edge!\n"; + abort(); + } while (exp_agenda.HasWork() || agenda.HasWork()) { while(exp_agenda.HasWork()) { const Edge* edge = exp_agenda.Next(); -- cgit v1.2.3 From e0507d1aa96c6b1348e6a202beb95f63d8662258 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 3 Mar 2012 03:24:53 -0500 Subject: PYP language model (Teh 2006) --- decoder/fst_translator.cc | 5 +- gi/pf/Makefile.am | 4 +- gi/pf/pyp_lm.cc | 150 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 157 insertions(+), 2 deletions(-) create mode 100644 gi/pf/pyp_lm.cc diff --git a/decoder/fst_translator.cc b/decoder/fst_translator.cc index 38dbd717..074de4c9 100644 --- a/decoder/fst_translator.cc +++ b/decoder/fst_translator.cc @@ -30,7 +30,10 @@ struct FSTTranslatorImpl { if (input.find("{\"rules\"") == 0) { istringstream is(input); Hypergraph src_cfg_hg; - assert(HypergraphIO::ReadFromJSON(&is, &src_cfg_hg)); + if (!HypergraphIO::ReadFromJSON(&is, &src_cfg_hg)) { + cerr << "Failed to read HG from JSON.\n"; + abort(); + } if (add_pass_through_rules) { SparseVector feats; feats.set_value(FD::Convert("PassThrough"), 1); diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 0cf0bc63..7cf9c14d 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,4 +1,4 @@ -bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp learn_cfg +bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp learn_cfg pyp_lm noinst_LIBRARIES = libpf.a libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc @@ -9,6 +9,8 @@ align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc itg_SOURCES = itg.cc +pyp_lm_SOURCES = pyp_lm.cc + learn_cfg_SOURCES = learn_cfg.cc condnaive_SOURCES = condnaive.cc diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc new file mode 100644 index 00000000..2837e33c --- /dev/null +++ b/gi/pf/pyp_lm.cc @@ -0,0 +1,150 @@ +#include +#include +#include + +#include +#include +#include + +#include "corpus_tools.h" +#include "m.h" +#include "tdict.h" +#include "sampler.h" +#include "ccrp.h" +#include "ccrp_onetable.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +shared_ptr prng; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("input,i",po::value(),"Read data from") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +template struct PYPLM; + +// uniform base distribution +template<> struct PYPLM<0> { + PYPLM(unsigned vs) : p0(1.0 / vs) {} + void increment(WordID w, const vector& context, MT19937* rng) const {} + void decrement(WordID w, const vector& context, MT19937* rng) const {} + double prob(WordID w, const vector& context) const { return p0; } + const double p0; +}; + +// represents an N-gram LM +template struct PYPLM { + PYPLM(unsigned vs) : backoff(vs) {} + void increment(WordID w, const vector& context, MT19937* rng) { + const double bo = backoff.prob(w, context); + static vector lookup(N-1); + for (unsigned i = 0; i < N-1; ++i) + lookup[i] = context[context.size() - 1 - i]; + typename unordered_map, CCRP, boost::hash > >::iterator it = p.find(lookup); + if (it == p.end()) + it = p.insert(make_pair(lookup, CCRP(1,1,1,1))).first; + if (it->second.increment(w, bo, rng)) + backoff.increment(w, context, rng); + } + void decrement(WordID w, const vector& context, MT19937* rng) { + static vector lookup(N-1); + for (unsigned i = 0; i < N-1; ++i) + lookup[i] = context[context.size() - 1 - i]; + typename unordered_map, CCRP, boost::hash > >::iterator it = p.find(lookup); + assert(it != p.end()); + if (it->second.decrement(w, rng)) + backoff.decrement(w, context, rng); + } + double prob(WordID w, const vector& context) const { + const double bo = backoff.prob(w, context); + static vector lookup(N-1); + for (unsigned i = 0; i < N-1; ++i) + lookup[i] = context[context.size() - 1 - i]; + typename unordered_map, CCRP, boost::hash > >::const_iterator it = p.find(lookup); + if (it == p.end()) return bo; + return it->second.prob(w, bo); + } + PYPLM backoff; + unordered_map, CCRP, boost::hash > > p; +}; + +int main(int argc, char** argv) { + po::variables_map conf; + + InitCommandLine(argc, argv, &conf); + const unsigned samples = conf["samples"].as(); + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); + MT19937& rng = *prng; + vector > corpuse; + set vocabe; + const WordID kEOS = TD::Convert(""); + cerr << "Reading corpus...\n"; + CorpusTools::ReadFromFile(conf["input"].as(), &corpuse, &vocabe); + cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; +#define kORDER 5 + PYPLM lm(vocabe.size()); + vector ctx(kORDER - 1, TD::Convert("")); + int mci = corpuse.size() * 99 / 100; + for (int SS=0; SS < samples; ++SS) { + for (int ci = 0; ci < mci; ++ci) { + ctx.resize(kORDER - 1); + const vector& s = corpuse[ci]; + for (int i = 0; i <= s.size(); ++i) { + WordID w = (i < s.size() ? s[i] : kEOS); + if (SS > 0) lm.decrement(w, ctx, &rng); + lm.increment(w, ctx, &rng); + ctx.push_back(w); + } + if (SS > 0) lm.decrement(kEOS, ctx, &rng); + lm.increment(kEOS, ctx, &rng); + } + } + double llh = 0; + unsigned cnt = 0; + for (int ci = mci; ci < corpuse.size(); ++ci) { + ctx.resize(kORDER - 1); + const vector& s = corpuse[ci]; + for (int i = 0; i <= s.size(); ++i) { + WordID w = (i < s.size() ? s[i] : kEOS); + double lp = log(lm.prob(w, ctx)) / log(2); + cerr << "p(" << TD::Convert(w) << " | " << TD::GetString(ctx) << ") = " << lp << endl; + ctx.push_back(w); + llh -= lp; + cnt++; + } + } + cerr << " Log_10 prob: " << (llh * log(2) / log(10)) << endl; + cerr << " Count: " << (cnt) << endl; + cerr << "Cross-entropy: " << (llh / cnt) << endl; + cerr << " Perplexity: " << pow(2, llh / cnt) << endl; + return 0; +} + -- cgit v1.2.3 From 2579dd24d3833823527e688196276c2fab381b37 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 3 Mar 2012 17:16:58 -0500 Subject: pyp lm, fixed hyperparameters inference --- gi/pf/align-lexonly-pyp.cc | 2 +- gi/pf/align-lexonly.cc | 2 +- gi/pf/brat.cc | 2 +- gi/pf/conditional_pseg.h | 4 +- gi/pf/learn_cfg.cc | 4 +- gi/pf/pfbrat.cc | 2 +- gi/pf/pyp_lm.cc | 70 ++++++++++++++++++++++++++++--- phrasinator/gibbs_train_plm.cc | 2 +- utils/ccrp.h | 95 ++++++++++++++++++------------------------ utils/ccrp_nt.h | 52 +++++++++++------------ utils/ccrp_onetable.h | 70 +++++++++++++++---------------- utils/mfcr.h | 58 +++++++++++++------------- 12 files changed, 203 insertions(+), 160 deletions(-) diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index e24cb457..4ce7cf62 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -104,7 +104,7 @@ struct HierarchicalWordBase { } void Summary() const { - cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (d=" << r.d() << ",\\alpha=" << r.alpha() << ')' << endl; + cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (d=" << r.discount() << ",\\alpha=" << r.alpha() << ')' << endl; for (MFCR >::const_iterator it = r.begin(); it != r.end(); ++it) cerr << " " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables)" << TD::GetString(it->first) << endl; } diff --git a/gi/pf/align-lexonly.cc b/gi/pf/align-lexonly.cc index 8c1d689f..dbc9dc07 100644 --- a/gi/pf/align-lexonly.cc +++ b/gi/pf/align-lexonly.cc @@ -105,7 +105,7 @@ struct HierarchicalWordBase { } void Summary() const { - cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (\\alpha=" << r.concentration() << ')' << endl; + cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (\\alpha=" << r.alpha() << ')' << endl; for (CCRP_NoTable >::const_iterator it = r.begin(); it != r.end(); ++it) cerr << " " << it->second << '\t' << TD::GetString(it->first) << endl; } diff --git a/gi/pf/brat.cc b/gi/pf/brat.cc index 7b60ef23..c2c52760 100644 --- a/gi/pf/brat.cc +++ b/gi/pf/brat.cc @@ -191,7 +191,7 @@ struct UniphraseLM { void ResampleHyperparameters(MT19937* rng) { phrases_.resample_hyperparameters(rng); gen_.resample_hyperparameters(rng); - cerr << " " << phrases_.concentration(); + cerr << " " << phrases_.alpha(); } CCRP_NoTable > phrases_; diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h index 2e9e38fc..f9841cbf 100644 --- a/gi/pf/conditional_pseg.h +++ b/gi/pf/conditional_pseg.h @@ -22,7 +22,7 @@ struct MConditionalTranslationModel { void Summary() const { std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { - std::cerr << TD::GetString(it->first) << " \t(d=" << it->second.d() << ",\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl; + std::cerr << TD::GetString(it->first) << " \t(d=" << it->second.discount() << ",\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl; for (MFCR::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) std::cerr << " " << -1 << '\t' << i2->first << std::endl; } @@ -95,7 +95,7 @@ struct ConditionalTranslationModel { void Summary() const { std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { - std::cerr << TD::GetString(it->first) << " \t(\\alpha = " << it->second.concentration() << ") --------------------------" << std::endl; + std::cerr << TD::GetString(it->first) << " \t(\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl; for (CCRP_NoTable::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) std::cerr << " " << i2->second << '\t' << i2->first << std::endl; } diff --git a/gi/pf/learn_cfg.cc b/gi/pf/learn_cfg.cc index b2ca029a..5b748311 100644 --- a/gi/pf/learn_cfg.cc +++ b/gi/pf/learn_cfg.cc @@ -183,9 +183,9 @@ struct HieroLMModel { nts[i].resample_hyperparameters(rng); if (kHIERARCHICAL_PRIOR) { q0.resample_hyperparameters(rng); - cerr << "[base d=" << q0.discount() << ", alpha=" << q0.discount() << "]"; + cerr << "[base d=" << q0.discount() << ", alpha=" << q0.alpha() << "]"; } - cerr << " d=" << nts[0].discount() << ", alpha=" << nts[0].concentration() << endl; + cerr << " d=" << nts[0].discount() << ", alpha=" << nts[0].alpha() << endl; } const BaseRuleModel base; diff --git a/gi/pf/pfbrat.cc b/gi/pf/pfbrat.cc index 7b60ef23..c2c52760 100644 --- a/gi/pf/pfbrat.cc +++ b/gi/pf/pfbrat.cc @@ -191,7 +191,7 @@ struct UniphraseLM { void ResampleHyperparameters(MT19937* rng) { phrases_.resample_hyperparameters(rng); gen_.resample_hyperparameters(rng); - cerr << " " << phrases_.concentration(); + cerr << " " << phrases_.alpha(); } CCRP_NoTable > phrases_; diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc index 2837e33c..0d85536c 100644 --- a/gi/pf/pyp_lm.cc +++ b/gi/pf/pyp_lm.cc @@ -50,16 +50,19 @@ template struct PYPLM; // uniform base distribution template<> struct PYPLM<0> { - PYPLM(unsigned vs) : p0(1.0 / vs) {} - void increment(WordID w, const vector& context, MT19937* rng) const {} - void decrement(WordID w, const vector& context, MT19937* rng) const {} + PYPLM(unsigned vs) : p0(1.0 / vs), draws() {} + void increment(WordID w, const vector& context, MT19937* rng) { ++draws; } + void decrement(WordID w, const vector& context, MT19937* rng) { --draws; assert(draws >= 0); } double prob(WordID w, const vector& context) const { return p0; } + void resample_hyperparameters(MT19937* rng, const unsigned nloop, const unsigned niterations) {} + double log_likelihood() const { return draws * log(p0); } const double p0; + int draws; }; // represents an N-gram LM template struct PYPLM { - PYPLM(unsigned vs) : backoff(vs) {} + PYPLM(unsigned vs) : backoff(vs), d(0.8), alpha(1.0) {} void increment(WordID w, const vector& context, MT19937* rng) { const double bo = backoff.prob(w, context); static vector lookup(N-1); @@ -67,7 +70,7 @@ template struct PYPLM { lookup[i] = context[context.size() - 1 - i]; typename unordered_map, CCRP, boost::hash > >::iterator it = p.find(lookup); if (it == p.end()) - it = p.insert(make_pair(lookup, CCRP(1,1,1,1))).first; + it = p.insert(make_pair(lookup, CCRP(d,alpha))).first; if (it->second.increment(w, bo, rng)) backoff.increment(w, context, rng); } @@ -89,7 +92,58 @@ template struct PYPLM { if (it == p.end()) return bo; return it->second.prob(w, bo); } + + double log_likelihood() const { + return log_likelihood(d, alpha) + backoff.log_likelihood(); + } + + double log_likelihood(const double& dd, const double& aa) const { + if (aa <= -dd) return -std::numeric_limits::infinity(); + double llh = Md::log_beta_density(dd, 1, 1) + Md::log_gamma_density(aa, 1, 1); + typename unordered_map, CCRP, boost::hash > >::const_iterator it; + for (it = p.begin(); it != p.end(); ++it) + llh += it->second.log_crp_prob(dd, aa); + return llh; + } + + struct DiscountResampler { + DiscountResampler(const PYPLM& m) : m_(m) {} + const PYPLM& m_; + double operator()(const double& proposed_discount) const { + return m_.log_likelihood(proposed_discount, m_.alpha); + } + }; + + struct AlphaResampler { + AlphaResampler(const PYPLM& m) : m_(m) {} + const PYPLM& m_; + double operator()(const double& proposed_alpha) const { + return m_.log_likelihood(m_.d, proposed_alpha); + } + }; + + void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { + DiscountResampler dr(*this); + AlphaResampler ar(*this); + for (int iter = 0; iter < nloop; ++iter) { + alpha = slice_sampler1d(ar, alpha, *rng, 0.0, + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + d = slice_sampler1d(dr, d, *rng, std::numeric_limits::min(), + 1.0, 0.0, niterations, 100*niterations); + } + alpha = slice_sampler1d(ar, alpha, *rng, 0.0, + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + typename unordered_map, CCRP, boost::hash > >::iterator it; + cerr << "PYPLM<" << N << ">(d=" << d << ",a=" << alpha << ") = " << log_likelihood(d, alpha) << endl; + for (it = p.begin(); it != p.end(); ++it) { + it->second.set_discount(d); + it->second.set_alpha(alpha); + } + backoff.resample_hyperparameters(rng, nloop, niterations); + } + PYPLM backoff; + double d, alpha; unordered_map, CCRP, boost::hash > > p; }; @@ -109,7 +163,7 @@ int main(int argc, char** argv) { cerr << "Reading corpus...\n"; CorpusTools::ReadFromFile(conf["input"].as(), &corpuse, &vocabe); cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; -#define kORDER 5 +#define kORDER 3 PYPLM lm(vocabe.size()); vector ctx(kORDER - 1, TD::Convert("")); int mci = corpuse.size() * 99 / 100; @@ -126,6 +180,10 @@ int main(int argc, char** argv) { if (SS > 0) lm.decrement(kEOS, ctx, &rng); lm.increment(kEOS, ctx, &rng); } + if (SS % 10 == 9) { + cerr << " [LLH=" << lm.log_likelihood() << "]" << endl; + if (SS % 20 == 19) lm.resample_hyperparameters(&rng); + } else { cerr << '.' << flush; } } double llh = 0; unsigned cnt = 0; diff --git a/phrasinator/gibbs_train_plm.cc b/phrasinator/gibbs_train_plm.cc index 66b46011..54861dcb 100644 --- a/phrasinator/gibbs_train_plm.cc +++ b/phrasinator/gibbs_train_plm.cc @@ -252,7 +252,7 @@ struct UniphraseLM { void ResampleHyperparameters(MT19937* rng) { phrases_.resample_hyperparameters(rng); gen_.resample_hyperparameters(rng); - cerr << " d=" << phrases_.discount() << ",c=" << phrases_.concentration(); + cerr << " d=" << phrases_.discount() << ",a=" << phrases_.alpha(); } CCRP > phrases_; diff --git a/utils/ccrp.h b/utils/ccrp.h index 1a9e3ed5..d9a38089 100644 --- a/utils/ccrp.h +++ b/utils/ccrp.h @@ -17,35 +17,37 @@ template > class CCRP { public: - CCRP(double disc, double conc) : + CCRP(double disc, double alpha) : num_tables_(), num_customers_(), discount_(disc), - concentration_(conc), + alpha_(alpha), discount_prior_alpha_(std::numeric_limits::quiet_NaN()), discount_prior_beta_(std::numeric_limits::quiet_NaN()), - concentration_prior_shape_(std::numeric_limits::quiet_NaN()), - concentration_prior_rate_(std::numeric_limits::quiet_NaN()) {} + alpha_prior_shape_(std::numeric_limits::quiet_NaN()), + alpha_prior_rate_(std::numeric_limits::quiet_NaN()) {} CCRP(double d_alpha, double d_beta, double c_shape, double c_rate, double d = 0.9, double c = 1.0) : num_tables_(), num_customers_(), discount_(d), - concentration_(c), + alpha_(c), discount_prior_alpha_(d_alpha), discount_prior_beta_(d_beta), - concentration_prior_shape_(c_shape), - concentration_prior_rate_(c_rate) {} + alpha_prior_shape_(c_shape), + alpha_prior_rate_(c_rate) {} double discount() const { return discount_; } - double concentration() const { return concentration_; } + double alpha() const { return alpha_; } + void set_discount(double d) { discount_ = d; } + void set_alpha(double a) { alpha_ = a; } bool has_discount_prior() const { return !std::isnan(discount_prior_alpha_); } - bool has_concentration_prior() const { - return !std::isnan(concentration_prior_shape_); + bool has_alpha_prior() const { + return !std::isnan(alpha_prior_shape_); } void clear() { @@ -79,7 +81,7 @@ class CCRP { DishLocations& loc = dish_locs_[dish]; bool share_table = false; if (loc.total_dish_count_) { - const double p_empty = (concentration_ + num_tables_ * discount_) * p0; + const double p_empty = (alpha_ + num_tables_ * discount_) * p0; const double p_share = (loc.total_dish_count_ - loc.table_counts_.size() * discount_); share_table = rng->SelectSample(p_empty, p_share); } @@ -113,7 +115,7 @@ class CCRP { DishLocations& loc = dish_locs_[dish]; bool share_table = false; if (loc.total_dish_count_) { - const T p_empty = T(concentration_ + num_tables_ * discount_) * p0; + const T p_empty = T(alpha_ + num_tables_ * discount_) * p0; const T p_share = T(loc.total_dish_count_ - loc.table_counts_.size() * discount_); share_table = rng->SelectSample(p_empty, p_share); } @@ -180,63 +182,46 @@ class CCRP { double prob(const Dish& dish, const double& p0) const { const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); - const double r = num_tables_ * discount_ + concentration_; + const double r = num_tables_ * discount_ + alpha_; if (it == dish_locs_.end()) { - return r * p0 / (num_customers_ + concentration_); + return r * p0 / (num_customers_ + alpha_); } else { return (it->second.total_dish_count_ - discount_ * it->second.table_counts_.size() + r * p0) / - (num_customers_ + concentration_); + (num_customers_ + alpha_); } } template T probT(const Dish& dish, const T& p0) const { const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); - const T r = T(num_tables_ * discount_ + concentration_); + const T r = T(num_tables_ * discount_ + alpha_); if (it == dish_locs_.end()) { - return r * p0 / T(num_customers_ + concentration_); + return r * p0 / T(num_customers_ + alpha_); } else { return (T(it->second.total_dish_count_ - discount_ * it->second.table_counts_.size()) + r * p0) / - T(num_customers_ + concentration_); + T(num_customers_ + alpha_); } } double log_crp_prob() const { - return log_crp_prob(discount_, concentration_); - } - - static double log_beta_density(const double& x, const double& alpha, const double& beta) { - assert(x > 0.0); - assert(x < 1.0); - assert(alpha > 0.0); - assert(beta > 0.0); - const double lp = (alpha-1)*log(x)+(beta-1)*log(1-x)+lgamma(alpha+beta)-lgamma(alpha)-lgamma(beta); - return lp; - } - - static double log_gamma_density(const double& x, const double& shape, const double& rate) { - assert(x >= 0.0); - assert(shape > 0.0); - assert(rate > 0.0); - const double lp = (shape-1)*log(x) - shape*log(rate) - x/rate - lgamma(shape); - return lp; + return log_crp_prob(discount_, alpha_); } // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process // does not include P_0's - double log_crp_prob(const double& discount, const double& concentration) const { + double log_crp_prob(const double& discount, const double& alpha) const { double lp = 0.0; if (has_discount_prior()) - lp = log_beta_density(discount, discount_prior_alpha_, discount_prior_beta_); - if (has_concentration_prior()) - lp += log_gamma_density(concentration, concentration_prior_shape_, concentration_prior_rate_); + lp = Md::log_beta_density(discount, discount_prior_alpha_, discount_prior_beta_); + if (has_alpha_prior()) + lp += Md::log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_); assert(lp <= 0.0); if (num_customers_) { if (discount > 0.0) { const double r = lgamma(1.0 - discount); - lp += lgamma(concentration) - lgamma(concentration + num_customers_) - + num_tables_ * log(discount) + lgamma(concentration / discount + num_tables_) - - lgamma(concentration / discount); + lp += lgamma(alpha) - lgamma(alpha + num_customers_) + + num_tables_ * log(discount) + lgamma(alpha / discount + num_tables_) + - lgamma(alpha / discount); assert(std::isfinite(lp)); for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); it != dish_locs_.end(); ++it) { @@ -254,12 +239,12 @@ class CCRP { } void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { - assert(has_discount_prior() || has_concentration_prior()); + assert(has_discount_prior() || has_alpha_prior()); DiscountResampler dr(*this); ConcentrationResampler cr(*this); for (int iter = 0; iter < nloop; ++iter) { - if (has_concentration_prior()) { - concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, + if (has_alpha_prior()) { + alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0, std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); } if (has_discount_prior()) { @@ -267,7 +252,7 @@ class CCRP { 1.0, 0.0, niterations, 100*niterations); } } - concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, + alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0, std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); } @@ -275,15 +260,15 @@ class CCRP { DiscountResampler(const CCRP& crp) : crp_(crp) {} const CCRP& crp_; double operator()(const double& proposed_discount) const { - return crp_.log_crp_prob(proposed_discount, crp_.concentration_); + return crp_.log_crp_prob(proposed_discount, crp_.alpha_); } }; struct ConcentrationResampler { ConcentrationResampler(const CCRP& crp) : crp_(crp) {} const CCRP& crp_; - double operator()(const double& proposed_concentration) const { - return crp_.log_crp_prob(crp_.discount_, proposed_concentration); + double operator()(const double& proposed_alpha) const { + return crp_.log_crp_prob(crp_.discount_, proposed_alpha); } }; @@ -295,7 +280,7 @@ class CCRP { }; void Print(std::ostream* out) const { - std::cerr << "PYP(d=" << discount_ << ",c=" << concentration_ << ") customers=" << num_customers_ << std::endl; + std::cerr << "PYP(d=" << discount_ << ",c=" << alpha_ << ") customers=" << num_customers_ << std::endl; for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); it != dish_locs_.end(); ++it) { (*out) << it->first << " (" << it->second.total_dish_count_ << " on " << it->second.table_counts_.size() << " tables): "; @@ -320,15 +305,15 @@ class CCRP { std::tr1::unordered_map dish_locs_; double discount_; - double concentration_; + double alpha_; // optional beta prior on discount_ (NaN if no prior) double discount_prior_alpha_; double discount_prior_beta_; - // optional gamma prior on concentration_ (NaN if no prior) - double concentration_prior_shape_; - double concentration_prior_rate_; + // optional gamma prior on alpha_ (NaN if no prior) + double alpha_prior_shape_; + double alpha_prior_rate_; }; template diff --git a/utils/ccrp_nt.h b/utils/ccrp_nt.h index 63b6f4c2..79321493 100644 --- a/utils/ccrp_nt.h +++ b/utils/ccrp_nt.h @@ -18,20 +18,20 @@ class CCRP_NoTable { public: explicit CCRP_NoTable(double conc) : num_customers_(), - concentration_(conc), - concentration_prior_shape_(std::numeric_limits::quiet_NaN()), - concentration_prior_rate_(std::numeric_limits::quiet_NaN()) {} + alpha_(conc), + alpha_prior_shape_(std::numeric_limits::quiet_NaN()), + alpha_prior_rate_(std::numeric_limits::quiet_NaN()) {} CCRP_NoTable(double c_shape, double c_rate, double c = 10.0) : num_customers_(), - concentration_(c), - concentration_prior_shape_(c_shape), - concentration_prior_rate_(c_rate) {} + alpha_(c), + alpha_prior_shape_(c_shape), + alpha_prior_rate_(c_rate) {} - double concentration() const { return concentration_; } + double alpha() const { return alpha_; } - bool has_concentration_prior() const { - return !std::isnan(concentration_prior_shape_); + bool has_alpha_prior() const { + return !std::isnan(alpha_prior_shape_); } void clear() { @@ -73,16 +73,16 @@ class CCRP_NoTable { double prob(const Dish& dish, const double& p0) const { const unsigned at_table = num_customers(dish); - return (at_table + p0 * concentration_) / (num_customers_ + concentration_); + return (at_table + p0 * alpha_) / (num_customers_ + alpha_); } double logprob(const Dish& dish, const double& logp0) const { const unsigned at_table = num_customers(dish); - return log(at_table + exp(logp0 + log(concentration_))) - log(num_customers_ + concentration_); + return log(at_table + exp(logp0 + log(alpha_))) - log(num_customers_ + alpha_); } double log_crp_prob() const { - return log_crp_prob(concentration_); + return log_crp_prob(alpha_); } static double log_gamma_density(const double& x, const double& shape, const double& rate) { @@ -95,14 +95,14 @@ class CCRP_NoTable { // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process // does not include P_0's - double log_crp_prob(const double& concentration) const { + double log_crp_prob(const double& alpha) const { double lp = 0.0; - if (has_concentration_prior()) - lp += log_gamma_density(concentration, concentration_prior_shape_, concentration_prior_rate_); + if (has_alpha_prior()) + lp += log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_); assert(lp <= 0.0); if (num_customers_) { - lp += lgamma(concentration) - lgamma(concentration + num_customers_) + - custs_.size() * log(concentration); + lp += lgamma(alpha) - lgamma(alpha + num_customers_) + + custs_.size() * log(alpha); assert(std::isfinite(lp)); for (typename std::tr1::unordered_map::const_iterator it = custs_.begin(); it != custs_.end(); ++it) { @@ -114,10 +114,10 @@ class CCRP_NoTable { } void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { - assert(has_concentration_prior()); + assert(has_alpha_prior()); ConcentrationResampler cr(*this); for (int iter = 0; iter < nloop; ++iter) { - concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, + alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0, std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); } } @@ -125,13 +125,13 @@ class CCRP_NoTable { struct ConcentrationResampler { ConcentrationResampler(const CCRP_NoTable& crp) : crp_(crp) {} const CCRP_NoTable& crp_; - double operator()(const double& proposed_concentration) const { - return crp_.log_crp_prob(proposed_concentration); + double operator()(const double& proposed_alpha) const { + return crp_.log_crp_prob(proposed_alpha); } }; void Print(std::ostream* out) const { - (*out) << "DP(alpha=" << concentration_ << ") customers=" << num_customers_ << std::endl; + (*out) << "DP(alpha=" << alpha_ << ") customers=" << num_customers_ << std::endl; int cc = 0; for (typename std::tr1::unordered_map::const_iterator it = custs_.begin(); it != custs_.end(); ++it) { @@ -153,11 +153,11 @@ class CCRP_NoTable { return custs_.end(); } - double concentration_; + double alpha_; - // optional gamma prior on concentration_ (NaN if no prior) - double concentration_prior_shape_; - double concentration_prior_rate_; + // optional gamma prior on alpha_ (NaN if no prior) + double alpha_prior_shape_; + double alpha_prior_rate_; }; template diff --git a/utils/ccrp_onetable.h b/utils/ccrp_onetable.h index b63737d1..1fe01b0e 100644 --- a/utils/ccrp_onetable.h +++ b/utils/ccrp_onetable.h @@ -21,33 +21,33 @@ class CCRP_OneTable { num_tables_(), num_customers_(), discount_(disc), - concentration_(conc), + alpha_(conc), discount_prior_alpha_(std::numeric_limits::quiet_NaN()), discount_prior_beta_(std::numeric_limits::quiet_NaN()), - concentration_prior_shape_(std::numeric_limits::quiet_NaN()), - concentration_prior_rate_(std::numeric_limits::quiet_NaN()) {} + alpha_prior_shape_(std::numeric_limits::quiet_NaN()), + alpha_prior_rate_(std::numeric_limits::quiet_NaN()) {} CCRP_OneTable(double d_alpha, double d_beta, double c_shape, double c_rate, double d = 0.9, double c = 1.0) : num_tables_(), num_customers_(), discount_(d), - concentration_(c), + alpha_(c), discount_prior_alpha_(d_alpha), discount_prior_beta_(d_beta), - concentration_prior_shape_(c_shape), - concentration_prior_rate_(c_rate) {} + alpha_prior_shape_(c_shape), + alpha_prior_rate_(c_rate) {} double discount() const { return discount_; } - double concentration() const { return concentration_; } - void set_concentration(double c) { concentration_ = c; } + double alpha() const { return alpha_; } + void set_alpha(double c) { alpha_ = c; } void set_discount(double d) { discount_ = d; } bool has_discount_prior() const { return !std::isnan(discount_prior_alpha_); } - bool has_concentration_prior() const { - return !std::isnan(concentration_prior_shape_); + bool has_alpha_prior() const { + return !std::isnan(alpha_prior_shape_); } void clear() { @@ -108,29 +108,29 @@ class CCRP_OneTable { double prob(const Dish& dish, const double& p0) const { const typename DishMapType::const_iterator it = dish_counts_.find(dish); - const double r = num_tables_ * discount_ + concentration_; + const double r = num_tables_ * discount_ + alpha_; if (it == dish_counts_.end()) { - return r * p0 / (num_customers_ + concentration_); + return r * p0 / (num_customers_ + alpha_); } else { return (it->second - discount_ + r * p0) / - (num_customers_ + concentration_); + (num_customers_ + alpha_); } } template T probT(const Dish& dish, const T& p0) const { const typename DishMapType::const_iterator it = dish_counts_.find(dish); - const T r(num_tables_ * discount_ + concentration_); + const T r(num_tables_ * discount_ + alpha_); if (it == dish_counts_.end()) { - return r * p0 / T(num_customers_ + concentration_); + return r * p0 / T(num_customers_ + alpha_); } else { return (T(it->second - discount_) + r * p0) / - T(num_customers_ + concentration_); + T(num_customers_ + alpha_); } } double log_crp_prob() const { - return log_crp_prob(discount_, concentration_); + return log_crp_prob(discount_, alpha_); } static double log_beta_density(const double& x, const double& alpha, const double& beta) { @@ -152,19 +152,19 @@ class CCRP_OneTable { // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process // does not include P_0's - double log_crp_prob(const double& discount, const double& concentration) const { + double log_crp_prob(const double& discount, const double& alpha) const { double lp = 0.0; if (has_discount_prior()) lp = log_beta_density(discount, discount_prior_alpha_, discount_prior_beta_); - if (has_concentration_prior()) - lp += log_gamma_density(concentration, concentration_prior_shape_, concentration_prior_rate_); + if (has_alpha_prior()) + lp += log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_); assert(lp <= 0.0); if (num_customers_) { if (discount > 0.0) { const double r = lgamma(1.0 - discount); - lp += lgamma(concentration) - lgamma(concentration + num_customers_) - + num_tables_ * log(discount) + lgamma(concentration / discount + num_tables_) - - lgamma(concentration / discount); + lp += lgamma(alpha) - lgamma(alpha + num_customers_) + + num_tables_ * log(discount) + lgamma(alpha / discount + num_tables_) + - lgamma(alpha / discount); assert(std::isfinite(lp)); for (typename DishMapType::const_iterator it = dish_counts_.begin(); it != dish_counts_.end(); ++it) { @@ -180,12 +180,12 @@ class CCRP_OneTable { } void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { - assert(has_discount_prior() || has_concentration_prior()); + assert(has_discount_prior() || has_alpha_prior()); DiscountResampler dr(*this); ConcentrationResampler cr(*this); for (int iter = 0; iter < nloop; ++iter) { - if (has_concentration_prior()) { - concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, + if (has_alpha_prior()) { + alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0, std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); } if (has_discount_prior()) { @@ -193,7 +193,7 @@ class CCRP_OneTable { 1.0, 0.0, niterations, 100*niterations); } } - concentration_ = slice_sampler1d(cr, concentration_, *rng, 0.0, + alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0, std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); } @@ -201,20 +201,20 @@ class CCRP_OneTable { DiscountResampler(const CCRP_OneTable& crp) : crp_(crp) {} const CCRP_OneTable& crp_; double operator()(const double& proposed_discount) const { - return crp_.log_crp_prob(proposed_discount, crp_.concentration_); + return crp_.log_crp_prob(proposed_discount, crp_.alpha_); } }; struct ConcentrationResampler { ConcentrationResampler(const CCRP_OneTable& crp) : crp_(crp) {} const CCRP_OneTable& crp_; - double operator()(const double& proposed_concentration) const { - return crp_.log_crp_prob(crp_.discount_, proposed_concentration); + double operator()(const double& proposed_alpha) const { + return crp_.log_crp_prob(crp_.discount_, proposed_alpha); } }; void Print(std::ostream* out) const { - (*out) << "PYP(d=" << discount_ << ",c=" << concentration_ << ") customers=" << num_customers_ << std::endl; + (*out) << "PYP(d=" << discount_ << ",c=" << alpha_ << ") customers=" << num_customers_ << std::endl; for (typename DishMapType::const_iterator it = dish_counts_.begin(); it != dish_counts_.end(); ++it) { (*out) << " " << it->first << " = " << it->second << std::endl; } @@ -233,15 +233,15 @@ class CCRP_OneTable { DishMapType dish_counts_; double discount_; - double concentration_; + double alpha_; // optional beta prior on discount_ (NaN if no prior) double discount_prior_alpha_; double discount_prior_beta_; - // optional gamma prior on concentration_ (NaN if no prior) - double concentration_prior_shape_; - double concentration_prior_rate_; + // optional gamma prior on alpha_ (NaN if no prior) + double alpha_prior_shape_; + double alpha_prior_rate_; }; template diff --git a/utils/mfcr.h b/utils/mfcr.h index 396d0205..df988f51 100644 --- a/utils/mfcr.h +++ b/utils/mfcr.h @@ -43,29 +43,29 @@ class MFCR { num_floors_(num_floors), num_tables_(), num_customers_(), - d_(d), + discount_(d), alpha_(alpha), - d_prior_alpha_(std::numeric_limits::quiet_NaN()), - d_prior_beta_(std::numeric_limits::quiet_NaN()), + discount_prior_alpha_(std::numeric_limits::quiet_NaN()), + discount_prior_beta_(std::numeric_limits::quiet_NaN()), alpha_prior_shape_(std::numeric_limits::quiet_NaN()), alpha_prior_rate_(std::numeric_limits::quiet_NaN()) {} - MFCR(unsigned num_floors, double d_alpha, double d_beta, double alpha_shape, double alpha_rate, double d = 0.9, double alpha = 10.0) : + MFCR(unsigned num_floors, double discount_alpha, double discount_beta, double alpha_shape, double alpha_rate, double d = 0.9, double alpha = 10.0) : num_floors_(num_floors), num_tables_(), num_customers_(), - d_(d), + discount_(d), alpha_(alpha), - d_prior_alpha_(d_alpha), - d_prior_beta_(d_beta), + discount_prior_alpha_(discount_alpha), + discount_prior_beta_(discount_beta), alpha_prior_shape_(alpha_shape), alpha_prior_rate_(alpha_rate) {} - double d() const { return d_; } + double discount() const { return discount_; } double alpha() const { return alpha_; } - bool has_d_prior() const { - return !std::isnan(d_prior_alpha_); + bool has_discount_prior() const { + return !std::isnan(discount_prior_alpha_); } bool has_alpha_prior() const { @@ -122,15 +122,15 @@ class MFCR { int floor = -1; bool share_table = false; if (loc.total_dish_count_) { - const double p_empty = (alpha_ + num_tables_ * d_) * marg_p0; - const double p_share = (loc.total_dish_count_ - loc.table_counts_.size() * d_); + const double p_empty = (alpha_ + num_tables_ * discount_) * marg_p0; + const double p_share = (loc.total_dish_count_ - loc.table_counts_.size() * discount_); share_table = rng->SelectSample(p_empty, p_share); } if (share_table) { - double r = rng->next() * (loc.total_dish_count_ - loc.table_counts_.size() * d_); + double r = rng->next() * (loc.total_dish_count_ - loc.table_counts_.size() * discount_); for (typename std::list::iterator ti = loc.table_counts_.begin(); ti != loc.table_counts_.end(); ++ti) { - r -= ti->count - d_; + r -= ti->count - discount_; if (r <= 0.0) { ++ti->count; floor = ti->floor; @@ -206,25 +206,25 @@ class MFCR { const double marg_p0 = std::inner_product(p0s.begin(), p0s.end(), lambdas.begin(), 0.0); assert(marg_p0 <= 1.0); const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); - const double r = num_tables_ * d_ + alpha_; + const double r = num_tables_ * discount_ + alpha_; if (it == dish_locs_.end()) { return r * marg_p0 / (num_customers_ + alpha_); } else { - return (it->second.total_dish_count_ - d_ * it->second.table_counts_.size() + r * marg_p0) / + return (it->second.total_dish_count_ - discount_ * it->second.table_counts_.size() + r * marg_p0) / (num_customers_ + alpha_); } } double log_crp_prob() const { - return log_crp_prob(d_, alpha_); + return log_crp_prob(discount_, alpha_); } // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process // does not include draws from G_w's double log_crp_prob(const double& d, const double& alpha) const { double lp = 0.0; - if (has_d_prior()) - lp = Md::log_beta_density(d, d_prior_alpha_, d_prior_beta_); + if (has_discount_prior()) + lp = Md::log_beta_density(d, discount_prior_alpha_, discount_prior_beta_); if (has_alpha_prior()) lp += Md::log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_); assert(lp <= 0.0); @@ -251,7 +251,7 @@ class MFCR { } void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { - assert(has_d_prior() || has_alpha_prior()); + assert(has_discount_prior() || has_alpha_prior()); DiscountResampler dr(*this); ConcentrationResampler cr(*this); for (int iter = 0; iter < nloop; ++iter) { @@ -259,8 +259,8 @@ class MFCR { alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0, std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); } - if (has_d_prior()) { - d_ = slice_sampler1d(dr, d_, *rng, std::numeric_limits::min(), + if (has_discount_prior()) { + discount_ = slice_sampler1d(dr, discount_, *rng, std::numeric_limits::min(), 1.0, 0.0, niterations, 100*niterations); } } @@ -279,8 +279,8 @@ class MFCR { struct ConcentrationResampler { ConcentrationResampler(const MFCR& crp) : crp_(crp) {} const MFCR& crp_; - double operator()(const double& proposed_alpha) const { - return crp_.log_crp_prob(crp_.d_, proposed_alpha); + double operator()(const double& proposediscount_alpha) const { + return crp_.log_crp_prob(crp_.discount_, proposediscount_alpha); } }; @@ -292,7 +292,7 @@ class MFCR { }; void Print(std::ostream* out) const { - (*out) << "MFCR(d=" << d_ << ",alpha=" << alpha_ << ") customers=" << num_customers_ << std::endl; + (*out) << "MFCR(d=" << discount_ << ",alpha=" << alpha_ << ") customers=" << num_customers_ << std::endl; for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); it != dish_locs_.end(); ++it) { (*out) << it->first << " (" << it->second.total_dish_count_ << " on " << it->second.table_counts_.size() << " tables): "; @@ -317,12 +317,12 @@ class MFCR { unsigned num_customers_; std::tr1::unordered_map dish_locs_; - double d_; + double discount_; double alpha_; - // optional beta prior on d_ (NaN if no prior) - double d_prior_alpha_; - double d_prior_beta_; + // optional beta prior on discount_ (NaN if no prior) + double discount_prior_alpha_; + double discount_prior_beta_; // optional gamma prior on alpha_ (NaN if no prior) double alpha_prior_shape_; -- cgit v1.2.3 From 29ae46010c3610dda877f2d1a07fe942f79bfc31 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 3 Mar 2012 22:18:58 +0000 Subject: fix include --- utils/ccrp.h | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/ccrp.h b/utils/ccrp.h index d9a38089..61ab5576 100644 --- a/utils/ccrp.h +++ b/utils/ccrp.h @@ -11,6 +11,7 @@ #include #include "sampler.h" #include "slice_sampler.h" +#include "m.h" // Chinese restaurant process (Pitman-Yor parameters) with table tracking. -- cgit v1.2.3 From 3c918889d86fe1deaa5d26162bf85865f1aa33bd Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 4 Mar 2012 14:33:11 -0500 Subject: clean up pyp lm code --- gi/pf/pyp_lm.cc | 85 ++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 60 insertions(+), 25 deletions(-) diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc index 0d85536c..88dfcc7c 100644 --- a/gi/pf/pyp_lm.cc +++ b/gi/pf/pyp_lm.cc @@ -11,7 +11,14 @@ #include "tdict.h" #include "sampler.h" #include "ccrp.h" -#include "ccrp_onetable.h" + +// A not very memory-efficient implementation of an N-gram LM based on PYPs +// as described in Y.-W. Teh. (2006) A Hierarchical Bayesian Language Model +// based on Pitman-Yor Processes. In Proc. ACL. + +// I use templates to handle the recursive formalation of the prior, so +// the order of the model has to be specified here, at compile time: +#define kORDER 3 using namespace std; using namespace tr1; @@ -22,8 +29,13 @@ shared_ptr prng; void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() - ("samples,s",po::value()->default_value(1000),"Number of samples") - ("input,i",po::value(),"Read data from") + ("samples,s",po::value()->default_value(300),"Number of samples") + ("train,i",po::value(),"Training data file") + ("test,T",po::value(),"Test data file") + ("discount_prior_a,a",po::value()->default_value(1.0), "discount ~ Beta(a,b): a=this") + ("discount_prior_b,b",po::value()->default_value(1.0), "discount ~ Beta(a,b): b=this") + ("strength_prior_s,s",po::value()->default_value(1.0), "strength ~ Gamma(s,r): s=this") + ("strength_prior_r,r",po::value()->default_value(1.0), "strength ~ Gamma(s,r): r=this") ("random_seed,S",po::value(), "Random seed"); po::options_description clo("Command line options"); clo.add_options() @@ -40,7 +52,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { } po::notify(*conf); - if (conf->count("help") || (conf->count("input") == 0)) { + if (conf->count("help") || (conf->count("train") == 0)) { cerr << dcmdline_options << endl; exit(1); } @@ -48,13 +60,13 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { template struct PYPLM; -// uniform base distribution +// uniform base distribution (0-gram model) template<> struct PYPLM<0> { - PYPLM(unsigned vs) : p0(1.0 / vs), draws() {} - void increment(WordID w, const vector& context, MT19937* rng) { ++draws; } - void decrement(WordID w, const vector& context, MT19937* rng) { --draws; assert(draws >= 0); } - double prob(WordID w, const vector& context) const { return p0; } - void resample_hyperparameters(MT19937* rng, const unsigned nloop, const unsigned niterations) {} + PYPLM(unsigned vs, double, double, double, double) : p0(1.0 / vs), draws() {} + void increment(WordID, const vector&, MT19937*) { ++draws; } + void decrement(WordID, const vector&, MT19937*) { --draws; assert(draws >= 0); } + double prob(WordID, const vector&) const { return p0; } + void resample_hyperparameters(MT19937*, const unsigned, const unsigned) {} double log_likelihood() const { return draws * log(p0); } const double p0; int draws; @@ -62,10 +74,13 @@ template<> struct PYPLM<0> { // represents an N-gram LM template struct PYPLM { - PYPLM(unsigned vs) : backoff(vs), d(0.8), alpha(1.0) {} + PYPLM(unsigned vs, double da, double db, double ss, double sr) : + backoff(vs, da, db, ss, sr), + discount_a(da), discount_b(db), + strength_s(ss), strength_r(sr), + d(0.8), alpha(1.0), lookup(N-1) {} void increment(WordID w, const vector& context, MT19937* rng) { const double bo = backoff.prob(w, context); - static vector lookup(N-1); for (unsigned i = 0; i < N-1; ++i) lookup[i] = context[context.size() - 1 - i]; typename unordered_map, CCRP, boost::hash > >::iterator it = p.find(lookup); @@ -75,7 +90,6 @@ template struct PYPLM { backoff.increment(w, context, rng); } void decrement(WordID w, const vector& context, MT19937* rng) { - static vector lookup(N-1); for (unsigned i = 0; i < N-1; ++i) lookup[i] = context[context.size() - 1 - i]; typename unordered_map, CCRP, boost::hash > >::iterator it = p.find(lookup); @@ -85,7 +99,6 @@ template struct PYPLM { } double prob(WordID w, const vector& context) const { const double bo = backoff.prob(w, context); - static vector lookup(N-1); for (unsigned i = 0; i < N-1; ++i) lookup[i] = context[context.size() - 1 - i]; typename unordered_map, CCRP, boost::hash > >::const_iterator it = p.find(lookup); @@ -99,7 +112,9 @@ template struct PYPLM { double log_likelihood(const double& dd, const double& aa) const { if (aa <= -dd) return -std::numeric_limits::infinity(); - double llh = Md::log_beta_density(dd, 1, 1) + Md::log_gamma_density(aa, 1, 1); + //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1); + double llh = Md::log_beta_density(dd, discount_a, discount_b) + + Md::log_gamma_density(aa, strength_s, strength_r); typename unordered_map, CCRP, boost::hash > >::const_iterator it; for (it = p.begin(); it != p.end(); ++it) llh += it->second.log_crp_prob(dd, aa); @@ -143,7 +158,9 @@ template struct PYPLM { } PYPLM backoff; + double discount_a, discount_b, strength_s, strength_r; double d, alpha; + mutable vector lookup; // thread-local unordered_map, CCRP, boost::hash > > p; }; @@ -161,14 +178,21 @@ int main(int argc, char** argv) { set vocabe; const WordID kEOS = TD::Convert(""); cerr << "Reading corpus...\n"; - CorpusTools::ReadFromFile(conf["input"].as(), &corpuse, &vocabe); + CorpusTools::ReadFromFile(conf["train"].as(), &corpuse, &vocabe); cerr << "E-corpus size: " << corpuse.size() << " sentences\t (" << vocabe.size() << " word types)\n"; -#define kORDER 3 - PYPLM lm(vocabe.size()); + vector > test; + if (conf.count("test")) + CorpusTools::ReadFromFile(conf["test"].as(), &test); + else + test = corpuse; + PYPLM lm(vocabe.size(), + conf["discount_prior_a"].as(), + conf["discount_prior_b"].as(), + conf["strength_prior_s"].as(), + conf["strength_prior_r"].as()); vector ctx(kORDER - 1, TD::Convert("")); - int mci = corpuse.size() * 99 / 100; for (int SS=0; SS < samples; ++SS) { - for (int ci = 0; ci < mci; ++ci) { + for (int ci = 0; ci < corpuse.size(); ++ci) { ctx.resize(kORDER - 1); const vector& s = corpuse[ci]; for (int i = 0; i <= s.size(); ++i) { @@ -187,22 +211,33 @@ int main(int argc, char** argv) { } double llh = 0; unsigned cnt = 0; - for (int ci = mci; ci < corpuse.size(); ++ci) { + unsigned oovs = 0; + for (int ci = 0; ci < test.size(); ++ci) { ctx.resize(kORDER - 1); - const vector& s = corpuse[ci]; + const vector& s = test[ci]; for (int i = 0; i <= s.size(); ++i) { WordID w = (i < s.size() ? s[i] : kEOS); double lp = log(lm.prob(w, ctx)) / log(2); - cerr << "p(" << TD::Convert(w) << " | " << TD::GetString(ctx) << ") = " << lp << endl; + if (i < s.size() && vocabe.count(w) == 0) { + cerr << "**OOV "; + ++oovs; + lp = 0; + } + cerr << "p(" << TD::Convert(w) << " |"; + for (int j = ctx.size() + 1 - kORDER; j < ctx.size(); ++j) + cerr << ' ' << TD::Convert(ctx[j]); + cerr << ") = " << lp << endl; ctx.push_back(w); llh -= lp; cnt++; } } - cerr << " Log_10 prob: " << (llh * log(2) / log(10)) << endl; - cerr << " Count: " << (cnt) << endl; + cerr << " Log_10 prob: " << (-llh * log(2) / log(10)) << endl; + cerr << " Count: " << cnt << endl; + cerr << " OOVs: " << oovs << endl; cerr << "Cross-entropy: " << (llh / cnt) << endl; cerr << " Perplexity: " << pow(2, llh / cnt) << endl; return 0; } + -- cgit v1.2.3 From 1fce37b69630269b93cfeca237675f7b2fc66ca4 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 4 Mar 2012 23:26:17 +0000 Subject: fix parameter name clash --- gi/pf/pyp_lm.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc index 88dfcc7c..e5c44c8b 100644 --- a/gi/pf/pyp_lm.cc +++ b/gi/pf/pyp_lm.cc @@ -29,7 +29,7 @@ shared_ptr prng; void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() - ("samples,s",po::value()->default_value(300),"Number of samples") + ("samples,n",po::value()->default_value(300),"Number of samples") ("train,i",po::value(),"Training data file") ("test,T",po::value(),"Test data file") ("discount_prior_a,a",po::value()->default_value(1.0), "discount ~ Beta(a,b): a=this") -- cgit v1.2.3 From 5b2daa43c608d648a077d37ed8ab0217f8ce8104 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 4 Mar 2012 21:35:50 -0500 Subject: move crp stuff around --- gi/clda/src/Makefile.am | 11 ------ gi/clda/src/crp_test.cc | 102 ------------------------------------------------ utils/Makefile.am | 5 ++- utils/crp_test.cc | 102 ++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 106 insertions(+), 114 deletions(-) delete mode 100644 gi/clda/src/crp_test.cc create mode 100644 utils/crp_test.cc diff --git a/gi/clda/src/Makefile.am b/gi/clda/src/Makefile.am index 3aab17da..cdca1f97 100644 --- a/gi/clda/src/Makefile.am +++ b/gi/clda/src/Makefile.am @@ -1,14 +1,3 @@ -if HAVE_GTEST -noinst_PROGRAMS = \ - crp_test - -TESTS = crp_test - -crp_test_SOURCES = crp_test.cc -crp_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) - -endif - bin_PROGRAMS = clda clda_SOURCES = clda.cc diff --git a/gi/clda/src/crp_test.cc b/gi/clda/src/crp_test.cc deleted file mode 100644 index 561cd4dd..00000000 --- a/gi/clda/src/crp_test.cc +++ /dev/null @@ -1,102 +0,0 @@ -#include -#include -#include - -#include - -#include "ccrp.h" -#include "sampler.h" - -const size_t MAX_DOC_LEN_CHARS = 10000000; - -using namespace std; - -class CRPTest : public testing::Test { - public: - CRPTest() {} - protected: - virtual void SetUp() { } - virtual void TearDown() { } - MT19937 rng; -}; - -TEST_F(CRPTest, Dist) { - CCRP crp(0.1, 5); - double un = 0.25; - int tt = 0; - tt += crp.increment("hi", un, &rng); - tt += crp.increment("foo", un, &rng); - tt += crp.increment("bar", un, &rng); - tt += crp.increment("bar", un, &rng); - tt += crp.increment("bar", un, &rng); - tt += crp.increment("bar", un, &rng); - tt += crp.increment("bar", un, &rng); - tt += crp.increment("bar", un, &rng); - tt += crp.increment("bar", un, &rng); - cout << "tt=" << tt << endl; - cout << crp << endl; - cout << " P(bar)=" << crp.prob("bar", un) << endl; - cout << " P(hi)=" << crp.prob("hi", un) << endl; - cout << " P(baz)=" << crp.prob("baz", un) << endl; - cout << " P(foo)=" << crp.prob("foo", un) << endl; - double x = crp.prob("bar", un) + crp.prob("hi", un) + crp.prob("baz", un) + crp.prob("foo", un); - cout << " tot=" << x << endl; - EXPECT_FLOAT_EQ(1.0, x); - tt += crp.decrement("hi", &rng); - tt += crp.decrement("bar", &rng); - cout << crp << endl; - tt += crp.decrement("bar", &rng); - cout << crp << endl; - cout << "tt=" << tt << endl; -} - -TEST_F(CRPTest, Exchangability) { - double tot = 0; - double xt = 0; - CCRP crp(0.5, 1.0); - int cust = 10; - vector hist(cust + 1, 0); - for (int i = 0; i < cust; ++i) { crp.increment(1, 1.0, &rng); } - const int samples = 100000; - const bool simulate = true; - for (int k = 0; k < samples; ++k) { - if (!simulate) { - crp.clear(); - for (int i = 0; i < cust; ++i) { crp.increment(1, 1.0, &rng); } - } else { - int da = rng.next() * cust; - bool a = rng.next() < 0.5; - if (a) { - for (int i = 0; i < da; ++i) { crp.increment(1, 1.0, &rng); } - for (int i = 0; i < da; ++i) { crp.decrement(1, &rng); } - xt += 1.0; - } else { - for (int i = 0; i < da; ++i) { crp.decrement(1, &rng); } - for (int i = 0; i < da; ++i) { crp.increment(1, 1.0, &rng); } - } - } - int c = crp.num_tables(1); - ++hist[c]; - tot += c; - } - EXPECT_EQ(cust, crp.num_customers()); - cerr << "P(a) = " << (xt / samples) << endl; - cerr << "E[num tables] = " << (tot / samples) << endl; - double error = fabs((tot / samples) - 5.4); - cerr << " error = " << error << endl; - EXPECT_LT(error, 0.1); // it's possible for this to fail, but - // very, very unlikely - for (int i = 1; i <= cust; ++i) - cerr << i << ' ' << (hist[i]) << endl; -} - -TEST_F(CRPTest, LP) { - CCRP crp(1,1,1,1,0.1,50.0); - crp.increment("foo", 1.0, &rng); - cerr << crp.log_crp_prob() << endl; -} - -int main(int argc, char** argv) { - testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/utils/Makefile.am b/utils/Makefile.am index bb067ed9..5153ae20 100644 --- a/utils/Makefile.am +++ b/utils/Makefile.am @@ -6,13 +6,14 @@ TESTS = ts phmt mfcr_test if HAVE_GTEST noinst_PROGRAMS += \ + crp_test \ dict_test \ m_test \ weights_test \ logval_test \ small_vector_test -TESTS += small_vector_test logval_test weights_test dict_test m_test +TESTS += crp_test small_vector_test logval_test weights_test dict_test m_test endif reconstruct_weights_SOURCES = reconstruct_weights.cc @@ -50,6 +51,8 @@ mfcr_test_SOURCES = mfcr_test.cc mfcr_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) weights_test_SOURCES = weights_test.cc weights_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) +crp_test_SOURCES = weights_test.cc +crp_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) logval_test_SOURCES = logval_test.cc logval_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) small_vector_test_SOURCES = small_vector_test.cc diff --git a/utils/crp_test.cc b/utils/crp_test.cc new file mode 100644 index 00000000..561cd4dd --- /dev/null +++ b/utils/crp_test.cc @@ -0,0 +1,102 @@ +#include +#include +#include + +#include + +#include "ccrp.h" +#include "sampler.h" + +const size_t MAX_DOC_LEN_CHARS = 10000000; + +using namespace std; + +class CRPTest : public testing::Test { + public: + CRPTest() {} + protected: + virtual void SetUp() { } + virtual void TearDown() { } + MT19937 rng; +}; + +TEST_F(CRPTest, Dist) { + CCRP crp(0.1, 5); + double un = 0.25; + int tt = 0; + tt += crp.increment("hi", un, &rng); + tt += crp.increment("foo", un, &rng); + tt += crp.increment("bar", un, &rng); + tt += crp.increment("bar", un, &rng); + tt += crp.increment("bar", un, &rng); + tt += crp.increment("bar", un, &rng); + tt += crp.increment("bar", un, &rng); + tt += crp.increment("bar", un, &rng); + tt += crp.increment("bar", un, &rng); + cout << "tt=" << tt << endl; + cout << crp << endl; + cout << " P(bar)=" << crp.prob("bar", un) << endl; + cout << " P(hi)=" << crp.prob("hi", un) << endl; + cout << " P(baz)=" << crp.prob("baz", un) << endl; + cout << " P(foo)=" << crp.prob("foo", un) << endl; + double x = crp.prob("bar", un) + crp.prob("hi", un) + crp.prob("baz", un) + crp.prob("foo", un); + cout << " tot=" << x << endl; + EXPECT_FLOAT_EQ(1.0, x); + tt += crp.decrement("hi", &rng); + tt += crp.decrement("bar", &rng); + cout << crp << endl; + tt += crp.decrement("bar", &rng); + cout << crp << endl; + cout << "tt=" << tt << endl; +} + +TEST_F(CRPTest, Exchangability) { + double tot = 0; + double xt = 0; + CCRP crp(0.5, 1.0); + int cust = 10; + vector hist(cust + 1, 0); + for (int i = 0; i < cust; ++i) { crp.increment(1, 1.0, &rng); } + const int samples = 100000; + const bool simulate = true; + for (int k = 0; k < samples; ++k) { + if (!simulate) { + crp.clear(); + for (int i = 0; i < cust; ++i) { crp.increment(1, 1.0, &rng); } + } else { + int da = rng.next() * cust; + bool a = rng.next() < 0.5; + if (a) { + for (int i = 0; i < da; ++i) { crp.increment(1, 1.0, &rng); } + for (int i = 0; i < da; ++i) { crp.decrement(1, &rng); } + xt += 1.0; + } else { + for (int i = 0; i < da; ++i) { crp.decrement(1, &rng); } + for (int i = 0; i < da; ++i) { crp.increment(1, 1.0, &rng); } + } + } + int c = crp.num_tables(1); + ++hist[c]; + tot += c; + } + EXPECT_EQ(cust, crp.num_customers()); + cerr << "P(a) = " << (xt / samples) << endl; + cerr << "E[num tables] = " << (tot / samples) << endl; + double error = fabs((tot / samples) - 5.4); + cerr << " error = " << error << endl; + EXPECT_LT(error, 0.1); // it's possible for this to fail, but + // very, very unlikely + for (int i = 1; i <= cust; ++i) + cerr << i << ' ' << (hist[i]) << endl; +} + +TEST_F(CRPTest, LP) { + CCRP crp(1,1,1,1,0.1,50.0); + crp.increment("foo", 1.0, &rng); + cerr << crp.log_crp_prob() << endl; +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} -- cgit v1.2.3 From 15170746be2fc718e8fb026b4468e33cf0c63170 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 4 Mar 2012 21:41:07 -0500 Subject: clean up crp --- utils/Makefile.am | 2 +- utils/ccrp.h | 10 +++++----- utils/fdict.h | 6 ++++-- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/utils/Makefile.am b/utils/Makefile.am index 5153ae20..3ea21835 100644 --- a/utils/Makefile.am +++ b/utils/Makefile.am @@ -51,7 +51,7 @@ mfcr_test_SOURCES = mfcr_test.cc mfcr_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) weights_test_SOURCES = weights_test.cc weights_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) -crp_test_SOURCES = weights_test.cc +crp_test_SOURCES = crp_test.cc crp_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) logval_test_SOURCES = logval_test.cc logval_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) diff --git a/utils/ccrp.h b/utils/ccrp.h index 61ab5576..68769635 100644 --- a/utils/ccrp.h +++ b/utils/ccrp.h @@ -242,10 +242,10 @@ class CCRP { void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { assert(has_discount_prior() || has_alpha_prior()); DiscountResampler dr(*this); - ConcentrationResampler cr(*this); + StrengthResampler sr(*this); for (int iter = 0; iter < nloop; ++iter) { if (has_alpha_prior()) { - alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0, + alpha_ = slice_sampler1d(sr, alpha_, *rng, 0.0, std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); } if (has_discount_prior()) { @@ -253,7 +253,7 @@ class CCRP { 1.0, 0.0, niterations, 100*niterations); } } - alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0, + alpha_ = slice_sampler1d(sr, alpha_, *rng, 0.0, std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); } @@ -265,8 +265,8 @@ class CCRP { } }; - struct ConcentrationResampler { - ConcentrationResampler(const CCRP& crp) : crp_(crp) {} + struct StrengthResampler { + StrengthResampler(const CCRP& crp) : crp_(crp) {} const CCRP& crp_; double operator()(const double& proposed_alpha) const { return crp_.log_crp_prob(crp_.discount_, proposed_alpha); diff --git a/utils/fdict.h b/utils/fdict.h index f0871b9a..0a2a9456 100644 --- a/utils/fdict.h +++ b/utils/fdict.h @@ -10,7 +10,7 @@ #ifdef HAVE_CMPH #include "perfect_hash.h" -#include "string_to.h" +#include #endif struct FD { @@ -49,7 +49,9 @@ struct FD { #ifdef HAVE_CMPH if (hash_) { static std::string tls; - tls = to_string(w); + std::ostringstream os; + os << w; + tls = os.str(); return tls; } #endif -- cgit v1.2.3 From 0c4ffecf6ccad06b426463d8edc5e0c50935b9c9 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 4 Mar 2012 23:15:51 -0500 Subject: support full range of hyperparameter values for PYP (including strength <= 0) --- utils/ccrp.h | 68 ++++++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 43 insertions(+), 25 deletions(-) diff --git a/utils/ccrp.h b/utils/ccrp.h index 68769635..c883c027 100644 --- a/utils/ccrp.h +++ b/utils/ccrp.h @@ -19,29 +19,44 @@ template > class CCRP { public: CCRP(double disc, double alpha) : - num_tables_(), - num_customers_(), - discount_(disc), - alpha_(alpha), - discount_prior_alpha_(std::numeric_limits::quiet_NaN()), - discount_prior_beta_(std::numeric_limits::quiet_NaN()), - alpha_prior_shape_(std::numeric_limits::quiet_NaN()), - alpha_prior_rate_(std::numeric_limits::quiet_NaN()) {} + num_tables_(), + num_customers_(), + discount_(disc), + alpha_(alpha), + discount_prior_alpha_(std::numeric_limits::quiet_NaN()), + discount_prior_beta_(std::numeric_limits::quiet_NaN()), + alpha_prior_shape_(std::numeric_limits::quiet_NaN()), + alpha_prior_rate_(std::numeric_limits::quiet_NaN()) { + check_hyperparameters(); + } CCRP(double d_alpha, double d_beta, double c_shape, double c_rate, double d = 0.9, double c = 1.0) : - num_tables_(), - num_customers_(), - discount_(d), - alpha_(c), - discount_prior_alpha_(d_alpha), - discount_prior_beta_(d_beta), - alpha_prior_shape_(c_shape), - alpha_prior_rate_(c_rate) {} + num_tables_(), + num_customers_(), + discount_(d), + alpha_(c), + discount_prior_alpha_(d_alpha), + discount_prior_beta_(d_beta), + alpha_prior_shape_(c_shape), + alpha_prior_rate_(c_rate) { + check_hyperparameters(); + } + + void check_hyperparameters() { + if (discount_ < 0.0 || discount_ >= 1.0) { + std::cerr << "Bad discount: " << discount_ << std::endl; + abort(); + } + if (alpha_ <= -discount_) { + std::cerr << "Bad strength: " << alpha_ << " (discount=" << discount_ << ")" << std::endl; + abort(); + } + } double discount() const { return discount_; } double alpha() const { return alpha_; } - void set_discount(double d) { discount_ = d; } - void set_alpha(double a) { alpha_ = a; } + void set_discount(double d) { discount_ = d; check_hyperparameters(); } + void set_alpha(double a) { alpha_ = a; check_hyperparameters(); } bool has_discount_prior() const { return !std::isnan(discount_prior_alpha_); @@ -215,14 +230,15 @@ class CCRP { if (has_discount_prior()) lp = Md::log_beta_density(discount, discount_prior_alpha_, discount_prior_beta_); if (has_alpha_prior()) - lp += Md::log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_); + lp += Md::log_gamma_density(alpha + discount, alpha_prior_shape_, alpha_prior_rate_); assert(lp <= 0.0); if (num_customers_) { if (discount > 0.0) { const double r = lgamma(1.0 - discount); - lp += lgamma(alpha) - lgamma(alpha + num_customers_) - + num_tables_ * log(discount) + lgamma(alpha / discount + num_tables_) - - lgamma(alpha / discount); + if (alpha) + lp += lgamma(alpha) - lgamma(alpha / discount); + lp += - lgamma(alpha + num_customers_) + + num_tables_ * log(discount) + lgamma(alpha / discount + num_tables_); assert(std::isfinite(lp)); for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); it != dish_locs_.end(); ++it) { @@ -245,15 +261,17 @@ class CCRP { StrengthResampler sr(*this); for (int iter = 0; iter < nloop; ++iter) { if (has_alpha_prior()) { - alpha_ = slice_sampler1d(sr, alpha_, *rng, 0.0, + alpha_ = slice_sampler1d(sr, alpha_, *rng, -discount_, std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); } if (has_discount_prior()) { - discount_ = slice_sampler1d(dr, discount_, *rng, std::numeric_limits::min(), + double min_discount = std::numeric_limits::min(); + if (alpha_ < 0.0) min_discount = -alpha_; + discount_ = slice_sampler1d(dr, discount_, *rng, min_discount, 1.0, 0.0, niterations, 100*niterations); } } - alpha_ = slice_sampler1d(sr, alpha_, *rng, 0.0, + alpha_ = slice_sampler1d(sr, alpha_, *rng, -discount_, std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); } -- cgit v1.2.3 From ce58cb44771a5194b71682d1602abe2fef9e6f13 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 5 Mar 2012 14:51:04 -0500 Subject: support strength=0 PYPs, final notation clean-up --- gi/pf/align-lexonly-pyp.cc | 2 +- gi/pf/conditional_pseg.h | 2 +- gi/pf/learn_cfg.cc | 4 +- gi/pf/pyp_lm.cc | 22 ++++----- phrasinator/gibbs_train_plm.cc | 2 +- utils/ccrp.h | 106 ++++++++++++++++++++++------------------- utils/mfcr.h | 105 ++++++++++++++++++++++------------------ 7 files changed, 131 insertions(+), 112 deletions(-) diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index 4ce7cf62..87f7f6b5 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -104,7 +104,7 @@ struct HierarchicalWordBase { } void Summary() const { - cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (d=" << r.discount() << ",\\alpha=" << r.alpha() << ')' << endl; + cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (d=" << r.discount() << ",s=" << r.strength() << ')' << endl; for (MFCR >::const_iterator it = r.begin(); it != r.end(); ++it) cerr << " " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables)" << TD::GetString(it->first) << endl; } diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h index f9841cbf..86403d8d 100644 --- a/gi/pf/conditional_pseg.h +++ b/gi/pf/conditional_pseg.h @@ -22,7 +22,7 @@ struct MConditionalTranslationModel { void Summary() const { std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { - std::cerr << TD::GetString(it->first) << " \t(d=" << it->second.discount() << ",\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl; + std::cerr << TD::GetString(it->first) << " \t(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << std::endl; for (MFCR::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) std::cerr << " " << -1 << '\t' << i2->first << std::endl; } diff --git a/gi/pf/learn_cfg.cc b/gi/pf/learn_cfg.cc index 5b748311..bf157828 100644 --- a/gi/pf/learn_cfg.cc +++ b/gi/pf/learn_cfg.cc @@ -183,9 +183,9 @@ struct HieroLMModel { nts[i].resample_hyperparameters(rng); if (kHIERARCHICAL_PRIOR) { q0.resample_hyperparameters(rng); - cerr << "[base d=" << q0.discount() << ", alpha=" << q0.alpha() << "]"; + cerr << "[base d=" << q0.discount() << ", s=" << q0.strength() << "]"; } - cerr << " d=" << nts[0].discount() << ", alpha=" << nts[0].alpha() << endl; + cerr << " d=" << nts[0].discount() << ", s=" << nts[0].strength() << endl; } const BaseRuleModel base; diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc index e5c44c8b..7ebada13 100644 --- a/gi/pf/pyp_lm.cc +++ b/gi/pf/pyp_lm.cc @@ -78,14 +78,14 @@ template struct PYPLM { backoff(vs, da, db, ss, sr), discount_a(da), discount_b(db), strength_s(ss), strength_r(sr), - d(0.8), alpha(1.0), lookup(N-1) {} + d(0.8), strength(1.0), lookup(N-1) {} void increment(WordID w, const vector& context, MT19937* rng) { const double bo = backoff.prob(w, context); for (unsigned i = 0; i < N-1; ++i) lookup[i] = context[context.size() - 1 - i]; typename unordered_map, CCRP, boost::hash > >::iterator it = p.find(lookup); if (it == p.end()) - it = p.insert(make_pair(lookup, CCRP(d,alpha))).first; + it = p.insert(make_pair(lookup, CCRP(d,strength))).first; if (it->second.increment(w, bo, rng)) backoff.increment(w, context, rng); } @@ -107,7 +107,7 @@ template struct PYPLM { } double log_likelihood() const { - return log_likelihood(d, alpha) + backoff.log_likelihood(); + return log_likelihood(d, strength) + backoff.log_likelihood(); } double log_likelihood(const double& dd, const double& aa) const { @@ -125,15 +125,15 @@ template struct PYPLM { DiscountResampler(const PYPLM& m) : m_(m) {} const PYPLM& m_; double operator()(const double& proposed_discount) const { - return m_.log_likelihood(proposed_discount, m_.alpha); + return m_.log_likelihood(proposed_discount, m_.strength); } }; struct AlphaResampler { AlphaResampler(const PYPLM& m) : m_(m) {} const PYPLM& m_; - double operator()(const double& proposed_alpha) const { - return m_.log_likelihood(m_.d, proposed_alpha); + double operator()(const double& proposed_strength) const { + return m_.log_likelihood(m_.d, proposed_strength); } }; @@ -141,25 +141,25 @@ template struct PYPLM { DiscountResampler dr(*this); AlphaResampler ar(*this); for (int iter = 0; iter < nloop; ++iter) { - alpha = slice_sampler1d(ar, alpha, *rng, 0.0, + strength = slice_sampler1d(ar, strength, *rng, 0.0, std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); d = slice_sampler1d(dr, d, *rng, std::numeric_limits::min(), 1.0, 0.0, niterations, 100*niterations); } - alpha = slice_sampler1d(ar, alpha, *rng, 0.0, + strength = slice_sampler1d(ar, strength, *rng, 0.0, std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); typename unordered_map, CCRP, boost::hash > >::iterator it; - cerr << "PYPLM<" << N << ">(d=" << d << ",a=" << alpha << ") = " << log_likelihood(d, alpha) << endl; + cerr << "PYPLM<" << N << ">(d=" << d << ",a=" << strength << ") = " << log_likelihood(d, strength) << endl; for (it = p.begin(); it != p.end(); ++it) { it->second.set_discount(d); - it->second.set_alpha(alpha); + it->second.set_strength(strength); } backoff.resample_hyperparameters(rng, nloop, niterations); } PYPLM backoff; double discount_a, discount_b, strength_s, strength_r; - double d, alpha; + double d, strength; mutable vector lookup; // thread-local unordered_map, CCRP, boost::hash > > p; }; diff --git a/phrasinator/gibbs_train_plm.cc b/phrasinator/gibbs_train_plm.cc index 54861dcb..3b99e1b6 100644 --- a/phrasinator/gibbs_train_plm.cc +++ b/phrasinator/gibbs_train_plm.cc @@ -252,7 +252,7 @@ struct UniphraseLM { void ResampleHyperparameters(MT19937* rng) { phrases_.resample_hyperparameters(rng); gen_.resample_hyperparameters(rng); - cerr << " d=" << phrases_.discount() << ",a=" << phrases_.alpha(); + cerr << " d=" << phrases_.discount() << ",s=" << phrases_.strength(); } CCRP > phrases_; diff --git a/utils/ccrp.h b/utils/ccrp.h index c883c027..5f9db7a6 100644 --- a/utils/ccrp.h +++ b/utils/ccrp.h @@ -18,27 +18,27 @@ template > class CCRP { public: - CCRP(double disc, double alpha) : + CCRP(double disc, double strength) : num_tables_(), num_customers_(), discount_(disc), - alpha_(alpha), - discount_prior_alpha_(std::numeric_limits::quiet_NaN()), + strength_(strength), + discount_prior_strength_(std::numeric_limits::quiet_NaN()), discount_prior_beta_(std::numeric_limits::quiet_NaN()), - alpha_prior_shape_(std::numeric_limits::quiet_NaN()), - alpha_prior_rate_(std::numeric_limits::quiet_NaN()) { + strength_prior_shape_(std::numeric_limits::quiet_NaN()), + strength_prior_rate_(std::numeric_limits::quiet_NaN()) { check_hyperparameters(); } - CCRP(double d_alpha, double d_beta, double c_shape, double c_rate, double d = 0.9, double c = 1.0) : + CCRP(double d_strength, double d_beta, double c_shape, double c_rate, double d = 0.9, double c = 1.0) : num_tables_(), num_customers_(), discount_(d), - alpha_(c), - discount_prior_alpha_(d_alpha), + strength_(c), + discount_prior_strength_(d_strength), discount_prior_beta_(d_beta), - alpha_prior_shape_(c_shape), - alpha_prior_rate_(c_rate) { + strength_prior_shape_(c_shape), + strength_prior_rate_(c_rate) { check_hyperparameters(); } @@ -47,23 +47,23 @@ class CCRP { std::cerr << "Bad discount: " << discount_ << std::endl; abort(); } - if (alpha_ <= -discount_) { - std::cerr << "Bad strength: " << alpha_ << " (discount=" << discount_ << ")" << std::endl; + if (strength_ <= -discount_) { + std::cerr << "Bad strength: " << strength_ << " (discount=" << discount_ << ")" << std::endl; abort(); } } double discount() const { return discount_; } - double alpha() const { return alpha_; } + double strength() const { return strength_; } void set_discount(double d) { discount_ = d; check_hyperparameters(); } - void set_alpha(double a) { alpha_ = a; check_hyperparameters(); } + void set_strength(double a) { strength_ = a; check_hyperparameters(); } bool has_discount_prior() const { - return !std::isnan(discount_prior_alpha_); + return !std::isnan(discount_prior_strength_); } - bool has_alpha_prior() const { - return !std::isnan(alpha_prior_shape_); + bool has_strength_prior() const { + return !std::isnan(strength_prior_shape_); } void clear() { @@ -97,7 +97,7 @@ class CCRP { DishLocations& loc = dish_locs_[dish]; bool share_table = false; if (loc.total_dish_count_) { - const double p_empty = (alpha_ + num_tables_ * discount_) * p0; + const double p_empty = (strength_ + num_tables_ * discount_) * p0; const double p_share = (loc.total_dish_count_ - loc.table_counts_.size() * discount_); share_table = rng->SelectSample(p_empty, p_share); } @@ -131,7 +131,7 @@ class CCRP { DishLocations& loc = dish_locs_[dish]; bool share_table = false; if (loc.total_dish_count_) { - const T p_empty = T(alpha_ + num_tables_ * discount_) * p0; + const T p_empty = T(strength_ + num_tables_ * discount_) * p0; const T p_share = T(loc.total_dish_count_ - loc.table_counts_.size() * discount_); share_table = rng->SelectSample(p_empty, p_share); } @@ -198,47 +198,47 @@ class CCRP { double prob(const Dish& dish, const double& p0) const { const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); - const double r = num_tables_ * discount_ + alpha_; + const double r = num_tables_ * discount_ + strength_; if (it == dish_locs_.end()) { - return r * p0 / (num_customers_ + alpha_); + return r * p0 / (num_customers_ + strength_); } else { return (it->second.total_dish_count_ - discount_ * it->second.table_counts_.size() + r * p0) / - (num_customers_ + alpha_); + (num_customers_ + strength_); } } template T probT(const Dish& dish, const T& p0) const { const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); - const T r = T(num_tables_ * discount_ + alpha_); + const T r = T(num_tables_ * discount_ + strength_); if (it == dish_locs_.end()) { - return r * p0 / T(num_customers_ + alpha_); + return r * p0 / T(num_customers_ + strength_); } else { return (T(it->second.total_dish_count_ - discount_ * it->second.table_counts_.size()) + r * p0) / - T(num_customers_ + alpha_); + T(num_customers_ + strength_); } } double log_crp_prob() const { - return log_crp_prob(discount_, alpha_); + return log_crp_prob(discount_, strength_); } // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process // does not include P_0's - double log_crp_prob(const double& discount, const double& alpha) const { + double log_crp_prob(const double& discount, const double& strength) const { double lp = 0.0; if (has_discount_prior()) - lp = Md::log_beta_density(discount, discount_prior_alpha_, discount_prior_beta_); - if (has_alpha_prior()) - lp += Md::log_gamma_density(alpha + discount, alpha_prior_shape_, alpha_prior_rate_); + lp = Md::log_beta_density(discount, discount_prior_strength_, discount_prior_beta_); + if (has_strength_prior()) + lp += Md::log_gamma_density(strength + discount, strength_prior_shape_, strength_prior_rate_); assert(lp <= 0.0); if (num_customers_) { if (discount > 0.0) { const double r = lgamma(1.0 - discount); - if (alpha) - lp += lgamma(alpha) - lgamma(alpha / discount); - lp += - lgamma(alpha + num_customers_) - + num_tables_ * log(discount) + lgamma(alpha / discount + num_tables_); + if (strength) + lp += lgamma(strength) - lgamma(strength / discount); + lp += - lgamma(strength + num_customers_) + + num_tables_ * log(discount) + lgamma(strength / discount + num_tables_); assert(std::isfinite(lp)); for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); it != dish_locs_.end(); ++it) { @@ -247,8 +247,16 @@ class CCRP { lp += lgamma(*ti - discount) - r; } } + } else if (!discount) { // discount == 0.0 + lp += lgamma(strength) + num_tables_ * log(strength) - lgamma(strength + num_tables_); + assert(std::isfinite(lp)); + for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); + it != dish_locs_.end(); ++it) { + const DishLocations& cur = it->second; + lp += lgamma(cur.table_counts_.size()); + } } else { - assert(!"not implemented yet"); + assert(!"discount less than 0 detected!"); } } assert(std::isfinite(lp)); @@ -256,22 +264,22 @@ class CCRP { } void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { - assert(has_discount_prior() || has_alpha_prior()); + assert(has_discount_prior() || has_strength_prior()); DiscountResampler dr(*this); StrengthResampler sr(*this); for (int iter = 0; iter < nloop; ++iter) { - if (has_alpha_prior()) { - alpha_ = slice_sampler1d(sr, alpha_, *rng, -discount_, + if (has_strength_prior()) { + strength_ = slice_sampler1d(sr, strength_, *rng, -discount_, std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); } if (has_discount_prior()) { double min_discount = std::numeric_limits::min(); - if (alpha_ < 0.0) min_discount = -alpha_; + if (strength_ < 0.0) min_discount = -strength_; discount_ = slice_sampler1d(dr, discount_, *rng, min_discount, 1.0, 0.0, niterations, 100*niterations); } } - alpha_ = slice_sampler1d(sr, alpha_, *rng, -discount_, + strength_ = slice_sampler1d(sr, strength_, *rng, -discount_, std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); } @@ -279,15 +287,15 @@ class CCRP { DiscountResampler(const CCRP& crp) : crp_(crp) {} const CCRP& crp_; double operator()(const double& proposed_discount) const { - return crp_.log_crp_prob(proposed_discount, crp_.alpha_); + return crp_.log_crp_prob(proposed_discount, crp_.strength_); } }; struct StrengthResampler { StrengthResampler(const CCRP& crp) : crp_(crp) {} const CCRP& crp_; - double operator()(const double& proposed_alpha) const { - return crp_.log_crp_prob(crp_.discount_, proposed_alpha); + double operator()(const double& proposed_strength) const { + return crp_.log_crp_prob(crp_.discount_, proposed_strength); } }; @@ -299,7 +307,7 @@ class CCRP { }; void Print(std::ostream* out) const { - std::cerr << "PYP(d=" << discount_ << ",c=" << alpha_ << ") customers=" << num_customers_ << std::endl; + std::cerr << "PYP(d=" << discount_ << ",c=" << strength_ << ") customers=" << num_customers_ << std::endl; for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); it != dish_locs_.end(); ++it) { (*out) << it->first << " (" << it->second.total_dish_count_ << " on " << it->second.table_counts_.size() << " tables): "; @@ -324,15 +332,15 @@ class CCRP { std::tr1::unordered_map dish_locs_; double discount_; - double alpha_; + double strength_; // optional beta prior on discount_ (NaN if no prior) - double discount_prior_alpha_; + double discount_prior_strength_; double discount_prior_beta_; - // optional gamma prior on alpha_ (NaN if no prior) - double alpha_prior_shape_; - double alpha_prior_rate_; + // optional gamma prior on strength_ (NaN if no prior) + double strength_prior_shape_; + double strength_prior_rate_; }; template diff --git a/utils/mfcr.h b/utils/mfcr.h index df988f51..aeaf599d 100644 --- a/utils/mfcr.h +++ b/utils/mfcr.h @@ -39,37 +39,37 @@ template > class MFCR { public: - MFCR(unsigned num_floors, double d, double alpha) : + MFCR(unsigned num_floors, double d, double strength) : num_floors_(num_floors), num_tables_(), num_customers_(), discount_(d), - alpha_(alpha), - discount_prior_alpha_(std::numeric_limits::quiet_NaN()), + strength_(strength), + discount_prior_strength_(std::numeric_limits::quiet_NaN()), discount_prior_beta_(std::numeric_limits::quiet_NaN()), - alpha_prior_shape_(std::numeric_limits::quiet_NaN()), - alpha_prior_rate_(std::numeric_limits::quiet_NaN()) {} + strength_prior_shape_(std::numeric_limits::quiet_NaN()), + strength_prior_rate_(std::numeric_limits::quiet_NaN()) {} - MFCR(unsigned num_floors, double discount_alpha, double discount_beta, double alpha_shape, double alpha_rate, double d = 0.9, double alpha = 10.0) : + MFCR(unsigned num_floors, double discount_strength, double discount_beta, double strength_shape, double strength_rate, double d = 0.9, double strength = 10.0) : num_floors_(num_floors), num_tables_(), num_customers_(), discount_(d), - alpha_(alpha), - discount_prior_alpha_(discount_alpha), + strength_(strength), + discount_prior_strength_(discount_strength), discount_prior_beta_(discount_beta), - alpha_prior_shape_(alpha_shape), - alpha_prior_rate_(alpha_rate) {} + strength_prior_shape_(strength_shape), + strength_prior_rate_(strength_rate) {} double discount() const { return discount_; } - double alpha() const { return alpha_; } + double strength() const { return strength_; } bool has_discount_prior() const { - return !std::isnan(discount_prior_alpha_); + return !std::isnan(discount_prior_strength_); } - bool has_alpha_prior() const { - return !std::isnan(alpha_prior_shape_); + bool has_strength_prior() const { + return !std::isnan(strength_prior_shape_); } void clear() { @@ -122,7 +122,7 @@ class MFCR { int floor = -1; bool share_table = false; if (loc.total_dish_count_) { - const double p_empty = (alpha_ + num_tables_ * discount_) * marg_p0; + const double p_empty = (strength_ + num_tables_ * discount_) * marg_p0; const double p_share = (loc.total_dish_count_ - loc.table_counts_.size() * discount_); share_table = rng->SelectSample(p_empty, p_share); } @@ -206,44 +206,53 @@ class MFCR { const double marg_p0 = std::inner_product(p0s.begin(), p0s.end(), lambdas.begin(), 0.0); assert(marg_p0 <= 1.0); const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); - const double r = num_tables_ * discount_ + alpha_; + const double r = num_tables_ * discount_ + strength_; if (it == dish_locs_.end()) { - return r * marg_p0 / (num_customers_ + alpha_); + return r * marg_p0 / (num_customers_ + strength_); } else { return (it->second.total_dish_count_ - discount_ * it->second.table_counts_.size() + r * marg_p0) / - (num_customers_ + alpha_); + (num_customers_ + strength_); } } double log_crp_prob() const { - return log_crp_prob(discount_, alpha_); + return log_crp_prob(discount_, strength_); } // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process // does not include draws from G_w's - double log_crp_prob(const double& d, const double& alpha) const { + double log_crp_prob(const double& discount, const double& strength) const { double lp = 0.0; if (has_discount_prior()) - lp = Md::log_beta_density(d, discount_prior_alpha_, discount_prior_beta_); - if (has_alpha_prior()) - lp += Md::log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_); + lp = Md::log_beta_density(discount, discount_prior_strength_, discount_prior_beta_); + if (has_strength_prior()) + lp += Md::log_gamma_density(strength + discount, strength_prior_shape_, strength_prior_rate_); assert(lp <= 0.0); if (num_customers_) { - if (d > 0.0) { - const double r = lgamma(1.0 - d); - lp += lgamma(alpha) - lgamma(alpha + num_customers_) - + num_tables_ * log(d) + lgamma(alpha / d + num_tables_) - - lgamma(alpha / d); + if (discount > 0.0) { + const double r = lgamma(1.0 - discount); + if (strength) + lp += lgamma(strength) - lgamma(strength / discount); + lp += - lgamma(strength + num_customers_) + + num_tables_ * log(discount) + lgamma(strength / discount + num_tables_); assert(std::isfinite(lp)); for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); it != dish_locs_.end(); ++it) { const DishLocations& cur = it->second; for (std::list::const_iterator ti = cur.table_counts_.begin(); ti != cur.table_counts_.end(); ++ti) { - lp += lgamma(ti->count - d) - r; + lp += lgamma(ti->count - discount) - r; } } + } else if (!discount) { // discount == 0.0 + lp += lgamma(strength) + num_tables_ * log(strength) - lgamma(strength + num_tables_); + assert(std::isfinite(lp)); + for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); + it != dish_locs_.end(); ++it) { + const DishLocations& cur = it->second; + lp += lgamma(cur.table_counts_.size()); + } } else { - assert(!"not implemented yet"); + assert(!"discount less than 0 detected!"); } } assert(std::isfinite(lp)); @@ -251,20 +260,22 @@ class MFCR { } void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { - assert(has_discount_prior() || has_alpha_prior()); + assert(has_discount_prior() || has_strength_prior()); DiscountResampler dr(*this); - ConcentrationResampler cr(*this); + StrengthResampler sr(*this); for (int iter = 0; iter < nloop; ++iter) { - if (has_alpha_prior()) { - alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0, + if (has_strength_prior()) { + strength_ = slice_sampler1d(sr, strength_, *rng, -discount_, std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); } if (has_discount_prior()) { - discount_ = slice_sampler1d(dr, discount_, *rng, std::numeric_limits::min(), + double min_discount = std::numeric_limits::min(); + if (strength_ < 0.0) min_discount = -strength_; + discount_ = slice_sampler1d(dr, discount_, *rng, min_discount, 1.0, 0.0, niterations, 100*niterations); } } - alpha_ = slice_sampler1d(cr, alpha_, *rng, 0.0, + strength_ = slice_sampler1d(sr, strength_, *rng, -discount_, std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); } @@ -272,15 +283,15 @@ class MFCR { DiscountResampler(const MFCR& crp) : crp_(crp) {} const MFCR& crp_; double operator()(const double& proposed_d) const { - return crp_.log_crp_prob(proposed_d, crp_.alpha_); + return crp_.log_crp_prob(proposed_d, crp_.strength_); } }; - struct ConcentrationResampler { - ConcentrationResampler(const MFCR& crp) : crp_(crp) {} + struct StrengthResampler { + StrengthResampler(const MFCR& crp) : crp_(crp) {} const MFCR& crp_; - double operator()(const double& proposediscount_alpha) const { - return crp_.log_crp_prob(crp_.discount_, proposediscount_alpha); + double operator()(const double& proposediscount_strength) const { + return crp_.log_crp_prob(crp_.discount_, proposediscount_strength); } }; @@ -292,7 +303,7 @@ class MFCR { }; void Print(std::ostream* out) const { - (*out) << "MFCR(d=" << discount_ << ",alpha=" << alpha_ << ") customers=" << num_customers_ << std::endl; + (*out) << "MFCR(d=" << discount_ << ",strength=" << strength_ << ") customers=" << num_customers_ << std::endl; for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); it != dish_locs_.end(); ++it) { (*out) << it->first << " (" << it->second.total_dish_count_ << " on " << it->second.table_counts_.size() << " tables): "; @@ -318,15 +329,15 @@ class MFCR { std::tr1::unordered_map dish_locs_; double discount_; - double alpha_; + double strength_; // optional beta prior on discount_ (NaN if no prior) - double discount_prior_alpha_; + double discount_prior_strength_; double discount_prior_beta_; - // optional gamma prior on alpha_ (NaN if no prior) - double alpha_prior_shape_; - double alpha_prior_rate_; + // optional gamma prior on strength_ (NaN if no prior) + double strength_prior_shape_; + double strength_prior_rate_; }; template -- cgit v1.2.3 From 2048ac9943e2695a75b5f0303ca869e66ee32202 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 5 Mar 2012 16:06:45 -0500 Subject: use template parameter inference to figure out what type to use for probability computations, templatatize number of floors in MFCR rather than compile-time set --- gi/pf/align-lexonly-pyp.cc | 20 +++++++------- gi/pf/conditional_pseg.h | 22 +++++++-------- gi/pf/learn_cfg.cc | 8 +++--- utils/ccrp.h | 48 ++------------------------------ utils/mfcr.h | 68 ++++++++++++++++++++++++---------------------- utils/mfcr_test.cc | 10 +++---- 6 files changed, 68 insertions(+), 108 deletions(-) diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index 87f7f6b5..ac0590e0 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -68,7 +68,7 @@ struct AlignedSentencePair { struct HierarchicalWordBase { explicit HierarchicalWordBase(const unsigned vocab_e_size) : - base(prob_t::One()), r(1,1,1,25,25), u0(-log(vocab_e_size)), l(1,1.0), v(1, 0.0) {} + base(prob_t::One()), r(1,1,1,1), u0(-log(vocab_e_size)), l(1,prob_t::One()), v(1, prob_t::Zero()) {} void ResampleHyperparameters(MT19937* rng) { r.resample_hyperparameters(rng); @@ -80,14 +80,14 @@ struct HierarchicalWordBase { // return p0 of rule.e_ prob_t operator()(const TRule& rule) const { - v[0] = exp(logp0(rule.e_)); - return prob_t(r.prob(rule.e_, v, l)); + v[0].logeq(logp0(rule.e_)); + return r.prob(rule.e_, v.begin(), l.begin()); } void Increment(const TRule& rule) { - v[0] = exp(logp0(rule.e_)); - if (r.increment(rule.e_, v, l, &*prng).count) { - base *= prob_t(v[0] * l[0]); + v[0].logeq(logp0(rule.e_)); + if (r.increment(rule.e_, v.begin(), l.begin(), &*prng).count) { + base *= v[0] * l[0]; } } @@ -105,15 +105,15 @@ struct HierarchicalWordBase { void Summary() const { cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (d=" << r.discount() << ",s=" << r.strength() << ')' << endl; - for (MFCR >::const_iterator it = r.begin(); it != r.end(); ++it) + for (MFCR<1,vector >::const_iterator it = r.begin(); it != r.end(); ++it) cerr << " " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables)" << TD::GetString(it->first) << endl; } prob_t base; - MFCR > r; + MFCR<1,vector > r; const double u0; - const vector l; - mutable vector v; + const vector l; + mutable vector v; }; struct BasicLexicalAlignment { diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h index 86403d8d..ef73e332 100644 --- a/gi/pf/conditional_pseg.h +++ b/gi/pf/conditional_pseg.h @@ -17,13 +17,13 @@ template struct MConditionalTranslationModel { explicit MConditionalTranslationModel(ConditionalBaseMeasure& rcp0) : - rp0(rcp0), lambdas(1, 1.0), p0s(1) {} + rp0(rcp0), lambdas(1, prob_t::One()), p0s(1) {} void Summary() const { std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { std::cerr << TD::GetString(it->first) << " \t(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << std::endl; - for (MFCR::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) + for (MFCR<1,TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) std::cerr << " " << -1 << '\t' << i2->first << std::endl; } } @@ -46,10 +46,10 @@ struct MConditionalTranslationModel { int IncrementRule(const TRule& rule, MT19937* rng) { RuleModelHash::iterator it = r.find(rule.f_); if (it == r.end()) { - it = r.insert(make_pair(rule.f_, MFCR(1, 1.0, 1.0, 1.0, 1.0, 1e-9, 4.0))).first; + it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(1.0, 1.0, 1.0, 1.0, 1e-9, 4.0))).first; } - p0s[0] = rp0(rule).as_float(); - TableCount delta = it->second.increment(rule, p0s, lambdas, rng); + p0s[0] = rp0(rule); + TableCount delta = it->second.increment(rule, p0s.begin(), lambdas.begin(), rng); return delta.count; } @@ -57,10 +57,10 @@ struct MConditionalTranslationModel { prob_t p; RuleModelHash::const_iterator it = r.find(rule.f_); if (it == r.end()) { - p.logeq(log(rp0(rule))); + p = rp0(rule); } else { - p0s[0] = rp0(rule).as_float(); - p = prob_t(it->second.prob(rule, p0s, lambdas)); + p0s[0] = rp0(rule); + p = it->second.prob(rule, p0s.begin(), lambdas.begin()); } return p; } @@ -80,11 +80,11 @@ struct MConditionalTranslationModel { const ConditionalBaseMeasure& rp0; typedef std::tr1::unordered_map, - MFCR, + MFCR<1, TRule>, boost::hash > > RuleModelHash; RuleModelHash r; - std::vector lambdas; - mutable std::vector p0s; + std::vector lambdas; + mutable std::vector p0s; }; template diff --git a/gi/pf/learn_cfg.cc b/gi/pf/learn_cfg.cc index bf157828..ed1772bf 100644 --- a/gi/pf/learn_cfg.cc +++ b/gi/pf/learn_cfg.cc @@ -127,20 +127,20 @@ struct HieroLMModel { nts(num_nts, CCRP(1,1,1,1)) {} prob_t Prob(const TRule& r) const { - return nts[nt_id_to_index[-r.lhs_]].probT(r, p0(r)); + return nts[nt_id_to_index[-r.lhs_]].prob(r, p0(r)); } inline prob_t p0(const TRule& r) const { if (kHIERARCHICAL_PRIOR) - return q0.probT(r, base(r)); + return q0.prob(r, base(r)); else return base(r); } int Increment(const TRule& r, MT19937* rng) { - const int delta = nts[nt_id_to_index[-r.lhs_]].incrementT(r, p0(r), rng); + const int delta = nts[nt_id_to_index[-r.lhs_]].increment(r, p0(r), rng); if (kHIERARCHICAL_PRIOR && delta) - q0.incrementT(r, base(r), rng); + q0.increment(r, base(r), rng); return delta; // return x.increment(r); } diff --git a/utils/ccrp.h b/utils/ccrp.h index 5f9db7a6..e24130ac 100644 --- a/utils/ccrp.h +++ b/utils/ccrp.h @@ -92,42 +92,9 @@ class CCRP { return it->total_dish_count_; } - // returns +1 or 0 indicating whether a new table was opened - int increment(const Dish& dish, const double& p0, MT19937* rng) { - DishLocations& loc = dish_locs_[dish]; - bool share_table = false; - if (loc.total_dish_count_) { - const double p_empty = (strength_ + num_tables_ * discount_) * p0; - const double p_share = (loc.total_dish_count_ - loc.table_counts_.size() * discount_); - share_table = rng->SelectSample(p_empty, p_share); - } - if (share_table) { - double r = rng->next() * (loc.total_dish_count_ - loc.table_counts_.size() * discount_); - for (typename std::list::iterator ti = loc.table_counts_.begin(); - ti != loc.table_counts_.end(); ++ti) { - r -= (*ti - discount_); - if (r <= 0.0) { - ++(*ti); - break; - } - } - if (r > 0.0) { - std::cerr << "Serious error: r=" << r << std::endl; - Print(&std::cerr); - assert(r <= 0.0); - } - } else { - loc.table_counts_.push_back(1u); - ++num_tables_; - } - ++loc.total_dish_count_; - ++num_customers_; - return (share_table ? 0 : 1); - } - // returns +1 or 0 indicating whether a new table was opened template - int incrementT(const Dish& dish, const T& p0, MT19937* rng) { + int increment(const Dish& dish, const T& p0, MT19937* rng) { DishLocations& loc = dish_locs_[dish]; bool share_table = false; if (loc.total_dish_count_) { @@ -196,19 +163,8 @@ class CCRP { } } - double prob(const Dish& dish, const double& p0) const { - const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); - const double r = num_tables_ * discount_ + strength_; - if (it == dish_locs_.end()) { - return r * p0 / (num_customers_ + strength_); - } else { - return (it->second.total_dish_count_ - discount_ * it->second.table_counts_.size() + r * p0) / - (num_customers_ + strength_); - } - } - template - T probT(const Dish& dish, const T& p0) const { + T prob(const Dish& dish, const T& p0) const { const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); const T r = T(num_tables_ * discount_ + strength_); if (it == dish_locs_.end()) { diff --git a/utils/mfcr.h b/utils/mfcr.h index aeaf599d..6cc0ebf1 100644 --- a/utils/mfcr.h +++ b/utils/mfcr.h @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include "sampler.h" @@ -35,12 +36,11 @@ std::ostream& operator<<(std::ostream& o, const TableCount& tc) { // referenced therein. // http://www.aclweb.org/anthology/P/P09/P09-2085.pdf // -template > +template > class MFCR { public: - MFCR(unsigned num_floors, double d, double strength) : - num_floors_(num_floors), + MFCR(double d, double strength) : num_tables_(), num_customers_(), discount_(d), @@ -50,8 +50,7 @@ class MFCR { strength_prior_shape_(std::numeric_limits::quiet_NaN()), strength_prior_rate_(std::numeric_limits::quiet_NaN()) {} - MFCR(unsigned num_floors, double discount_strength, double discount_beta, double strength_shape, double strength_rate, double d = 0.9, double strength = 10.0) : - num_floors_(num_floors), + MFCR(double discount_strength, double discount_beta, double strength_shape, double strength_rate, double d = 0.9, double strength = 10.0) : num_tables_(), num_customers_(), discount_(d), @@ -111,22 +110,22 @@ class MFCR { } // returns (delta, floor) indicating whether a new table (delta) was opened and on which floor - TableCount increment(const Dish& dish, const std::vector& p0s, const std::vector& lambdas, MT19937* rng) { - assert(p0s.size() == num_floors_); - assert(lambdas.size() == num_floors_); - + template + TableCount increment(const Dish& dish, InputIterator p0s, InputIterator2 lambdas, MT19937* rng) { DishLocations& loc = dish_locs_[dish]; // marg_p0 = marginal probability of opening a new table on any floor with label dish - const double marg_p0 = std::inner_product(p0s.begin(), p0s.end(), lambdas.begin(), 0.0); - assert(marg_p0 <= 1.0); + typedef typename std::iterator_traits::value_type F; + const F marg_p0 = std::inner_product(p0s, p0s + Floors, lambdas, F(0.0)); + assert(marg_p0 <= F(1.0001)); int floor = -1; bool share_table = false; if (loc.total_dish_count_) { - const double p_empty = (strength_ + num_tables_ * discount_) * marg_p0; - const double p_share = (loc.total_dish_count_ - loc.table_counts_.size() * discount_); + const F p_empty = F(strength_ + num_tables_ * discount_) * marg_p0; + const F p_share = F(loc.total_dish_count_ - loc.table_counts_.size() * discount_); share_table = rng->SelectSample(p_empty, p_share); } if (share_table) { + // this can be done with doubles since P0 (which may be tiny) is not involved double r = rng->next() * (loc.total_dish_count_ - loc.table_counts_.size() * discount_); for (typename std::list::iterator ti = loc.table_counts_.begin(); ti != loc.table_counts_.end(); ++ti) { @@ -143,12 +142,18 @@ class MFCR { assert(r <= 0.0); } } else { // sit at currently empty table -- must sample what floor - double r = rng->next() * marg_p0; - for (unsigned i = 0; i < p0s.size(); ++i) { - r -= p0s[i] * lambdas[i]; - if (r <= 0.0) { - floor = i; - break; + if (Floors == 1) { + floor = 0; + } else { + F r = F(rng->next()) * marg_p0; + for (unsigned i = 0; i < Floors; ++i) { + r -= (*p0s) * (*lambdas); + ++p0s; + ++lambdas; + if (r <= F(0.0)) { + floor = i; + break; + } } } assert(floor >= 0); @@ -200,18 +205,18 @@ class MFCR { return TableCount(delta, floor); } - double prob(const Dish& dish, const std::vector& p0s, const std::vector& lambdas) const { - assert(p0s.size() == num_floors_); - assert(lambdas.size() == num_floors_); - const double marg_p0 = std::inner_product(p0s.begin(), p0s.end(), lambdas.begin(), 0.0); - assert(marg_p0 <= 1.0); + template + typename std::iterator_traits::value_type prob(const Dish& dish, InputIterator p0s, InputIterator2 lambdas) const { + typedef typename std::iterator_traits::value_type F; + const F marg_p0 = std::inner_product(p0s, p0s + Floors, lambdas, F(0.0)); + assert(marg_p0 <= F(1.0001)); const typename std::tr1::unordered_map::const_iterator it = dish_locs_.find(dish); - const double r = num_tables_ * discount_ + strength_; + const F r = F(num_tables_ * discount_ + strength_); if (it == dish_locs_.end()) { - return r * marg_p0 / (num_customers_ + strength_); + return r * marg_p0 / F(num_customers_ + strength_); } else { - return (it->second.total_dish_count_ - discount_ * it->second.table_counts_.size() + r * marg_p0) / - (num_customers_ + strength_); + return (F(it->second.total_dish_count_ - discount_ * it->second.table_counts_.size()) + F(r * marg_p0)) / + F(num_customers_ + strength_); } } @@ -303,7 +308,7 @@ class MFCR { }; void Print(std::ostream* out) const { - (*out) << "MFCR(d=" << discount_ << ",strength=" << strength_ << ") customers=" << num_customers_ << std::endl; + (*out) << "MFCR<" << Floors << ">(d=" << discount_ << ",strength=" << strength_ << ") customers=" << num_customers_ << std::endl; for (typename std::tr1::unordered_map::const_iterator it = dish_locs_.begin(); it != dish_locs_.end(); ++it) { (*out) << it->first << " (" << it->second.total_dish_count_ << " on " << it->second.table_counts_.size() << " tables): "; @@ -323,7 +328,6 @@ class MFCR { return dish_locs_.end(); } - unsigned num_floors_; unsigned num_tables_; unsigned num_customers_; std::tr1::unordered_map dish_locs_; @@ -340,8 +344,8 @@ class MFCR { double strength_prior_rate_; }; -template -std::ostream& operator<<(std::ostream& o, const MFCR& c) { +template +std::ostream& operator<<(std::ostream& o, const MFCR& c) { c.Print(&o); return o; } diff --git a/utils/mfcr_test.cc b/utils/mfcr_test.cc index 7c45a37c..cc886335 100644 --- a/utils/mfcr_test.cc +++ b/utils/mfcr_test.cc @@ -9,7 +9,7 @@ using namespace std; void test_exch(MT19937* rng) { - MFCR crp(2, 0.5, 3.0); + MFCR<2, int> crp(0.5, 3.0); vector lambdas(2); vector p0s(2); lambdas[0] = 0.2; @@ -22,23 +22,23 @@ void test_exch(MT19937* rng) { double xt = 0; int cust = 10; vector hist(cust + 1, 0), hist2(cust + 1, 0); - for (int i = 0; i < cust; ++i) { crp.increment(1, p0s, lambdas, rng); } + for (int i = 0; i < cust; ++i) { crp.increment(1, p0s.begin(), lambdas.begin(), rng); } const int samples = 100000; const bool simulate = true; for (int k = 0; k < samples; ++k) { if (!simulate) { crp.clear(); - for (int i = 0; i < cust; ++i) { crp.increment(1, p0s, lambdas, rng); } + for (int i = 0; i < cust; ++i) { crp.increment(1, p0s.begin(), lambdas.begin(), rng); } } else { int da = rng->next() * cust; bool a = rng->next() < 0.45; if (a) { - for (int i = 0; i < da; ++i) { crp.increment(1, p0s, lambdas, rng); } + for (int i = 0; i < da; ++i) { crp.increment(1, p0s.begin(), lambdas.begin(), rng); } for (int i = 0; i < da; ++i) { crp.decrement(1, rng); } xt += 1.0; } else { for (int i = 0; i < da; ++i) { crp.decrement(1, rng); } - for (int i = 0; i < da; ++i) { crp.increment(1, p0s, lambdas, rng); } + for (int i = 0; i < da; ++i) { crp.increment(1, p0s.begin(), lambdas.begin(), rng); } } } int c = crp.num_tables(1); -- cgit v1.2.3 From de34b1493df93169c991a1828f951ca5abc00cae Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 5 Mar 2012 21:36:07 -0500 Subject: tie hyperparameters for translation distributions; support theta < 0 for PYPLM --- gi/pf/align-lexonly-pyp.cc | 13 ++++----- gi/pf/conditional_pseg.h | 68 ++++++++++++++++++++++++++++++++++++---------- gi/pf/pyp_lm.cc | 12 ++++---- utils/ccrp.h | 4 +-- utils/mfcr.h | 19 +++++++++++-- 5 files changed, 84 insertions(+), 32 deletions(-) diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index ac0590e0..13a3a487 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -68,14 +68,14 @@ struct AlignedSentencePair { struct HierarchicalWordBase { explicit HierarchicalWordBase(const unsigned vocab_e_size) : - base(prob_t::One()), r(1,1,1,1), u0(-log(vocab_e_size)), l(1,prob_t::One()), v(1, prob_t::Zero()) {} + base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-log(vocab_e_size)), l(1,prob_t::One()), v(1, prob_t::Zero()) {} void ResampleHyperparameters(MT19937* rng) { r.resample_hyperparameters(rng); } inline double logp0(const vector& s) const { - return s.size() * u0; + return Md::log_poisson(s.size(), 7.5) + s.size() * u0; } // return p0 of rule.e_ @@ -106,7 +106,7 @@ struct HierarchicalWordBase { void Summary() const { cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (d=" << r.discount() << ",s=" << r.strength() << ')' << endl; for (MFCR<1,vector >::const_iterator it = r.begin(); it != r.end(); ++it) - cerr << " " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables)" << TD::GetString(it->first) << endl; + cerr << " " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables) " << TD::GetString(it->first) << endl; } prob_t base; @@ -167,10 +167,9 @@ struct BasicLexicalAlignment { } void ResampleHyperparemeters() { - cerr << " LLH_prev = " << Likelihood() << flush; tmodel.ResampleHyperparameters(&*prng); up0.ResampleHyperparameters(&*prng); - cerr << "\tLLH_post = " << Likelihood() << endl; + cerr << " (base d=" << up0.r.discount() << ",s=" << up0.r.strength() << ")\n"; } void ResampleCorpus(); @@ -218,7 +217,7 @@ void BasicLexicalAlignment::ResampleCorpus() { up0.Increment(r); } } - cerr << " LLH = " << tmodel.Likelihood() << endl; + cerr << " LLH = " << Likelihood() << endl; } void ExtractLetters(const set& v, vector >* l, set* letset = NULL) { @@ -311,7 +310,7 @@ int main(int argc, char** argv) { for (int i = 0; i < samples; ++i) { for (int j = 65; j < 67; ++j) Debug(corpus[j]); cerr << i << "\t" << x.tmodel.r.size() << "\t"; - if (i % 10 == 0) x.ResampleHyperparemeters(); + if (i % 7 == 6) x.ResampleHyperparemeters(); x.ResampleCorpus(); if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); } diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h index ef73e332..8202778b 100644 --- a/gi/pf/conditional_pseg.h +++ b/gi/pf/conditional_pseg.h @@ -17,21 +17,66 @@ template struct MConditionalTranslationModel { explicit MConditionalTranslationModel(ConditionalBaseMeasure& rcp0) : - rp0(rcp0), lambdas(1, prob_t::One()), p0s(1) {} + rp0(rcp0), d(0.5), strength(1.0), lambdas(1, prob_t::One()), p0s(1) {} void Summary() const { std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { std::cerr << TD::GetString(it->first) << " \t(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << std::endl; for (MFCR<1,TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) - std::cerr << " " << -1 << '\t' << i2->first << std::endl; + std::cerr << " " << i2->second.total_dish_count_ << '\t' << i2->first << std::endl; } } + double log_likelihood(const double& dd, const double& aa) const { + if (aa <= -dd) return -std::numeric_limits::infinity(); + //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1); + double llh = Md::log_beta_density(dd, 1, 1) + + Md::log_gamma_density(dd + aa, 1, 1); + typename std::tr1::unordered_map, MFCR<1,TRule>, boost::hash > >::const_iterator it; + for (it = r.begin(); it != r.end(); ++it) + llh += it->second.log_crp_prob(dd, aa); + return llh; + } + + struct DiscountResampler { + DiscountResampler(const MConditionalTranslationModel& m) : m_(m) {} + const MConditionalTranslationModel& m_; + double operator()(const double& proposed_discount) const { + return m_.log_likelihood(proposed_discount, m_.strength); + } + }; + + struct AlphaResampler { + AlphaResampler(const MConditionalTranslationModel& m) : m_(m) {} + const MConditionalTranslationModel& m_; + double operator()(const double& proposed_strength) const { + return m_.log_likelihood(m_.d, proposed_strength); + } + }; + void ResampleHyperparameters(MT19937* rng) { - for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it) - it->second.resample_hyperparameters(rng); - } + const unsigned nloop = 5; + const unsigned niterations = 10; + DiscountResampler dr(*this); + AlphaResampler ar(*this); + for (int iter = 0; iter < nloop; ++iter) { + strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits::min(), + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + double min_discount = std::numeric_limits::min(); + if (strength < 0.0) min_discount -= strength; + d = slice_sampler1d(dr, d, *rng, min_discount, + 1.0, 0.0, niterations, 100*niterations); + } + strength = slice_sampler1d(ar, strength, *rng, -d, + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + typename std::tr1::unordered_map, MFCR<1,TRule>, boost::hash > >::iterator it; + std::cerr << "MConditionalTranslationModel(d=" << d << ",s=" << strength << ") = " << log_likelihood(d, strength) << std::endl; + for (it = r.begin(); it != r.end(); ++it) { + it->second.set_discount(d); + it->second.set_strength(strength); + } + } int DecrementRule(const TRule& rule, MT19937* rng) { RuleModelHash::iterator it = r.find(rule.f_); @@ -46,7 +91,7 @@ struct MConditionalTranslationModel { int IncrementRule(const TRule& rule, MT19937* rng) { RuleModelHash::iterator it = r.find(rule.f_); if (it == r.end()) { - it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(1.0, 1.0, 1.0, 1.0, 1e-9, 4.0))).first; + it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(d, strength))).first; } p0s[0] = rp0(rule); TableCount delta = it->second.increment(rule, p0s.begin(), lambdas.begin(), rng); @@ -66,15 +111,7 @@ struct MConditionalTranslationModel { } prob_t Likelihood() const { - prob_t p = prob_t::One(); -#if 0 - for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { - prob_t q; q.logeq(it->second.log_crp_prob()); - p *= q; - for (CCRP_NoTable::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) - p *= rp0(i2->first); - } -#endif + prob_t p; p.logeq(log_likelihood(d, strength)); return p; } @@ -83,6 +120,7 @@ struct MConditionalTranslationModel { MFCR<1, TRule>, boost::hash > > RuleModelHash; RuleModelHash r; + double d, strength; std::vector lambdas; mutable std::vector p0s; }; diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc index 7ebada13..104f356b 100644 --- a/gi/pf/pyp_lm.cc +++ b/gi/pf/pyp_lm.cc @@ -18,7 +18,7 @@ // I use templates to handle the recursive formalation of the prior, so // the order of the model has to be specified here, at compile time: -#define kORDER 3 +#define kORDER 4 using namespace std; using namespace tr1; @@ -114,7 +114,7 @@ template struct PYPLM { if (aa <= -dd) return -std::numeric_limits::infinity(); //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1); double llh = Md::log_beta_density(dd, discount_a, discount_b) + - Md::log_gamma_density(aa, strength_s, strength_r); + Md::log_gamma_density(aa + dd, strength_s, strength_r); typename unordered_map, CCRP, boost::hash > >::const_iterator it; for (it = p.begin(); it != p.end(); ++it) llh += it->second.log_crp_prob(dd, aa); @@ -141,12 +141,14 @@ template struct PYPLM { DiscountResampler dr(*this); AlphaResampler ar(*this); for (int iter = 0; iter < nloop; ++iter) { - strength = slice_sampler1d(ar, strength, *rng, 0.0, + strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits::min(), std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); - d = slice_sampler1d(dr, d, *rng, std::numeric_limits::min(), + double min_discount = std::numeric_limits::min(); + if (strength < 0.0) min_discount -= strength; + d = slice_sampler1d(dr, d, *rng, min_discount, 1.0, 0.0, niterations, 100*niterations); } - strength = slice_sampler1d(ar, strength, *rng, 0.0, + strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits::min(), std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); typename unordered_map, CCRP, boost::hash > >::iterator it; cerr << "PYPLM<" << N << ">(d=" << d << ",a=" << strength << ") = " << log_likelihood(d, strength) << endl; diff --git a/utils/ccrp.h b/utils/ccrp.h index e24130ac..439d7e1e 100644 --- a/utils/ccrp.h +++ b/utils/ccrp.h @@ -225,12 +225,12 @@ class CCRP { StrengthResampler sr(*this); for (int iter = 0; iter < nloop; ++iter) { if (has_strength_prior()) { - strength_ = slice_sampler1d(sr, strength_, *rng, -discount_, + strength_ = slice_sampler1d(sr, strength_, *rng, -discount_ + std::numeric_limits::min(), std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); } if (has_discount_prior()) { double min_discount = std::numeric_limits::min(); - if (strength_ < 0.0) min_discount = -strength_; + if (strength_ < 0.0) min_discount -= strength_; discount_ = slice_sampler1d(dr, discount_, *rng, min_discount, 1.0, 0.0, niterations, 100*niterations); } diff --git a/utils/mfcr.h b/utils/mfcr.h index 6cc0ebf1..886f01ef 100644 --- a/utils/mfcr.h +++ b/utils/mfcr.h @@ -48,7 +48,7 @@ class MFCR { discount_prior_strength_(std::numeric_limits::quiet_NaN()), discount_prior_beta_(std::numeric_limits::quiet_NaN()), strength_prior_shape_(std::numeric_limits::quiet_NaN()), - strength_prior_rate_(std::numeric_limits::quiet_NaN()) {} + strength_prior_rate_(std::numeric_limits::quiet_NaN()) { check_hyperparameters(); } MFCR(double discount_strength, double discount_beta, double strength_shape, double strength_rate, double d = 0.9, double strength = 10.0) : num_tables_(), @@ -58,10 +58,23 @@ class MFCR { discount_prior_strength_(discount_strength), discount_prior_beta_(discount_beta), strength_prior_shape_(strength_shape), - strength_prior_rate_(strength_rate) {} + strength_prior_rate_(strength_rate) { check_hyperparameters(); } + + void check_hyperparameters() { + if (discount_ < 0.0 || discount_ >= 1.0) { + std::cerr << "Bad discount: " << discount_ << std::endl; + abort(); + } + if (strength_ <= -discount_) { + std::cerr << "Bad strength: " << strength_ << " (discount=" << discount_ << ")" << std::endl; + abort(); + } + } double discount() const { return discount_; } double strength() const { return strength_; } + void set_discount(double d) { discount_ = d; check_hyperparameters(); } + void set_strength(double a) { strength_ = a; check_hyperparameters(); } bool has_discount_prior() const { return !std::isnan(discount_prior_strength_); @@ -275,7 +288,7 @@ class MFCR { } if (has_discount_prior()) { double min_discount = std::numeric_limits::min(); - if (strength_ < 0.0) min_discount = -strength_; + if (strength_ < 0.0) min_discount -= strength_; discount_ = slice_sampler1d(dr, discount_, *rng, min_discount, 1.0, 0.0, niterations, 100*niterations); } -- cgit v1.2.3 From 27e0de58bf49a4fc74bbf58718d1b89525a154a6 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 6 Mar 2012 23:20:16 -0500 Subject: a few statistical helpers i'm using to figure some algorithms out --- utils/m.h | 45 +++++++++++++++++++++++++++++++++++++++++++++ utils/m_test.cc | 16 ++++++++++++++++ 2 files changed, 61 insertions(+) diff --git a/utils/m.h b/utils/m.h index 5e45efee..dc881b36 100644 --- a/utils/m.h +++ b/utils/m.h @@ -4,6 +4,10 @@ #include #include #include +#include + +// TODO right now I sometimes assert that x is in the support of the distributions +// should be configurable to return -inf instead template struct M { @@ -59,6 +63,47 @@ struct M { return (alpha-1)*std::log(x)+(beta-1)*std::log(1-x) - log_beta_fn(alpha, beta); } + // support x \in R + static inline F log_laplace_density(const F& x, const F& mu, const F& b) { + assert(b > 0.0); + return -std::log(2*b) - std::fabs(x - mu) / b; + } + + // support x \in R + // this is NOT the "log normal" density, it is the log of the "normal density at x" + static inline F log_gaussian_density(const F& x, const F& mu, const F& var) { + assert(var > 0.0); + return -0.5 * std::log(var * 2 * boost::math::constants::pi()) - (x - mu)*(x - mu) / (2 * var); + } + + // (x1,x2) \in R^2 + // parameterized in terms of two means, a two "variances", a correlation < 1 + static inline F log_bivariate_gaussian_density(const F& x1, const F& x2, + const F& mu1, const F& mu2, + const F& var1, const F& var2, + const F& cor) { + assert(var1 > 0); + assert(var2 > 0); + assert(std::fabs(cor) < 1.0); + const F cor2 = cor*cor; + const F var1var22 = var1 * var2; + const F Z = 0.5 * std::log(var1var22 * (1 - cor2)) + std::log(2 * boost::math::constants::pi()); + return -Z -1.0 / (2 * (1 - cor2)) * ((x1 - mu1)*(x1-mu1) / var1 + (x2-mu2)*(x2-mu2) / var2 - 2*cor*(x1 - mu1)*(x2-mu2) / std::sqrt(var1var22)); + } + + // support x \in [a,b] + static inline F log_triangle_density(const F& x, const F& a, const F& b, const F& c) { + assert(a < b); + assert(a <= c); + assert(c <= b); + assert(x >= a); + assert(x <= b); + if (x <= c) + return std::log(2) + std::log(x - a) - std::log(b - a) - std::log(c - a); + else + return std::log(2) + std::log(b - x) - std::log(b - a) - std::log(b - c); + } + // note: this has been adapted so that 0 is in the support of the distribution // support [0, 1, 2 ...) static inline F log_yule_simon(unsigned x, const F& rho) { diff --git a/utils/m_test.cc b/utils/m_test.cc index fca8f895..c4d6a166 100644 --- a/utils/m_test.cc +++ b/utils/m_test.cc @@ -14,6 +14,22 @@ class MTest : public testing::Test { virtual void TearDown() { } }; +TEST_F(MTest, Densities) { + double px1 = Md::log_gaussian_density(1.0, 0.0, 1.0); + double px2 = Md::log_gaussian_density(-1.0, 0.0, 1.0); + double py1 = Md::log_laplace_density(1.0, 0.0, 1.0); + double py2 = Md::log_laplace_density(1.0, 0.0, 1.0); + double pz1 = Md::log_triangle_density(1.0, -2.0, 2.0, 0.0); + double pz2 = Md::log_triangle_density(1.0, -2.0, 2.0, 0.0); + cerr << px1 << " " << py1 << " " << pz2 << endl; + EXPECT_FLOAT_EQ(px1, px2); + EXPECT_FLOAT_EQ(py1, py2); + EXPECT_FLOAT_EQ(pz1, pz2); + double b1 = Md::log_bivariate_gaussian_density(1.0, -1.0, 0.0, 0.0, 1.0, 1.0, -0.8); + double b2 = Md::log_bivariate_gaussian_density(-1.0, 1.0, 0.0, 0.0, 1.0, 1.0, -0.8); + cerr << b1 << " " << b2 << endl; +} + TEST_F(MTest, Poisson) { double prev = 1.0; double tot = 0; -- cgit v1.2.3 From 4f19cd0c9a729cfd59d186492b3035c168f5e58f Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 7 Mar 2012 09:46:54 -0500 Subject: configure order of n-gram features --- decoder/ff_ngrams.cc | 43 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 39 insertions(+), 4 deletions(-) diff --git a/decoder/ff_ngrams.cc b/decoder/ff_ngrams.cc index 04dd1906..d6d79f5e 100644 --- a/decoder/ff_ngrams.cc +++ b/decoder/ff_ngrams.cc @@ -57,6 +57,39 @@ namespace { } } +static bool ParseArgs(string const& in, bool* explicit_markers, unsigned* order) { + vector const& argv=SplitOnWhitespace(in); + *explicit_markers = false; + *order = 3; +#define LMSPEC_NEXTARG if (i==argv.end()) { \ + cerr << "Missing argument for "<<*last<<". "; goto usage; \ + } else { ++i; } + + for (vector::const_iterator last,i=argv.begin(),e=argv.end();i!=e;++i) { + string const& s=*i; + if (s[0]=='-') { + if (s.size()>2) goto fail; + switch (s[1]) { + case 'x': + *explicit_markers = true; + break; + case 'o': + LMSPEC_NEXTARG; *order=atoi((*i).c_str()); + break; +#undef LMSPEC_NEXTARG + default: + fail: + cerr<<"Unknown option on NgramFeatures "<")) , add_sos_eos_(!explicit_markers) { - order_ = 3; + order_ = order; state_size_ = (order_ - 1) * sizeof(WordID) + 2 + (order_ - 1) * sizeof(WordID); unscored_size_offset_ = (order_ - 1) * sizeof(WordID); is_complete_offset_ = unscored_size_offset_ + 1; @@ -316,8 +349,10 @@ class NgramDetectorImpl { NgramDetector::NgramDetector(const string& param) { string filename, mapfile, featname; - bool explicit_markers = (param == "-x"); - pimpl_ = new NgramDetectorImpl(explicit_markers); + bool explicit_markers = false; + unsigned order = 3; + ParseArgs(param, &explicit_markers, &order); + pimpl_ = new NgramDetectorImpl(explicit_markers, order); SetStateSize(pimpl_->ReserveStateSize()); } -- cgit v1.2.3 From 63ea78b71bf913be248b064219734cac5ce41be2 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 7 Mar 2012 18:54:11 +0000 Subject: better logging with MPI --- training/mpi_flex_optimize.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/training/mpi_flex_optimize.cc b/training/mpi_flex_optimize.cc index 00746532..d98ea4dc 100644 --- a/training/mpi_flex_optimize.cc +++ b/training/mpi_flex_optimize.cc @@ -205,7 +205,7 @@ int main(int argc, char** argv) { const int size = 1; const int rank = 0; #endif - if (size > 0) SetSilent(true); // turn off verbose decoder output + if (size > 1) SetSilent(true); // turn off verbose decoder output register_feature_functions(); MT19937* rng = NULL; @@ -343,7 +343,7 @@ int main(int argc, char** argv) { double obj = 0; #ifdef HAVE_MPI - // TODO obj + reduce(world, local_obj, obj, std::plus(), 0); reduce(world, local_grad, g, std::plus >(), 0); #else obj = local_obj; -- cgit v1.2.3 From fdeb2267eb843ef80b5f2b95234a72c9c3333bbe Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 7 Mar 2012 20:13:58 +0000 Subject: more mpi fixes --- training/mpi_flex_optimize.cc | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/training/mpi_flex_optimize.cc b/training/mpi_flex_optimize.cc index d98ea4dc..a9197208 100644 --- a/training/mpi_flex_optimize.cc +++ b/training/mpi_flex_optimize.cc @@ -272,6 +272,7 @@ int main(int argc, char** argv) { int iter = -1; bool converged = false; + vector gg; while (!converged) { #ifdef HAVE_MPI mpi::timer timer; @@ -354,13 +355,14 @@ int main(int argc, char** argv) { // g /= (size_per_proc * size); if (!o) o.reset(new LBFGSOptimizer(FD::NumFeats(), lbfgs_memory_buffers)); - vector gg(FD::NumFeats()); + gg.clear(); + gg.resize(FD::NumFeats()); if (gg.size() != cur_weights.size()) { cur_weights.resize(gg.size()); } for (SparseVector::const_iterator it = g.begin(); it != g.end(); ++it) if (it->first) { gg[it->first] = it->second; } g.clear(); double r = ApplyRegularizationTerms(regularization_strength, - time_series_strength * (iter == 0 ? 0.0 : 1.0), + time_series_strength, // * (iter == 0 ? 0.0 : 1.0), cur_weights, prev_weights, &gg); @@ -375,10 +377,9 @@ int main(int argc, char** argv) { o->Optimize(obj, gg, &cur_weights); } #ifdef HAVE_MPI - // broadcast(world, x, 0); + broadcast(world, cur_weights, 0); broadcast(world, converged, 0); world.barrier(); - if (rank == 0) { cerr << " ELAPSED TIME THIS ITERATION=" << timer.elapsed() << endl; } #endif } prev_weights = cur_weights; -- cgit v1.2.3 From 7fd9fe26f00cf31a7b407364399d37b4eaf04eba Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 7 Mar 2012 20:25:53 -0500 Subject: lattice builder --- gi/pf/Makefile.am | 7 +- gi/pf/align-tl.cc | 334 ++++++++++++++++++++++++++++++++++++++++++++++ gi/pf/conditional_pseg.h | 11 +- gi/pf/nuisance_test.cc | 161 ++++++++++++++++++++++ gi/pf/transliterations.cc | 193 +++++++++++++++++++++++++++ gi/pf/transliterations.h | 20 +++ 6 files changed, 723 insertions(+), 3 deletions(-) create mode 100644 gi/pf/align-tl.cc create mode 100644 gi/pf/nuisance_test.cc create mode 100644 gi/pf/transliterations.cc create mode 100644 gi/pf/transliterations.h diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 7cf9c14d..5e89f02a 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,12 +1,17 @@ -bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp learn_cfg pyp_lm +bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp learn_cfg pyp_lm nuisance_test align-tl noinst_LIBRARIES = libpf.a + libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc +nuisance_test_SOURCES = nuisance_test.cc transliterations.cc + align_lexonly_SOURCES = align-lexonly.cc align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc +align_tl_SOURCES = align-tl.cc transliterations.cc + itg_SOURCES = itg.cc pyp_lm_SOURCES = pyp_lm.cc diff --git a/gi/pf/align-tl.cc b/gi/pf/align-tl.cc new file mode 100644 index 00000000..0e0454e5 --- /dev/null +++ b/gi/pf/align-tl.cc @@ -0,0 +1,334 @@ +#include +#include +#include + +#include +#include +#include + +#include "array2d.h" +#include "base_distributions.h" +#include "monotonic_pseg.h" +#include "conditional_pseg.h" +#include "trule.h" +#include "tdict.h" +#include "stringlib.h" +#include "filelib.h" +#include "dict.h" +#include "sampler.h" +#include "mfcr.h" +#include "corpus.h" +#include "ngram_base.h" +#include "transliterations.h" + +using namespace std; +using namespace tr1; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("samples,s",po::value()->default_value(1000),"Number of samples") + ("input,i",po::value(),"Read parallel data from") + ("random_seed,S",po::value(), "Random seed"); + po::options_description clo("Command line options"); + clo.add_options() + ("config", po::value(), "Configuration file") + ("help,h", "Print this help message and exit"); + po::options_description dconfig_options, dcmdline_options; + dconfig_options.add(opts); + dcmdline_options.add(opts).add(clo); + + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("config")) { + ifstream config((*conf)["config"].as().c_str()); + po::store(po::parse_config_file(config, dconfig_options), *conf); + } + po::notify(*conf); + + if (conf->count("help") || (conf->count("input") == 0)) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +shared_ptr prng; + +struct LexicalAlignment { + unsigned char src_index; + bool is_transliteration; + vector > derivation; +}; + +struct AlignedSentencePair { + vector src; + vector trg; + vector a; + Array2D posterior; +}; + +struct HierarchicalWordBase { + explicit HierarchicalWordBase(const unsigned vocab_e_size) : + base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-log(vocab_e_size)), l(1,prob_t::One()), v(1, prob_t::Zero()) {} + + void ResampleHyperparameters(MT19937* rng) { + r.resample_hyperparameters(rng); + } + + inline double logp0(const vector& s) const { + return Md::log_poisson(s.size(), 7.5) + s.size() * u0; + } + + // return p0 of rule.e_ + prob_t operator()(const TRule& rule) const { + v[0].logeq(logp0(rule.e_)); + return r.prob(rule.e_, v.begin(), l.begin()); + } + + void Increment(const TRule& rule) { + v[0].logeq(logp0(rule.e_)); + if (r.increment(rule.e_, v.begin(), l.begin(), &*prng).count) { + base *= v[0] * l[0]; + } + } + + void Decrement(const TRule& rule) { + if (r.decrement(rule.e_, &*prng).count) { + base /= prob_t(exp(logp0(rule.e_))); + } + } + + prob_t Likelihood() const { + prob_t p; p.logeq(r.log_crp_prob()); + p *= base; + return p; + } + + void Summary() const { + cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (d=" << r.discount() << ",s=" << r.strength() << ')' << endl; + for (MFCR<1,vector >::const_iterator it = r.begin(); it != r.end(); ++it) + cerr << " " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables) " << TD::GetString(it->first) << endl; + } + + prob_t base; + MFCR<1,vector > r; + const double u0; + const vector l; + mutable vector v; +}; + +struct BasicLexicalAlignment { + explicit BasicLexicalAlignment(const vector >& lets, + const unsigned words_e, + const unsigned letters_e, + vector* corp) : + letters(lets), + corpus(*corp), + //up0(words_e), + //up0("en.chars.1gram", letters_e), + //up0("en.words.1gram"), + up0(letters_e), + //up0("en.chars.2gram"), + tmodel(up0) { + } + + void InstantiateRule(const WordID src, + const WordID trg, + TRule* rule) const { + static const WordID kX = TD::Convert("X") * -1; + rule->lhs_ = kX; + rule->e_ = letters[trg]; + rule->f_ = letters[src]; + } + + void InitializeRandom() { + const WordID kNULL = TD::Convert("NULL"); + cerr << "Initializing with random alignments ...\n"; + for (unsigned i = 0; i < corpus.size(); ++i) { + AlignedSentencePair& asp = corpus[i]; + asp.a.resize(asp.trg.size()); + for (unsigned j = 0; j < asp.trg.size(); ++j) { + const unsigned char a_j = prng->next() * (1 + asp.src.size()); + const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + TRule r; + InstantiateRule(f_a_j, asp.trg[j], &r); + asp.a[j].is_transliteration = false; + asp.a[j].src_index = a_j; + if (tmodel.IncrementRule(r, &*prng)) + up0.Increment(r); + } + } + cerr << " LLH = " << Likelihood() << endl; + } + + prob_t Likelihood() const { + prob_t p = tmodel.Likelihood(); + p *= up0.Likelihood(); + return p; + } + + void ResampleHyperparemeters() { + tmodel.ResampleHyperparameters(&*prng); + up0.ResampleHyperparameters(&*prng); + cerr << " (base d=" << up0.r.discount() << ",s=" << up0.r.strength() << ")\n"; + } + + void ResampleCorpus(); + + const vector >& letters; // spelling dictionary + vector& corpus; + //PhraseConditionalUninformativeBase up0; + //PhraseConditionalUninformativeUnigramBase up0; + //UnigramWordBase up0; + //HierarchicalUnigramBase up0; + HierarchicalWordBase up0; + //CompletelyUniformBase up0; + //FixedNgramBase up0; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + MConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; + //ConditionalTranslationModel tmodel; +}; + +void BasicLexicalAlignment::ResampleCorpus() { + static const WordID kNULL = TD::Convert("NULL"); + for (unsigned i = 0; i < corpus.size(); ++i) { + AlignedSentencePair& asp = corpus[i]; + SampleSet ss; ss.resize(asp.src.size() + 1); + for (unsigned j = 0; j < asp.trg.size(); ++j) { + TRule r; + unsigned char& a_j = asp.a[j].src_index; + WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + InstantiateRule(f_a_j, asp.trg[j], &r); + if (tmodel.DecrementRule(r, &*prng)) + up0.Decrement(r); + + for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { + const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); + InstantiateRule(prop_f, asp.trg[j], &r); + ss[prop_a_j] = tmodel.RuleProbability(r); + } + a_j = prng->SelectSample(ss); + f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + InstantiateRule(f_a_j, asp.trg[j], &r); + if (tmodel.IncrementRule(r, &*prng)) + up0.Increment(r); + } + } + cerr << " LLH = " << Likelihood() << endl; +} + +void ExtractLetters(const set& v, vector >* l, set* letset = NULL) { + for (set::const_iterator it = v.begin(); it != v.end(); ++it) { + vector& letters = (*l)[*it]; + if (letters.size()) continue; // if e and f have the same word + + const string& w = TD::Convert(*it); + + size_t cur = 0; + while (cur < w.size()) { + const size_t len = UTF8Len(w[cur]); + letters.push_back(TD::Convert(w.substr(cur, len))); + if (letset) letset->insert(letters.back()); + cur += len; + } + } +} + +void Debug(const AlignedSentencePair& asp) { + cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl; + Array2D a(asp.src.size(), asp.trg.size()); + for (unsigned j = 0; j < asp.trg.size(); ++j) + if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true; + cerr << a << endl; +} + +void AddSample(AlignedSentencePair* asp) { + for (unsigned j = 0; j < asp->trg.size(); ++j) + asp->posterior(asp->a[j].src_index, j)++; +} + +void WriteAlignments(const AlignedSentencePair& asp) { + bool first = true; + for (unsigned j = 0; j < asp.trg.size(); ++j) { + int src_index = -1; + int mc = -1; + for (unsigned i = 0; i <= asp.src.size(); ++i) { + if (asp.posterior(i, j) > mc) { + mc = asp.posterior(i, j); + src_index = i; + } + } + + if (src_index) { + if (first) first = false; else cout << ' '; + cout << (src_index - 1) << '-' << j; + } + } + cout << endl; +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + + if (conf.count("random_seed")) + prng.reset(new MT19937(conf["random_seed"].as())); + else + prng.reset(new MT19937); +// MT19937& rng = *prng; + + vector > corpuse, corpusf; + set vocabe, vocabf; + corpus::ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); + cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; + cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; + cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; + assert(corpusf.size() == corpuse.size()); + + vector corpus(corpuse.size()); + for (unsigned i = 0; i < corpuse.size(); ++i) { + corpus[i].src.swap(corpusf[i]); + corpus[i].trg.swap(corpuse[i]); + corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size()); + } + corpusf.clear(); corpuse.clear(); + + vocabf.insert(TD::Convert("NULL")); + vector > letters(TD::NumWords()); + set letset; + ExtractLetters(vocabe, &letters, &letset); + ExtractLetters(vocabf, &letters, NULL); + letters[TD::Convert("NULL")].clear(); + + Transliterations tl; + + // TODO CONFIGURE THIS + int min_trans_src = 4; + + cerr << "Initializing transliteration DPs ...\n"; + for (int i = 0; i < corpus.size(); ++i) { + const vector& src = corpus[i].src; + const vector& trg = corpus[i].trg; + cerr << '.' << flush; + if (i % 80 == 79) cerr << endl; + for (int j = 0; j < src.size(); ++j) { + const vector& src_let = letters[src[j]]; + for (int k = 0; k < trg.size(); ++k) { + const vector& trg_let = letters[trg[k]]; + if (src_let.size() < min_trans_src) + tl.Forbid(src[j], trg[k]); + else + tl.Initialize(src[j], src_let, trg[k], trg_let); + } + } + } + cerr << endl; + tl.GraphSummary(); + + return 0; +} diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h index 8202778b..81ddb206 100644 --- a/gi/pf/conditional_pseg.h +++ b/gi/pf/conditional_pseg.h @@ -56,6 +56,12 @@ struct MConditionalTranslationModel { }; void ResampleHyperparameters(MT19937* rng) { + typename std::tr1::unordered_map, MFCR<1,TRule>, boost::hash > >::iterator it; +#if 1 + for (it = r.begin(); it != r.end(); ++it) { + it->second.resample_hyperparameters(rng); + } +#else const unsigned nloop = 5; const unsigned niterations = 10; DiscountResampler dr(*this); @@ -70,12 +76,12 @@ struct MConditionalTranslationModel { } strength = slice_sampler1d(ar, strength, *rng, -d, std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); - typename std::tr1::unordered_map, MFCR<1,TRule>, boost::hash > >::iterator it; std::cerr << "MConditionalTranslationModel(d=" << d << ",s=" << strength << ") = " << log_likelihood(d, strength) << std::endl; for (it = r.begin(); it != r.end(); ++it) { it->second.set_discount(d); it->second.set_strength(strength); } +#endif } int DecrementRule(const TRule& rule, MT19937* rng) { @@ -91,7 +97,8 @@ struct MConditionalTranslationModel { int IncrementRule(const TRule& rule, MT19937* rng) { RuleModelHash::iterator it = r.find(rule.f_); if (it == r.end()) { - it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(d, strength))).first; + //it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(d, strength))).first; + it = r.insert(make_pair(rule.f_, MFCR<1,TRule>(1,1,1,1,0.6, -0.12))).first; } p0s[0] = rp0(rule); TableCount delta = it->second.increment(rule, p0s.begin(), lambdas.begin(), rng); diff --git a/gi/pf/nuisance_test.cc b/gi/pf/nuisance_test.cc new file mode 100644 index 00000000..0f44fe95 --- /dev/null +++ b/gi/pf/nuisance_test.cc @@ -0,0 +1,161 @@ +#include "ccrp.h" + +#include +#include + +#include "tdict.h" +#include "transliterations.h" + +using namespace std; + +MT19937 rng; + +ostream& operator<<(ostream&os, const vector& v) { + os << '[' << v[0]; + if (v.size() == 2) os << ' ' << v[1]; + return os << ']'; +} + +struct Base { + Base() : llh(), v(2), v1(1), v2(1), crp(0.25, 0.5) {} + inline double p0(const vector& x) const { + double p = 0.75; + if (x.size() == 2) p = 0.25; + p *= 1.0 / 3.0; + if (x.size() == 2) p *= 1.0 / 3.0; + return p; + } + double est_deriv_prob(int a, int b, int seg) const { + assert(a > 0 && a < 4); // a \in {1,2,3} + assert(b > 0 && b < 4); // b \in {1,2,3} + assert(seg == 0 || seg == 1); // seg \in {0,1} + if (seg == 0) { + v[0] = a; + v[1] = b; + return crp.prob(v, p0(v)); + } else { + v1[0] = a; + v2[0] = b; + return crp.prob(v1, p0(v1)) * crp.prob(v2, p0(v2)); + } + } + double est_marginal_prob(int a, int b) const { + return est_deriv_prob(a,b,0) + est_deriv_prob(a,b,1); + } + int increment(int a, int b, double* pw = NULL) { + double p1 = est_deriv_prob(a, b, 0); + double p2 = est_deriv_prob(a, b, 1); + //p1 = 0.5; p2 = 0.5; + int seg = rng.SelectSample(p1,p2); + double tmp = 0; + if (!pw) pw = &tmp; + double& w = *pw; + if (seg == 0) { + v[0] = a; + v[1] = b; + w = crp.prob(v, p0(v)) / p1; + if (crp.increment(v, p0(v), &rng)) { + llh += log(p0(v)); + } + } else { + v1[0] = a; + w = crp.prob(v1, p0(v1)) / p2; + if (crp.increment(v1, p0(v1), &rng)) { + llh += log(p0(v1)); + } + v2[0] = b; + w *= crp.prob(v2, p0(v2)); + if (crp.increment(v2, p0(v2), &rng)) { + llh += log(p0(v2)); + } + } + return seg; + } + void increment(int a, int b, int seg) { + if (seg == 0) { + v[0] = a; + v[1] = b; + if (crp.increment(v, p0(v), &rng)) { + llh += log(p0(v)); + } + } else { + v1[0] = a; + if (crp.increment(v1, p0(v1), &rng)) { + llh += log(p0(v1)); + } + v2[0] = b; + if (crp.increment(v2, p0(v2), &rng)) { + llh += log(p0(v2)); + } + } + } + void decrement(int a, int b, int seg) { + if (seg == 0) { + v[0] = a; + v[1] = b; + if (crp.decrement(v, &rng)) { + llh -= log(p0(v)); + } + } else { + v1[0] = a; + if (crp.decrement(v1, &rng)) { + llh -= log(p0(v1)); + } + v2[0] = b; + if (crp.decrement(v2, &rng)) { + llh -= log(p0(v2)); + } + } + } + double log_likelihood() const { + return llh + crp.log_crp_prob(); + } + double llh; + mutable vector v, v1, v2; + CCRP > crp; +}; + +int main(int argc, char** argv) { + double tl = 0; + const int ITERS = 1000; + const int PARTICLES = 20; + const int DATAPOINTS = 50; + WordID x = TD::Convert("souvenons"); + WordID y = TD::Convert("remember"); + vector src; TD::ConvertSentence("s o u v e n o n s", &src); + vector trg; TD::ConvertSentence("r e m e m b e r", &trg); + Transliterations xx; + xx.Initialize(x, src, y, trg); + return 1; + + for (int j = 0; j < ITERS; ++j) { + Base b; + vector segs(DATAPOINTS); + SampleSet ss; + vector sss; + for (int i = 0; i < DATAPOINTS; i++) { + ss.clear(); + sss.clear(); + int x = ((i / 10) % 3) + 1; + int y = (i % 3) + 1; + //double ep = b.est_marginal_prob(x,y); + //cerr << "est p(" << x << "," << y << ") = " << ep << endl; + for (int n = 0; n < PARTICLES; ++n) { + double w; + int seg = b.increment(x,y,&w); + //cerr << seg << " w=" << w << endl; + ss.add(w); + sss.push_back(seg); + b.decrement(x,y,seg); + } + int seg = sss[rng.SelectSample(ss)]; + b.increment(x, y, seg); + //cerr << "Selected: " << seg << endl; + //return 1; + segs[i] = seg; + } + tl += b.log_likelihood(); + } + cerr << "LLH=" << tl / ITERS << endl; +} + diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc new file mode 100644 index 00000000..6e0c2e93 --- /dev/null +++ b/gi/pf/transliterations.cc @@ -0,0 +1,193 @@ +#include "transliterations.h" + +#include +#include +#include + +#include "grammar.h" +#include "bottom_up_parser.h" +#include "hg.h" +#include "hg_intersect.h" +#include "filelib.h" +#include "ccrp.h" +#include "m.h" +#include "lattice.h" +#include "verbose.h" + +using namespace std; +using namespace std::tr1; + +static WordID kX; +static int kMAX_SRC_SIZE = 0; +static vector > cur_trg_chunks; + +vector tlttofreelist; + +static void InitTargetChunks(int max_size, const vector& trg) { + cur_trg_chunks.clear(); + vector tmp; + unordered_set, boost::hash > > u; + for (int len = 1; len <= max_size; ++len) { + int end = trg.size() + 1; + end -= len; + for (int i = 0; i < end; ++i) { + tmp.clear(); + for (int j = 0; j < len; ++j) + tmp.push_back(trg[i + j]); + if (u.insert(tmp).second) cur_trg_chunks.push_back(tmp); + } + } +} + +struct TransliterationGrammarIter : public GrammarIter, public RuleBin { + TransliterationGrammarIter() { tlttofreelist.push_back(this); } + TransliterationGrammarIter(const TRulePtr& inr, int symbol) { + if (inr) { + r.reset(new TRule(*inr)); + } else { + r.reset(new TRule); + } + TRule& rr = *r; + rr.lhs_ = kX; + rr.f_.push_back(symbol); + tlttofreelist.push_back(this); + } + virtual int GetNumRules() const { + if (!r) return 0; + return cur_trg_chunks.size(); + } + virtual TRulePtr GetIthRule(int i) const { + TRulePtr nr(new TRule(*r)); + nr->e_ = cur_trg_chunks[i]; + //cerr << nr->AsString() << endl; + return nr; + } + virtual int Arity() const { + return 0; + } + virtual const RuleBin* GetRules() const { + if (!r) return NULL; else return this; + } + virtual const GrammarIter* Extend(int symbol) const { + if (symbol <= 0) return NULL; + if (!r || !kMAX_SRC_SIZE || r->f_.size() < kMAX_SRC_SIZE) + return new TransliterationGrammarIter(r, symbol); + else + return NULL; + } + TRulePtr r; +}; + +struct TransliterationGrammar : public Grammar { + virtual const GrammarIter* GetRoot() const { + return new TransliterationGrammarIter; + } + virtual bool HasRuleForSpan(int, int, int distance) const { + return (distance < kMAX_SRC_SIZE); + } +}; + +struct TInfo { + TInfo() : initialized(false) {} + bool initialized; + Hypergraph lattice; // may be empty if transliteration is not possible + prob_t est_prob; // will be zero if not possible +}; + +struct TransliterationsImpl { + TransliterationsImpl() { + kX = TD::Convert("X")*-1; + kMAX_SRC_SIZE = 4; + grammars.push_back(GrammarPtr(new TransliterationGrammar)); + grammars.push_back(GrammarPtr(new GlueGrammar("S", "X"))); + SetSilent(true); + } + + void Initialize(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { + if (src >= graphs.size()) graphs.resize(src + 1); + if (graphs[src][trg].initialized) return; + int kMAX_TRG_SIZE = 4; + InitTargetChunks(kMAX_TRG_SIZE, trg_lets); + ExhaustiveBottomUpParser parser("S", grammars); + Lattice lat(src_lets.size()), tlat(trg_lets.size()); + for (unsigned i = 0; i < src_lets.size(); ++i) + lat[i].push_back(LatticeArc(src_lets[i], 0.0, 1)); + for (unsigned i = 0; i < trg_lets.size(); ++i) + tlat[i].push_back(LatticeArc(trg_lets[i], 0.0, 1)); + //cerr << "Creating lattice for: " << TD::Convert(src) << " --> " << TD::Convert(trg) << endl; + //cerr << "'" << TD::GetString(src_lets) << "' --> " << TD::GetString(trg_lets) << endl; + if (!parser.Parse(lat, &graphs[src][trg].lattice)) { + //cerr << "Failed to parse " << TD::GetString(src_lets) << endl; + abort(); + } + if (HG::Intersect(tlat, &graphs[src][trg].lattice)) { + graphs[src][trg].est_prob = prob_t(1e-4); + } else { + graphs[src][trg].lattice.clear(); + //cerr << "Failed to intersect " << TD::GetString(src_lets) << " ||| " << TD::GetString(trg_lets) << endl; + graphs[src][trg].est_prob = prob_t::Zero(); + } + for (unsigned i = 0; i < tlttofreelist.size(); ++i) + delete tlttofreelist[i]; + tlttofreelist.clear(); + //cerr << "Number of paths: " << graphs[src][trg].lattice.NumberOfPaths() << endl; + graphs[src][trg].initialized = true; + } + + const prob_t& EstimateProbability(WordID src, WordID trg) const { + assert(src < graphs.size()); + const unordered_map& um = graphs[src]; + const unordered_map::const_iterator it = um.find(trg); + assert(it != um.end()); + assert(it->second.initialized); + return it->second.est_prob; + } + + void Forbid(WordID src, WordID trg) { + if (src >= graphs.size()) graphs.resize(src + 1); + graphs[src][trg].est_prob = prob_t::Zero(); + graphs[src][trg].initialized = true; + } + + void GraphSummary() const { + double tlp = 0; + int tt = 0; + for (int i = 0; i < graphs.size(); ++i) { + const unordered_map& um = graphs[i]; + unordered_map::const_iterator it; + for (it = um.begin(); it != um.end(); ++it) { + if (it->second.lattice.empty()) continue; + //cerr << TD::Convert(i) << " --> " << TD::Convert(it->first) << ": " << it->second.lattice.NumberOfPaths() << endl; + tlp += log(it->second.lattice.NumberOfPaths()); + tt++; + } + } + tlp /= tt; + cerr << "E[log paths] = " << tlp << endl; + cerr << "exp(E[log paths]) = " << exp(tlp) << endl; + } + + vector > graphs; + vector grammars; +}; + +Transliterations::Transliterations() : pimpl_(new TransliterationsImpl) {} +Transliterations::~Transliterations() { delete pimpl_; } + +void Transliterations::Initialize(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { + pimpl_->Initialize(src, src_lets, trg, trg_lets); +} + +prob_t Transliterations::EstimateProbability(WordID src, WordID trg) const { + return pimpl_->EstimateProbability(src,trg); +} + +void Transliterations::Forbid(WordID src, WordID trg) { + pimpl_->Forbid(src, trg); +} + +void Transliterations::GraphSummary() const { + pimpl_->GraphSummary(); +} + + diff --git a/gi/pf/transliterations.h b/gi/pf/transliterations.h new file mode 100644 index 00000000..a548aacf --- /dev/null +++ b/gi/pf/transliterations.h @@ -0,0 +1,20 @@ +#ifndef _TRANSLITERATIONS_H_ +#define _TRANSLITERATIONS_H_ + +#include +#include "wordid.h" +#include "prob.h" + +struct TransliterationsImpl; +struct Transliterations { + explicit Transliterations(); + ~Transliterations(); + void Initialize(WordID src, const std::vector& src_lets, WordID trg, const std::vector& trg_lets); + void Forbid(WordID src, WordID trg); + void GraphSummary() const; + prob_t EstimateProbability(WordID src, WordID trg) const; + TransliterationsImpl* pimpl_; +}; + +#endif + -- cgit v1.2.3 From e2998fc79c9dd549b1c1bad537fdf1052276f82c Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 8 Mar 2012 01:46:32 -0500 Subject: simple context feature for tagger --- decoder/Makefile.am | 1 + decoder/cdec_ff.cc | 2 + decoder/ff_context.cc | 99 +++++++++++++++++++++++ decoder/ff_context.h | 23 ++++++ gi/pf/align-tl.cc | 6 +- gi/pf/reachability.cc | 2 + gi/pf/reachability.h | 6 +- gi/pf/transliterations.cc | 198 ++++++++++++++-------------------------------- gi/pf/transliterations.h | 5 +- 9 files changed, 194 insertions(+), 148 deletions(-) create mode 100644 decoder/ff_context.cc create mode 100644 decoder/ff_context.h diff --git a/decoder/Makefile.am b/decoder/Makefile.am index 30eaf04d..a00b18af 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -63,6 +63,7 @@ libcdec_a_SOURCES = \ ff.cc \ ff_rules.cc \ ff_wordset.cc \ + ff_context.cc \ ff_charset.cc \ ff_lm.cc \ ff_klm.cc \ diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc index 4ce5749e..b516c386 100644 --- a/decoder/cdec_ff.cc +++ b/decoder/cdec_ff.cc @@ -1,6 +1,7 @@ #include #include "ff.h" +#include "ff_context.h" #include "ff_spans.h" #include "ff_lm.h" #include "ff_klm.h" @@ -42,6 +43,7 @@ void register_feature_functions() { #endif ff_registry.Register("SpanFeatures", new FFFactory()); ff_registry.Register("NgramFeatures", new FFFactory()); + ff_registry.Register("RuleContextFeatures", new FFFactory()); ff_registry.Register("RuleIdentityFeatures", new FFFactory()); ff_registry.Register("SourceSyntaxFeatures", new FFFactory); ff_registry.Register("SourceSpanSizeFeatures", new FFFactory); diff --git a/decoder/ff_context.cc b/decoder/ff_context.cc new file mode 100644 index 00000000..19f9a413 --- /dev/null +++ b/decoder/ff_context.cc @@ -0,0 +1,99 @@ +#include "ff_context.h" + +#include +#include +#include + +#include "filelib.h" +#include "stringlib.h" +#include "sentence_metadata.h" +#include "lattice.h" +#include "fdict.h" +#include "verbose.h" + +using namespace std; + +namespace { + string Escape(const string& x) { + string y = x; + for (int i = 0; i < y.size(); ++i) { + if (y[i] == '=') y[i]='_'; + if (y[i] == ';') y[i]='_'; + } + return y; + } +} + +RuleContextFeatures::RuleContextFeatures(const std::string& param) { + kSOS = TD::Convert(""); + kEOS = TD::Convert(""); + + // TODO param lets you pass in a string from the cdec.ini file +} + +void RuleContextFeatures::PrepareForInput(const SentenceMetadata& smeta) { + const Lattice& sl = smeta.GetSourceLattice(); + current_input.resize(sl.size()); + for (unsigned i = 0; i < sl.size(); ++i) { + if (sl[i].size() != 1) { + cerr << "Context features not supported with lattice inputs!\nid=" << smeta.GetSentenceId() << endl; + abort(); + } + current_input[i] = sl[i][0].label; + } +} + +void RuleContextFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* context) const { + const TRule& rule = *edge.rule_; + + if (rule.Arity() != 0 || // arity = 0, no nonterminals + rule.e_.size() != 1) return; // size = 1, predicted label is a single token + + + // you can see the current label "for free" + const WordID cur_label = rule.e_[0]; + // (if you want to see more labels, you have to be very careful, and muck + // about with contexts and ant_contexts) + + // but... you can look at as much of the source as you want! + const int from_src_index = edge.i_; // start of the span in the input being labeled + const int to_src_index = edge.j_; // end of the span in the input + // (note: in the case of tagging the size of the spans being labeled will + // always be 1, but in other formalisms, you can have bigger spans.) + + // this is the current token being labeled: + const WordID cur_input = current_input[from_src_index]; + + // let's get the previous token in the input (may be to the left of the start + // of the sentence!) + WordID prev_input = kSOS; + if (from_src_index > 0) { prev_input = current_input[from_src_index - 1]; } + // let's get the next token (may be to the left of the start of the sentence!) + WordID next_input = kEOS; + if (to_src_index < current_input.size()) { next_input = current_input[to_src_index]; } + + // now, build a feature string + ostringstream os; + // TD::Convert converts from the internal integer representation of a token + // to the actual token + os << "C1:" << TD::Convert(prev_input) << '_' + << TD::Convert(cur_input) << '|' << TD::Convert(cur_label); + // C1 is just to prevent a name clash + + // pick a value + double fval = 1.0; // can be any real value + + // add it to the feature vector FD::Convert converts the feature string to a + // feature int, Escape makes sure the feature string doesn't have any bad + // symbols that could confuse a parser somewhere + features->add_value(FD::Convert(Escape(os.str())), fval); + // that's it! + + // create more features if you like... +} + diff --git a/decoder/ff_context.h b/decoder/ff_context.h new file mode 100644 index 00000000..0d22b027 --- /dev/null +++ b/decoder/ff_context.h @@ -0,0 +1,23 @@ +#ifndef _FF_CONTEXT_H_ +#define _FF_CONTEXT_H_ + +#include +#include "ff.h" + +class RuleContextFeatures : public FeatureFunction { + public: + RuleContextFeatures(const std::string& param); + protected: + virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* context) const; + virtual void PrepareForInput(const SentenceMetadata& smeta); + private: + std::vector current_input; + WordID kSOS, kEOS; +}; + +#endif diff --git a/gi/pf/align-tl.cc b/gi/pf/align-tl.cc index 0e0454e5..6bb8c886 100644 --- a/gi/pf/align-tl.cc +++ b/gi/pf/align-tl.cc @@ -310,18 +310,16 @@ int main(int argc, char** argv) { // TODO CONFIGURE THIS int min_trans_src = 4; - cerr << "Initializing transliteration DPs ...\n"; + cerr << "Initializing transliteration graph structures ...\n"; for (int i = 0; i < corpus.size(); ++i) { const vector& src = corpus[i].src; const vector& trg = corpus[i].trg; - cerr << '.' << flush; - if (i % 80 == 79) cerr << endl; for (int j = 0; j < src.size(); ++j) { const vector& src_let = letters[src[j]]; for (int k = 0; k < trg.size(); ++k) { const vector& trg_let = letters[trg[k]]; if (src_let.size() < min_trans_src) - tl.Forbid(src[j], trg[k]); + tl.Forbid(src[j], src_let, trg[k], trg_let); else tl.Initialize(src[j], src_let, trg[k], trg_let); } diff --git a/gi/pf/reachability.cc b/gi/pf/reachability.cc index 73dd8d39..70fb76da 100644 --- a/gi/pf/reachability.cc +++ b/gi/pf/reachability.cc @@ -47,6 +47,7 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras r[prevs[k].prev_src_covered][prevs[k].prev_trg_covered] = true; int src_delta = i - prevs[k].prev_src_covered; edges[prevs[k].prev_src_covered][prevs[k].prev_trg_covered][src_delta][j - prevs[k].prev_trg_covered] = true; + valid_deltas[prevs[k].prev_src_covered][prevs[k].prev_trg_covered].push_back(make_pair(src_delta,j - prevs[k].prev_trg_covered)); short &msd = max_src_delta[prevs[k].prev_src_covered][prevs[k].prev_trg_covered]; if (src_delta > msd) msd = src_delta; } @@ -56,6 +57,7 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras assert(!edges[0][0][0][1]); assert(!edges[0][0][0][0]); assert(max_src_delta[0][0] > 0); + cerr << "Sentence with length (" << srclen << ',' << trglen << ") has " << valid_deltas[0][0].size() << " out edges in its root node\n"; //cerr << "First cell contains " << b[0][0].size() << " forward pointers\n"; //for (int i = 0; i < b[0][0].size(); ++i) { // cerr << " -> (" << b[0][0][i].next_src_covered << "," << b[0][0][i].next_trg_covered << ")\n"; diff --git a/gi/pf/reachability.h b/gi/pf/reachability.h index 98450ec1..fb2f4965 100644 --- a/gi/pf/reachability.h +++ b/gi/pf/reachability.h @@ -12,12 +12,14 @@ // currently forbids 0 -> n and n -> 0 alignments struct Reachability { - boost::multi_array edges; // edges[src_covered][trg_covered][x][trg_delta] is this edge worth exploring? + boost::multi_array edges; // edges[src_covered][trg_covered][src_delta][trg_delta] is this edge worth exploring? boost::multi_array max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid + boost::multi_array >, 2> valid_deltas; // valid_deltas[src_covered][trg_covered] list of valid transitions leaving a particular node Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) : edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]), - max_src_delta(boost::extents[srclen][trglen]) { + max_src_delta(boost::extents[srclen][trglen]), + valid_deltas(boost::extents[srclen][trglen]) { ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len); } diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc index 6e0c2e93..e29334fd 100644 --- a/gi/pf/transliterations.cc +++ b/gi/pf/transliterations.cc @@ -2,173 +2,92 @@ #include #include -#include -#include "grammar.h" -#include "bottom_up_parser.h" -#include "hg.h" -#include "hg_intersect.h" +#include "boost/shared_ptr.hpp" + #include "filelib.h" #include "ccrp.h" #include "m.h" -#include "lattice.h" -#include "verbose.h" +#include "reachability.h" using namespace std; using namespace std::tr1; -static WordID kX; -static int kMAX_SRC_SIZE = 0; -static vector > cur_trg_chunks; - -vector tlttofreelist; - -static void InitTargetChunks(int max_size, const vector& trg) { - cur_trg_chunks.clear(); - vector tmp; - unordered_set, boost::hash > > u; - for (int len = 1; len <= max_size; ++len) { - int end = trg.size() + 1; - end -= len; - for (int i = 0; i < end; ++i) { - tmp.clear(); - for (int j = 0; j < len; ++j) - tmp.push_back(trg[i + j]); - if (u.insert(tmp).second) cur_trg_chunks.push_back(tmp); - } - } -} - -struct TransliterationGrammarIter : public GrammarIter, public RuleBin { - TransliterationGrammarIter() { tlttofreelist.push_back(this); } - TransliterationGrammarIter(const TRulePtr& inr, int symbol) { - if (inr) { - r.reset(new TRule(*inr)); - } else { - r.reset(new TRule); - } - TRule& rr = *r; - rr.lhs_ = kX; - rr.f_.push_back(symbol); - tlttofreelist.push_back(this); - } - virtual int GetNumRules() const { - if (!r) return 0; - return cur_trg_chunks.size(); - } - virtual TRulePtr GetIthRule(int i) const { - TRulePtr nr(new TRule(*r)); - nr->e_ = cur_trg_chunks[i]; - //cerr << nr->AsString() << endl; - return nr; - } - virtual int Arity() const { - return 0; - } - virtual const RuleBin* GetRules() const { - if (!r) return NULL; else return this; - } - virtual const GrammarIter* Extend(int symbol) const { - if (symbol <= 0) return NULL; - if (!r || !kMAX_SRC_SIZE || r->f_.size() < kMAX_SRC_SIZE) - return new TransliterationGrammarIter(r, symbol); - else - return NULL; - } - TRulePtr r; -}; - -struct TransliterationGrammar : public Grammar { - virtual const GrammarIter* GetRoot() const { - return new TransliterationGrammarIter; - } - virtual bool HasRuleForSpan(int, int, int distance) const { - return (distance < kMAX_SRC_SIZE); - } -}; - -struct TInfo { - TInfo() : initialized(false) {} +struct GraphStructure { + GraphStructure() : initialized(false) {} + boost::shared_ptr r; bool initialized; - Hypergraph lattice; // may be empty if transliteration is not possible - prob_t est_prob; // will be zero if not possible }; struct TransliterationsImpl { TransliterationsImpl() { - kX = TD::Convert("X")*-1; - kMAX_SRC_SIZE = 4; - grammars.push_back(GrammarPtr(new TransliterationGrammar)); - grammars.push_back(GrammarPtr(new GlueGrammar("S", "X"))); - SetSilent(true); } void Initialize(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { - if (src >= graphs.size()) graphs.resize(src + 1); - if (graphs[src][trg].initialized) return; - int kMAX_TRG_SIZE = 4; - InitTargetChunks(kMAX_TRG_SIZE, trg_lets); - ExhaustiveBottomUpParser parser("S", grammars); - Lattice lat(src_lets.size()), tlat(trg_lets.size()); - for (unsigned i = 0; i < src_lets.size(); ++i) - lat[i].push_back(LatticeArc(src_lets[i], 0.0, 1)); - for (unsigned i = 0; i < trg_lets.size(); ++i) - tlat[i].push_back(LatticeArc(trg_lets[i], 0.0, 1)); - //cerr << "Creating lattice for: " << TD::Convert(src) << " --> " << TD::Convert(trg) << endl; - //cerr << "'" << TD::GetString(src_lets) << "' --> " << TD::GetString(trg_lets) << endl; - if (!parser.Parse(lat, &graphs[src][trg].lattice)) { - //cerr << "Failed to parse " << TD::GetString(src_lets) << endl; - abort(); - } - if (HG::Intersect(tlat, &graphs[src][trg].lattice)) { - graphs[src][trg].est_prob = prob_t(1e-4); + const size_t src_len = src_lets.size(); + const size_t trg_len = trg_lets.size(); + if (src_len >= graphs.size()) graphs.resize(src_len + 1); + if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1); + if (graphs[src_len][trg_len].initialized) return; + graphs[src_len][trg_len].r.reset(new Reachability(src_len, trg_len, 4, 4)); + +#if 0 + if (HG::Intersect(tlat, &hg)) { + // TODO } else { - graphs[src][trg].lattice.clear(); - //cerr << "Failed to intersect " << TD::GetString(src_lets) << " ||| " << TD::GetString(trg_lets) << endl; - graphs[src][trg].est_prob = prob_t::Zero(); + cerr << "No transliteration lattice possible for src_len=" << src_len << " trg_len=" << trg_len << endl; + hg.clear(); } - for (unsigned i = 0; i < tlttofreelist.size(); ++i) - delete tlttofreelist[i]; - tlttofreelist.clear(); //cerr << "Number of paths: " << graphs[src][trg].lattice.NumberOfPaths() << endl; - graphs[src][trg].initialized = true; +#endif + graphs[src_len][trg_len].initialized = true; } - const prob_t& EstimateProbability(WordID src, WordID trg) const { - assert(src < graphs.size()); - const unordered_map& um = graphs[src]; - const unordered_map::const_iterator it = um.find(trg); - assert(it != um.end()); - assert(it->second.initialized); - return it->second.est_prob; + void Forbid(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { + const size_t src_len = src_lets.size(); + const size_t trg_len = trg_lets.size(); + if (src_len >= graphs.size()) graphs.resize(src_len + 1); + if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1); + graphs[src_len][trg_len].r.reset(); + graphs[src_len][trg_len].initialized = true; } - void Forbid(WordID src, WordID trg) { - if (src >= graphs.size()) graphs.resize(src + 1); - graphs[src][trg].est_prob = prob_t::Zero(); - graphs[src][trg].initialized = true; + prob_t EstimateProbability(WordID s, const vector& src, WordID t, const vector& trg) const { + assert(src.size() < graphs.size()); + const vector& tv = graphs[src.size()]; + assert(trg.size() < tv.size()); + const GraphStructure& gs = tv[trg.size()]; + // TODO: do prob + return prob_t::Zero(); } void GraphSummary() const { - double tlp = 0; - int tt = 0; + double to = 0; + double tn = 0; + double tt = 0; for (int i = 0; i < graphs.size(); ++i) { - const unordered_map& um = graphs[i]; - unordered_map::const_iterator it; - for (it = um.begin(); it != um.end(); ++it) { - if (it->second.lattice.empty()) continue; - //cerr << TD::Convert(i) << " --> " << TD::Convert(it->first) << ": " << it->second.lattice.NumberOfPaths() << endl; - tlp += log(it->second.lattice.NumberOfPaths()); + const vector& vt = graphs[i]; + for (int j = 0; j < vt.size(); ++j) { + const GraphStructure& gs = vt[j]; + if (!gs.r) continue; tt++; + for (int k = 0; k < i; ++k) { + for (int l = 0; l < j; ++l) { + size_t c = gs.r->valid_deltas[k][l].size(); + if (c) { + tn += 1; + to += c; + } + } + } } } - tlp /= tt; - cerr << "E[log paths] = " << tlp << endl; - cerr << "exp(E[log paths]) = " << exp(tlp) << endl; + cerr << " Average nodes = " << (tn / tt) << endl; + cerr << "Average out-degree = " << (to / tn) << endl; + cerr << " Unique structures = " << tt << endl; } - vector > graphs; - vector grammars; + vector > graphs; // graphs[src_len][trg_len] }; Transliterations::Transliterations() : pimpl_(new TransliterationsImpl) {} @@ -178,16 +97,15 @@ void Transliterations::Initialize(WordID src, const vector& src_lets, Wo pimpl_->Initialize(src, src_lets, trg, trg_lets); } -prob_t Transliterations::EstimateProbability(WordID src, WordID trg) const { - return pimpl_->EstimateProbability(src,trg); +prob_t Transliterations::EstimateProbability(WordID s, const vector& src, WordID t, const vector& trg) const { + return pimpl_->EstimateProbability(s, src,t, trg); } -void Transliterations::Forbid(WordID src, WordID trg) { - pimpl_->Forbid(src, trg); +void Transliterations::Forbid(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { + pimpl_->Forbid(src, src_lets, trg, trg_lets); } void Transliterations::GraphSummary() const { pimpl_->GraphSummary(); } - diff --git a/gi/pf/transliterations.h b/gi/pf/transliterations.h index a548aacf..76eb2a05 100644 --- a/gi/pf/transliterations.h +++ b/gi/pf/transliterations.h @@ -10,9 +10,10 @@ struct Transliterations { explicit Transliterations(); ~Transliterations(); void Initialize(WordID src, const std::vector& src_lets, WordID trg, const std::vector& trg_lets); - void Forbid(WordID src, WordID trg); + void Forbid(WordID src, const std::vector& src_lets, WordID trg, const std::vector& trg_lets); void GraphSummary() const; - prob_t EstimateProbability(WordID src, WordID trg) const; + prob_t EstimateProbability(WordID s, const std::vector& src, WordID t, const std::vector& trg) const; + private: TransliterationsImpl* pimpl_; }; -- cgit v1.2.3 From 9a8256604686a9283e7afce04e6feaab4922dd45 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 8 Mar 2012 13:32:41 -0500 Subject: tl stuff --- gi/pf/Makefile.am | 8 +++-- gi/pf/align-tl.cc | 8 +++-- gi/pf/reachability.cc | 17 +++++++--- gi/pf/reachability.h | 4 +++ gi/pf/transliterations.cc | 82 ++++++++++++++++++++++++++++++++++------------- gi/pf/transliterations.h | 3 +- 6 files changed, 88 insertions(+), 34 deletions(-) diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 5e89f02a..9888a70b 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -2,15 +2,17 @@ bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexon noinst_LIBRARIES = libpf.a -libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc +libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc transliterations.cc -nuisance_test_SOURCES = nuisance_test.cc transliterations.cc +nuisance_test_SOURCES = nuisance_test.cc +nuisance_test_LDADD = libpf.a align_lexonly_SOURCES = align-lexonly.cc align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc -align_tl_SOURCES = align-tl.cc transliterations.cc +align_tl_SOURCES = align-tl.cc +align_tl_LDADD = libpf.a itg_SOURCES = itg.cc diff --git a/gi/pf/align-tl.cc b/gi/pf/align-tl.cc index 6bb8c886..fe8950b5 100644 --- a/gi/pf/align-tl.cc +++ b/gi/pf/align-tl.cc @@ -305,7 +305,10 @@ int main(int argc, char** argv) { ExtractLetters(vocabf, &letters, NULL); letters[TD::Convert("NULL")].clear(); - Transliterations tl; + // TODO configure this + int max_src_chunk = 4; + int max_trg_chunk = 4; + Transliterations tl(max_src_chunk, max_trg_chunk); // TODO CONFIGURE THIS int min_trans_src = 4; @@ -318,10 +321,9 @@ int main(int argc, char** argv) { const vector& src_let = letters[src[j]]; for (int k = 0; k < trg.size(); ++k) { const vector& trg_let = letters[trg[k]]; + tl.Initialize(src[j], src_let, trg[k], trg_let); if (src_let.size() < min_trans_src) tl.Forbid(src[j], src_let, trg[k], trg_let); - else - tl.Initialize(src[j], src_let, trg[k], trg_let); } } } diff --git a/gi/pf/reachability.cc b/gi/pf/reachability.cc index 70fb76da..59bc6ace 100644 --- a/gi/pf/reachability.cc +++ b/gi/pf/reachability.cc @@ -39,6 +39,7 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras typedef boost::multi_array rarray_type; rarray_type r(boost::extents[srclen + 1][trglen + 1]); r[srclen][trglen] = true; + nodes = 0; for (int i = srclen; i >= 0; --i) { for (int j = trglen; j >= 0; --j) { vector& prevs = a[i][j]; @@ -57,10 +58,16 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras assert(!edges[0][0][0][1]); assert(!edges[0][0][0][0]); assert(max_src_delta[0][0] > 0); - cerr << "Sentence with length (" << srclen << ',' << trglen << ") has " << valid_deltas[0][0].size() << " out edges in its root node\n"; - //cerr << "First cell contains " << b[0][0].size() << " forward pointers\n"; - //for (int i = 0; i < b[0][0].size(); ++i) { - // cerr << " -> (" << b[0][0][i].next_src_covered << "," << b[0][0][i].next_trg_covered << ")\n"; - //} + nodes = 0; + for (int i = 0; i < srclen; ++i) { + for (int j = 0; j < trglen; ++j) { + if (valid_deltas[i][j].size() > 0) { + node_addresses[i][j] = nodes++; + } else { + node_addresses[i][j] = -1; + } + } + } + cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") has " << valid_deltas[0][0].size() << " out edges in its root node, " << nodes << " nodes in total, and outside estimate matrix will require " << sizeof(float)*nodes << " bytes\n"; } diff --git a/gi/pf/reachability.h b/gi/pf/reachability.h index fb2f4965..1e22c76a 100644 --- a/gi/pf/reachability.h +++ b/gi/pf/reachability.h @@ -12,13 +12,17 @@ // currently forbids 0 -> n and n -> 0 alignments struct Reachability { + unsigned nodes; boost::multi_array edges; // edges[src_covered][trg_covered][src_delta][trg_delta] is this edge worth exploring? boost::multi_array max_src_delta; // msd[src_covered][trg_covered] -- the largest src delta that's valid + boost::multi_array node_addresses; // na[src_covered][trg_covered] -- the index of the node in a one-dimensional array (of size "nodes") boost::multi_array >, 2> valid_deltas; // valid_deltas[src_covered][trg_covered] list of valid transitions leaving a particular node Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) : + nodes(), edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]), max_src_delta(boost::extents[srclen][trglen]), + node_addresses(boost::extents[srclen][trglen]), valid_deltas(boost::extents[srclen][trglen]) { ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len); } diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc index e29334fd..61e95b82 100644 --- a/gi/pf/transliterations.cc +++ b/gi/pf/transliterations.cc @@ -14,42 +14,75 @@ using namespace std; using namespace std::tr1; struct GraphStructure { - GraphStructure() : initialized(false) {} - boost::shared_ptr r; - bool initialized; + GraphStructure() : r() {} + // leak memory - these are basically static + const Reachability* r; + bool IsReachable() const { return r->nodes > 0; } +}; + +struct BackwardEstimates { + BackwardEstimates() : gs(), backward() {} + explicit BackwardEstimates(const GraphStructure& g) : + gs(&g), backward() { + if (g.r->nodes > 0) + backward = new float[g.r->nodes]; + } + // leak memory, these are static + + // returns an estimate of the marginal probability + double MarginalEstimate() const { + if (!backward) return 0; + return backward[0]; + } + + // returns an backward estimate + double operator()(int src_covered, int trg_covered) const { + if (!backward) return 0; + int ind = gs->r->node_addresses[src_covered][trg_covered]; + if (ind < 0) return 0; + return backward[ind]; + } + private: + const GraphStructure* gs; + float* backward; }; struct TransliterationsImpl { - TransliterationsImpl() { + TransliterationsImpl(int max_src, int max_trg) : + kMAX_SRC_CHUNK(max_src), + kMAX_TRG_CHUNK(max_trg), + tot_pairs() { } void Initialize(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { const size_t src_len = src_lets.size(); const size_t trg_len = trg_lets.size(); + + // init graph structure if (src_len >= graphs.size()) graphs.resize(src_len + 1); if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1); - if (graphs[src_len][trg_len].initialized) return; - graphs[src_len][trg_len].r.reset(new Reachability(src_len, trg_len, 4, 4)); - -#if 0 - if (HG::Intersect(tlat, &hg)) { - // TODO - } else { - cerr << "No transliteration lattice possible for src_len=" << src_len << " trg_len=" << trg_len << endl; - hg.clear(); - } - //cerr << "Number of paths: " << graphs[src][trg].lattice.NumberOfPaths() << endl; -#endif - graphs[src_len][trg_len].initialized = true; + GraphStructure& gs = graphs[src_len][trg_len]; + if (!gs.r) + gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK); + const Reachability& r = *gs.r; + + // init backward estimates + if (src >= bes.size()) bes.resize(src + 1); + unordered_map::iterator it = bes[src].find(trg); + if (it != bes[src].end()) return; // already initialized + + it = bes[src].insert(make_pair(trg, BackwardEstimates(gs))).first; + BackwardEstimates& b = it->second; + if (!gs.r->nodes) return; // not derivable subject to length constraints + + // TODO + tot_pairs++; } void Forbid(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { const size_t src_len = src_lets.size(); const size_t trg_len = trg_lets.size(); - if (src_len >= graphs.size()) graphs.resize(src_len + 1); - if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1); - graphs[src_len][trg_len].r.reset(); - graphs[src_len][trg_len].initialized = true; + // TODO } prob_t EstimateProbability(WordID s, const vector& src, WordID t, const vector& trg) const { @@ -85,12 +118,17 @@ struct TransliterationsImpl { cerr << " Average nodes = " << (tn / tt) << endl; cerr << "Average out-degree = " << (to / tn) << endl; cerr << " Unique structures = " << tt << endl; + cerr << " Unique pairs = " << tot_pairs << endl; } + const int kMAX_SRC_CHUNK; + const int kMAX_TRG_CHUNK; + unsigned tot_pairs; vector > graphs; // graphs[src_len][trg_len] + vector > bes; // bes[src][trg] }; -Transliterations::Transliterations() : pimpl_(new TransliterationsImpl) {} +Transliterations::Transliterations(int max_src, int max_trg) : pimpl_(new TransliterationsImpl(max_src, max_trg)) {} Transliterations::~Transliterations() { delete pimpl_; } void Transliterations::Initialize(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { diff --git a/gi/pf/transliterations.h b/gi/pf/transliterations.h index 76eb2a05..e025547e 100644 --- a/gi/pf/transliterations.h +++ b/gi/pf/transliterations.h @@ -7,7 +7,8 @@ struct TransliterationsImpl; struct Transliterations { - explicit Transliterations(); + // max_src and max_trg indicate how big the transliteration phrases can be + explicit Transliterations(int max_src, int max_trg); ~Transliterations(); void Initialize(WordID src, const std::vector& src_lets, WordID trg, const std::vector& trg_lets); void Forbid(WordID src, const std::vector& src_lets, WordID trg, const std::vector& trg_lets); -- cgit v1.2.3 From 301106af9c13285ff252c618848eaa54b9b0e490 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 8 Mar 2012 18:47:23 +0000 Subject: fix link error on linux --- gi/pf/Makefile.am | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 9888a70b..94364c3d 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -5,14 +5,14 @@ noinst_LIBRARIES = libpf.a libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc transliterations.cc nuisance_test_SOURCES = nuisance_test.cc -nuisance_test_LDADD = libpf.a +nuisance_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz align_lexonly_SOURCES = align-lexonly.cc align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc align_tl_SOURCES = align-tl.cc -align_tl_LDADD = libpf.a +align_tl_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz itg_SOURCES = itg.cc -- cgit v1.2.3 From 78bf1457f606dd3880c2bc912201c4945d3f85b4 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 8 Mar 2012 14:29:42 -0500 Subject: moar --- gi/pf/align-tl.cc | 15 +++++++++------ gi/pf/reachability.cc | 9 +++++---- gi/pf/reachability.h | 8 +++++--- gi/pf/transliterations.cc | 14 ++++++++++---- gi/pf/transliterations.h | 3 ++- 5 files changed, 31 insertions(+), 18 deletions(-) diff --git a/gi/pf/align-tl.cc b/gi/pf/align-tl.cc index fe8950b5..fc9b7ca5 100644 --- a/gi/pf/align-tl.cc +++ b/gi/pf/align-tl.cc @@ -30,6 +30,10 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { opts.add_options() ("samples,s",po::value()->default_value(1000),"Number of samples") ("input,i",po::value(),"Read parallel data from") + ("max_src_chunk", po::value()->default_value(4), "Maximum size of translitered chunk in source") + ("max_trg_chunk", po::value()->default_value(4), "Maximum size of translitered chunk in target") + ("min_transliterated_src_length", po::value()->default_value(3), "Minimum length of source words considered for transliteration") + ("filter_ratio", po::value()->default_value(0.66), "Filter ratio: basically, if the lengths differ by less than this ratio, mark the pair as non-transliteratable") ("random_seed,S",po::value(), "Random seed"); po::options_description clo("Command line options"); clo.add_options() @@ -306,12 +310,11 @@ int main(int argc, char** argv) { letters[TD::Convert("NULL")].clear(); // TODO configure this - int max_src_chunk = 4; - int max_trg_chunk = 4; - Transliterations tl(max_src_chunk, max_trg_chunk); - - // TODO CONFIGURE THIS - int min_trans_src = 4; + const int max_src_chunk = conf["max_src_chunk"].as(); + const int max_trg_chunk = conf["max_trg_chunk"].as(); + const double filter_rat = conf["filter_ratio"].as(); + const int min_trans_src = conf["min_transliterated_src_length"].as(); + Transliterations tl(max_src_chunk, max_trg_chunk, filter_rat); cerr << "Initializing transliteration graph structures ...\n"; for (int i = 0; i < corpus.size(); ++i) { diff --git a/gi/pf/reachability.cc b/gi/pf/reachability.cc index 59bc6ace..c10000f2 100644 --- a/gi/pf/reachability.cc +++ b/gi/pf/reachability.cc @@ -12,7 +12,7 @@ struct SState { int prev_trg_covered; }; -void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) { +void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio) { typedef boost::multi_array, 2> array_type; array_type a(boost::extents[srclen + 1][trglen + 1]); a[0][0].push_back(SState()); @@ -30,9 +30,10 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras } } a[0][0].clear(); - //cerr << "Final cell contains " << a[srclen][trglen].size() << " back pointers\n"; - if (a[srclen][trglen].size() == 0) { - cerr << "Sentence with length (" << srclen << ',' << trglen << ") violates reachability constraints\n"; + //cerr << srclen << "," << trglen << ": Final cell contains " << a[srclen][trglen].size() << " back pointers\n"; + size_t min_allowed = (src_max_phrase_len + 1) * (trg_max_phrase_len + 1) * (filter_ratio * filter_ratio); + if (a[srclen][trglen].size() < min_allowed) { + cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") violates reachability constraint of min indegree " << min_allowed << " with " << a[srclen][trglen].size() << " in edges\n"; return; } diff --git a/gi/pf/reachability.h b/gi/pf/reachability.h index 1e22c76a..03967d44 100644 --- a/gi/pf/reachability.h +++ b/gi/pf/reachability.h @@ -18,17 +18,19 @@ struct Reachability { boost::multi_array node_addresses; // na[src_covered][trg_covered] -- the index of the node in a one-dimensional array (of size "nodes") boost::multi_array >, 2> valid_deltas; // valid_deltas[src_covered][trg_covered] list of valid transitions leaving a particular node - Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) : + // filter_ratio says if the number of outgoing edges from the first cell is less than + // src_max * trg_max * filter_rat^2 then mark as non reachable + Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio = 0.0) : nodes(), edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]), max_src_delta(boost::extents[srclen][trglen]), node_addresses(boost::extents[srclen][trglen]), valid_deltas(boost::extents[srclen][trglen]) { - ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len); + ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len, filter_ratio); } private: - void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len); + void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio); }; #endif diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc index 61e95b82..8ea4ebd2 100644 --- a/gi/pf/transliterations.cc +++ b/gi/pf/transliterations.cc @@ -48,10 +48,11 @@ struct BackwardEstimates { }; struct TransliterationsImpl { - TransliterationsImpl(int max_src, int max_trg) : + TransliterationsImpl(int max_src, int max_trg, double fr) : kMAX_SRC_CHUNK(max_src), kMAX_TRG_CHUNK(max_trg), - tot_pairs() { + kFILTER_RATIO(fr), + tot_pairs(), tot_mem() { } void Initialize(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { @@ -63,7 +64,7 @@ struct TransliterationsImpl { if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1); GraphStructure& gs = graphs[src_len][trg_len]; if (!gs.r) - gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK); + gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK, kFILTER_RATIO); const Reachability& r = *gs.r; // init backward estimates @@ -77,6 +78,7 @@ struct TransliterationsImpl { // TODO tot_pairs++; + tot_mem += sizeof(float) * gs.r->nodes; } void Forbid(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { @@ -119,16 +121,20 @@ struct TransliterationsImpl { cerr << "Average out-degree = " << (to / tn) << endl; cerr << " Unique structures = " << tt << endl; cerr << " Unique pairs = " << tot_pairs << endl; + cerr << " BEs size = " << (tot_mem / (1024.0*1024.0)) << " MB" << endl; } const int kMAX_SRC_CHUNK; const int kMAX_TRG_CHUNK; + const double kFILTER_RATIO; unsigned tot_pairs; + size_t tot_mem; vector > graphs; // graphs[src_len][trg_len] vector > bes; // bes[src][trg] }; -Transliterations::Transliterations(int max_src, int max_trg) : pimpl_(new TransliterationsImpl(max_src, max_trg)) {} +Transliterations::Transliterations(int max_src, int max_trg, double fr) : + pimpl_(new TransliterationsImpl(max_src, max_trg, fr)) {} Transliterations::~Transliterations() { delete pimpl_; } void Transliterations::Initialize(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { diff --git a/gi/pf/transliterations.h b/gi/pf/transliterations.h index e025547e..ea9f9d3f 100644 --- a/gi/pf/transliterations.h +++ b/gi/pf/transliterations.h @@ -8,7 +8,8 @@ struct TransliterationsImpl; struct Transliterations { // max_src and max_trg indicate how big the transliteration phrases can be - explicit Transliterations(int max_src, int max_trg); + // see reachability.h for information about filter_ratio + explicit Transliterations(int max_src, int max_trg, double filter_ratio); ~Transliterations(); void Initialize(WordID src, const std::vector& src_lets, WordID trg, const std::vector& trg_lets); void Forbid(WordID src, const std::vector& src_lets, WordID trg, const std::vector& trg_lets); -- cgit v1.2.3 From 249301376865578b7f9678cc97c0f8b6f78623f6 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Fri, 9 Mar 2012 16:35:10 -0500 Subject: KenLM d45e2be including CreateOrThrow for Jon --- klm/lm/left.hh | 2 +- klm/util/file.cc | 10 ++++++++++ klm/util/file.hh | 3 +++ klm/util/mmap.cc | 14 -------------- 4 files changed, 14 insertions(+), 15 deletions(-) diff --git a/klm/lm/left.hh b/klm/lm/left.hh index 41f71f84..a07f9803 100644 --- a/klm/lm/left.hh +++ b/klm/lm/left.hh @@ -112,7 +112,7 @@ inline size_t hash_value(const ChartState &state) { size_t hashes[2]; hashes[0] = hash_value(state.left); hashes[1] = hash_value(state.right); - return util::MurmurHashNative(hashes, sizeof(size_t), state.full); + return util::MurmurHashNative(hashes, sizeof(size_t) * 2, state.full); } template class RuleScore { diff --git a/klm/util/file.cc b/klm/util/file.cc index aee7c77a..176737fa 100644 --- a/klm/util/file.cc +++ b/klm/util/file.cc @@ -42,6 +42,16 @@ int OpenReadOrThrow(const char *name) { return ret; } +int CreateOrThrow(const char *name) { + int ret; +#if defined(_WIN32) || defined(_WIN64) + UTIL_THROW_IF(-1 == (ret = _open(name, _O_CREAT | _O_TRUNC | _O_RDWR, _S_IREAD | _S_IWRITE)), ErrnoException, "while creating " << name); +#else + UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)), ErrnoException, "while creating " << name); +#endif + return ret; +} + uint64_t SizeFile(int fd) { #if defined(_WIN32) || defined(_WIN64) __int64 ret = _filelengthi64(fd); diff --git a/klm/util/file.hh b/klm/util/file.hh index 5c57e2a9..72c8ea76 100644 --- a/klm/util/file.hh +++ b/klm/util/file.hh @@ -65,7 +65,10 @@ class scoped_FILE { std::FILE *file_; }; +// Open for read only. int OpenReadOrThrow(const char *name); +// Create file if it doesn't exist, truncate if it does. Opened for write. +int CreateOrThrow(const char *name); // Return value for SizeFile when it can't size properly. const uint64_t kBadSize = (uint64_t)-1; diff --git a/klm/util/mmap.cc b/klm/util/mmap.cc index a329ce4e..3b1c58b8 100644 --- a/klm/util/mmap.cc +++ b/klm/util/mmap.cc @@ -170,20 +170,6 @@ void *MapZeroedWrite(int fd, std::size_t size) { return MapOrThrow(size, true, kFileFlags, false, fd, 0); } -namespace { - -int CreateOrThrow(const char *name) { - int ret; -#if defined(_WIN32) || defined(_WIN64) - UTIL_THROW_IF(-1 == (ret = _open(name, _O_CREAT | _O_TRUNC | _O_RDWR, _S_IREAD | _S_IWRITE)), ErrnoException, "while creating " << name); -#else - UTIL_THROW_IF(-1 == (ret = open(name, O_CREAT | O_TRUNC | O_RDWR, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)), ErrnoException, "while creating " << name); -#endif - return ret; -} - -} // namespace - void *MapZeroedWrite(const char *name, std::size_t size, scoped_fd &file) { file.reset(CreateOrThrow(name)); try { -- cgit v1.2.3 From 113317266853abff2e1c0c3e889017d0eee55c93 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 9 Mar 2012 22:23:50 -0500 Subject: moar --- gi/pf/Makefile.am | 3 +- gi/pf/align-lexonly-pyp.cc | 207 ++++++++++------------------------------- gi/pf/align-tl.cc | 18 ++-- gi/pf/backward.cc | 89 ++++++++++++++++++ gi/pf/backward.h | 33 +++++++ gi/pf/base_distributions.h | 8 +- gi/pf/guess-translits.pl | 2 +- gi/pf/nuisance_test.cc | 6 +- gi/pf/pyp_lm.cc | 2 +- gi/pf/pyp_tm.cc | 113 +++++++++++++++++++++++ gi/pf/pyp_tm.h | 34 +++++++ gi/pf/pyp_word_model.cc | 20 ++++ gi/pf/pyp_word_model.h | 58 ++++++++++++ gi/pf/reachability.cc | 8 +- gi/pf/reachability.h | 8 +- gi/pf/transliterations.cc | 223 ++++++++++++++++++++++++++++++++++++++++----- gi/pf/transliterations.h | 3 +- utils/ccrp_nt.h | 17 ++-- 18 files changed, 628 insertions(+), 224 deletions(-) create mode 100644 gi/pf/backward.cc create mode 100644 gi/pf/backward.h create mode 100644 gi/pf/pyp_tm.cc create mode 100644 gi/pf/pyp_tm.h create mode 100644 gi/pf/pyp_word_model.cc create mode 100644 gi/pf/pyp_word_model.h diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 94364c3d..4ce72ba1 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -2,7 +2,7 @@ bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexon noinst_LIBRARIES = libpf.a -libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc transliterations.cc +libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc transliterations.cc backward.cc pyp_word_model.cc pyp_tm.cc nuisance_test_SOURCES = nuisance_test.cc nuisance_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz @@ -10,6 +10,7 @@ nuisance_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mtev align_lexonly_SOURCES = align-lexonly.cc align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc +align_lexonly_pyp_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz align_tl_SOURCES = align-tl.cc align_tl_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index 13a3a487..d68a4b8f 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -1,27 +1,18 @@ #include -#include #include -#include #include #include -#include "array2d.h" -#include "base_distributions.h" -#include "monotonic_pseg.h" -#include "conditional_pseg.h" -#include "trule.h" #include "tdict.h" #include "stringlib.h" #include "filelib.h" -#include "dict.h" +#include "array2d.h" #include "sampler.h" -#include "mfcr.h" #include "corpus.h" -#include "ngram_base.h" +#include "pyp_tm.h" using namespace std; -using namespace tr1; namespace po = boost::program_options; void InitCommandLine(int argc, char** argv, po::variables_map* conf) { @@ -51,7 +42,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { } } -shared_ptr prng; +MT19937* prng; struct LexicalAlignment { unsigned char src_index; @@ -66,159 +57,59 @@ struct AlignedSentencePair { Array2D posterior; }; -struct HierarchicalWordBase { - explicit HierarchicalWordBase(const unsigned vocab_e_size) : - base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-log(vocab_e_size)), l(1,prob_t::One()), v(1, prob_t::Zero()) {} - - void ResampleHyperparameters(MT19937* rng) { - r.resample_hyperparameters(rng); - } - - inline double logp0(const vector& s) const { - return Md::log_poisson(s.size(), 7.5) + s.size() * u0; - } - - // return p0 of rule.e_ - prob_t operator()(const TRule& rule) const { - v[0].logeq(logp0(rule.e_)); - return r.prob(rule.e_, v.begin(), l.begin()); - } - - void Increment(const TRule& rule) { - v[0].logeq(logp0(rule.e_)); - if (r.increment(rule.e_, v.begin(), l.begin(), &*prng).count) { - base *= v[0] * l[0]; - } - } - - void Decrement(const TRule& rule) { - if (r.decrement(rule.e_, &*prng).count) { - base /= prob_t(exp(logp0(rule.e_))); - } - } - - prob_t Likelihood() const { - prob_t p; p.logeq(r.log_crp_prob()); - p *= base; - return p; +struct Aligner { + Aligner(const vector >& lets, int num_letters, vector* c) : + corpus(*c), + model(lets, num_letters), + kNULL(TD::Convert("NULL")) { + assert(lets[kNULL].size() == 0); } - void Summary() const { - cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (d=" << r.discount() << ",s=" << r.strength() << ')' << endl; - for (MFCR<1,vector >::const_iterator it = r.begin(); it != r.end(); ++it) - cerr << " " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables) " << TD::GetString(it->first) << endl; - } - - prob_t base; - MFCR<1,vector > r; - const double u0; - const vector l; - mutable vector v; -}; - -struct BasicLexicalAlignment { - explicit BasicLexicalAlignment(const vector >& lets, - const unsigned words_e, - const unsigned letters_e, - vector* corp) : - letters(lets), - corpus(*corp), - //up0(words_e), - //up0("en.chars.1gram", letters_e), - //up0("en.words.1gram"), - up0(letters_e), - //up0("en.chars.2gram"), - tmodel(up0) { - } + vector& corpus; + PYPLexicalTranslation model; + const WordID kNULL; - void InstantiateRule(const WordID src, - const WordID trg, - TRule* rule) const { - static const WordID kX = TD::Convert("X") * -1; - rule->lhs_ = kX; - rule->e_ = letters[trg]; - rule->f_ = letters[src]; + void ResampleHyperparameters() { + model.ResampleHyperparameters(prng); } void InitializeRandom() { - const WordID kNULL = TD::Convert("NULL"); cerr << "Initializing with random alignments ...\n"; for (unsigned i = 0; i < corpus.size(); ++i) { AlignedSentencePair& asp = corpus[i]; asp.a.resize(asp.trg.size()); for (unsigned j = 0; j < asp.trg.size(); ++j) { - const unsigned char a_j = prng->next() * (1 + asp.src.size()); + unsigned char& a_j = asp.a[j].src_index; + a_j = prng->next() * (1 + asp.src.size()); const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); - TRule r; - InstantiateRule(f_a_j, asp.trg[j], &r); - asp.a[j].is_transliteration = false; - asp.a[j].src_index = a_j; - if (tmodel.IncrementRule(r, &*prng)) - up0.Increment(r); + model.Increment(f_a_j, asp.trg[j], &*prng); } } - cerr << " LLH = " << Likelihood() << endl; - } - - prob_t Likelihood() const { - prob_t p = tmodel.Likelihood(); - p *= up0.Likelihood(); - return p; - } - - void ResampleHyperparemeters() { - tmodel.ResampleHyperparameters(&*prng); - up0.ResampleHyperparameters(&*prng); - cerr << " (base d=" << up0.r.discount() << ",s=" << up0.r.strength() << ")\n"; + cerr << "Corpus intialized randomly. LLH = " << model.Likelihood() << endl; } - void ResampleCorpus(); - - const vector >& letters; // spelling dictionary - vector& corpus; - //PhraseConditionalUninformativeBase up0; - //PhraseConditionalUninformativeUnigramBase up0; - //UnigramWordBase up0; - //HierarchicalUnigramBase up0; - HierarchicalWordBase up0; - //CompletelyUniformBase up0; - //FixedNgramBase up0; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - MConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; -}; - -void BasicLexicalAlignment::ResampleCorpus() { - static const WordID kNULL = TD::Convert("NULL"); - for (unsigned i = 0; i < corpus.size(); ++i) { - AlignedSentencePair& asp = corpus[i]; - SampleSet ss; ss.resize(asp.src.size() + 1); - for (unsigned j = 0; j < asp.trg.size(); ++j) { - TRule r; - unsigned char& a_j = asp.a[j].src_index; - WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); - InstantiateRule(f_a_j, asp.trg[j], &r); - if (tmodel.DecrementRule(r, &*prng)) - up0.Decrement(r); - - for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { - const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); - InstantiateRule(prop_f, asp.trg[j], &r); - ss[prop_a_j] = tmodel.RuleProbability(r); + void ResampleCorpus() { + for (unsigned i = 0; i < corpus.size(); ++i) { + AlignedSentencePair& asp = corpus[i]; + SampleSet ss; ss.resize(asp.src.size() + 1); + for (unsigned j = 0; j < asp.trg.size(); ++j) { + unsigned char& a_j = asp.a[j].src_index; + const WordID e_j = asp.trg[j]; + WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + model.Decrement(f_a_j, e_j, prng); + + for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { + const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); + ss[prop_a_j] = model.Prob(prop_f, e_j); + } + a_j = prng->SelectSample(ss); + f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); + model.Increment(f_a_j, e_j, prng); } - a_j = prng->SelectSample(ss); - f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); - InstantiateRule(f_a_j, asp.trg[j], &r); - if (tmodel.IncrementRule(r, &*prng)) - up0.Increment(r); } + cerr << "LLH = " << model.Likelihood() << " " << model.UniqueConditioningContexts() << endl; } - cerr << " LLH = " << Likelihood() << endl; -} +}; void ExtractLetters(const set& v, vector >* l, set* letset = NULL) { for (set::const_iterator it = v.begin(); it != v.end(); ++it) { @@ -240,8 +131,10 @@ void ExtractLetters(const set& v, vector >* l, set a(asp.src.size(), asp.trg.size()); - for (unsigned j = 0; j < asp.trg.size(); ++j) + for (unsigned j = 0; j < asp.trg.size(); ++j) { + assert(asp.a[j].src_index <= asp.src.size()); if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true; + } cerr << a << endl; } @@ -275,10 +168,9 @@ int main(int argc, char** argv) { InitCommandLine(argc, argv, &conf); if (conf.count("random_seed")) - prng.reset(new MT19937(conf["random_seed"].as())); + prng = new MT19937(conf["random_seed"].as()); else - prng.reset(new MT19937); -// MT19937& rng = *prng; + prng = new MT19937; vector > corpuse, corpusf; set vocabe, vocabf; @@ -304,23 +196,18 @@ int main(int argc, char** argv) { ExtractLetters(vocabf, &letters, NULL); letters[TD::Convert("NULL")].clear(); - BasicLexicalAlignment x(letters, vocabe.size(), letset.size(), &corpus); - x.InitializeRandom(); + Aligner aligner(letters, letset.size(), &corpus); + aligner.InitializeRandom(); + const unsigned samples = conf["samples"].as(); for (int i = 0; i < samples; ++i) { for (int j = 65; j < 67; ++j) Debug(corpus[j]); - cerr << i << "\t" << x.tmodel.r.size() << "\t"; - if (i % 7 == 6) x.ResampleHyperparemeters(); - x.ResampleCorpus(); + if (i % 7 == 6) aligner.ResampleHyperparameters(); + aligner.ResampleCorpus(); if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); } for (unsigned i = 0; i < corpus.size(); ++i) WriteAlignments(corpus[i]); - //ModelAndData posterior(x, &corpus, vocabe, vocabf); - x.tmodel.Summary(); - x.up0.Summary(); - - //posterior.Sample(); return 0; } diff --git a/gi/pf/align-tl.cc b/gi/pf/align-tl.cc index fc9b7ca5..cbe8c6c8 100644 --- a/gi/pf/align-tl.cc +++ b/gi/pf/align-tl.cc @@ -6,6 +6,7 @@ #include #include +#include "backward.h" #include "array2d.h" #include "base_distributions.h" #include "monotonic_pseg.h" @@ -30,10 +31,11 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { opts.add_options() ("samples,s",po::value()->default_value(1000),"Number of samples") ("input,i",po::value(),"Read parallel data from") + ("s2t", po::value(), "character level source-to-target prior transliteration probabilities") + ("t2s", po::value(), "character level target-to-source prior transliteration probabilities") ("max_src_chunk", po::value()->default_value(4), "Maximum size of translitered chunk in source") ("max_trg_chunk", po::value()->default_value(4), "Maximum size of translitered chunk in target") - ("min_transliterated_src_length", po::value()->default_value(3), "Minimum length of source words considered for transliteration") - ("filter_ratio", po::value()->default_value(0.66), "Filter ratio: basically, if the lengths differ by less than this ratio, mark the pair as non-transliteratable") + ("expected_src_to_trg_ratio", po::value()->default_value(1.0), "If a word is transliterated, what is the expected length ratio from source to target?") ("random_seed,S",po::value(), "Random seed"); po::options_description clo("Command line options"); clo.add_options() @@ -303,7 +305,7 @@ int main(int argc, char** argv) { corpusf.clear(); corpuse.clear(); vocabf.insert(TD::Convert("NULL")); - vector > letters(TD::NumWords()); + vector > letters(TD::NumWords() + 1); set letset; ExtractLetters(vocabe, &letters, &letset); ExtractLetters(vocabf, &letters, NULL); @@ -312,9 +314,9 @@ int main(int argc, char** argv) { // TODO configure this const int max_src_chunk = conf["max_src_chunk"].as(); const int max_trg_chunk = conf["max_trg_chunk"].as(); - const double filter_rat = conf["filter_ratio"].as(); - const int min_trans_src = conf["min_transliterated_src_length"].as(); - Transliterations tl(max_src_chunk, max_trg_chunk, filter_rat); + const double s2t_rat = conf["expected_src_to_trg_ratio"].as(); + const BackwardEstimator be(conf["s2t"].as(), conf["t2s"].as()); + Transliterations tl(max_src_chunk, max_trg_chunk, s2t_rat, be); cerr << "Initializing transliteration graph structures ...\n"; for (int i = 0; i < corpus.size(); ++i) { @@ -325,8 +327,8 @@ int main(int argc, char** argv) { for (int k = 0; k < trg.size(); ++k) { const vector& trg_let = letters[trg[k]]; tl.Initialize(src[j], src_let, trg[k], trg_let); - if (src_let.size() < min_trans_src) - tl.Forbid(src[j], src_let, trg[k], trg_let); + //if (src_let.size() < min_trans_src) + // tl.Forbid(src[j], src_let, trg[k], trg_let); } } } diff --git a/gi/pf/backward.cc b/gi/pf/backward.cc new file mode 100644 index 00000000..b92629fd --- /dev/null +++ b/gi/pf/backward.cc @@ -0,0 +1,89 @@ +#include "backward.h" + +#include +#include + +#include "array2d.h" +#include "reachability.h" +#include "base_distributions.h" + +using namespace std; + +BackwardEstimator::BackwardEstimator(const string& s2t, + const string& t2s) : m1(new Model1(s2t)), m1inv(new Model1(t2s)) {} + +BackwardEstimator::~BackwardEstimator() { + delete m1; m1 = NULL; + delete m1inv; m1inv = NULL; +} + +float BackwardEstimator::ComputeBackwardProb(const std::vector& src, + const std::vector& trg, + unsigned src_covered, + unsigned trg_covered, + double s2t_ratio) const { + if (src_covered == src.size() || trg_covered == trg.size()) { + assert(src_covered == src.size()); + assert(trg_covered == trg.size()); + return 0; + } + static const WordID kNULL = TD::Convert(""); + const prob_t uniform_alignment(1.0 / (src.size() - src_covered + 1)); + // TODO factor in expected length ratio + prob_t e; e.logeq(Md::log_poisson(trg.size() - trg_covered, (src.size() - src_covered) * s2t_ratio)); // p(trg len remaining | src len remaining) + for (unsigned j = trg_covered; j < trg.size(); ++j) { + prob_t p = (*m1)(kNULL, trg[j]) + prob_t(1e-12); + for (unsigned i = src_covered; i < src.size(); ++i) + p += (*m1)(src[i], trg[j]); + if (p.is_0()) { + cerr << "ERROR: p(" << TD::Convert(trg[j]) << " | " << TD::GetString(src) << ") = 0!\n"; + assert(!"failed"); + } + p *= uniform_alignment; + e *= p; + } + // TODO factor in expected length ratio + const prob_t inv_uniform(1.0 / (trg.size() - trg_covered + 1.0)); + prob_t inv; + inv.logeq(Md::log_poisson(src.size() - src_covered, (trg.size() - trg_covered) / s2t_ratio)); + for (unsigned i = src_covered; i < src.size(); ++i) { + prob_t p = (*m1inv)(kNULL, src[i]) + prob_t(1e-12); + for (unsigned j = trg_covered; j < trg.size(); ++j) + p += (*m1inv)(trg[j], src[i]); + if (p.is_0()) { + cerr << "ERROR: p_inv(" << TD::Convert(src[i]) << " | " << TD::GetString(trg) << ") = 0!\n"; + assert(!"failed"); + } + p *= inv_uniform; + inv *= p; + } + return (log(e) + log(inv)) / 2; +} + +void BackwardEstimator::InitializeGrid(const vector& src, + const vector& trg, + const Reachability& r, + double s2t_ratio, + float* grid) const { + queue > q; + q.push(make_pair(0,0)); + Array2D done(src.size()+1, trg.size()+1, false); + //cerr << TD::GetString(src) << " ||| " << TD::GetString(trg) << endl; + while(!q.empty()) { + const pair n = q.front(); + q.pop(); + if (done(n.first,n.second)) continue; + done(n.first,n.second) = true; + + float lp = ComputeBackwardProb(src, trg, n.first, n.second, s2t_ratio); + if (n.first == 0 && n.second == 0) grid[0] = lp; + //cerr << " " << n.first << "," << n.second << "\t" << lp << endl; + + if (n.first == src.size() || n.second == trg.size()) continue; + const vector >& edges = r.valid_deltas[n.first][n.second]; + for (int i = 0; i < edges.size(); ++i) + q.push(make_pair(n.first + edges[i].first, n.second + edges[i].second)); + } + //static int cc = 0; ++cc; if (cc == 80) exit(1); +} + diff --git a/gi/pf/backward.h b/gi/pf/backward.h new file mode 100644 index 00000000..e67eff0c --- /dev/null +++ b/gi/pf/backward.h @@ -0,0 +1,33 @@ +#ifndef _BACKWARD_H_ +#define _BACKWARD_H_ + +#include +#include +#include "wordid.h" + +struct Reachability; +struct Model1; + +struct BackwardEstimator { + BackwardEstimator(const std::string& s2t, + const std::string& t2s); + ~BackwardEstimator(); + + void InitializeGrid(const std::vector& src, + const std::vector& trg, + const Reachability& r, + double src2trg_ratio, + float* grid) const; + + private: + float ComputeBackwardProb(const std::vector& src, + const std::vector& trg, + unsigned src_covered, + unsigned trg_covered, + double src2trg_ratio) const; + + Model1* m1; + Model1* m1inv; +}; + +#endif diff --git a/gi/pf/base_distributions.h b/gi/pf/base_distributions.h index 0d597c5c..84dacdf2 100644 --- a/gi/pf/base_distributions.h +++ b/gi/pf/base_distributions.h @@ -14,13 +14,7 @@ #include "tdict.h" #include "sampler.h" #include "m.h" - -inline std::ostream& operator<<(std::ostream& os, const std::vector& p) { - os << '['; - for (int i = 0; i < p.size(); ++i) - os << (i==0 ? "" : " ") << TD::Convert(p[i]); - return os << ']'; -} +#include "os_phrase.h" struct Model1 { explicit Model1(const std::string& fname) : diff --git a/gi/pf/guess-translits.pl b/gi/pf/guess-translits.pl index aafec13a..d00c2168 100755 --- a/gi/pf/guess-translits.pl +++ b/gi/pf/guess-translits.pl @@ -69,4 +69,4 @@ for my $f (keys %fs) { } } print STDERR "Extracted $num pairs.\n"; -print STDERR "Recommend running:\n ../../training/model1 -t -99999 output.txt\n"; +print STDERR "Recommend running:\n ../../training/model1 -v -d -t -99999 output.txt\n"; diff --git a/gi/pf/nuisance_test.cc b/gi/pf/nuisance_test.cc index 0f44fe95..fc0af9cb 100644 --- a/gi/pf/nuisance_test.cc +++ b/gi/pf/nuisance_test.cc @@ -124,9 +124,9 @@ int main(int argc, char** argv) { WordID y = TD::Convert("remember"); vector src; TD::ConvertSentence("s o u v e n o n s", &src); vector trg; TD::ConvertSentence("r e m e m b e r", &trg); - Transliterations xx; - xx.Initialize(x, src, y, trg); - return 1; +// Transliterations xx; +// xx.Initialize(x, src, y, trg); +// return 1; for (int j = 0; j < ITERS; ++j) { Base b; diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc index 104f356b..52e6be2c 100644 --- a/gi/pf/pyp_lm.cc +++ b/gi/pf/pyp_lm.cc @@ -18,7 +18,7 @@ // I use templates to handle the recursive formalation of the prior, so // the order of the model has to be specified here, at compile time: -#define kORDER 4 +#define kORDER 3 using namespace std; using namespace tr1; diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc new file mode 100644 index 00000000..94cbe7c3 --- /dev/null +++ b/gi/pf/pyp_tm.cc @@ -0,0 +1,113 @@ +#include "pyp_tm.h" + +#include +#include +#include + +#include "base_distributions.h" +#include "monotonic_pseg.h" +#include "conditional_pseg.h" +#include "tdict.h" +#include "ccrp.h" +#include "pyp_word_model.h" + +using namespace std; +using namespace std::tr1; + +template +struct ConditionalPYPWordModel { + ConditionalPYPWordModel(Base* b) : base(*b) {} + + void Summary() const { + cerr << "Number of conditioning contexts: " << r.size() << endl; + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + cerr << TD::Convert(it->first) << " \tPYP(d=" << it->second.discount() << ",s=" << it->second.strength() << ") --------------------------" << endl; + for (CCRP >::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) + cerr << " " << i2->second.total_dish_count_ << '\t' << TD::GetString(i2->first) << endl; + } + } + + void ResampleHyperparameters(MT19937* rng) { + for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it) + it->second.resample_hyperparameters(rng); + } + + prob_t Prob(const WordID src, const vector& trglets) const { + RuleModelHash::const_iterator it = r.find(src); + if (it == r.end()) { + return base(trglets); + } else { + return it->second.prob(trglets, base(trglets)); + } + } + + void Increment(const WordID src, const vector& trglets, MT19937* rng) { + RuleModelHash::iterator it = r.find(src); + if (it == r.end()) + it = r.insert(make_pair(src, CCRP >(1,1,1,1,0.5,1.0))).first; + if (it->second.increment(trglets, base(trglets), rng)) + base.Increment(trglets, rng); + } + + void Decrement(const WordID src, const vector& trglets, MT19937* rng) { + RuleModelHash::iterator it = r.find(src); + assert(it != r.end()); + if (it->second.decrement(trglets, rng)) { + base.Decrement(trglets, rng); + if (it->second.num_customers() == 0) + r.erase(it); + } + } + + prob_t Likelihood() const { + prob_t p = prob_t::One(); + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + prob_t q; q.logeq(it->second.log_crp_prob()); + p *= q; + } + return p; + } + + unsigned UniqueConditioningContexts() const { + return r.size(); + } + + Base& base; + typedef unordered_map > > RuleModelHash; + RuleModelHash r; +}; + +PYPLexicalTranslation::PYPLexicalTranslation(const vector >& lets, + const unsigned num_letters) : + letters(lets), + up0(new PYPWordModel(num_letters)), + tmodel(new ConditionalPYPWordModel(up0)), + kX(-TD::Convert("X")) {} + +prob_t PYPLexicalTranslation::Likelihood() const { + prob_t p = up0->Likelihood(); + p *= tmodel->Likelihood(); + return p; +} + +void PYPLexicalTranslation::ResampleHyperparameters(MT19937* rng) { + tmodel->ResampleHyperparameters(rng); + up0->ResampleHyperparameters(rng); +} + +unsigned PYPLexicalTranslation::UniqueConditioningContexts() const { + return tmodel->UniqueConditioningContexts(); +} + +prob_t PYPLexicalTranslation::Prob(WordID src, WordID trg) const { + return tmodel->Prob(src, letters[trg]); +} + +void PYPLexicalTranslation::Increment(WordID src, WordID trg, MT19937* rng) { + tmodel->Increment(src, letters[trg], rng); +} + +void PYPLexicalTranslation::Decrement(WordID src, WordID trg, MT19937* rng) { + tmodel->Decrement(src, letters[trg], rng); +} + diff --git a/gi/pf/pyp_tm.h b/gi/pf/pyp_tm.h new file mode 100644 index 00000000..fa0fb28f --- /dev/null +++ b/gi/pf/pyp_tm.h @@ -0,0 +1,34 @@ +#ifndef PYP_LEX_TRANS +#define PYP_LEX_TRANS + +#include +#include "wordid.h" +#include "prob.h" +#include "sampler.h" + +struct TRule; +struct PYPWordModel; +template struct ConditionalPYPWordModel; + +struct PYPLexicalTranslation { + explicit PYPLexicalTranslation(const std::vector >& lets, + const unsigned num_letters); + + prob_t Likelihood() const; + + void ResampleHyperparameters(MT19937* rng); + prob_t Prob(WordID src, WordID trg) const; // return p(trg | src) + void Summary() const; + void Increment(WordID src, WordID trg, MT19937* rng); + void Decrement(WordID src, WordID trg, MT19937* rng); + unsigned UniqueConditioningContexts() const; + + private: + const std::vector >& letters; // spelling dictionary + PYPWordModel* up0; // base distribuction (model English word) + ConditionalPYPWordModel* tmodel; // translation distributions + // (model English word | French word) + const WordID kX; +}; + +#endif diff --git a/gi/pf/pyp_word_model.cc b/gi/pf/pyp_word_model.cc new file mode 100644 index 00000000..12df4abf --- /dev/null +++ b/gi/pf/pyp_word_model.cc @@ -0,0 +1,20 @@ +#include "pyp_word_model.h" + +#include + +using namespace std; + +void PYPWordModel::ResampleHyperparameters(MT19937* rng) { + r.resample_hyperparameters(rng); + cerr << " PYPWordModel(d=" << r.discount() << ",s=" << r.strength() << ")\n"; +} + +void PYPWordModel::Summary() const { + cerr << "PYPWordModel: generations=" << r.num_customers() + << " PYP(d=" << r.discount() << ",s=" << r.strength() << ')' << endl; + for (CCRP >::const_iterator it = r.begin(); it != r.end(); ++it) + cerr << " " << it->second.total_dish_count_ + << " (on " << it->second.table_counts_.size() << " tables) " + << TD::GetString(it->first) << endl; +} + diff --git a/gi/pf/pyp_word_model.h b/gi/pf/pyp_word_model.h new file mode 100644 index 00000000..800a4fd7 --- /dev/null +++ b/gi/pf/pyp_word_model.h @@ -0,0 +1,58 @@ +#ifndef _PYP_WORD_MODEL_H_ +#define _PYP_WORD_MODEL_H_ + +#include +#include +#include +#include "prob.h" +#include "ccrp.h" +#include "m.h" +#include "tdict.h" +#include "os_phrase.h" + +// PYP(d,s,poisson-uniform) represented as a CRP +struct PYPWordModel { + explicit PYPWordModel(const unsigned vocab_e_size, const double mean_len = 7.5) : + base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-std::log(vocab_e_size)), mean_length(mean_len) {} + + void ResampleHyperparameters(MT19937* rng); + + inline prob_t operator()(const std::vector& s) const { + return r.prob(s, p0(s)); + } + + inline void Increment(const std::vector& s, MT19937* rng) { + if (r.increment(s, p0(s), rng)) + base *= p0(s); + } + + inline void Decrement(const std::vector& s, MT19937 *rng) { + if (r.decrement(s, rng)) + base /= p0(s); + } + + inline prob_t Likelihood() const { + prob_t p; p.logeq(r.log_crp_prob()); + p *= base; + return p; + } + + void Summary() const; + + private: + inline double logp0(const std::vector& s) const { + return Md::log_poisson(s.size(), mean_length) + s.size() * u0; + } + + inline prob_t p0(const std::vector& s) const { + prob_t p; p.logeq(logp0(s)); + return p; + } + + prob_t base; // keeps track of the draws from the base distribution + CCRP > r; + const double u0; // uniform log prob of generating a letter + const double mean_length; // mean length of a word in the base distribution +}; + +#endif diff --git a/gi/pf/reachability.cc b/gi/pf/reachability.cc index c10000f2..7d0d04ac 100644 --- a/gi/pf/reachability.cc +++ b/gi/pf/reachability.cc @@ -12,7 +12,7 @@ struct SState { int prev_trg_covered; }; -void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio) { +void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) { typedef boost::multi_array, 2> array_type; array_type a(boost::extents[srclen + 1][trglen + 1]); a[0][0].push_back(SState()); @@ -31,9 +31,9 @@ void Reachability::ComputeReachability(int srclen, int trglen, int src_max_phras } a[0][0].clear(); //cerr << srclen << "," << trglen << ": Final cell contains " << a[srclen][trglen].size() << " back pointers\n"; - size_t min_allowed = (src_max_phrase_len + 1) * (trg_max_phrase_len + 1) * (filter_ratio * filter_ratio); - if (a[srclen][trglen].size() < min_allowed) { - cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") violates reachability constraint of min indegree " << min_allowed << " with " << a[srclen][trglen].size() << " in edges\n"; + if (a[srclen][trglen].empty()) { + cerr << "Sequence pair with lengths (" << srclen << ',' << trglen << ") violates reachability constraints\n"; + nodes = 0; return; } diff --git a/gi/pf/reachability.h b/gi/pf/reachability.h index 03967d44..1e22c76a 100644 --- a/gi/pf/reachability.h +++ b/gi/pf/reachability.h @@ -18,19 +18,17 @@ struct Reachability { boost::multi_array node_addresses; // na[src_covered][trg_covered] -- the index of the node in a one-dimensional array (of size "nodes") boost::multi_array >, 2> valid_deltas; // valid_deltas[src_covered][trg_covered] list of valid transitions leaving a particular node - // filter_ratio says if the number of outgoing edges from the first cell is less than - // src_max * trg_max * filter_rat^2 then mark as non reachable - Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio = 0.0) : + Reachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len) : nodes(), edges(boost::extents[srclen][trglen][src_max_phrase_len+1][trg_max_phrase_len+1]), max_src_delta(boost::extents[srclen][trglen]), node_addresses(boost::extents[srclen][trglen]), valid_deltas(boost::extents[srclen][trglen]) { - ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len, filter_ratio); + ComputeReachability(srclen, trglen, src_max_phrase_len, trg_max_phrase_len); } private: - void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len, double filter_ratio); + void ComputeReachability(int srclen, int trglen, int src_max_phrase_len, int trg_max_phrase_len); }; #endif diff --git a/gi/pf/transliterations.cc b/gi/pf/transliterations.cc index 8ea4ebd2..2200715e 100644 --- a/gi/pf/transliterations.cc +++ b/gi/pf/transliterations.cc @@ -5,14 +5,173 @@ #include "boost/shared_ptr.hpp" +#include "backward.h" #include "filelib.h" -#include "ccrp.h" +#include "tdict.h" +#include "trule.h" +#include "filelib.h" +#include "ccrp_nt.h" #include "m.h" #include "reachability.h" using namespace std; using namespace std::tr1; +struct TruncatedConditionalLengthModel { + TruncatedConditionalLengthModel(unsigned max_src_size, unsigned max_trg_size, double expected_src_to_trg_ratio) : + plens(max_src_size+1, vector(max_trg_size+1, 0.0)) { + for (unsigned i = 1; i <= max_src_size; ++i) { + prob_t z = prob_t::Zero(); + for (unsigned j = 1; j <= max_trg_size; ++j) + z += (plens[i][j] = prob_t(0.01 + exp(Md::log_poisson(j, i * expected_src_to_trg_ratio)))); + for (unsigned j = 1; j <= max_trg_size; ++j) + plens[i][j] /= z; + //for (unsigned j = 1; j <= max_trg_size; ++j) + // cerr << "P(trg_len=" << j << " | src_len=" << i << ") = " << plens[i][j] << endl; + } + } + + // return p(tlen | slen) for *chunks* not full words + inline const prob_t& operator()(int slen, int tlen) const { + return plens[slen][tlen]; + } + + vector > plens; +}; + +struct CondBaseDist { + CondBaseDist(unsigned max_src_size, unsigned max_trg_size, double expected_src_to_trg_ratio) : + tclm(max_src_size, max_trg_size, expected_src_to_trg_ratio) {} + + prob_t operator()(const vector& src, unsigned sf, unsigned st, + const vector& trg, unsigned tf, unsigned tt) const { + prob_t p = tclm(st - sf, tt - tf); // target len | source length ~ TCLM(source len) + assert(!"not impl"); + return p; + } + inline prob_t operator()(const vector& src, const vector& trg) const { + return (*this)(src, 0, src.size(), trg, 0, trg.size()); + } + TruncatedConditionalLengthModel tclm; +}; + +// represents transliteration phrase probabilities, e.g. +// p( a l - | A l ) , p( o | A w ) , ... +struct TransliterationChunkConditionalModel { + explicit TransliterationChunkConditionalModel(const CondBaseDist& pp0) : + d(0.0), + strength(1.0), + rp0(pp0) { + } + + void Summary() const { + std::cerr << "Number of conditioning contexts: " << r.size() << std::endl; + for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) { + std::cerr << TD::GetString(it->first) << " \t(\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl; + for (CCRP_NoTable::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2) + std::cerr << " " << i2->second << '\t' << i2->first << std::endl; + } + } + + int DecrementRule(const TRule& rule) { + RuleModelHash::iterator it = r.find(rule.f_); + assert(it != r.end()); + int count = it->second.decrement(rule); + if (count) { + if (it->second.num_customers() == 0) r.erase(it); + } + return count; + } + + int IncrementRule(const TRule& rule) { + RuleModelHash::iterator it = r.find(rule.f_); + if (it == r.end()) { + it = r.insert(make_pair(rule.f_, CCRP_NoTable(strength))).first; + } + int count = it->second.increment(rule); + return count; + } + + void IncrementRules(const std::vector& rules) { + for (int i = 0; i < rules.size(); ++i) + IncrementRule(*rules[i]); + } + + void DecrementRules(const std::vector& rules) { + for (int i = 0; i < rules.size(); ++i) + DecrementRule(*rules[i]); + } + + prob_t RuleProbability(const TRule& rule) const { + prob_t p; + RuleModelHash::const_iterator it = r.find(rule.f_); + if (it == r.end()) { + p = rp0(rule.f_, rule.e_); + } else { + p = it->second.prob(rule, rp0(rule.f_, rule.e_)); + } + return p; + } + + double LogLikelihood(const double& dd, const double& aa) const { + if (aa <= -dd) return -std::numeric_limits::infinity(); + //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1); + double llh = //Md::log_beta_density(dd, 1, 1) + + Md::log_gamma_density(dd + aa, 1, 1); + typename std::tr1::unordered_map, CCRP_NoTable, boost::hash > >::const_iterator it; + for (it = r.begin(); it != r.end(); ++it) + llh += it->second.log_crp_prob(aa); + return llh; + } + + struct AlphaResampler { + AlphaResampler(const TransliterationChunkConditionalModel& m) : m_(m) {} + const TransliterationChunkConditionalModel& m_; + double operator()(const double& proposed_strength) const { + return m_.LogLikelihood(m_.d, proposed_strength); + } + }; + + void ResampleHyperparameters(MT19937* rng) { + typename std::tr1::unordered_map, CCRP_NoTable, boost::hash > >::iterator it; + //const unsigned nloop = 5; + const unsigned niterations = 10; + //DiscountResampler dr(*this); + AlphaResampler ar(*this); +#if 0 + for (int iter = 0; iter < nloop; ++iter) { + strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits::min(), + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + double min_discount = std::numeric_limits::min(); + if (strength < 0.0) min_discount -= strength; + d = slice_sampler1d(dr, d, *rng, min_discount, + 1.0, 0.0, niterations, 100*niterations); + } +#endif + strength = slice_sampler1d(ar, strength, *rng, -d, + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + std::cerr << "CTMModel(alpha=" << strength << ") = " << LogLikelihood(d, strength) << std::endl; + for (it = r.begin(); it != r.end(); ++it) { +#if 0 + it->second.set_discount(d); +#endif + it->second.set_alpha(strength); + } + } + + prob_t Likelihood() const { + prob_t p; p.logeq(LogLikelihood(d, strength)); + return p; + } + + const CondBaseDist& rp0; + typedef std::tr1::unordered_map, + CCRP_NoTable, + boost::hash > > RuleModelHash; + RuleModelHash r; + double d, strength; +}; + struct GraphStructure { GraphStructure() : r() {} // leak memory - these are basically static @@ -20,9 +179,9 @@ struct GraphStructure { bool IsReachable() const { return r->nodes > 0; } }; -struct BackwardEstimates { - BackwardEstimates() : gs(), backward() {} - explicit BackwardEstimates(const GraphStructure& g) : +struct ProbabilityEstimates { + ProbabilityEstimates() : gs(), backward() {} + explicit ProbabilityEstimates(const GraphStructure& g) : gs(&g), backward() { if (g.r->nodes > 0) backward = new float[g.r->nodes]; @@ -36,24 +195,32 @@ struct BackwardEstimates { } // returns an backward estimate - double operator()(int src_covered, int trg_covered) const { + double Backward(int src_covered, int trg_covered) const { if (!backward) return 0; int ind = gs->r->node_addresses[src_covered][trg_covered]; if (ind < 0) return 0; return backward[ind]; } + + prob_t estp; + float* backward; private: const GraphStructure* gs; - float* backward; }; struct TransliterationsImpl { - TransliterationsImpl(int max_src, int max_trg, double fr) : + TransliterationsImpl(int max_src, int max_trg, double sr, const BackwardEstimator& b) : + cp0(max_src, max_trg, sr), + tccm(cp0), + be(b), kMAX_SRC_CHUNK(max_src), kMAX_TRG_CHUNK(max_trg), - kFILTER_RATIO(fr), + kS2T_RATIO(sr), tot_pairs(), tot_mem() { } + const CondBaseDist cp0; + TransliterationChunkConditionalModel tccm; + const BackwardEstimator& be; void Initialize(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { const size_t src_len = src_lets.size(); @@ -63,20 +230,29 @@ struct TransliterationsImpl { if (src_len >= graphs.size()) graphs.resize(src_len + 1); if (trg_len >= graphs[src_len].size()) graphs[src_len].resize(trg_len + 1); GraphStructure& gs = graphs[src_len][trg_len]; - if (!gs.r) - gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK, kFILTER_RATIO); + if (!gs.r) { + double rat = exp(fabs(log(trg_len / (src_len * kS2T_RATIO)))); + if (rat > 1.5 || (rat > 2.4 && src_len < 6)) { + cerr << " ** Forbidding transliterations of size " << src_len << "," << trg_len << ": " << rat << endl; + gs.r = new Reachability(src_len, trg_len, 0, 0); + } else { + gs.r = new Reachability(src_len, trg_len, kMAX_SRC_CHUNK, kMAX_TRG_CHUNK); + } + } + const Reachability& r = *gs.r; // init backward estimates - if (src >= bes.size()) bes.resize(src + 1); - unordered_map::iterator it = bes[src].find(trg); - if (it != bes[src].end()) return; // already initialized + if (src >= ests.size()) ests.resize(src + 1); + unordered_map::iterator it = ests[src].find(trg); + if (it != ests[src].end()) return; // already initialized - it = bes[src].insert(make_pair(trg, BackwardEstimates(gs))).first; - BackwardEstimates& b = it->second; + it = ests[src].insert(make_pair(trg, ProbabilityEstimates(gs))).first; + ProbabilityEstimates& est = it->second; if (!gs.r->nodes) return; // not derivable subject to length constraints - // TODO + be.InitializeGrid(src_lets, trg_lets, r, kS2T_RATIO, est.backward); + cerr << TD::GetString(src_lets) << " ||| " << TD::GetString(trg_lets) << " ||| " << (est.backward[0] / trg_lets.size()) << endl; tot_pairs++; tot_mem += sizeof(float) * gs.r->nodes; } @@ -92,8 +268,11 @@ struct TransliterationsImpl { const vector& tv = graphs[src.size()]; assert(trg.size() < tv.size()); const GraphStructure& gs = tv[trg.size()]; - // TODO: do prob - return prob_t::Zero(); + if (gs.r->nodes == 0) + return prob_t::Zero(); + const unordered_map::const_iterator it = ests[s].find(t); + assert(it != ests[s].end()); + return it->second.estp; } void GraphSummary() const { @@ -126,15 +305,15 @@ struct TransliterationsImpl { const int kMAX_SRC_CHUNK; const int kMAX_TRG_CHUNK; - const double kFILTER_RATIO; + const double kS2T_RATIO; unsigned tot_pairs; size_t tot_mem; vector > graphs; // graphs[src_len][trg_len] - vector > bes; // bes[src][trg] + vector > ests; // ests[src][trg] }; -Transliterations::Transliterations(int max_src, int max_trg, double fr) : - pimpl_(new TransliterationsImpl(max_src, max_trg, fr)) {} +Transliterations::Transliterations(int max_src, int max_trg, double sr, const BackwardEstimator& be) : + pimpl_(new TransliterationsImpl(max_src, max_trg, sr, be)) {} Transliterations::~Transliterations() { delete pimpl_; } void Transliterations::Initialize(WordID src, const vector& src_lets, WordID trg, const vector& trg_lets) { diff --git a/gi/pf/transliterations.h b/gi/pf/transliterations.h index ea9f9d3f..49d14684 100644 --- a/gi/pf/transliterations.h +++ b/gi/pf/transliterations.h @@ -5,11 +5,12 @@ #include "wordid.h" #include "prob.h" +struct BackwardEstimator; struct TransliterationsImpl; struct Transliterations { // max_src and max_trg indicate how big the transliteration phrases can be // see reachability.h for information about filter_ratio - explicit Transliterations(int max_src, int max_trg, double filter_ratio); + explicit Transliterations(int max_src, int max_trg, double s2t_rat, const BackwardEstimator& be); ~Transliterations(); void Initialize(WordID src, const std::vector& src_lets, WordID trg, const std::vector& trg_lets); void Forbid(WordID src, const std::vector& src_lets, WordID trg, const std::vector& trg_lets); diff --git a/utils/ccrp_nt.h b/utils/ccrp_nt.h index 79321493..6efbfc78 100644 --- a/utils/ccrp_nt.h +++ b/utils/ccrp_nt.h @@ -11,6 +11,7 @@ #include #include "sampler.h" #include "slice_sampler.h" +#include "m.h" // Chinese restaurant process (1 parameter) template > @@ -29,6 +30,7 @@ class CCRP_NoTable { alpha_prior_rate_(c_rate) {} double alpha() const { return alpha_; } + void set_alpha(const double& alpha) { alpha_ = alpha; assert(alpha_ > 0.0); } bool has_alpha_prior() const { return !std::isnan(alpha_prior_shape_); @@ -71,9 +73,10 @@ class CCRP_NoTable { return table_diff; } - double prob(const Dish& dish, const double& p0) const { + template + F prob(const Dish& dish, const F& p0) const { const unsigned at_table = num_customers(dish); - return (at_table + p0 * alpha_) / (num_customers_ + alpha_); + return (F(at_table) + p0 * F(alpha_)) / F(num_customers_ + alpha_); } double logprob(const Dish& dish, const double& logp0) const { @@ -85,20 +88,12 @@ class CCRP_NoTable { return log_crp_prob(alpha_); } - static double log_gamma_density(const double& x, const double& shape, const double& rate) { - assert(x >= 0.0); - assert(shape > 0.0); - assert(rate > 0.0); - const double lp = (shape-1)*log(x) - shape*log(rate) - x/rate - lgamma(shape); - return lp; - } - // taken from http://en.wikipedia.org/wiki/Chinese_restaurant_process // does not include P_0's double log_crp_prob(const double& alpha) const { double lp = 0.0; if (has_alpha_prior()) - lp += log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_); + lp += Md::log_gamma_density(alpha, alpha_prior_shape_, alpha_prior_rate_); assert(lp <= 0.0); if (num_customers_) { lp += lgamma(alpha) - lgamma(alpha + num_customers_) + -- cgit v1.2.3 From 600ff8e60086c5cc197fe302bfcea113ebd15565 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 9 Mar 2012 22:27:12 -0500 Subject: forgotten file --- gi/pf/os_phrase.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 gi/pf/os_phrase.h diff --git a/gi/pf/os_phrase.h b/gi/pf/os_phrase.h new file mode 100644 index 00000000..dfe40cb1 --- /dev/null +++ b/gi/pf/os_phrase.h @@ -0,0 +1,15 @@ +#ifndef _OS_PHRASE_H_ +#define _OS_PHRASE_H_ + +#include +#include +#include "tdict.h" + +inline std::ostream& operator<<(std::ostream& os, const std::vector& p) { + os << '['; + for (int i = 0; i < p.size(); ++i) + os << (i==0 ? "" : " ") << TD::Convert(p[i]); + return os << ']'; +} + +#endif -- cgit v1.2.3 From ef614a1d968aebbf463ed57876fee395b4c24635 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 9 Mar 2012 23:13:09 -0500 Subject: logging after alignment --- gi/pf/align-lexonly-pyp.cc | 1 + gi/pf/pyp_tm.cc | 7 +++++-- gi/pf/pyp_word_model.h | 2 +- utils/ccrp.h | 1 + 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index d68a4b8f..4a1d1db6 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -208,6 +208,7 @@ int main(int argc, char** argv) { } for (unsigned i = 0; i < corpus.size(); ++i) WriteAlignments(corpus[i]); + aligner.model.Summary(); return 0; } diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc index 94cbe7c3..b5262f47 100644 --- a/gi/pf/pyp_tm.cc +++ b/gi/pf/pyp_tm.cc @@ -54,8 +54,6 @@ struct ConditionalPYPWordModel { assert(it != r.end()); if (it->second.decrement(trglets, rng)) { base.Decrement(trglets, rng); - if (it->second.num_customers() == 0) - r.erase(it); } } @@ -84,6 +82,11 @@ PYPLexicalTranslation::PYPLexicalTranslation(const vector >& lets tmodel(new ConditionalPYPWordModel(up0)), kX(-TD::Convert("X")) {} +void PYPLexicalTranslation::Summary() const { + tmodel->Summary(); + up0->Summary(); +} + prob_t PYPLexicalTranslation::Likelihood() const { prob_t p = up0->Likelihood(); p *= tmodel->Likelihood(); diff --git a/gi/pf/pyp_word_model.h b/gi/pf/pyp_word_model.h index 800a4fd7..ff366865 100644 --- a/gi/pf/pyp_word_model.h +++ b/gi/pf/pyp_word_model.h @@ -12,7 +12,7 @@ // PYP(d,s,poisson-uniform) represented as a CRP struct PYPWordModel { - explicit PYPWordModel(const unsigned vocab_e_size, const double mean_len = 7.5) : + explicit PYPWordModel(const unsigned vocab_e_size, const double mean_len = 5) : base(prob_t::One()), r(1,1,1,1,0.66,50.0), u0(-std::log(vocab_e_size)), mean_length(mean_len) {} void ResampleHyperparameters(MT19937* rng); diff --git a/utils/ccrp.h b/utils/ccrp.h index 439d7e1e..4a8b80e7 100644 --- a/utils/ccrp.h +++ b/utils/ccrp.h @@ -221,6 +221,7 @@ class CCRP { void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { assert(has_discount_prior() || has_strength_prior()); + if (num_customers() == 0) return; DiscountResampler dr(*this); StrengthResampler sr(*this); for (int iter = 0; iter < nloop; ++iter) { -- cgit v1.2.3 From e320dd47380f8f3a628073f926a56e4321146ebd Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 10 Mar 2012 00:00:27 -0500 Subject: use quasi model 2 instead of uniform alignments --- gi/pf/align-lexonly-pyp.cc | 6 ++++++ gi/pf/quasi_model2.h | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 gi/pf/quasi_model2.h diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index 4a1d1db6..0c90b6ce 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -11,6 +11,7 @@ #include "sampler.h" #include "corpus.h" #include "pyp_tm.h" +#include "quasi_model2.h" using namespace std; namespace po = boost::program_options; @@ -61,12 +62,14 @@ struct Aligner { Aligner(const vector >& lets, int num_letters, vector* c) : corpus(*c), model(lets, num_letters), + paj(4, 0.08), kNULL(TD::Convert("NULL")) { assert(lets[kNULL].size() == 0); } vector& corpus; PYPLexicalTranslation model; + const QuasiModel2 paj; const WordID kNULL; void ResampleHyperparameters() { @@ -83,6 +86,7 @@ struct Aligner { a_j = prng->next() * (1 + asp.src.size()); const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); model.Increment(f_a_j, asp.trg[j], &*prng); + // TODO factor in alignment prob } } cerr << "Corpus intialized randomly. LLH = " << model.Likelihood() << endl; @@ -101,6 +105,8 @@ struct Aligner { for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); ss[prop_a_j] = model.Prob(prop_f, e_j); + // TODO configurable + ss[prop_a_j] *= paj.Pa_j(prop_a_j, j, asp.src.size(), asp.trg.size()); } a_j = prng->SelectSample(ss); f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); diff --git a/gi/pf/quasi_model2.h b/gi/pf/quasi_model2.h new file mode 100644 index 00000000..0095289f --- /dev/null +++ b/gi/pf/quasi_model2.h @@ -0,0 +1,46 @@ +#ifndef _QUASI_MODEL2_H_ +#define _QUASI_MODEL2_H_ + +#include +#include +#include "prob.h" +#include "array2d.h" + +struct QuasiModel2 { + explicit QuasiModel2(double alpha, double pnull = 0.1) : + alpha_(alpha), + pnull_(pnull), + pnotnull_(1 - pnull), + z_(1000,1000) {} + // a_j = 0 => NULL; src_len does *not* include null + prob_t Pa_j(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) const { + if (!a_j) return pnull_; + std::vector& zv = z_(src_len, trg_len); + if (zv.size() == 0) + zv.resize(trg_len); + + prob_t& z = zv[j]; + if (z.is_0()) z = ComputeZ(j, src_len, trg_len); + + prob_t p; + p.logeq(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha_); + p *= pnotnull_; + p /= z; + return p; + } + private: + prob_t ComputeZ(unsigned j, unsigned src_len, unsigned trg_len) const { + prob_t p, z = prob_t::Zero(); + for (int a_j = 1; a_j <= src_len; ++a_j) { + p.logeq(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha_); + z += p; + } + return z; + } + double alpha_; + const prob_t pnull_; + const prob_t pnotnull_; + mutable Array2D > z_; +}; + +#endif -- cgit v1.2.3 From de136247bdedb960dc0f317cd65b28c02a441532 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 10 Mar 2012 01:08:23 -0500 Subject: tie params --- gi/pf/pyp_lm.cc | 66 +++++++++------------------------------- gi/pf/pyp_tm.cc | 2 ++ gi/pf/tied_resampler.h | 82 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 99 insertions(+), 51 deletions(-) create mode 100644 gi/pf/tied_resampler.h diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc index 52e6be2c..85635b8f 100644 --- a/gi/pf/pyp_lm.cc +++ b/gi/pf/pyp_lm.cc @@ -11,6 +11,7 @@ #include "tdict.h" #include "sampler.h" #include "ccrp.h" +#include "tied_resampler.h" // A not very memory-efficient implementation of an N-gram LM based on PYPs // as described in Y.-W. Teh. (2006) A Hierarchical Bayesian Language Model @@ -66,7 +67,7 @@ template<> struct PYPLM<0> { void increment(WordID, const vector&, MT19937*) { ++draws; } void decrement(WordID, const vector&, MT19937*) { --draws; assert(draws >= 0); } double prob(WordID, const vector&) const { return p0; } - void resample_hyperparameters(MT19937*, const unsigned, const unsigned) {} + void resample_hyperparameters(MT19937*) {} double log_likelihood() const { return draws * log(p0); } const double p0; int draws; @@ -76,16 +77,17 @@ template<> struct PYPLM<0> { template struct PYPLM { PYPLM(unsigned vs, double da, double db, double ss, double sr) : backoff(vs, da, db, ss, sr), - discount_a(da), discount_b(db), - strength_s(ss), strength_r(sr), - d(0.8), strength(1.0), lookup(N-1) {} + tr(da, db, ss, sr, 0.8, 1.0), + lookup(N-1) {} void increment(WordID w, const vector& context, MT19937* rng) { const double bo = backoff.prob(w, context); for (unsigned i = 0; i < N-1; ++i) lookup[i] = context[context.size() - 1 - i]; typename unordered_map, CCRP, boost::hash > >::iterator it = p.find(lookup); - if (it == p.end()) - it = p.insert(make_pair(lookup, CCRP(d,strength))).first; + if (it == p.end()) { + it = p.insert(make_pair(lookup, CCRP(0.5,1))).first; + tr.Add(&it->second); // add to resampler + } if (it->second.increment(w, bo, rng)) backoff.increment(w, context, rng); } @@ -107,59 +109,21 @@ template struct PYPLM { } double log_likelihood() const { - return log_likelihood(d, strength) + backoff.log_likelihood(); - } - - double log_likelihood(const double& dd, const double& aa) const { - if (aa <= -dd) return -std::numeric_limits::infinity(); - //double llh = Md::log_beta_density(dd, 10, 3) + Md::log_gamma_density(aa, 1, 1); - double llh = Md::log_beta_density(dd, discount_a, discount_b) + - Md::log_gamma_density(aa + dd, strength_s, strength_r); + double llh = backoff.log_likelihood(); typename unordered_map, CCRP, boost::hash > >::const_iterator it; for (it = p.begin(); it != p.end(); ++it) - llh += it->second.log_crp_prob(dd, aa); + llh += it->second.log_crp_prob(); + // TODO parametric likelihood from TiedResampler return llh; } - struct DiscountResampler { - DiscountResampler(const PYPLM& m) : m_(m) {} - const PYPLM& m_; - double operator()(const double& proposed_discount) const { - return m_.log_likelihood(proposed_discount, m_.strength); - } - }; - - struct AlphaResampler { - AlphaResampler(const PYPLM& m) : m_(m) {} - const PYPLM& m_; - double operator()(const double& proposed_strength) const { - return m_.log_likelihood(m_.d, proposed_strength); - } - }; - - void resample_hyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { - DiscountResampler dr(*this); - AlphaResampler ar(*this); - for (int iter = 0; iter < nloop; ++iter) { - strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits::min(), - std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); - double min_discount = std::numeric_limits::min(); - if (strength < 0.0) min_discount -= strength; - d = slice_sampler1d(dr, d, *rng, min_discount, - 1.0, 0.0, niterations, 100*niterations); - } - strength = slice_sampler1d(ar, strength, *rng, -d + std::numeric_limits::min(), - std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); - typename unordered_map, CCRP, boost::hash > >::iterator it; - cerr << "PYPLM<" << N << ">(d=" << d << ",a=" << strength << ") = " << log_likelihood(d, strength) << endl; - for (it = p.begin(); it != p.end(); ++it) { - it->second.set_discount(d); - it->second.set_strength(strength); - } - backoff.resample_hyperparameters(rng, nloop, niterations); + void resample_hyperparameters(MT19937* rng) { + tr.ResampleHyperparameters(rng); + backoff.resample_hyperparameters(rng); } PYPLM backoff; + TiedResampler > tr; double discount_a, discount_b, strength_s, strength_r; double d, strength; mutable vector lookup; // thread-local diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc index b5262f47..73104fe9 100644 --- a/gi/pf/pyp_tm.cc +++ b/gi/pf/pyp_tm.cc @@ -11,6 +11,8 @@ #include "ccrp.h" #include "pyp_word_model.h" +#include "tied_resampler.h" + using namespace std; using namespace std::tr1; diff --git a/gi/pf/tied_resampler.h b/gi/pf/tied_resampler.h new file mode 100644 index 00000000..208fb9c7 --- /dev/null +++ b/gi/pf/tied_resampler.h @@ -0,0 +1,82 @@ +#ifndef _TIED_RESAMPLER_H_ +#define _TIED_RESAMPLER_H_ + +#include +#include "sampler.h" +#include "slice_sampler.h" +#include "m.h" + +template +struct TiedResampler { + explicit TiedResampler(double da, double db, double ss, double sr, double d=0.5, double s=1.0) : + d_alpha(da), + d_beta(db), + s_shape(ss), + s_rate(sr), + discount(d), + strength(s) {} + + void Add(CRP* crp) { + crps.insert(crp); + crp->set_discount(discount); + crp->set_strength(strength); + assert(!crp->has_discount_prior()); + assert(!crp->has_strength_prior()); + } + + void Remove(CRP* crp) { + crps.erase(crp); + } + + double LogLikelihood(double d, double s) const { + if (s <= -d) return -std::numeric_limits::infinity(); + double llh = Md::log_beta_density(d, d_alpha, d_beta) + + Md::log_gamma_density(d + s, s_shape, s_rate); + for (typename std::set::iterator it = crps.begin(); it != crps.end(); ++it) + llh += (*it)->log_crp_prob(d, s); + return llh; + } + + struct DiscountResampler { + DiscountResampler(const TiedResampler& m) : m_(m) {} + const TiedResampler& m_; + double operator()(const double& proposed_discount) const { + return m_.LogLikelihood(proposed_discount, m_.strength); + } + }; + + struct AlphaResampler { + AlphaResampler(const TiedResampler& m) : m_(m) {} + const TiedResampler& m_; + double operator()(const double& proposed_strength) const { + return m_.LogLikelihood(m_.discount, proposed_strength); + } + }; + + void ResampleHyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { + const DiscountResampler dr(*this); + const AlphaResampler ar(*this); + for (int iter = 0; iter < nloop; ++iter) { + strength = slice_sampler1d(ar, strength, *rng, -discount + std::numeric_limits::min(), + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + double min_discount = std::numeric_limits::min(); + if (strength < 0.0) min_discount -= strength; + discount = slice_sampler1d(dr, discount, *rng, min_discount, + 1.0, 0.0, niterations, 100*niterations); + } + strength = slice_sampler1d(ar, strength, *rng, -discount + std::numeric_limits::min(), + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + std::cerr << "TiedCRPs(d=" << discount << ",s=" + << strength << ") = " << LogLikelihood(discount, strength) << std::endl; + for (typename std::set::iterator it = crps.begin(); it != crps.end(); ++it) { + (*it)->set_discount(discount); + (*it)->set_strength(strength); + } + } + private: + std::set crps; + const double d_alpha, d_beta, s_shape, s_rate; + double discount, strength; +}; + +#endif -- cgit v1.2.3 From 38f28be7cd2bada87ebad78994e3c938e10c2cce Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 10 Mar 2012 12:56:15 -0500 Subject: ready to infer alignment parameters --- gi/pf/Makefile.am | 4 +- gi/pf/align-lexonly-pyp.cc | 22 ++- gi/pf/align-lexonly.cc | 332 --------------------------------------------- gi/pf/pyp_tm.cc | 6 +- gi/pf/quasi_model2.h | 115 ++++++++++++---- gi/pf/tied_resampler.h | 31 +++++ 6 files changed, 143 insertions(+), 367 deletions(-) delete mode 100644 gi/pf/align-lexonly.cc diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am index 4ce72ba1..f9c979d0 100644 --- a/gi/pf/Makefile.am +++ b/gi/pf/Makefile.am @@ -1,4 +1,4 @@ -bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp learn_cfg pyp_lm nuisance_test align-tl +bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly-pyp learn_cfg pyp_lm nuisance_test align-tl noinst_LIBRARIES = libpf.a @@ -7,8 +7,6 @@ libpf_a_SOURCES = base_distributions.cc reachability.cc cfg_wfst_composer.cc cor nuisance_test_SOURCES = nuisance_test.cc nuisance_test_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz -align_lexonly_SOURCES = align-lexonly.cc - align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc align_lexonly_pyp_LDADD = libpf.a $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a $(top_srcdir)/klm/lm/libklm.a $(top_srcdir)/klm/util/libklm_util.a -lz diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index 0c90b6ce..68cb9192 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -61,15 +61,15 @@ struct AlignedSentencePair { struct Aligner { Aligner(const vector >& lets, int num_letters, vector* c) : corpus(*c), + paj_model(4, 0.08), model(lets, num_letters), - paj(4, 0.08), kNULL(TD::Convert("NULL")) { assert(lets[kNULL].size() == 0); } vector& corpus; + QuasiModel2 paj_model; PYPLexicalTranslation model; - const QuasiModel2 paj; const WordID kNULL; void ResampleHyperparameters() { @@ -86,10 +86,12 @@ struct Aligner { a_j = prng->next() * (1 + asp.src.size()); const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); model.Increment(f_a_j, asp.trg[j], &*prng); - // TODO factor in alignment prob + paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size()); } } - cerr << "Corpus intialized randomly. LLH = " << model.Likelihood() << endl; + cerr << "Corpus intialized randomly." << endl; + cerr << "LLH = " << Likelihood() << " \t(Amodel=" << paj_model.Likelihood() + << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl; } void ResampleCorpus() { @@ -101,19 +103,25 @@ struct Aligner { const WordID e_j = asp.trg[j]; WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); model.Decrement(f_a_j, e_j, prng); + paj_model.Decrement(a_j, j, asp.src.size(), asp.trg.size()); for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); ss[prop_a_j] = model.Prob(prop_f, e_j); - // TODO configurable - ss[prop_a_j] *= paj.Pa_j(prop_a_j, j, asp.src.size(), asp.trg.size()); + ss[prop_a_j] *= paj_model.Prob(prop_a_j, j, asp.src.size(), asp.trg.size()); } a_j = prng->SelectSample(ss); f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); model.Increment(f_a_j, e_j, prng); + paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size()); } } - cerr << "LLH = " << model.Likelihood() << " " << model.UniqueConditioningContexts() << endl; + cerr << "LLH = " << Likelihood() << " \t(Amodel=" << paj_model.Likelihood() + << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl; + } + + prob_t Likelihood() const { + return model.Likelihood() * paj_model.Likelihood(); } }; diff --git a/gi/pf/align-lexonly.cc b/gi/pf/align-lexonly.cc deleted file mode 100644 index dbc9dc07..00000000 --- a/gi/pf/align-lexonly.cc +++ /dev/null @@ -1,332 +0,0 @@ -#include -#include -#include - -#include -#include -#include - -#include "array2d.h" -#include "base_distributions.h" -#include "monotonic_pseg.h" -#include "conditional_pseg.h" -#include "trule.h" -#include "tdict.h" -#include "stringlib.h" -#include "filelib.h" -#include "dict.h" -#include "sampler.h" -#include "ccrp_nt.h" -#include "corpus.h" -#include "ngram_base.h" - -using namespace std; -using namespace tr1; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("samples,s",po::value()->default_value(1000),"Number of samples") - ("input,i",po::value(),"Read parallel data from") - ("random_seed,S",po::value(), "Random seed"); - po::options_description clo("Command line options"); - clo.add_options() - ("config", po::value(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - ifstream config((*conf)["config"].as().c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("help") || (conf->count("input") == 0)) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -shared_ptr prng; - -struct LexicalAlignment { - unsigned char src_index; - bool is_transliteration; - vector > derivation; -}; - -struct AlignedSentencePair { - vector src; - vector trg; - vector a; - Array2D posterior; -}; - -struct HierarchicalWordBase { - explicit HierarchicalWordBase(const unsigned vocab_e_size) : - base(prob_t::One()), r(25,25,10), u0(-log(vocab_e_size)) {} - - void ResampleHyperparameters(MT19937* rng) { - r.resample_hyperparameters(rng); - } - - inline double logp0(const vector& s) const { - return s.size() * u0; - } - - // return p0 of rule.e_ - prob_t operator()(const TRule& rule) const { - prob_t p; p.logeq(r.logprob(rule.e_, logp0(rule.e_))); - return p; - } - - void Increment(const TRule& rule) { - if (r.increment(rule.e_)) { - prob_t p; p.logeq(logp0(rule.e_)); - base *= p; - } - } - - void Decrement(const TRule& rule) { - if (r.decrement(rule.e_)) { - prob_t p; p.logeq(logp0(rule.e_)); - base /= p; - } - } - - prob_t Likelihood() const { - prob_t p; p.logeq(r.log_crp_prob()); - p *= base; - return p; - } - - void Summary() const { - cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << " (\\alpha=" << r.alpha() << ')' << endl; - for (CCRP_NoTable >::const_iterator it = r.begin(); it != r.end(); ++it) - cerr << " " << it->second << '\t' << TD::GetString(it->first) << endl; - } - - prob_t base; - CCRP_NoTable > r; - const double u0; -}; - -struct BasicLexicalAlignment { - explicit BasicLexicalAlignment(const vector >& lets, - const unsigned words_e, - const unsigned letters_e, - vector* corp) : - letters(lets), - corpus(*corp), - up0("fr-en.10k.translit-base.txt.gz"), - //up0(words_e), - //up0("en.chars.1gram", letters_e), - //up0("en.words.1gram"), - //up0(letters_e), - //up0("en.chars.2gram"), - tmodel(up0) { - } - - void InstantiateRule(const WordID src, - const WordID trg, - TRule* rule) const { - static const WordID kX = TD::Convert("X") * -1; - rule->lhs_ = kX; - rule->e_ = letters[trg]; - rule->f_ = letters[src]; - } - - void InitializeRandom() { - const WordID kNULL = TD::Convert("NULL"); - cerr << "Initializing with random alignments ...\n"; - for (unsigned i = 0; i < corpus.size(); ++i) { - AlignedSentencePair& asp = corpus[i]; - asp.a.resize(asp.trg.size()); - for (unsigned j = 0; j < asp.trg.size(); ++j) { - const unsigned char a_j = prng->next() * (1 + asp.src.size()); - const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); - TRule r; - InstantiateRule(f_a_j, asp.trg[j], &r); - asp.a[j].is_transliteration = false; - asp.a[j].src_index = a_j; - if (tmodel.IncrementRule(r)) - up0.Increment(r); - } - } - cerr << " LLH = " << Likelihood() << endl; - } - - prob_t Likelihood() const { - prob_t p = tmodel.Likelihood(); - p *= up0.Likelihood(); - return p; - } - - void ResampleHyperparemeters() { - cerr << " LLH_prev = " << Likelihood() << flush; - tmodel.ResampleHyperparameters(&*prng); - up0.ResampleHyperparameters(&*prng); - cerr << "\tLLH_post = " << Likelihood() << endl; - } - - void ResampleCorpus(); - - const vector >& letters; // spelling dictionary - vector& corpus; - //PhraseConditionalUninformativeBase up0; - //PhraseConditionalUninformativeUnigramBase up0; - //UnigramWordBase up0; - //HierarchicalUnigramBase up0; - TableLookupBase up0; - //HierarchicalWordBase up0; - //PoissonUniformUninformativeBase up0; - //CompletelyUniformBase up0; - //FixedNgramBase up0; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; - //ConditionalTranslationModel tmodel; -}; - -void BasicLexicalAlignment::ResampleCorpus() { - static const WordID kNULL = TD::Convert("NULL"); - for (unsigned i = 0; i < corpus.size(); ++i) { - AlignedSentencePair& asp = corpus[i]; - SampleSet ss; ss.resize(asp.src.size() + 1); - for (unsigned j = 0; j < asp.trg.size(); ++j) { - TRule r; - unsigned char& a_j = asp.a[j].src_index; - WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); - InstantiateRule(f_a_j, asp.trg[j], &r); - if (tmodel.DecrementRule(r)) - up0.Decrement(r); - - for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) { - const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL); - InstantiateRule(prop_f, asp.trg[j], &r); - ss[prop_a_j] = tmodel.RuleProbability(r); - } - a_j = prng->SelectSample(ss); - f_a_j = (a_j ? asp.src[a_j - 1] : kNULL); - InstantiateRule(f_a_j, asp.trg[j], &r); - if (tmodel.IncrementRule(r)) - up0.Increment(r); - } - } - cerr << " LLH = " << tmodel.Likelihood() << endl; -} - -void ExtractLetters(const set& v, vector >* l, set* letset = NULL) { - for (set::const_iterator it = v.begin(); it != v.end(); ++it) { - if (*it >= l->size()) { l->resize(*it + 1); } - vector& letters = (*l)[*it]; - if (letters.size()) continue; // if e and f have the same word - - const string& w = TD::Convert(*it); - - size_t cur = 0; - while (cur < w.size()) { - const size_t len = UTF8Len(w[cur]); - letters.push_back(TD::Convert(w.substr(cur, len))); - if (letset) letset->insert(letters.back()); - cur += len; - } - } -} - -void Debug(const AlignedSentencePair& asp) { - cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl; - Array2D a(asp.src.size(), asp.trg.size()); - for (unsigned j = 0; j < asp.trg.size(); ++j) - if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true; - cerr << a << endl; -} - -void AddSample(AlignedSentencePair* asp) { - for (unsigned j = 0; j < asp->trg.size(); ++j) - asp->posterior(asp->a[j].src_index, j)++; -} - -void WriteAlignments(const AlignedSentencePair& asp) { - bool first = true; - for (unsigned j = 0; j < asp.trg.size(); ++j) { - int src_index = -1; - int mc = -1; - for (unsigned i = 0; i <= asp.src.size(); ++i) { - if (asp.posterior(i, j) > mc) { - mc = asp.posterior(i, j); - src_index = i; - } - } - - if (src_index) { - if (first) first = false; else cout << ' '; - cout << (src_index - 1) << '-' << j; - } - } - cout << endl; -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - - if (conf.count("random_seed")) - prng.reset(new MT19937(conf["random_seed"].as())); - else - prng.reset(new MT19937); -// MT19937& rng = *prng; - - vector > corpuse, corpusf; - set vocabe, vocabf; - corpus::ReadParallelCorpus(conf["input"].as(), &corpusf, &corpuse, &vocabf, &vocabe); - cerr << "f-Corpus size: " << corpusf.size() << " sentences\n"; - cerr << "f-Vocabulary size: " << vocabf.size() << " types\n"; - cerr << "f-Corpus size: " << corpuse.size() << " sentences\n"; - cerr << "f-Vocabulary size: " << vocabe.size() << " types\n"; - assert(corpusf.size() == corpuse.size()); - - vector corpus(corpuse.size()); - for (unsigned i = 0; i < corpuse.size(); ++i) { - corpus[i].src.swap(corpusf[i]); - corpus[i].trg.swap(corpuse[i]); - corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size()); - } - corpusf.clear(); corpuse.clear(); - - vocabf.insert(TD::Convert("NULL")); - vector > letters(TD::NumWords()); - set letset; - ExtractLetters(vocabe, &letters, &letset); - ExtractLetters(vocabf, &letters, NULL); - letters[TD::Convert("NULL")].clear(); - - BasicLexicalAlignment x(letters, vocabe.size(), letset.size(), &corpus); - x.InitializeRandom(); - const unsigned samples = conf["samples"].as(); - for (int i = 0; i < samples; ++i) { - for (int j = 395; j < 397; ++j) Debug(corpus[j]); - cerr << i << "\t" << x.tmodel.r.size() << "\t"; - if (i % 10 == 0) x.ResampleHyperparemeters(); - x.ResampleCorpus(); - if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); - } - for (unsigned i = 0; i < corpus.size(); ++i) - WriteAlignments(corpus[i]); - //ModelAndData posterior(x, &corpus, vocabe, vocabf); - x.tmodel.Summary(); - x.up0.Summary(); - - //posterior.Sample(); - - return 0; -} diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc index 73104fe9..bf5a6497 100644 --- a/gi/pf/pyp_tm.cc +++ b/gi/pf/pyp_tm.cc @@ -10,7 +10,6 @@ #include "tdict.h" #include "ccrp.h" #include "pyp_word_model.h" - #include "tied_resampler.h" using namespace std; @@ -18,7 +17,7 @@ using namespace std::tr1; template struct ConditionalPYPWordModel { - ConditionalPYPWordModel(Base* b) : base(*b) {} + ConditionalPYPWordModel(Base* b) : base(*b), btr(3) {} void Summary() const { cerr << "Number of conditioning contexts: " << r.size() << endl; @@ -32,6 +31,7 @@ struct ConditionalPYPWordModel { void ResampleHyperparameters(MT19937* rng) { for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it) it->second.resample_hyperparameters(rng); + btr.ResampleHyperparameters(rng); } prob_t Prob(const WordID src, const vector& trglets) const { @@ -72,7 +72,9 @@ struct ConditionalPYPWordModel { return r.size(); } + // TODO tie PYP hyperparameters based on source word frequency bins Base& base; + BinTiedResampler > > btr; typedef unordered_map > > RuleModelHash; RuleModelHash r; }; diff --git a/gi/pf/quasi_model2.h b/gi/pf/quasi_model2.h index 0095289f..8ec0a400 100644 --- a/gi/pf/quasi_model2.h +++ b/gi/pf/quasi_model2.h @@ -3,44 +3,113 @@ #include #include +#include +#include "boost/functional.hpp" #include "prob.h" #include "array2d.h" +struct AlignmentObservation { + AlignmentObservation() : src_len(), trg_len(), j(), a_j() {} + AlignmentObservation(unsigned sl, unsigned tl, unsigned tw, unsigned sw) : + src_len(sl), trg_len(tl), j(tw), a_j(sw) {} + unsigned short src_len; + unsigned short trg_len; + unsigned short j; + unsigned short a_j; +}; + +inline size_t hash_value(const AlignmentObservation& o) { + return reinterpret_cast(o); +} + +inline bool operator==(const AlignmentObservation& a, const AlignmentObservation& b) { + return hash_value(a) == hash_value(b); +} + struct QuasiModel2 { explicit QuasiModel2(double alpha, double pnull = 0.1) : alpha_(alpha), pnull_(pnull), - pnotnull_(1 - pnull), - z_(1000,1000) {} + pnotnull_(1 - pnull) {} + // a_j = 0 => NULL; src_len does *not* include null - prob_t Pa_j(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) const { + prob_t Prob(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) const { if (!a_j) return pnull_; - std::vector& zv = z_(src_len, trg_len); - if (zv.size() == 0) - zv.resize(trg_len); - - prob_t& z = zv[j]; - if (z.is_0()) z = ComputeZ(j, src_len, trg_len); - - prob_t p; - p.logeq(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha_); - p *= pnotnull_; - p /= z; + return pnotnull_ * + prob_t(UnnormalizedProb(a_j, j, src_len, trg_len, alpha_) / GetOrComputeZ(j, src_len, trg_len)); + } + + void Increment(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) { + assert(a_j <= src_len); + assert(j < trg_len); + ++obs_[AlignmentObservation(src_len, trg_len, j, a_j)]; + } + + void Decrement(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len) { + const AlignmentObservation ao(src_len, trg_len, j, a_j); + int &cc = obs_[ao]; + assert(cc > 0); + --cc; + if (!cc) obs_.erase(ao); + } + + prob_t Likelihood() const { + return Likelihood(alpha_, pnull_.as_float()); + } + + prob_t Likelihood(double alpha, double ppnull) const { + const prob_t pnull(ppnull); + const prob_t pnotnull(1 - ppnull); + + prob_t p = prob_t::One(); + for (ObsCount::const_iterator it = obs_.begin(); it != obs_.end(); ++it) { + const AlignmentObservation& ao = it->first; + if (ao.a_j) { + double u = UnnormalizedProb(ao.a_j, ao.j, ao.src_len, ao.trg_len, alpha); + double z = ComputeZ(ao.j, ao.src_len, ao.trg_len, alpha); + prob_t pa(u / z); + pa *= pnotnull; + pa.poweq(it->second); + p *= pa; + } else { + p *= pnull.pow(it->second); + } + } return p; } + private: - prob_t ComputeZ(unsigned j, unsigned src_len, unsigned trg_len) const { - prob_t p, z = prob_t::Zero(); - for (int a_j = 1; a_j <= src_len; ++a_j) { - p.logeq(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha_); - z += p; - } + static double UnnormalizedProb(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len, double alpha) { + return exp(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha); + } + + static double ComputeZ(unsigned j, unsigned src_len, unsigned trg_len, double alpha) { + double z = 0; + for (int a_j = 1; a_j <= src_len; ++a_j) + z += UnnormalizedProb(a_j, j, src_len, trg_len, alpha); return z; } + + const double& GetOrComputeZ(unsigned j, unsigned src_len, unsigned trg_len) const { + if (src_len >= zcache_.size()) + zcache_.resize(src_len + 1); + if (trg_len >= zcache_[src_len].size()) + zcache_[src_len].resize(trg_len + 1); + std::vector& zv = zcache_[src_len][trg_len]; + if (zv.size() == 0) + zv.resize(trg_len); + double& z = zv[j]; + if (!z) + z = ComputeZ(j, src_len, trg_len, alpha_); + return z; + } + double alpha_; - const prob_t pnull_; - const prob_t pnotnull_; - mutable Array2D > z_; + prob_t pnull_; + prob_t pnotnull_; + mutable std::vector > > zcache_; + typedef std::tr1::unordered_map > ObsCount; + ObsCount obs_; }; #endif diff --git a/gi/pf/tied_resampler.h b/gi/pf/tied_resampler.h index 208fb9c7..5a262f9d 100644 --- a/gi/pf/tied_resampler.h +++ b/gi/pf/tied_resampler.h @@ -2,6 +2,7 @@ #define _TIED_RESAMPLER_H_ #include +#include #include "sampler.h" #include "slice_sampler.h" #include "m.h" @@ -28,6 +29,10 @@ struct TiedResampler { crps.erase(crp); } + size_t size() const { + return crps.size(); + } + double LogLikelihood(double d, double s) const { if (s <= -d) return -std::numeric_limits::infinity(); double llh = Md::log_beta_density(d, d_alpha, d_beta) + @@ -54,6 +59,7 @@ struct TiedResampler { }; void ResampleHyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { + if (size() == 0) { std::cerr << "EMPTY - not resampling\n"; return; } const DiscountResampler dr(*this); const AlphaResampler ar(*this); for (int iter = 0; iter < nloop; ++iter) { @@ -79,4 +85,29 @@ struct TiedResampler { double discount, strength; }; +// split according to some criterion +template +struct BinTiedResampler { + explicit BinTiedResampler(unsigned nbins) : + resamplers(nbins, TiedResampler(1,1,1,1)) {} + + void Add(unsigned bin, CRP* crp) { + resamplers[bin].Add(crp); + } + + void Remove(unsigned bin, CRP* crp) { + resamplers[bin].Remove(crp); + } + + void ResampleHyperparameters(MT19937* rng) { + for (unsigned i = 0; i < resamplers.size(); ++i) { + std::cerr << "BIN " << i << " (" << resamplers[i].size() << " CRPs): " << std::flush; + resamplers[i].ResampleHyperparameters(rng); + } + } + + private: + std::vector > resamplers; +}; + #endif -- cgit v1.2.3 From 289f96779e665ba24adca3461a624c68aa37bd99 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 10 Mar 2012 14:10:04 -0500 Subject: do Bayesian inference on quasimodel2 hyperparameters --- gi/pf/align-lexonly-pyp.cc | 5 ++-- gi/pf/pyp_lm.cc | 2 +- gi/pf/pyp_tm.cc | 11 +++++---- gi/pf/quasi_model2.h | 57 +++++++++++++++++++++++++++++++++++++++++++--- gi/pf/tied_resampler.h | 11 +++++++++ 5 files changed, 75 insertions(+), 11 deletions(-) diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index 68cb9192..6c054753 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -74,6 +74,7 @@ struct Aligner { void ResampleHyperparameters() { model.ResampleHyperparameters(prng); + paj_model.ResampleHyperparameters(prng); } void InitializeRandom() { @@ -216,9 +217,9 @@ int main(int argc, char** argv) { const unsigned samples = conf["samples"].as(); for (int i = 0; i < samples; ++i) { for (int j = 65; j < 67; ++j) Debug(corpus[j]); - if (i % 7 == 6) aligner.ResampleHyperparameters(); + if (i % 10 == 9) aligner.ResampleHyperparameters(); aligner.ResampleCorpus(); - if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); + if (i > (samples / 5) && (i % 6 == 5)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); } for (unsigned i = 0; i < corpus.size(); ++i) WriteAlignments(corpus[i]); diff --git a/gi/pf/pyp_lm.cc b/gi/pf/pyp_lm.cc index 85635b8f..91029688 100644 --- a/gi/pf/pyp_lm.cc +++ b/gi/pf/pyp_lm.cc @@ -113,7 +113,7 @@ template struct PYPLM { typename unordered_map, CCRP, boost::hash > >::const_iterator it; for (it = p.begin(); it != p.end(); ++it) llh += it->second.log_crp_prob(); - // TODO parametric likelihood from TiedResampler + llh += tr.LogLikelihood(); return llh; } diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc index bf5a6497..34ef0ba2 100644 --- a/gi/pf/pyp_tm.cc +++ b/gi/pf/pyp_tm.cc @@ -17,7 +17,7 @@ using namespace std::tr1; template struct ConditionalPYPWordModel { - ConditionalPYPWordModel(Base* b) : base(*b), btr(3) {} + ConditionalPYPWordModel(Base* b) : base(*b), btr(2) {} void Summary() const { cerr << "Number of conditioning contexts: " << r.size() << endl; @@ -29,8 +29,6 @@ struct ConditionalPYPWordModel { } void ResampleHyperparameters(MT19937* rng) { - for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it) - it->second.resample_hyperparameters(rng); btr.ResampleHyperparameters(rng); } @@ -45,8 +43,11 @@ struct ConditionalPYPWordModel { void Increment(const WordID src, const vector& trglets, MT19937* rng) { RuleModelHash::iterator it = r.find(src); - if (it == r.end()) - it = r.insert(make_pair(src, CCRP >(1,1,1,1,0.5,1.0))).first; + if (it == r.end()) { + it = r.insert(make_pair(src, CCRP >(0.5,1.0))).first; + static const WordID kNULL = TD::Convert("NULL"); + btr.Add(src == kNULL ? 0 : 1, &it->second); + } if (it->second.increment(trglets, base(trglets), rng)) base.Increment(trglets, rng); } diff --git a/gi/pf/quasi_model2.h b/gi/pf/quasi_model2.h index 8ec0a400..588c8f84 100644 --- a/gi/pf/quasi_model2.h +++ b/gi/pf/quasi_model2.h @@ -7,6 +7,8 @@ #include "boost/functional.hpp" #include "prob.h" #include "array2d.h" +#include "slice_sampler.h" +#include "m.h" struct AlignmentObservation { AlignmentObservation() : src_len(), trg_len(), j(), a_j() {} @@ -53,6 +55,37 @@ struct QuasiModel2 { if (!cc) obs_.erase(ao); } + struct PNullResampler { + PNullResampler(const QuasiModel2& m) : m_(m) {} + const QuasiModel2& m_; + double operator()(const double& proposed_pnull) const { + return log(m_.Likelihood(m_.alpha_, proposed_pnull)); + } + }; + + struct AlphaResampler { + AlphaResampler(const QuasiModel2& m) : m_(m) {} + const QuasiModel2& m_; + double operator()(const double& proposed_alpha) const { + return log(m_.Likelihood(proposed_alpha, m_.pnull_.as_float())); + } + }; + + void ResampleHyperparameters(MT19937* rng, const unsigned nloop = 5, const unsigned niterations = 10) { + const PNullResampler dr(*this); + const AlphaResampler ar(*this); + for (unsigned i = 0; i < nloop; ++i) { + double pnull = slice_sampler1d(dr, pnull_.as_float(), *rng, 0.00000001, + 1.0, 0.0, niterations, 100*niterations); + pnull_ = prob_t(pnull); + alpha_ = slice_sampler1d(ar, alpha_, *rng, 0.00000001, + std::numeric_limits::infinity(), 0.0, niterations, 100*niterations); + } + std::cerr << "QuasiModel2(alpha=" << alpha_ << ",p_null=" + << pnull_.as_float() << ") = " << Likelihood() << std::endl; + zcache_.clear(); + } + prob_t Likelihood() const { return Likelihood(alpha_, pnull_.as_float()); } @@ -61,12 +94,17 @@ struct QuasiModel2 { const prob_t pnull(ppnull); const prob_t pnotnull(1 - ppnull); - prob_t p = prob_t::One(); + prob_t p; + p.logeq(Md::log_gamma_density(alpha, 0.1, 25)); // TODO configure + assert(!p.is_0()); + prob_t prob_of_ppnull; prob_of_ppnull.logeq(Md::log_beta_density(ppnull, 2, 10)); + assert(!prob_of_ppnull.is_0()); + p *= prob_of_ppnull; for (ObsCount::const_iterator it = obs_.begin(); it != obs_.end(); ++it) { const AlignmentObservation& ao = it->first; if (ao.a_j) { - double u = UnnormalizedProb(ao.a_j, ao.j, ao.src_len, ao.trg_len, alpha); - double z = ComputeZ(ao.j, ao.src_len, ao.trg_len, alpha); + prob_t u = XUnnormalizedProb(ao.a_j, ao.j, ao.src_len, ao.trg_len, alpha); + prob_t z = XComputeZ(ao.j, ao.src_len, ao.trg_len, alpha); prob_t pa(u / z); pa *= pnotnull; pa.poweq(it->second); @@ -79,6 +117,19 @@ struct QuasiModel2 { } private: + static prob_t XUnnormalizedProb(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len, double alpha) { + prob_t p; + p.logeq(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha); + return p; + } + + static prob_t XComputeZ(unsigned j, unsigned src_len, unsigned trg_len, double alpha) { + prob_t z = prob_t::Zero(); + for (int a_j = 1; a_j <= src_len; ++a_j) + z += XUnnormalizedProb(a_j, j, src_len, trg_len, alpha); + return z; + } + static double UnnormalizedProb(unsigned a_j, unsigned j, unsigned src_len, unsigned trg_len, double alpha) { return exp(-fabs(double(a_j - 1) / src_len - double(j) / trg_len) * alpha); } diff --git a/gi/pf/tied_resampler.h b/gi/pf/tied_resampler.h index 5a262f9d..6f45fbce 100644 --- a/gi/pf/tied_resampler.h +++ b/gi/pf/tied_resampler.h @@ -42,6 +42,10 @@ struct TiedResampler { return llh; } + double LogLikelihood() const { + return LogLikelihood(discount, strength); + } + struct DiscountResampler { DiscountResampler(const TiedResampler& m) : m_(m) {} const TiedResampler& m_; @@ -106,6 +110,13 @@ struct BinTiedResampler { } } + double LogLikelihood() const { + double llh = 0; + for (unsigned i = 0; i < resamplers.size(); ++i) + llh += resamplers[i].LogLikelihood(); + return llh; + } + private: std::vector > resamplers; }; -- cgit v1.2.3 From dfbc278c1057555fda9312291c8024049e00b7d8 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 10 Mar 2012 16:42:12 -0500 Subject: frequency-based binning --- decoder/Makefile.am | 1 - decoder/ff_csplit.cc | 2 +- decoder/freqdict.cc | 29 ----------------------------- decoder/freqdict.h | 37 ++++++++++++++++++++++++++++++++----- gi/pf/align-lexonly-pyp.cc | 24 +++++++++++++++++------- gi/pf/make-freq-bins.pl | 26 ++++++++++++++++++++++++++ gi/pf/pyp_tm.cc | 24 +++++++++++++++++------- gi/pf/pyp_tm.h | 7 ++++--- 8 files changed, 97 insertions(+), 53 deletions(-) delete mode 100644 decoder/freqdict.cc create mode 100755 gi/pf/make-freq-bins.pl diff --git a/decoder/Makefile.am b/decoder/Makefile.am index a00b18af..ec51d643 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -76,7 +76,6 @@ libcdec_a_SOURCES = \ ff_source_syntax.cc \ ff_bleu.cc \ ff_factory.cc \ - freqdict.cc \ lexalign.cc \ lextrans.cc \ tagger.cc \ diff --git a/decoder/ff_csplit.cc b/decoder/ff_csplit.cc index 3991d38f..c9ed996c 100644 --- a/decoder/ff_csplit.cc +++ b/decoder/ff_csplit.cc @@ -72,7 +72,7 @@ struct BasicCSplitFeaturesImpl { const int fl1_; const int fl2_; const int bad_; - FreqDict freq_dict_; + FreqDict freq_dict_; set bad_words_; }; diff --git a/decoder/freqdict.cc b/decoder/freqdict.cc deleted file mode 100644 index 9e25d346..00000000 --- a/decoder/freqdict.cc +++ /dev/null @@ -1,29 +0,0 @@ -#include -#include -#include -#include "freqdict.h" -#include "tdict.h" -#include "filelib.h" - -using namespace std; - -void FreqDict::Load(const std::string& fname) { - cerr << "Reading word frequencies: " << fname << endl; - ReadFile rf(fname); - istream& ifs = *rf.stream(); - int cc=0; - while (ifs) { - std::string word; - ifs >> word; - if (word.size() == 0) continue; - if (word[0] == '#') continue; - double count = 0; - ifs >> count; - assert(count > 0.0); // use -log(f) - counts_[TD::Convert(word)]=count; - ++cc; - if (cc % 10000 == 0) { std::cerr << "."; } - } - std::cerr << "\n"; - std::cerr << "Loaded " << cc << " words\n"; -} diff --git a/decoder/freqdict.h b/decoder/freqdict.h index 9acf0c33..4e03fadd 100644 --- a/decoder/freqdict.h +++ b/decoder/freqdict.h @@ -1,20 +1,47 @@ #ifndef _FREQDICT_H_ #define _FREQDICT_H_ +#include #include #include #include "wordid.h" +#include "filelib.h" +#include "tdict.h" +template class FreqDict { public: - void Load(const std::string& fname); - float LookUp(const WordID& word) const { - std::map::const_iterator i = counts_.find(word); - if (i == counts_.end()) return 0; + FreqDict() : max_() {} + T Max() const { return max_; } + void Load(const std::string& fname) { + std::cerr << "Reading word statistics from: " << fname << std::endl; + ReadFile rf(fname); + std::istream& ifs = *rf.stream(); + int cc=0; + std::string word; + while (ifs) { + ifs >> word; + if (word.size() == 0) continue; + if (word[0] == '#') continue; + T count = 0; + ifs >> count; + if (count > max_) max_ = count; + counts_[TD::Convert(word)]=count; + ++cc; + if (cc % 10000 == 0) { std::cerr << "."; } + } + std::cerr << "\n"; + std::cerr << "Loaded " << cc << " words\n"; + } + + T LookUp(const WordID& word) const { + typename std::map::const_iterator i = counts_.find(word); + if (i == counts_.end()) return T(); return i->second; } private: - std::map counts_; + T max_; + std::map counts_; }; #endif diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index 6c054753..942dcf51 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -20,6 +20,9 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() ("samples,s",po::value()->default_value(1000),"Number of samples") + ("infer_alignment_hyperparameters,I", "Infer alpha and p_null, otherwise fixed values will be assumed") + ("p_null,0", po::value()->default_value(0.08), "probability of aligning to null") + ("align_alpha,a", po::value()->default_value(4.0), "how 'tight' is the bias toward be along the diagonal?") ("input,i",po::value(),"Read parallel data from") ("random_seed,S",po::value(), "Random seed"); po::options_description clo("Command line options"); @@ -59,9 +62,13 @@ struct AlignedSentencePair { }; struct Aligner { - Aligner(const vector >& lets, int num_letters, vector* c) : + Aligner(const vector >& lets, + int num_letters, + const po::variables_map& conf, + vector* c) : corpus(*c), - paj_model(4, 0.08), + paj_model(conf["align_alpha"].as(), conf["p_null"].as()), + infer_paj(conf.count("infer_alignment_hyperparameters") > 0), model(lets, num_letters), kNULL(TD::Convert("NULL")) { assert(lets[kNULL].size() == 0); @@ -69,12 +76,13 @@ struct Aligner { vector& corpus; QuasiModel2 paj_model; + const bool infer_paj; PYPLexicalTranslation model; const WordID kNULL; void ResampleHyperparameters() { model.ResampleHyperparameters(prng); - paj_model.ResampleHyperparameters(prng); + if (infer_paj) paj_model.ResampleHyperparameters(prng); } void InitializeRandom() { @@ -117,8 +125,6 @@ struct Aligner { paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size()); } } - cerr << "LLH = " << Likelihood() << " \t(Amodel=" << paj_model.Likelihood() - << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl; } prob_t Likelihood() const { @@ -211,13 +217,17 @@ int main(int argc, char** argv) { ExtractLetters(vocabf, &letters, NULL); letters[TD::Convert("NULL")].clear(); - Aligner aligner(letters, letset.size(), &corpus); + Aligner aligner(letters, letset.size(), conf, &corpus); aligner.InitializeRandom(); const unsigned samples = conf["samples"].as(); for (int i = 0; i < samples; ++i) { for (int j = 65; j < 67; ++j) Debug(corpus[j]); - if (i % 10 == 9) aligner.ResampleHyperparameters(); + if (i % 10 == 9) { + aligner.ResampleHyperparameters(); + cerr << "LLH = " << aligner.Likelihood() << " \t(Amodel=" << aligner.paj_model.Likelihood() + << " TModel=" << aligner.model.Likelihood() << ") contexts=" << aligner.model.UniqueConditioningContexts() << endl; + } aligner.ResampleCorpus(); if (i > (samples / 5) && (i % 6 == 5)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); } diff --git a/gi/pf/make-freq-bins.pl b/gi/pf/make-freq-bins.pl new file mode 100755 index 00000000..fdcd3555 --- /dev/null +++ b/gi/pf/make-freq-bins.pl @@ -0,0 +1,26 @@ +#!/usr/bin/perl -w +use strict; + +my $BASE = 6; +my $CUTOFF = 3; + +my %d; +my $num = 0; +while(<>){ + chomp; + my @words = split /\s+/; + for my $w (@words) {$d{$w}++; $num++;} +} + +my @vocab = sort {$d{$b} <=> $d{$a}} keys %d; + +for (my $i=0; $i #include -#include "base_distributions.h" -#include "monotonic_pseg.h" -#include "conditional_pseg.h" #include "tdict.h" #include "ccrp.h" #include "pyp_word_model.h" @@ -15,9 +12,19 @@ using namespace std; using namespace std::tr1; -template +struct FreqBinner { + FreqBinner(const std::string& fname) { fd_.Load(fname); } + unsigned NumberOfBins() const { return fd_.Max() + 1; } + unsigned Bin(const WordID& w) const { return fd_.LookUp(w); } + FreqDict fd_; +}; + +template struct ConditionalPYPWordModel { - ConditionalPYPWordModel(Base* b) : base(*b), btr(2) {} + ConditionalPYPWordModel(Base* b, const Binner* bnr = NULL) : + base(*b), + binner(bnr), + btr(binner ? binner->NumberOfBins() + 1u : 2u) {} void Summary() const { cerr << "Number of conditioning contexts: " << r.size() << endl; @@ -46,7 +53,9 @@ struct ConditionalPYPWordModel { if (it == r.end()) { it = r.insert(make_pair(src, CCRP >(0.5,1.0))).first; static const WordID kNULL = TD::Convert("NULL"); - btr.Add(src == kNULL ? 0 : 1, &it->second); + unsigned bin = (src == kNULL ? 0 : 1); + if (binner && bin) { bin = binner->Bin(src) + 1; } + btr.Add(bin, &it->second); } if (it->second.increment(trglets, base(trglets), rng)) base.Increment(trglets, rng); @@ -75,6 +84,7 @@ struct ConditionalPYPWordModel { // TODO tie PYP hyperparameters based on source word frequency bins Base& base; + const Binner* binner; BinTiedResampler > > btr; typedef unordered_map > > RuleModelHash; RuleModelHash r; @@ -84,7 +94,7 @@ PYPLexicalTranslation::PYPLexicalTranslation(const vector >& lets const unsigned num_letters) : letters(lets), up0(new PYPWordModel(num_letters)), - tmodel(new ConditionalPYPWordModel(up0)), + tmodel(new ConditionalPYPWordModel(up0, new FreqBinner("10k.freq"))), kX(-TD::Convert("X")) {} void PYPLexicalTranslation::Summary() const { diff --git a/gi/pf/pyp_tm.h b/gi/pf/pyp_tm.h index fa0fb28f..63e7c96d 100644 --- a/gi/pf/pyp_tm.h +++ b/gi/pf/pyp_tm.h @@ -5,10 +5,11 @@ #include "wordid.h" #include "prob.h" #include "sampler.h" +#include "freqdict.h" -struct TRule; +struct FreqBinner; struct PYPWordModel; -template struct ConditionalPYPWordModel; +template struct ConditionalPYPWordModel; struct PYPLexicalTranslation { explicit PYPLexicalTranslation(const std::vector >& lets, @@ -26,7 +27,7 @@ struct PYPLexicalTranslation { private: const std::vector >& letters; // spelling dictionary PYPWordModel* up0; // base distribuction (model English word) - ConditionalPYPWordModel* tmodel; // translation distributions + ConditionalPYPWordModel* tmodel; // translation distributions // (model English word | French word) const WordID kX; }; -- cgit v1.2.3