fixes

author: Chris Dyer <cdyer@Chriss-MacBook-Air.local> 2013-11-10 03:56:11 -0500
committer: Chris Dyer <cdyer@Chriss-MacBook-Air.local> 2013-11-10 03:56:11 -0500
commit: 43571516d8ba27599bd8ed58944d06715ea8533d (patch)
tree: 807ff7422338dac0e85a644be44ee69212790c4d
parent: 1e9afb904a57ff0b03edd0e94d634ef98e7d4b2a (diff)
3 files changed, 48 insertions, 391 deletions
diff --git a/decoder/Makefile.am b/decoder/Makefile.am
index 39a13ad8..f02299e6 100644
--- a/decoder/Makefile.am
+++ b/decoder/Makefile.am
@@ -55,12 +55,18 @@ libcdec_a_SOURCES = \
   ff_klm.h \
   ff_lm.h \
   ff_ngrams.h \
+  ff_parse_match.h \
   ff_register.h \
   ff_rules.h \
   ff_ruleshape.h \
   ff_sample_fsa.h \
+  ff_soft_syntax.h \
+  ff_soft_syntax2.h \
   ff_source_path.h \
   ff_source_syntax.h \
+  ff_source_syntax2.h \
+  ff_source_syntax2_p.h \
+  ff_source_syntax_p.h \
   ff_spans.h \
   ff_tagger.h \
   ff_wordalign.h \
@@ -94,66 +100,66 @@ libcdec_a_SOURCES = \
   sentences.h \
   tagger.h \
   translator.h \
-  tromble_loss.h \
   trule.h \
   viterbi.h \
-  forest_writer.cc \
-  maxtrans_blunsom.cc \
+  aligner.cc \
+  apply_models.cc \
+  bottom_up_parser.cc \
+  cdec.cc \
   cdec_ff.cc \
   cfg.cc \
-  ff_external.cc \
-  rule_lexer.cc \
-  fst_translator.cc \
   csplit.cc \
-  translator.cc \
-  rescore_translator.cc \
-  scfg_translator.cc \
-  hg.cc \
-  hg_io.cc \
-  hg_remove_eps.cc \
   decoder.cc \
-  hg_intersect.cc \
-  hg_union.cc \
-  hg_sampler.cc \
-  factored_lexicon_helper.cc \
-  viterbi.cc \
-  lattice.cc \
-  aligner.cc \
-  apply_models.cc \
   earley_composer.cc \
-  phrasetable_fst.cc \
-  trule.cc \
+  factored_lexicon_helper.cc \
   ff.cc \
-  ffset.cc \
   ff_basic.cc \
-  ff_rules.cc \
-  ff_wordset.cc \
-  ff_context.cc \
+  ff_bleu.cc \
   ff_charset.cc \
-  ff_lm.cc \
+  ff_context.cc \
+  ff_csplit.cc \
+  ff_external.cc \
+  ff_factory.cc \
   ff_klm.cc \
+  ff_lm.cc \
   ff_ngrams.cc \
-  ff_spans.cc \
+  ff_parse_match.cc \
+  ff_rules.cc \
   ff_ruleshape.cc \
-  ff_wordalign.cc \
-  ff_csplit.cc \
-  ff_tagger.cc \
+  ff_soft_syntax.cc \
+  ff_soft_syntax2.cc \
   ff_source_path.cc \
-  ff_parse_match.cc \
-	ff_soft_syntax.cc \
-	ff_soft_syntax2.cc \
   ff_source_syntax.cc \
-	ff_source_syntax_p.cc \
   ff_source_syntax2.cc \
   ff_source_syntax2_p.cc \
-  ff_bleu.cc \
-  ff_factory.cc \
+  ff_source_syntax_p.cc \
+  ff_spans.cc \
+  ff_tagger.cc \
+  ff_wordalign.cc \
+  ff_wordset.cc \
+  ffset.cc \
+  forest_writer.cc \
+  fst_translator.cc \
+  grammar.cc \
+  hg.cc \
+  hg_intersect.cc \
+  hg_io.cc \
+  hg_remove_eps.cc \
+  hg_sampler.cc \
+  hg_union.cc \
   incremental.cc \
+  json_parse.cc \
+  lattice.cc \
   lexalign.cc \
   lextrans.cc \
-  tagger.cc \
-  bottom_up_parser.cc \
+  maxtrans_blunsom.cc \
   phrasebased_translator.cc \
-  JSON_parser.c \
-  json_parse.cc \
-  grammar.cc
+  phrasetable_fst.cc \
+  rescore_translator.cc \
+  rule_lexer.cc \
+  scfg_translator.cc \
+  tagger.cc \
+  translator.cc \
+  trule.cc \
+  viterbi.cc \
+  JSON_parser.c
diff --git a/decoder/tromble_loss.cc b/decoder/tromble_loss.cc
deleted file mode 100644
index 24cfef5f..00000000
--- a/decoder/tromble_loss.cc
+++ /dev/null
@@ -1,309 +0,0 @@
-#include "tromble_loss.h"
-#include "fast_lexical_cast.hpp"
-
-#include <boost/algorithm/string/predicate.hpp>
-#include <boost/circular_buffer.hpp>
-#include <boost/functional/hash.hpp>
-#include <boost/range/iterator_range.hpp>
-#include <boost/tokenizer.hpp>
-#include <boost/unordered_map.hpp>
-
-#include <cmath>
-#include <fstream>
-#include <vector>
-
-#include "sentence_metadata.h"
-#include "trule.h"
-#include "tdict.h"
-
-using namespace std;
-
-namespace {
-
-typedef unsigned char GramCount;
-
-struct RefCounts {
-  GramCount max;
-  std::vector<GramCount> refs;
-  size_t length;
-};
-
-typedef boost::unordered_map<std::vector<WordID>, size_t, boost::hash<std::vector<WordID> > > NGramMap;
-
-// Take all the n-grams in the references and stuff them into ngrams.
-void MakeNGramMapFromReferences(const vector<vector<WordID> > &references,
-                                int n,
-                                vector<RefCounts> *counts,
-                                NGramMap *ngrams) {
-  ngrams->clear();
-  std::pair<vector<WordID>, size_t> insert_me;
-  vector<WordID> &ngram = insert_me.first;
-  ngram.reserve(n);
-  size_t &id = insert_me.second;
-  id = 0;
-  for (int refi = 0; refi < references.size(); ++refi) {
-    const vector<WordID>& ref = references[refi];
-    const int s = ref.size();
-    for (int j=0; j<s; ++j) {
-      const int remaining = s-j;
-      const int k = (n < remaining ? n : remaining);
-      ngram.clear();
-      for (unsigned int i = 0; i < k; ++i) {
-        ngram.push_back(ref[j + i]);
-        std::pair<NGramMap::iterator, bool> ret(ngrams->insert(insert_me));
-        if (ret.second) {
-          counts->resize(id + 1);
-          RefCounts &ref_counts = counts->back();
-          ref_counts.max = 1;
-          ref_counts.refs.resize(references.size());
-          ref_counts.refs[refi] = 1;
-          ref_counts.length = ngram.size();
-          ++id;
-        } else {
-          RefCounts &ref_counts = (*counts)[ret.first->second];
-          ref_counts.max = std::max(ref_counts.max, ++ref_counts.refs[refi]);
-        }
-      }
-    }
-  }
-}
-
-struct MutableState {
-  MutableState(void *from, size_t n) : length(reinterpret_cast<size_t*>(from)), left(reinterpret_cast<WordID *>(length + 1)), right(left + n - 1), counts(reinterpret_cast<GramCount *>(right + n - 1)) {}
-  size_t *length;
-  WordID *left, *right;
-  GramCount *counts;
-  static size_t Size(size_t n, size_t bound_ngram_id) { return sizeof(size_t) + (n - 1) * 2 * sizeof(WordID) + bound_ngram_id * sizeof(GramCount); }
-};
-
-struct ConstState {
-  ConstState(const void *from, size_t n) : length(reinterpret_cast<const size_t*>(from)), left(reinterpret_cast<const WordID *>(length + 1)), right(left + n - 1), counts(reinterpret_cast<const GramCount *>(right + n - 1)) {}
-  const size_t *length;
-  const WordID *left, *right;
-  const GramCount *counts;
-  static size_t Size(size_t n, size_t bound_ngram_id) { return sizeof(size_t) + (n - 1) * 2 * sizeof(WordID) + bound_ngram_id * sizeof(GramCount); }
-};
-
-template <class T> struct CompatibleHashRange : public std::unary_function<const boost::iterator_range<T> &, size_t> {
-  size_t operator()(const boost::iterator_range<T> &range) const {
-    return boost::hash_range(range.begin(), range.end());
-  }
-};
-
-template <class T> struct CompatibleEqualsRange : public std::binary_function<const boost::iterator_range<T> &, const std::vector<WordID> &, size_t> {
-  size_t operator()(const boost::iterator_range<T> &range, const std::vector<WordID> &vec) const {
-    return boost::algorithm::equals(range, vec);
-  }
-  size_t operator()(const std::vector<WordID> &vec, const boost::iterator_range<T> &range) const {
-    return boost::algorithm::equals(range, vec);
-  }
-};
-
-void AddWord(const boost::circular_buffer<WordID> &segment, size_t min_length, const NGramMap &ref_grams, GramCount *counters) {
-  typedef boost::circular_buffer<WordID>::const_iterator BufferIt;
-  typedef boost::iterator_range<BufferIt> SegmentRange;
-  if (segment.size() < min_length) return;
-#if 0
-  CompatibleHashRange<BufferIt> hasher;
-  CompatibleEqualsRange<BufferIt> equals;
-  for (BufferIt seg_start(segment.end() - min_length); ; --seg_start) {
-    NGramMap::const_iterator found = ref_grams.find(SegmentRange(seg_start, segment.end()));
-    if (found == ref_grams.end()) break;
-    ++counters[found->second];
-    if (seg_start == segment.begin()) break;
-  }
-#endif
-}
-
-} // namespace
-
-class TrombleLossComputerImpl {
- public:
-  explicit TrombleLossComputerImpl(const std::string &params) : star_(TD::Convert("<{STAR}>")) {
-    typedef boost::tokenizer<boost::char_separator<char> > Tokenizer;
-    // Argument parsing
-    std::string ref_file_name;
-    Tokenizer tok(params, boost::char_separator<char>(" "));
-    Tokenizer::iterator i = tok.begin();
-    if (i == tok.end()) {
-      std::cerr << "TrombleLossComputer needs a reference file name." << std::endl;
-      exit(1);
-    }
-    ref_file_name = *i++;
-    if (i == tok.end()) {
-      std::cerr << "TrombleLossComputer needs to know how many references." << std::endl;
-      exit(1);
-    }
-    num_refs_ = boost::lexical_cast<unsigned int>(*i++);
-    for (; i != tok.end(); ++i) {
-     thetas_.push_back(boost::lexical_cast<double>(*i));
-    }
-    if (thetas_.empty()) {
-      std::cerr << "TrombleLossComputer is pointless with no weight on n-grams." << std::endl;
-      exit(1);
-    }
-
-    // Read references file.
-    std::ifstream ref_file(ref_file_name.c_str());
-    if (!ref_file) {
-      std::cerr << "Could not open TrombleLossComputer file " << ref_file_name << std::endl;
-      exit(1);
-    }
-    std::string ref;
-    vector<vector<WordID> > references(num_refs_);
-    bound_ngram_id_ = 0;
-    for (unsigned int sentence = 0; ref_file; ++sentence) {
-      for (unsigned int refidx = 0; refidx < num_refs_; ++refidx) {
-        if (!getline(ref_file, ref)) {
-          if (refidx == 0) break;
-          std::cerr << "Short read of " << refidx << " references for sentence " << sentence << std::endl;
-          exit(1);
-        }
-        TD::ConvertSentence(ref, &references[refidx]);
-      }
-      ref_ids_.resize(sentence + 1);
-      ref_counts_.resize(sentence + 1);
-      MakeNGramMapFromReferences(references, thetas_.size(), &ref_counts_.back(), &ref_ids_.back());
-      bound_ngram_id_ = std::max(bound_ngram_id_, ref_ids_.back().size());
-    }
-  }
-
-  size_t StateSize() const {
-    // n-1 boundary words plus counts for n-grams currently rendered as bytes even though most would fit in bits.
-    // Also, this is cached by higher up classes so no need to cache here.
-    return MutableState::Size(thetas_.size(), bound_ngram_id_);
-  }
-
-  double Traversal(
-      const SentenceMetadata &smeta,
-      const TRule &rule,
-      const vector<const void*> &ant_contexts,
-      void *out_context) const {
-    // TODO: get refs from sentence metadata.
-    // This will require resizable features.
-    if (smeta.GetSentenceID() >= ref_ids_.size()) {
-      std::cerr << "Sentence ID " << smeta.GetSentenceID() << " doesn't have references; there are only " << ref_ids_.size() << " references." << std::endl;
-      exit(1);
-    }
-    const NGramMap &ngrams = ref_ids_[smeta.GetSentenceID()];
-    MutableState out_state(out_context, thetas_.size());
-    memset(out_state.counts, 0, bound_ngram_id_ * sizeof(GramCount));
-    boost::circular_buffer<WordID> history(thetas_.size());
-    std::vector<const void*>::const_iterator ant_context = ant_contexts.begin();
-    *out_state.length = 0;
-    size_t pushed = 0;
-    const size_t keep = thetas_.size() - 1;
-    for (vector<WordID>::const_iterator rhs = rule.e().begin(); rhs != rule.e().end(); ++rhs) {
-      if (*rhs < 1) {
-        assert(ant_context != ant_contexts.end());
-        // Constituent
-        ConstState rhs_state(*ant_context, thetas_.size());
-        *out_state.length += *rhs_state.length;
-        {
-          GramCount *accum = out_state.counts;
-          for (const GramCount *c = rhs_state.counts; c != rhs_state.counts + ngrams.size(); ++c, ++accum) {
-            *accum += *c;
-          }
-        }
-        const WordID *w = rhs_state.left;
-        bool long_constit = true;
-        for (size_t i = 1; i <= keep; ++i, ++w) {
-          if (*w == star_) {
-            long_constit = false;
-            break;
-          }
-          history.push_back(*w);
-          if (++pushed == keep) {
-            std::copy(history.begin(), history.end(), out_state.left);
-          }
-          // Now i is the length of the history coming from this constituent.  So it needs at least i+1 words to have a cross-child add.
-          AddWord(history, i + 1, ngrams, out_state.counts);
-        }
-        // If the consituent is shorter than thetas_.size(), then the
-        // constituent's left is the entire constituent, so history is already
-        // correct.  Otherwise, the entire right hand side is the entire
-        // history.
-        if (long_constit) {
-          history.assign(thetas_.size(), rhs_state.right, rhs_state.right + keep);
-        }
-        ++ant_context;
-      } else {
-        // Word
-        ++*out_state.length;
-        history.push_back(*rhs);
-        if (++pushed == keep) {
-          std::copy(history.begin(), history.end(), out_state.left);
-        }
-        AddWord(history, 1, ngrams, out_state.counts);
-      }
-    }
-    // Fill in left and right constituents.
-    if (pushed < keep) {
-      std::copy(history.begin(), history.end(), out_state.left);
-      for (WordID *i = out_state.left + pushed; i != out_state.left + keep; ++i) {
-        *i = star_;
-      }
-      std::copy(out_state.left, out_state.left + keep, out_state.right);
-    } else if(pushed == keep) {
-      std::copy(history.begin(), history.end(), out_state.right);
-    } else if ((pushed > keep) && !history.empty()) {
-      std::copy(history.begin() + 1, history.end(), out_state.right);
-    }
-    std::vector<RefCounts>::const_iterator ref_info = ref_counts_[smeta.GetSentenceID()].begin();
-    // Clip the counts and count matches.
-    // Indexed by reference then by length.
-    std::vector<std::vector<unsigned int> > matches(num_refs_, std::vector<unsigned int>(thetas_.size()));
-    for (GramCount *c = out_state.counts; c != out_state.counts + ngrams.size(); ++c, ++ref_info) {
-      *c = std::min(*c, ref_info->max);
-      if (*c) {
-        for (unsigned int refidx = 0; refidx < num_refs_; ++refidx) {
-          assert(ref_info->length >= 1);
-          assert(ref_info->length - 1 < thetas_.size());
-          matches[refidx][ref_info->length - 1] += std::min(*c, ref_info->refs[refidx]);
-        }
-      }
-    }
-    double best_score = 0.0;
-    for (unsigned int refidx = 0; refidx < num_refs_; ++refidx) {
-      double score = 0.0;
-      for (unsigned int j = 0; j < std::min(*out_state.length, thetas_.size()); ++j) {
-        score += thetas_[j] * static_cast<double>(matches[refidx][j]) / static_cast<double>(*out_state.length - j);
-      }
-      best_score = std::max(best_score, score);
-    }
-    return best_score;
-  }
-
- private:
-  unsigned int num_refs_;
-  // Indexed by sentence id.
-  std::vector<NGramMap> ref_ids_;
-  // Then by id from ref_ids_.
-  std::vector<std::vector<RefCounts> > ref_counts_;
-
-  // thetas_[0] is the weight for 1-grams
-  std::vector<double> thetas_;
-
-  // All ngram ids in ref_ids_ are < this value.
-  size_t bound_ngram_id_;
-
-  const WordID star_;
-};
-
-TrombleLossComputer::TrombleLossComputer(const std::string &params) :
-    boost::base_from_member<PImpl>(new TrombleLossComputerImpl(params)),
-    FeatureFunction(boost::base_from_member<PImpl>::member->StateSize()),
-    fid_(FD::Convert("TrombleLossComputer")) {}
-
-TrombleLossComputer::~TrombleLossComputer() {}
-
-void TrombleLossComputer::TraversalFeaturesImpl(const SentenceMetadata& smeta,
-                                     const Hypergraph::Edge& edge,
-                                     const vector<const void*>& ant_contexts,
-                                     SparseVector<double>* features,
-                                     SparseVector<double>* estimated_features,
-                                     void* out_context) const {
-  (void) estimated_features;
-  const double loss = boost::base_from_member<PImpl>::member->Traversal(smeta, *edge.rule_, ant_contexts, out_context);
-  features->set_value(fid_, loss);
-}
diff --git a/decoder/tromble_loss.h b/decoder/tromble_loss.h
deleted file mode 100644
index fde33100..00000000
--- a/decoder/tromble_loss.h
+++ /dev/null
@@ -1,40 +0,0 @@
-#ifndef _TROMBLE_LOSS_H_
-#define _TROMBLE_LOSS_H_
-
-#include <vector>
-#include <boost/scoped_ptr.hpp>
-#include <boost/utility/base_from_member.hpp>
-
-#include "ff.h"
-#include "wordid.h"
-
-// this may not be the most elegant way to implement this computation, but since we
-// may need cube pruning and state splitting, we reuse the feature detector framework.
-// the loss is then stored in a feature #0 (which is guaranteed to have weight 0 and
-// never be a "real" feature).
-class TrombleLossComputerImpl;
-class TrombleLossComputer : private boost::base_from_member<boost::scoped_ptr<TrombleLossComputerImpl> >, public FeatureFunction {
- private:
-  typedef boost::scoped_ptr<TrombleLossComputerImpl> PImpl;
-  typedef FeatureFunction Base;
-
- public:
-  // String parameters are ref.txt num_ref weight1 weight2 ... weightn
-  // where ref.txt contains references on per line, with num_ref references per sentence
-  // The weights are the weight on each length n-gram.
-  explicit TrombleLossComputer(const std::string &params);
-
-  ~TrombleLossComputer();
-
- protected:
-  virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
-                                     const HG::Edge& edge,
-                                     const std::vector<const void*>& ant_contexts,
-                                     SparseVector<double>* features,
-                                     SparseVector<double>* estimated_features,
-                                     void* out_context) const;
- private:
-  const int fid_;
-};
-
-#endif
author	Chris Dyer <cdyer@Chriss-MacBook-Air.local>	2013-11-10 03:56:11 -0500
committer	Chris Dyer <cdyer@Chriss-MacBook-Air.local>	2013-11-10 03:56:11 -0500
commit	43571516d8ba27599bd8ed58944d06715ea8533d (patch)
tree	807ff7422338dac0e85a644be44ee69212790c4d
parent	1e9afb904a57ff0b03edd0e94d634ef98e7d4b2a (diff)