few small fixes of alignment tools, add new orthographic similarity feature for word aligner, final naming of directories, libraries in cdec

author: Chris Dyer <redpony@gmail.com> 2009-12-14 20:35:11 -0500
committer: Chris Dyer <redpony@gmail.com> 2009-12-14 20:35:11 -0500
commit: 851e389dffdd6996ea32d70defb8906de80b9edc (patch)
tree: 8c68ee77205badc056b8ab5b332e67e3e98017df /src/ff_csplit.cc
parent: dc6930c00b4b276883280cff1ed6dcd9ddef03c7 (diff)
1 files changed, 0 insertions, 212 deletions
diff --git a/src/ff_csplit.cc b/src/ff_csplit.cc
deleted file mode 100644
index cac4bb8e..00000000
--- a/src/ff_csplit.cc
+++ /dev/null
@@ -1,212 +0,0 @@
-#include "ff_csplit.h"
-
-#include <set>
-#include <cstring>
-
-#include "Vocab.h"
-#include "Ngram.h"
-
-#include "sentence_metadata.h"
-#include "lattice.h"
-#include "tdict.h"
-#include "freqdict.h"
-#include "filelib.h"
-#include "stringlib.h"
-#include "tdict.h"
-
-using namespace std;
-
-struct BasicCSplitFeaturesImpl {
-  BasicCSplitFeaturesImpl(const string& param) :
-      word_count_(FD::Convert("WordCount")),
-      letters_sq_(FD::Convert("LettersSq")),
-      letters_sqrt_(FD::Convert("LettersSqrt")),
-      in_dict_(FD::Convert("InDict")),
-      short_(FD::Convert("Short")),
-      long_(FD::Convert("Long")),
-      oov_(FD::Convert("OOV")),
-      short_range_(FD::Convert("ShortRange")),
-      high_freq_(FD::Convert("HighFreq")),
-      med_freq_(FD::Convert("MedFreq")),
-      freq_(FD::Convert("Freq")),
-      fl1_(FD::Convert("FreqLen1")),
-      fl2_(FD::Convert("FreqLen2")),
-      bad_(FD::Convert("Bad")) {
-    vector<string> argv;
-    int argc = SplitOnWhitespace(param, &argv);
-    if (argc != 1 && argc != 2) {
-      cerr << "Expected: freqdict.txt [badwords.txt]\n";
-      abort();
-    }
-    freq_dict_.Load(argv[0]);
-    if (argc == 2) {
-      ReadFile rf(argv[1]);
-      istream& in = *rf.stream();
-      while(in) {
-        string badword;
-        in >> badword;
-        if (badword.empty()) continue;
-        bad_words_.insert(TD::Convert(badword));
-      }
-    }
-  }
-
-  void TraversalFeaturesImpl(const Hypergraph::Edge& edge,
-                             SparseVector<double>* features) const;
-
-  const int word_count_;
-  const int letters_sq_;
-  const int letters_sqrt_;
-  const int in_dict_;
-  const int short_;
-  const int long_;
-  const int oov_;
-  const int short_range_;
-  const int high_freq_;
-  const int med_freq_;
-  const int freq_;
-  const int fl1_;
-  const int fl2_;
-  const int bad_;
-  FreqDict freq_dict_;
-  set<WordID> bad_words_;
-};
-
-BasicCSplitFeatures::BasicCSplitFeatures(const string& param) :
-  pimpl_(new BasicCSplitFeaturesImpl(param)) {}
-
-void BasicCSplitFeaturesImpl::TraversalFeaturesImpl(
-                                     const Hypergraph::Edge& edge,
-                                     SparseVector<double>* features) const {
-  features->set_value(word_count_, 1.0);
-  features->set_value(letters_sq_, (edge.j_ - edge.i_) * (edge.j_ - edge.i_));
-  features->set_value(letters_sqrt_, sqrt(edge.j_ - edge.i_));
-  const WordID word = edge.rule_->e_[1];
-  const char* sword = TD::Convert(word);
-  const int len = strlen(sword);
-  int cur = 0;
-  int chars = 0;
-  while(cur < len) {
-    cur += UTF8Len(sword[cur]);
-    ++chars;
-  }
-
-  // these are corrections that attempt to make chars
-  // more like a phoneme count than a letter count, they
-  // are only really meaningful for german and should
-  // probably be gotten rid of
-  bool has_sch = strstr(sword, "sch");
-  bool has_ch = (!has_sch && strstr(sword, "ch"));
-  bool has_ie = strstr(sword, "ie");
-  bool has_zw = strstr(sword, "zw");
-  if (has_sch) chars -= 2;
-  if (has_ch) --chars;
-  if (has_ie) --chars;
-  if (has_zw) --chars;
-
-  float freq = freq_dict_.LookUp(word);
-  if (freq) {
-    features->set_value(freq_, freq);
-    features->set_value(in_dict_, 1.0);
-  } else {
-    features->set_value(oov_, 1.0);
-    freq = 99.0f;
-  }
-  if (bad_words_.count(word) != 0)
-    features->set_value(bad_, 1.0);
-  if (chars < 5)
-    features->set_value(short_, 1.0);
-  if (chars > 10)
-    features->set_value(long_, 1.0);
-  if (freq < 7.0f)
-    features->set_value(high_freq_, 1.0);
-  if (freq > 8.0f && freq < 10.f)
-    features->set_value(med_freq_, 1.0);
-  if (freq < 10.0f && chars < 5)
-    features->set_value(short_range_, 1.0);
-
-  // i don't understand these features, but they really help!
-  features->set_value(fl1_, sqrt(chars * freq));
-  features->set_value(fl2_, freq / chars);
-}
-
-void BasicCSplitFeatures::TraversalFeaturesImpl(
-                                     const SentenceMetadata& smeta,
-                                     const Hypergraph::Edge& edge,
-                                     const std::vector<const void*>& ant_contexts,
-                                     SparseVector<double>* features,
-                                     SparseVector<double>* estimated_features,
-                                     void* out_context) const {
-  (void) smeta;
-  (void) ant_contexts;
-  (void) out_context;
-  (void) estimated_features;
-  if (edge.Arity() == 0) return;
-  if (edge.rule_->EWords() != 1) return;
-  pimpl_->TraversalFeaturesImpl(edge, features);
-}
-
-struct ReverseCharLMCSplitFeatureImpl {
-  ReverseCharLMCSplitFeatureImpl(const string& param) :
-      order_(5),
-      vocab_(*TD::dict_),
-      ngram_(vocab_, order_) {
-    kBOS = vocab_.getIndex("<s>");
-    kEOS = vocab_.getIndex("</s>");
-    File file(param.c_str(), "r", 0);
-    assert(file);
-    cerr << "Reading " << order_ << "-gram LM from " << param << endl;
-    ngram_.read(file);
-  }
-
-  double LeftPhonotacticProb(const Lattice& inword, const int start) {
-    const int end = inword.size();
-    for (int i = 0; i < order_; ++i)
-      sc[i] = kBOS;
-    int sp = min(end - start, order_ - 1);
-    // cerr << "[" << start << "," << sp << "]\n";
-    int ci = (order_ - sp - 1);
-    int wi = start;
-    while (sp > 0) {
-      sc[ci] = inword[wi][0].label;
-      // cerr << " CHAR: " << TD::Convert(sc[ci]) << "  ci=" << ci << endl;
-      ++wi;
-      ++ci;
-      --sp;
-    }
-    // cerr << "  END ci=" << ci << endl;
-    sc[ci] = Vocab_None;
-    const double startprob = ngram_.wordProb(kEOS, sc);
-    // cerr << "  PROB=" << startprob << endl;
-    return startprob;
-  }
- private:
-  const int order_;
-  Vocab& vocab_;
-  VocabIndex kBOS;
-  VocabIndex kEOS;
-  Ngram ngram_;
-  VocabIndex sc[80];
-};
-
-ReverseCharLMCSplitFeature::ReverseCharLMCSplitFeature(const string& param) :
-  pimpl_(new ReverseCharLMCSplitFeatureImpl(param)),
-  fid_(FD::Convert("RevCharLM")) {}
-
-void ReverseCharLMCSplitFeature::TraversalFeaturesImpl(
-                                     const SentenceMetadata& smeta,
-                                     const Hypergraph::Edge& edge,
-                                     const std::vector<const void*>& ant_contexts,
-                                     SparseVector<double>* features,
-                                     SparseVector<double>* estimated_features,
-                                     void* out_context) const {
-  (void) ant_contexts;
-  (void) estimated_features;
-  (void) out_context;
-
-  if (edge.Arity() != 1) return;
-  if (edge.rule_->EWords() != 1) return;
-  const double lpp = pimpl_->LeftPhonotacticProb(smeta.GetSourceLattice(), edge.i_);
-  features->set_value(fid_, lpp);
-}
-
author	Chris Dyer <redpony@gmail.com>	2009-12-14 20:35:11 -0500
committer	Chris Dyer <redpony@gmail.com>	2009-12-14 20:35:11 -0500
commit	851e389dffdd6996ea32d70defb8906de80b9edc (patch)
tree	8c68ee77205badc056b8ab5b332e67e3e98017df /src/ff_csplit.cc
parent	dc6930c00b4b276883280cff1ed6dcd9ddef03c7 (diff)