diff options
author | Chris Dyer <redpony@gmail.com> | 2009-12-14 20:35:11 -0500 |
---|---|---|
committer | Chris Dyer <redpony@gmail.com> | 2009-12-14 20:35:11 -0500 |
commit | 851e389dffdd6996ea32d70defb8906de80b9edc (patch) | |
tree | 8c68ee77205badc056b8ab5b332e67e3e98017df /src/ff_csplit.cc | |
parent | dc6930c00b4b276883280cff1ed6dcd9ddef03c7 (diff) |
few small fixes of alignment tools, add new orthographic similarity feature for word aligner, final naming of directories, libraries in cdec
Diffstat (limited to 'src/ff_csplit.cc')
-rw-r--r-- | src/ff_csplit.cc | 212 |
1 files changed, 0 insertions, 212 deletions
diff --git a/src/ff_csplit.cc b/src/ff_csplit.cc deleted file mode 100644 index cac4bb8e..00000000 --- a/src/ff_csplit.cc +++ /dev/null @@ -1,212 +0,0 @@ -#include "ff_csplit.h" - -#include <set> -#include <cstring> - -#include "Vocab.h" -#include "Ngram.h" - -#include "sentence_metadata.h" -#include "lattice.h" -#include "tdict.h" -#include "freqdict.h" -#include "filelib.h" -#include "stringlib.h" -#include "tdict.h" - -using namespace std; - -struct BasicCSplitFeaturesImpl { - BasicCSplitFeaturesImpl(const string& param) : - word_count_(FD::Convert("WordCount")), - letters_sq_(FD::Convert("LettersSq")), - letters_sqrt_(FD::Convert("LettersSqrt")), - in_dict_(FD::Convert("InDict")), - short_(FD::Convert("Short")), - long_(FD::Convert("Long")), - oov_(FD::Convert("OOV")), - short_range_(FD::Convert("ShortRange")), - high_freq_(FD::Convert("HighFreq")), - med_freq_(FD::Convert("MedFreq")), - freq_(FD::Convert("Freq")), - fl1_(FD::Convert("FreqLen1")), - fl2_(FD::Convert("FreqLen2")), - bad_(FD::Convert("Bad")) { - vector<string> argv; - int argc = SplitOnWhitespace(param, &argv); - if (argc != 1 && argc != 2) { - cerr << "Expected: freqdict.txt [badwords.txt]\n"; - abort(); - } - freq_dict_.Load(argv[0]); - if (argc == 2) { - ReadFile rf(argv[1]); - istream& in = *rf.stream(); - while(in) { - string badword; - in >> badword; - if (badword.empty()) continue; - bad_words_.insert(TD::Convert(badword)); - } - } - } - - void TraversalFeaturesImpl(const Hypergraph::Edge& edge, - SparseVector<double>* features) const; - - const int word_count_; - const int letters_sq_; - const int letters_sqrt_; - const int in_dict_; - const int short_; - const int long_; - const int oov_; - const int short_range_; - const int high_freq_; - const int med_freq_; - const int freq_; - const int fl1_; - const int fl2_; - const int bad_; - FreqDict freq_dict_; - set<WordID> bad_words_; -}; - -BasicCSplitFeatures::BasicCSplitFeatures(const string& param) : - pimpl_(new BasicCSplitFeaturesImpl(param)) {} - -void BasicCSplitFeaturesImpl::TraversalFeaturesImpl( - const Hypergraph::Edge& edge, - SparseVector<double>* features) const { - features->set_value(word_count_, 1.0); - features->set_value(letters_sq_, (edge.j_ - edge.i_) * (edge.j_ - edge.i_)); - features->set_value(letters_sqrt_, sqrt(edge.j_ - edge.i_)); - const WordID word = edge.rule_->e_[1]; - const char* sword = TD::Convert(word); - const int len = strlen(sword); - int cur = 0; - int chars = 0; - while(cur < len) { - cur += UTF8Len(sword[cur]); - ++chars; - } - - // these are corrections that attempt to make chars - // more like a phoneme count than a letter count, they - // are only really meaningful for german and should - // probably be gotten rid of - bool has_sch = strstr(sword, "sch"); - bool has_ch = (!has_sch && strstr(sword, "ch")); - bool has_ie = strstr(sword, "ie"); - bool has_zw = strstr(sword, "zw"); - if (has_sch) chars -= 2; - if (has_ch) --chars; - if (has_ie) --chars; - if (has_zw) --chars; - - float freq = freq_dict_.LookUp(word); - if (freq) { - features->set_value(freq_, freq); - features->set_value(in_dict_, 1.0); - } else { - features->set_value(oov_, 1.0); - freq = 99.0f; - } - if (bad_words_.count(word) != 0) - features->set_value(bad_, 1.0); - if (chars < 5) - features->set_value(short_, 1.0); - if (chars > 10) - features->set_value(long_, 1.0); - if (freq < 7.0f) - features->set_value(high_freq_, 1.0); - if (freq > 8.0f && freq < 10.f) - features->set_value(med_freq_, 1.0); - if (freq < 10.0f && chars < 5) - features->set_value(short_range_, 1.0); - - // i don't understand these features, but they really help! - features->set_value(fl1_, sqrt(chars * freq)); - features->set_value(fl2_, freq / chars); -} - -void BasicCSplitFeatures::TraversalFeaturesImpl( - const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector<const void*>& ant_contexts, - SparseVector<double>* features, - SparseVector<double>* estimated_features, - void* out_context) const { - (void) smeta; - (void) ant_contexts; - (void) out_context; - (void) estimated_features; - if (edge.Arity() == 0) return; - if (edge.rule_->EWords() != 1) return; - pimpl_->TraversalFeaturesImpl(edge, features); -} - -struct ReverseCharLMCSplitFeatureImpl { - ReverseCharLMCSplitFeatureImpl(const string& param) : - order_(5), - vocab_(*TD::dict_), - ngram_(vocab_, order_) { - kBOS = vocab_.getIndex("<s>"); - kEOS = vocab_.getIndex("</s>"); - File file(param.c_str(), "r", 0); - assert(file); - cerr << "Reading " << order_ << "-gram LM from " << param << endl; - ngram_.read(file); - } - - double LeftPhonotacticProb(const Lattice& inword, const int start) { - const int end = inword.size(); - for (int i = 0; i < order_; ++i) - sc[i] = kBOS; - int sp = min(end - start, order_ - 1); - // cerr << "[" << start << "," << sp << "]\n"; - int ci = (order_ - sp - 1); - int wi = start; - while (sp > 0) { - sc[ci] = inword[wi][0].label; - // cerr << " CHAR: " << TD::Convert(sc[ci]) << " ci=" << ci << endl; - ++wi; - ++ci; - --sp; - } - // cerr << " END ci=" << ci << endl; - sc[ci] = Vocab_None; - const double startprob = ngram_.wordProb(kEOS, sc); - // cerr << " PROB=" << startprob << endl; - return startprob; - } - private: - const int order_; - Vocab& vocab_; - VocabIndex kBOS; - VocabIndex kEOS; - Ngram ngram_; - VocabIndex sc[80]; -}; - -ReverseCharLMCSplitFeature::ReverseCharLMCSplitFeature(const string& param) : - pimpl_(new ReverseCharLMCSplitFeatureImpl(param)), - fid_(FD::Convert("RevCharLM")) {} - -void ReverseCharLMCSplitFeature::TraversalFeaturesImpl( - const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector<const void*>& ant_contexts, - SparseVector<double>* features, - SparseVector<double>* estimated_features, - void* out_context) const { - (void) ant_contexts; - (void) estimated_features; - (void) out_context; - - if (edge.Arity() != 1) return; - if (edge.rule_->EWords() != 1) return; - const double lpp = pimpl_->LeftPhonotacticProb(smeta.GetSourceLattice(), edge.i_); - features->set_value(fid_, lpp); -} - |