diff options
-rw-r--r-- | src/csplit.cc | 2 | ||||
-rw-r--r-- | src/csplit.h | 7 | ||||
-rw-r--r-- | src/ff_csplit.cc | 67 | ||||
-rw-r--r-- | src/sentence_metadata.h | 7 |
4 files changed, 53 insertions, 30 deletions
diff --git a/src/csplit.cc b/src/csplit.cc index 788f3112..21e1b711 100644 --- a/src/csplit.cc +++ b/src/csplit.cc @@ -145,6 +145,8 @@ bool CompoundSplit::Translate(const string& input, vector<string> in; SplitUTF8String(input, &in); smeta->SetSourceLength(in.size()); // TODO do utf8 or somethign + for (int i = 0; i < in.size(); ++i) + smeta->src_lattice_.push_back(vector<LatticeArc>(1, LatticeArc(TD::Convert(in[i]), 0.0, 1))); pimpl_->BuildTrellis(in, forest); forest->Reweight(weights); return true; diff --git a/src/csplit.h b/src/csplit.h index 5911af77..54e5329d 100644 --- a/src/csplit.h +++ b/src/csplit.h @@ -4,6 +4,13 @@ #include "translator.h" #include "lattice.h" +// this "translator" takes single words (with NO SPACES) and segments +// them using the approach described in: +// +// C. Dyer. (2009) Using a maximum entropy model to build segmentation +// lattices for MT. In Proceedings of NAACL HLT 2009. +// note, an extra word space marker # is inserted at the left edge of +// the forest! struct CompoundSplitImpl; struct CompoundSplit : public Translator { CompoundSplit(const boost::program_options::variables_map& conf); diff --git a/src/ff_csplit.cc b/src/ff_csplit.cc index e24d7d1d..eb106047 100644 --- a/src/ff_csplit.cc +++ b/src/ff_csplit.cc @@ -3,15 +3,17 @@ #include <set> #include <cstring> +#include "Vocab.h" +#include "Ngram.h" + +#include "sentence_metadata.h" +#include "lattice.h" #include "tdict.h" #include "freqdict.h" #include "filelib.h" #include "stringlib.h" #include "tdict.h" -#include "Vocab.h" -#include "Ngram.h" - using namespace std; struct BasicCSplitFeaturesImpl { @@ -25,6 +27,8 @@ struct BasicCSplitFeaturesImpl { high_freq_(FD::Convert("HighFreq")), med_freq_(FD::Convert("MedFreq")), freq_(FD::Convert("Freq")), + fl1_(FD::Convert("FreqLen1")), + fl2_(FD::Convert("FreqLen2")), bad_(FD::Convert("Bad")) { vector<string> argv; int argc = SplitOnWhitespace(param, &argv); @@ -57,6 +61,8 @@ struct BasicCSplitFeaturesImpl { const int high_freq_; const int med_freq_; const int freq_; + const int fl1_; + const int fl2_; const int bad_; FreqDict freq_dict_; set<WordID> bad_words_; @@ -78,6 +84,11 @@ void BasicCSplitFeaturesImpl::TraversalFeaturesImpl( cur += UTF8Len(sword[cur]); ++chars; } + + // these are corrections that attempt to make chars + // more like a phoneme count than a letter count, they + // are only really meaningful for german and should + // probably be gotten rid of bool has_sch = strstr(sword, "sch"); bool has_ch = (!has_sch && strstr(sword, "ch")); bool has_ie = strstr(sword, "ie"); @@ -107,6 +118,10 @@ void BasicCSplitFeaturesImpl::TraversalFeaturesImpl( features->set_value(med_freq_, 1.0); if (freq < 10.0f && chars < 5) features->set_value(short_range_, 1.0); + + // i don't understand these features, but they really help! + features->set_value(fl1_, sqrt(chars * freq)); + features->set_value(fl2_, freq / chars); } void BasicCSplitFeatures::TraversalFeaturesImpl( @@ -128,6 +143,7 @@ void BasicCSplitFeatures::TraversalFeaturesImpl( struct ReverseCharLMCSplitFeatureImpl { ReverseCharLMCSplitFeatureImpl(const string& param) : order_(5), + vocab_(*TD::dict_), ngram_(vocab_, order_) { kBOS = vocab_.getIndex("<s>"); kEOS = vocab_.getIndex("</s>"); @@ -137,41 +153,30 @@ struct ReverseCharLMCSplitFeatureImpl { ngram_.read(file); } - double LeftPhonotacticProb(const char* word) { + double LeftPhonotacticProb(const Lattice& inword, const int start) { + const int end = inword.size(); for (int i = 0; i < order_; ++i) sc[i] = kBOS; - const int len = strlen(word); - int cur = 0; - int chars = 0; - while(cur < len) { - cur += UTF8Len(word[cur]); - ++chars; - } - const int sp = min(chars, order_-1); - int wend = 0; cur = 0; - while(cur < sp) { - wend += UTF8Len(word[wend]); - ++cur; - } - int wi = 0; + int sp = min(end - start, order_ - 1); + // cerr << "[" << start << "," << sp << "]\n"; int ci = (order_ - sp - 1); - // cerr << "WORD: " << word << endl; - while (wi != wend) { - const int clen = UTF8Len(word[wi]); - string cur_char(&word[wi], clen); - wi += clen; - // cerr << " char: " << cur_char << " ci=" << ci << endl; - sc[ci++] = vocab_.getIndex(cur_char.c_str()); + int wi = start; + while (sp > 0) { + sc[ci] = inword[wi][0].label; + // cerr << " CHAR: " << TD::Convert(sc[ci]) << " ci=" << ci << endl; + ++wi; + ++ci; + --sp; } - // cerr << " END sp=" << sp << endl; - sc[sp] = Vocab_None; + // cerr << " END ci=" << ci << endl; + sc[ci] = Vocab_None; const double startprob = ngram_.wordProb(kEOS, sc); // cerr << " PROB=" << startprob << endl; return startprob; } private: const int order_; - Vocab vocab_; + Vocab& vocab_; VocabIndex kBOS; VocabIndex kEOS; Ngram ngram_; @@ -189,9 +194,13 @@ void ReverseCharLMCSplitFeature::TraversalFeaturesImpl( SparseVector<double>* features, SparseVector<double>* estimated_features, void* out_context) const { + (void) ant_contexts; + (void) estimated_features; + (void) out_context; + if (edge.Arity() != 1) return; if (edge.rule_->EWords() != 1) return; - const double lpp = pimpl_->LeftPhonotacticProb(TD::Convert(edge.rule_->e_[1])); + const double lpp = pimpl_->LeftPhonotacticProb(smeta.GetSourceLattice(), edge.i_); features->set_value(fid_, lpp); } diff --git a/src/sentence_metadata.h b/src/sentence_metadata.h index 0178f1f5..ef9eb388 100644 --- a/src/sentence_metadata.h +++ b/src/sentence_metadata.h @@ -27,11 +27,16 @@ struct SentenceMetadata { int GetSourceLength() const { return src_len_; } int GetTargetLength() const { return trg_len_; } int GetSentenceID() const { return sent_id_; } + // this will be empty if the translator accepts non FS input! + const Lattice& GetSourceLattice() const { return src_lattice_; } private: const int sent_id_; + // the following should be set, if possible, by the Translator int src_len_; - + public: + Lattice src_lattice_; // this will only be set if inputs are finite state! + private: // you need to be very careful when depending on these values // they will only be set during training / alignment contexts const bool has_reference_; |