summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Dyer <redpony@gmail.com>2009-12-07 01:50:49 -0500
committerChris Dyer <redpony@gmail.com>2009-12-07 01:50:49 -0500
commitec7edcc7e398bdb040d810094b8416ad9f279d98 (patch)
treebdaea01e4317d04946de432d8b3a187881e63e96
parentd6c82eaee9489a4d187a266a7d76a3e09cde119f (diff)
minor changes to the way the phonotactic prob is calculated
-rw-r--r--src/csplit.cc2
-rw-r--r--src/csplit.h7
-rw-r--r--src/ff_csplit.cc67
-rw-r--r--src/sentence_metadata.h7
4 files changed, 53 insertions, 30 deletions
diff --git a/src/csplit.cc b/src/csplit.cc
index 788f3112..21e1b711 100644
--- a/src/csplit.cc
+++ b/src/csplit.cc
@@ -145,6 +145,8 @@ bool CompoundSplit::Translate(const string& input,
vector<string> in;
SplitUTF8String(input, &in);
smeta->SetSourceLength(in.size()); // TODO do utf8 or somethign
+ for (int i = 0; i < in.size(); ++i)
+ smeta->src_lattice_.push_back(vector<LatticeArc>(1, LatticeArc(TD::Convert(in[i]), 0.0, 1)));
pimpl_->BuildTrellis(in, forest);
forest->Reweight(weights);
return true;
diff --git a/src/csplit.h b/src/csplit.h
index 5911af77..54e5329d 100644
--- a/src/csplit.h
+++ b/src/csplit.h
@@ -4,6 +4,13 @@
#include "translator.h"
#include "lattice.h"
+// this "translator" takes single words (with NO SPACES) and segments
+// them using the approach described in:
+//
+// C. Dyer. (2009) Using a maximum entropy model to build segmentation
+// lattices for MT. In Proceedings of NAACL HLT 2009.
+// note, an extra word space marker # is inserted at the left edge of
+// the forest!
struct CompoundSplitImpl;
struct CompoundSplit : public Translator {
CompoundSplit(const boost::program_options::variables_map& conf);
diff --git a/src/ff_csplit.cc b/src/ff_csplit.cc
index e24d7d1d..eb106047 100644
--- a/src/ff_csplit.cc
+++ b/src/ff_csplit.cc
@@ -3,15 +3,17 @@
#include <set>
#include <cstring>
+#include "Vocab.h"
+#include "Ngram.h"
+
+#include "sentence_metadata.h"
+#include "lattice.h"
#include "tdict.h"
#include "freqdict.h"
#include "filelib.h"
#include "stringlib.h"
#include "tdict.h"
-#include "Vocab.h"
-#include "Ngram.h"
-
using namespace std;
struct BasicCSplitFeaturesImpl {
@@ -25,6 +27,8 @@ struct BasicCSplitFeaturesImpl {
high_freq_(FD::Convert("HighFreq")),
med_freq_(FD::Convert("MedFreq")),
freq_(FD::Convert("Freq")),
+ fl1_(FD::Convert("FreqLen1")),
+ fl2_(FD::Convert("FreqLen2")),
bad_(FD::Convert("Bad")) {
vector<string> argv;
int argc = SplitOnWhitespace(param, &argv);
@@ -57,6 +61,8 @@ struct BasicCSplitFeaturesImpl {
const int high_freq_;
const int med_freq_;
const int freq_;
+ const int fl1_;
+ const int fl2_;
const int bad_;
FreqDict freq_dict_;
set<WordID> bad_words_;
@@ -78,6 +84,11 @@ void BasicCSplitFeaturesImpl::TraversalFeaturesImpl(
cur += UTF8Len(sword[cur]);
++chars;
}
+
+ // these are corrections that attempt to make chars
+ // more like a phoneme count than a letter count, they
+ // are only really meaningful for german and should
+ // probably be gotten rid of
bool has_sch = strstr(sword, "sch");
bool has_ch = (!has_sch && strstr(sword, "ch"));
bool has_ie = strstr(sword, "ie");
@@ -107,6 +118,10 @@ void BasicCSplitFeaturesImpl::TraversalFeaturesImpl(
features->set_value(med_freq_, 1.0);
if (freq < 10.0f && chars < 5)
features->set_value(short_range_, 1.0);
+
+ // i don't understand these features, but they really help!
+ features->set_value(fl1_, sqrt(chars * freq));
+ features->set_value(fl2_, freq / chars);
}
void BasicCSplitFeatures::TraversalFeaturesImpl(
@@ -128,6 +143,7 @@ void BasicCSplitFeatures::TraversalFeaturesImpl(
struct ReverseCharLMCSplitFeatureImpl {
ReverseCharLMCSplitFeatureImpl(const string& param) :
order_(5),
+ vocab_(*TD::dict_),
ngram_(vocab_, order_) {
kBOS = vocab_.getIndex("<s>");
kEOS = vocab_.getIndex("</s>");
@@ -137,41 +153,30 @@ struct ReverseCharLMCSplitFeatureImpl {
ngram_.read(file);
}
- double LeftPhonotacticProb(const char* word) {
+ double LeftPhonotacticProb(const Lattice& inword, const int start) {
+ const int end = inword.size();
for (int i = 0; i < order_; ++i)
sc[i] = kBOS;
- const int len = strlen(word);
- int cur = 0;
- int chars = 0;
- while(cur < len) {
- cur += UTF8Len(word[cur]);
- ++chars;
- }
- const int sp = min(chars, order_-1);
- int wend = 0; cur = 0;
- while(cur < sp) {
- wend += UTF8Len(word[wend]);
- ++cur;
- }
- int wi = 0;
+ int sp = min(end - start, order_ - 1);
+ // cerr << "[" << start << "," << sp << "]\n";
int ci = (order_ - sp - 1);
- // cerr << "WORD: " << word << endl;
- while (wi != wend) {
- const int clen = UTF8Len(word[wi]);
- string cur_char(&word[wi], clen);
- wi += clen;
- // cerr << " char: " << cur_char << " ci=" << ci << endl;
- sc[ci++] = vocab_.getIndex(cur_char.c_str());
+ int wi = start;
+ while (sp > 0) {
+ sc[ci] = inword[wi][0].label;
+ // cerr << " CHAR: " << TD::Convert(sc[ci]) << " ci=" << ci << endl;
+ ++wi;
+ ++ci;
+ --sp;
}
- // cerr << " END sp=" << sp << endl;
- sc[sp] = Vocab_None;
+ // cerr << " END ci=" << ci << endl;
+ sc[ci] = Vocab_None;
const double startprob = ngram_.wordProb(kEOS, sc);
// cerr << " PROB=" << startprob << endl;
return startprob;
}
private:
const int order_;
- Vocab vocab_;
+ Vocab& vocab_;
VocabIndex kBOS;
VocabIndex kEOS;
Ngram ngram_;
@@ -189,9 +194,13 @@ void ReverseCharLMCSplitFeature::TraversalFeaturesImpl(
SparseVector<double>* features,
SparseVector<double>* estimated_features,
void* out_context) const {
+ (void) ant_contexts;
+ (void) estimated_features;
+ (void) out_context;
+
if (edge.Arity() != 1) return;
if (edge.rule_->EWords() != 1) return;
- const double lpp = pimpl_->LeftPhonotacticProb(TD::Convert(edge.rule_->e_[1]));
+ const double lpp = pimpl_->LeftPhonotacticProb(smeta.GetSourceLattice(), edge.i_);
features->set_value(fid_, lpp);
}
diff --git a/src/sentence_metadata.h b/src/sentence_metadata.h
index 0178f1f5..ef9eb388 100644
--- a/src/sentence_metadata.h
+++ b/src/sentence_metadata.h
@@ -27,11 +27,16 @@ struct SentenceMetadata {
int GetSourceLength() const { return src_len_; }
int GetTargetLength() const { return trg_len_; }
int GetSentenceID() const { return sent_id_; }
+ // this will be empty if the translator accepts non FS input!
+ const Lattice& GetSourceLattice() const { return src_lattice_; }
private:
const int sent_id_;
+ // the following should be set, if possible, by the Translator
int src_len_;
-
+ public:
+ Lattice src_lattice_; // this will only be set if inputs are finite state!
+ private:
// you need to be very careful when depending on these values
// they will only be set during training / alignment contexts
const bool has_reference_;