Felix' https://github.com/felleh lexical word alignment features

author: Patrick Simianer <p@simianer.de> 2014-01-13 17:15:24 +0100
committer: Patrick Simianer <p@simianer.de> 2014-01-13 17:15:24 +0100
commit: 411ad2eb4a09ef04a7529bc1a178e83f624c6569 (patch)
tree: d34f3ef450614f061bd87ac6229e833f155e3abb
parent: 757ffb75e9a860ccd10843dcf099787cefec781a (diff)
5 files changed, 191 insertions, 58 deletions
diff --git a/decoder/Makefile.am b/decoder/Makefile.am
index b735756d..c0371081 100644
--- a/decoder/Makefile.am
+++ b/decoder/Makefile.am
@@ -48,6 +48,7 @@ libcdec_a_SOURCES = \
   ff_external.h \
   ff_factory.h \
   ff_klm.h \
+	ff_lexical.h \
   ff_lm.h \
   ff_ngrams.h \
   ff_parse_match.h \
diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc
index b2541722..80b42d22 100644
--- a/decoder/cdec_ff.cc
+++ b/decoder/cdec_ff.cc
@@ -24,6 +24,7 @@
 #include "ff_charset.h"
 #include "ff_wordset.h"
 #include "ff_external.h"
+#include "ff_lexical.h"
 
 
 void register_feature_functions() {
@@ -39,6 +40,7 @@ void register_feature_functions() {
   RegisterFF<SourceWordPenalty>();
   RegisterFF<ArityPenalty>();
   RegisterFF<BLEUModel>();
+  RegisterFF<LexicalFeatures>();
 
   //TODO: use for all features the new Register which requires static FF::usage(false,false) give name
   ff_registry.Register("SpanFeatures", new FFFactory<SpanFeatures>());
diff --git a/decoder/ff_lexical.h b/decoder/ff_lexical.h
new file mode 100644
index 00000000..21c85b27
--- /dev/null
+++ b/decoder/ff_lexical.h
@@ -0,0 +1,128 @@
+#ifndef FF_LEXICAL_H_
+#define FF_LEXICAL_H_
+
+#include <vector>
+#include <map>
+#include "trule.h"
+#include "ff.h"
+#include "hg.h"
+#include "array2d.h"
+#include "wordid.h"
+#include <sstream>
+#include <cassert>
+#include <cmath>
+
+#include "filelib.h"
+#include "stringlib.h"
+#include "sentence_metadata.h"
+#include "lattice.h"
+#include "fdict.h"
+#include "verbose.h"
+#include "tdict.h"
+#include "hg.h"
+
+using namespace std;
+
+namespace {
+  string Escape(const string& x) {
+    string y = x;
+    for (int i = 0; i < y.size(); ++i) {
+      if (y[i] == '=') y[i]='_';
+      if (y[i] == ';') y[i]='_';
+    }
+    return y;
+  }
+}
+
+class LexicalFeatures : public FeatureFunction {
+public:
+	LexicalFeatures(const std::string& param) {
+		if (param.empty()) {
+			cerr << "LexicalFeatures: using T,D,I\n";
+			T_ = true; I_ = true; D_ = true;
+		} else {
+			const vector<string> argv = SplitOnWhitespace(param);
+			assert(argv.size() == 3);
+			T_ = (bool) atoi(argv[0].c_str());
+			I_ = (bool) atoi(argv[1].c_str());
+			D_ = (bool) atoi(argv[2].c_str());
+			cerr << "T=" << T_ << " I=" << I_ << " D=" << D_ << endl;
+		}
+	};
+	static std::string usage(bool p,bool d) {
+	    return usage_helper("LexicalFeatures","[0/1 0/1 0/1]","Sparse lexical word translation indicator features. If arguments are supplied, specify like this: translations insertions deletions",p,d);
+	}
+protected:
+	virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
+			const HG::Edge& edge,
+			const std::vector<const void*>& ant_contexts,
+			SparseVector<double>* features,
+			SparseVector<double>* estimated_features,
+			void* context) const;
+	virtual void PrepareForInput(const SentenceMetadata& smeta);
+private:
+	mutable std::map<const TRule*, SparseVector<double> > rule2feats_;
+	bool T_;
+	bool I_;
+	bool D_;
+};
+
+void LexicalFeatures::PrepareForInput(const SentenceMetadata& smeta) {
+  rule2feats_.clear(); //  std::map<const TRule*, SparseVector<double> >
+}
+
+void LexicalFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta,
+	const HG::Edge& edge,
+	const std::vector<const void*>& ant_contexts,
+	SparseVector<double>* features,
+	SparseVector<double>* estimated_features,
+	void* context) const {
+	
+	map<const TRule*, SparseVector<double> >::iterator it = rule2feats_.find(edge.rule_.get());	
+	if (it == rule2feats_.end()) {
+		const TRule& rule = *edge.rule_;
+	    it = rule2feats_.insert(make_pair(&rule, SparseVector<double>())).first;
+	    SparseVector<double>& f = it->second;
+	    std::vector<bool> sf(edge.rule_->FLength(),false); // stores if source tokens are visited by alignment points
+		std::vector<bool> se(edge.rule_->ELength(),false); // stores if target tokens are visited by alignment points
+		int fid = 0;
+	    // translations
+	    for (unsigned i=0;i<rule.a_.size();++i) {
+	    	const AlignmentPoint& ap = rule.a_[i];
+	    	sf[ap.s_] = true; // mark index as seen
+	    	se[ap.t_] = true; // mark index as seen
+	    	ostringstream os;
+			os << "LT:" << Escape(TD::Convert(rule.f_[ap.s_])) << ":" << Escape(TD::Convert(rule.e_[ap.t_]));
+			fid = FD::Convert(os.str());
+			if (fid <= 0) continue;
+			if (T_)
+				f.add_value(fid, 1.0);
+	    }
+	    // word deletions
+	    for (unsigned i=0;i<sf.size();++i) {
+	    	if (!sf[i] && rule.f_[i] > 0) {// if not visited and is terminal
+	    		ostringstream os;
+	    		os << "LD:" << Escape(TD::Convert(rule.f_[i]));
+	    		fid = FD::Convert(os.str());
+	    		if (fid <= 0) continue;
+	    		if (D_)
+		    		f.add_value(fid, 1.0);
+	    	}
+	    }
+	    // word insertions
+	    for (unsigned i=0;i<se.size();++i) {
+	    	if (!se[i] && rule.e_[i] >= 1) {// if not visited and is terminal
+	    		ostringstream os;
+	    		os << "LI:" << Escape(TD::Convert(rule.e_[i]));
+	    		fid = FD::Convert(os.str());
+	    		if (fid <= 0) continue;
+	    		if (I_)
+		    		f.add_value(fid, 1.0);
+	    	}
+	    }
+	}
+	(*features) += it->second;
+}
+
+
+#endif
diff --git a/training/dtrain/examples/standard/cdec.ini b/training/dtrain/examples/standard/cdec.ini
index 6cba9e1e..044ae2f5 100644
--- a/training/dtrain/examples/standard/cdec.ini
+++ b/training/dtrain/examples/standard/cdec.ini
@@ -21,7 +21,8 @@ feature_function=RuleIdentityFeatures
 feature_function=RuleSourceBigramFeatures
 feature_function=RuleTargetBigramFeatures
 feature_function=RuleShape
-feature_function=RuleWordAlignmentFeatures
+#feature_function=RuleWordAlignmentFeatures
+feature_function=LexicalFeatures 1 1 1
 #feature_function=SourceSpanSizeFeatures
 #feature_function=SourceWordPenalty
 #feature_function=SpanFeatures
diff --git a/training/dtrain/examples/standard/expected-output b/training/dtrain/examples/standard/expected-output
index fa831221..2460cfbb 100644
--- a/training/dtrain/examples/standard/expected-output
+++ b/training/dtrain/examples/standard/expected-output
@@ -4,7 +4,8 @@ Reading ./nc-wmt11.en.srilm.gz
 ----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
 ****************************************************************************************************
   Example feature: Shape_S00000_T00000
-Seeding random number sequence to 4138446869
+T=1 I=1 D=1
+Seeding random number sequence to 2327685089
 
 dtrain
 Parameters:
@@ -36,87 +37,87 @@ Iteration #1 of 3.
  . 10
 Stopping after 10 input sentences.
 WEIGHTS
-              Glue = -80.3
-       WordPenalty = -51.247
-     LanguageModel = +282.46
- LanguageModel_OOV = -85.8
-     PhraseModel_0 = -100.06
-     PhraseModel_1 = -98.692
-     PhraseModel_2 = -9.4958
-     PhraseModel_3 = +18.535
-     PhraseModel_4 = +62.35
-     PhraseModel_5 = +7
-     PhraseModel_6 = +31.4
-       PassThrough = -126.5
+              Glue = +6.9
+       WordPenalty = -46.426
+     LanguageModel = +535.12
+ LanguageModel_OOV = -123.5
+     PhraseModel_0 = -160.73
+     PhraseModel_1 = -350.13
+     PhraseModel_2 = -187.81
+     PhraseModel_3 = +172.04
+     PhraseModel_4 = +0.90108
+     PhraseModel_5 = +21.6
+     PhraseModel_6 = +67.2
+       PassThrough = -149.7
         ---
-       1best avg score: 0.25631 (+0.25631)
- 1best avg model score: -4843.6 (-4843.6)
-           avg # pairs: 744.4
+       1best avg score: 0.23327 (+0.23327)
+ 1best avg model score: -9084.9 (-9084.9)
+           avg # pairs: 780.7
         avg # rank err: 0 (meaningless)
      avg # margin viol: 0
        k-best loss imp: 100%
-    non0 feature count: 1274
+    non0 feature count: 1389
            avg list sz: 91.3
-           avg f count: 143.72
-(time 0.4 min, 2.4 s/S)
+           avg f count: 146.2
+(time 0.37 min, 2.2 s/S)
 
 Iteration #2 of 3.
  . 10
 WEIGHTS
-              Glue = -117.4
-       WordPenalty = -99.584
-     LanguageModel = +395.05
- LanguageModel_OOV = -136.8
-     PhraseModel_0 = +40.614
-     PhraseModel_1 = -123.29
-     PhraseModel_2 = -152
-     PhraseModel_3 = -161.13
-     PhraseModel_4 = -76.379
-     PhraseModel_5 = +39.1
-     PhraseModel_6 = +137.7
-       PassThrough = -162.1
+              Glue = -43
+       WordPenalty = -22.019
+     LanguageModel = +591.53
+ LanguageModel_OOV = -252.1
+     PhraseModel_0 = -120.21
+     PhraseModel_1 = -43.589
+     PhraseModel_2 = +73.53
+     PhraseModel_3 = +113.7
+     PhraseModel_4 = -223.81
+     PhraseModel_5 = +64
+     PhraseModel_6 = +54.8
+       PassThrough = -331.1
         ---
-       1best avg score: 0.26751 (+0.011198)
- 1best avg model score: -10061 (-5216.9)
-           avg # pairs: 639.1
+       1best avg score: 0.29568 (+0.062413)
+ 1best avg model score: -15879 (-6794.1)
+           avg # pairs: 566.1
         avg # rank err: 0 (meaningless)
      avg # margin viol: 0
        k-best loss imp: 100%
-    non0 feature count: 1845
+    non0 feature count: 1931
            avg list sz: 91.3
-           avg f count: 139.88
-(time 0.35 min, 2.1 s/S)
+           avg f count: 139.89
+(time 0.33 min, 2 s/S)
 
 Iteration #3 of 3.
  . 10
 WEIGHTS
-              Glue = -101.1
-       WordPenalty = -139.97
-     LanguageModel = +327.98
- LanguageModel_OOV = -234.7
-     PhraseModel_0 = -144.49
-     PhraseModel_1 = -263.88
-     PhraseModel_2 = -149.25
-     PhraseModel_3 = -38.805
-     PhraseModel_4 = +50.575
-     PhraseModel_5 = -52.4
-     PhraseModel_6 = +41.6
-       PassThrough = -230.2
+              Glue = -44.3
+       WordPenalty = -131.85
+     LanguageModel = +230.91
+ LanguageModel_OOV = -285.4
+     PhraseModel_0 = -194.27
+     PhraseModel_1 = -294.83
+     PhraseModel_2 = -92.043
+     PhraseModel_3 = -140.24
+     PhraseModel_4 = +85.613
+     PhraseModel_5 = +238.1
+     PhraseModel_6 = +158.7
+       PassThrough = -359.6
         ---
-       1best avg score: 0.36222 (+0.094717)
- 1best avg model score: -17416 (-7355.5)
-           avg # pairs: 661.2
+       1best avg score: 0.37375 (+0.078067)
+ 1best avg model score: -14519 (+1359.7)
+           avg # pairs: 545.4
         avg # rank err: 0 (meaningless)
      avg # margin viol: 0
        k-best loss imp: 100%
-    non0 feature count: 2163
+    non0 feature count: 2218
            avg list sz: 91.3
-           avg f count: 132.53
-(time 0.33 min, 2 s/S)
+           avg f count: 137.77
+(time 0.35 min, 2.1 s/S)
 
 Writing weights file to '-' ...
 done
 
 ---
-Best iteration: 3 [SCORE 'fixed_stupid_bleu'=0.36222].
-This took 1.0833 min.
+Best iteration: 3 [SCORE 'fixed_stupid_bleu'=0.37375].
+This took 1.05 min.
author	Patrick Simianer <p@simianer.de>	2014-01-13 17:15:24 +0100
committer	Patrick Simianer <p@simianer.de>	2014-01-13 17:15:24 +0100
commit	411ad2eb4a09ef04a7529bc1a178e83f624c6569 (patch)
tree	d34f3ef450614f061bd87ac6229e833f155e3abb
parent	757ffb75e9a860ccd10843dcf099787cefec781a (diff)