summaryrefslogtreecommitdiff
path: root/dtrain/test
diff options
context:
space:
mode:
Diffstat (limited to 'dtrain/test')
-rw-r--r--dtrain/test/example/cdec.ini21
-rw-r--r--dtrain/test/example/dtrain.ini20
-rw-r--r--dtrain/test/example/nc-wmt11.1k.gzbin0 -> 21185883 bytes
-rw-r--r--dtrain/test/example/nc-wmt11.en.srilm.gzbin0 -> 16017291 bytes
-rw-r--r--dtrain/test/logreg_cd/bin_class.cc4
-rw-r--r--dtrain/test/logreg_cd/bin_class.h22
-rw-r--r--dtrain/test/logreg_cd/log_reg.cc39
-rw-r--r--dtrain/test/logreg_cd/log_reg.h14
-rw-r--r--dtrain/test/test.in3
-rw-r--r--dtrain/test/toy/cdec.ini2
-rw-r--r--dtrain/test/toy/dtrain.ini11
-rw-r--r--dtrain/test/toy/in2
12 files changed, 138 insertions, 0 deletions
diff --git a/dtrain/test/example/cdec.ini b/dtrain/test/example/cdec.ini
new file mode 100644
index 00000000..d88779fa
--- /dev/null
+++ b/dtrain/test/example/cdec.ini
@@ -0,0 +1,21 @@
+formalism=scfg
+add_pass_through_rules=true
+scfg_max_span_limit=15
+intersection_strategy=cube_pruning
+cubepruning_pop_limit=30
+feature_function=WordPenalty
+feature_function=KLanguageModel test/example/nc-wmt11.en.srilm.gz
+#feature_function=ArityPenalty
+#feature_function=CMR2008ReorderingFeatures
+#feature_function=InputIndicator
+#feature_function=LexNullJump
+#feature_function=NewJump
+#feature_function=NgramFeatures
+#feature_function=NonLatinCount
+#feature_function=OutputIndicator
+#feature_function=RuleIdentityFeatures
+#feature_function=RuleNgramFeatures
+#feature_function=RuleShape
+#feature_function=SourceSpanSizeFeatures
+#feature_function=SourceWordPenalty
+#feature_function=SpanFeatures
diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini
new file mode 100644
index 00000000..9b9f45e7
--- /dev/null
+++ b/dtrain/test/example/dtrain.ini
@@ -0,0 +1,20 @@
+input=test/example/nc-wmt11.1k.gz # use '-' for stdin
+output=- # a weights file
+decoder_config=test/example/cdec.ini # a ini for cdec
+# these will be printed on each iteration
+print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough
+tmp=/tmp
+#stop_after=10
+
+# interesting stuff
+epochs=10
+k=100
+N=4
+learning_rate=0.0001
+gamma=0.00001
+scorer=stupid_bleu
+sample_from=kbest
+filter=uniq
+pair_sampling=108010
+pair_threshold=0
+select_weights=VOID
diff --git a/dtrain/test/example/nc-wmt11.1k.gz b/dtrain/test/example/nc-wmt11.1k.gz
new file mode 100644
index 00000000..45496cd8
--- /dev/null
+++ b/dtrain/test/example/nc-wmt11.1k.gz
Binary files differ
diff --git a/dtrain/test/example/nc-wmt11.en.srilm.gz b/dtrain/test/example/nc-wmt11.en.srilm.gz
new file mode 100644
index 00000000..7ce81057
--- /dev/null
+++ b/dtrain/test/example/nc-wmt11.en.srilm.gz
Binary files differ
diff --git a/dtrain/test/logreg_cd/bin_class.cc b/dtrain/test/logreg_cd/bin_class.cc
new file mode 100644
index 00000000..19bcde25
--- /dev/null
+++ b/dtrain/test/logreg_cd/bin_class.cc
@@ -0,0 +1,4 @@
+#include "bin_class.h"
+
+Objective::~Objective() {}
+
diff --git a/dtrain/test/logreg_cd/bin_class.h b/dtrain/test/logreg_cd/bin_class.h
new file mode 100644
index 00000000..3466109a
--- /dev/null
+++ b/dtrain/test/logreg_cd/bin_class.h
@@ -0,0 +1,22 @@
+#ifndef _BIN_CLASS_H_
+#define _BIN_CLASS_H_
+
+#include <vector>
+#include "sparse_vector.h"
+
+struct TrainingInstance {
+ // TODO add other info? loss for MIRA-type updates?
+ SparseVector<double> x_feature_map;
+ bool y;
+};
+
+struct Objective {
+ virtual ~Objective();
+
+ // returns f(x) and f'(x)
+ virtual double ObjectiveAndGradient(const SparseVector<double>& x,
+ const std::vector<TrainingInstance>& training_instances,
+ SparseVector<double>* g) const = 0;
+};
+
+#endif
diff --git a/dtrain/test/logreg_cd/log_reg.cc b/dtrain/test/logreg_cd/log_reg.cc
new file mode 100644
index 00000000..ec2331fe
--- /dev/null
+++ b/dtrain/test/logreg_cd/log_reg.cc
@@ -0,0 +1,39 @@
+#include "log_reg.h"
+
+#include <vector>
+#include <cmath>
+
+#include "sparse_vector.h"
+
+using namespace std;
+
+double LogisticRegression::ObjectiveAndGradient(const SparseVector<double>& x,
+ const vector<TrainingInstance>& training_instances,
+ SparseVector<double>* g) const {
+ double cll = 0;
+ for (int i = 0; i < training_instances.size(); ++i) {
+ const double dotprod = training_instances[i].x_feature_map.dot(x); // TODO no bias, if bias, add x[0]
+ double lp_false = dotprod;
+ double lp_true = -dotprod;
+ if (0 < lp_true) {
+ lp_true += log1p(exp(-lp_true));
+ lp_false = log1p(exp(lp_false));
+ } else {
+ lp_true = log1p(exp(lp_true));
+ lp_false += log1p(exp(-lp_false));
+ }
+ lp_true *= -1;
+ lp_false *= -1;
+ if (training_instances[i].y) { // true label
+ cll -= lp_true;
+ (*g) -= training_instances[i].x_feature_map * exp(lp_false);
+ // (*g)[0] -= exp(lp_false); // bias
+ } else { // false label
+ cll -= lp_false;
+ (*g) += training_instances[i].x_feature_map * exp(lp_true);
+ // g += corpus[i].second * exp(lp_true);
+ }
+ }
+ return cll;
+}
+
diff --git a/dtrain/test/logreg_cd/log_reg.h b/dtrain/test/logreg_cd/log_reg.h
new file mode 100644
index 00000000..ecc560b8
--- /dev/null
+++ b/dtrain/test/logreg_cd/log_reg.h
@@ -0,0 +1,14 @@
+#ifndef _LOG_REG_H_
+#define _LOG_REG_H_
+
+#include <vector>
+#include "sparse_vector.h"
+#include "bin_class.h"
+
+struct LogisticRegression : public Objective {
+ double ObjectiveAndGradient(const SparseVector<double>& x,
+ const std::vector<TrainingInstance>& training_instances,
+ SparseVector<double>* g) const;
+};
+
+#endif
diff --git a/dtrain/test/test.in b/dtrain/test/test.in
new file mode 100644
index 00000000..4f53335e
--- /dev/null
+++ b/dtrain/test/test.in
@@ -0,0 +1,3 @@
+0 vorrichtung means [X] ||| vorrichtung ||| apparatus ||| LogP=0 ||| 0-0 __NEXT_RULE__ [X] ||| vorrichtung ||| means ||| LogP=-100 ||| 0-0
+1 Test test [X] ||| Test ||| test ||| LogP=0 ||| 0-0 __NEXT_RULE__ [X] ||| Test ||| xxx ||| LogP=-100 ||| 0-0
+2 kaputt broken
diff --git a/dtrain/test/toy/cdec.ini b/dtrain/test/toy/cdec.ini
new file mode 100644
index 00000000..98b02d44
--- /dev/null
+++ b/dtrain/test/toy/cdec.ini
@@ -0,0 +1,2 @@
+formalism=scfg
+add_pass_through_rules=true
diff --git a/dtrain/test/toy/dtrain.ini b/dtrain/test/toy/dtrain.ini
new file mode 100644
index 00000000..3548bbb6
--- /dev/null
+++ b/dtrain/test/toy/dtrain.ini
@@ -0,0 +1,11 @@
+decoder_config=test/toy/cdec.ini
+input=test/toy/in
+output=-
+print_weights=logp use_shell use_house PassThrough
+
+k=4
+N=3
+epochs=2
+scorer=stupid_bleu
+sample_from=kbest
+filter=uniq
diff --git a/dtrain/test/toy/in b/dtrain/test/toy/in
new file mode 100644
index 00000000..d7b7d080
--- /dev/null
+++ b/dtrain/test/toy/in
@@ -0,0 +1,2 @@
+0 ich sah ein kleines haus i saw a little house [S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 [NP] ||| ich ||| i ||| logp=0 [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 [NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1 [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1 [JJ] ||| kleines ||| small ||| logp=0 [JJ] ||| kleines ||| little ||| logp=0 [JJ] ||| grosses ||| big ||| logp=0 [JJ] ||| grosses ||| large ||| logp=0 [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 [V] ||| sah ||| saw ||| logp=0 [V] ||| fand ||| found ||| logp=0
+1 ich fand ein grosses haus i found a large house [S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 [NP] ||| ich ||| i ||| logp=0 [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 [NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1 [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1 [JJ] ||| kleines ||| small ||| logp=0 [JJ] ||| kleines ||| little ||| logp=0 [JJ] ||| grosses ||| big ||| logp=0 [JJ] ||| grosses ||| large ||| logp=0 [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 [V] ||| sah ||| saw ||| logp=0 [V] ||| fand ||| found ||| logp=0