diff options
Diffstat (limited to 'dtrain/test')
| -rw-r--r-- | dtrain/test/example/cdec.ini | 21 | ||||
| -rw-r--r-- | dtrain/test/example/dtrain.ini | 20 | ||||
| -rw-r--r-- | dtrain/test/example/nc-wmt11.1k.gz | bin | 0 -> 21185883 bytes | |||
| -rw-r--r-- | dtrain/test/example/nc-wmt11.en.srilm.gz | bin | 0 -> 16017291 bytes | |||
| -rw-r--r-- | dtrain/test/logreg_cd/bin_class.cc | 4 | ||||
| -rw-r--r-- | dtrain/test/logreg_cd/bin_class.h | 22 | ||||
| -rw-r--r-- | dtrain/test/logreg_cd/log_reg.cc | 39 | ||||
| -rw-r--r-- | dtrain/test/logreg_cd/log_reg.h | 14 | ||||
| -rw-r--r-- | dtrain/test/test.in | 3 | ||||
| -rw-r--r-- | dtrain/test/toy/cdec.ini | 2 | ||||
| -rw-r--r-- | dtrain/test/toy/dtrain.ini | 11 | ||||
| -rw-r--r-- | dtrain/test/toy/in | 2 | 
12 files changed, 138 insertions, 0 deletions
| diff --git a/dtrain/test/example/cdec.ini b/dtrain/test/example/cdec.ini new file mode 100644 index 00000000..d88779fa --- /dev/null +++ b/dtrain/test/example/cdec.ini @@ -0,0 +1,21 @@ +formalism=scfg +add_pass_through_rules=true +scfg_max_span_limit=15 +intersection_strategy=cube_pruning +cubepruning_pop_limit=30 +feature_function=WordPenalty +feature_function=KLanguageModel test/example/nc-wmt11.en.srilm.gz +#feature_function=ArityPenalty +#feature_function=CMR2008ReorderingFeatures +#feature_function=InputIndicator +#feature_function=LexNullJump +#feature_function=NewJump +#feature_function=NgramFeatures +#feature_function=NonLatinCount +#feature_function=OutputIndicator +#feature_function=RuleIdentityFeatures +#feature_function=RuleNgramFeatures +#feature_function=RuleShape +#feature_function=SourceSpanSizeFeatures +#feature_function=SourceWordPenalty +#feature_function=SpanFeatures diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini new file mode 100644 index 00000000..9b9f45e7 --- /dev/null +++ b/dtrain/test/example/dtrain.ini @@ -0,0 +1,20 @@ +input=test/example/nc-wmt11.1k.gz    # use '-' for stdin +output=-                             # a weights file +decoder_config=test/example/cdec.ini # a ini for cdec +# these will be printed on each iteration +print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough +tmp=/tmp +#stop_after=10 + +# interesting stuff +epochs=10 +k=100 +N=4 +learning_rate=0.0001 +gamma=0.00001 +scorer=stupid_bleu +sample_from=kbest +filter=uniq +pair_sampling=108010 +pair_threshold=0 +select_weights=VOID diff --git a/dtrain/test/example/nc-wmt11.1k.gz b/dtrain/test/example/nc-wmt11.1k.gzBinary files differ new file mode 100644 index 00000000..45496cd8 --- /dev/null +++ b/dtrain/test/example/nc-wmt11.1k.gz diff --git a/dtrain/test/example/nc-wmt11.en.srilm.gz b/dtrain/test/example/nc-wmt11.en.srilm.gzBinary files differ new file mode 100644 index 00000000..7ce81057 --- /dev/null +++ b/dtrain/test/example/nc-wmt11.en.srilm.gz diff --git a/dtrain/test/logreg_cd/bin_class.cc b/dtrain/test/logreg_cd/bin_class.cc new file mode 100644 index 00000000..19bcde25 --- /dev/null +++ b/dtrain/test/logreg_cd/bin_class.cc @@ -0,0 +1,4 @@ +#include "bin_class.h" + +Objective::~Objective() {} + diff --git a/dtrain/test/logreg_cd/bin_class.h b/dtrain/test/logreg_cd/bin_class.h new file mode 100644 index 00000000..3466109a --- /dev/null +++ b/dtrain/test/logreg_cd/bin_class.h @@ -0,0 +1,22 @@ +#ifndef _BIN_CLASS_H_ +#define _BIN_CLASS_H_ + +#include <vector> +#include "sparse_vector.h" + +struct TrainingInstance { +  // TODO add other info? loss for MIRA-type updates? +  SparseVector<double> x_feature_map; +  bool y; +}; + +struct Objective { +  virtual ~Objective(); + +  // returns f(x) and f'(x) +  virtual double ObjectiveAndGradient(const SparseVector<double>& x, +                  const std::vector<TrainingInstance>& training_instances, +                  SparseVector<double>* g) const = 0; +}; + +#endif diff --git a/dtrain/test/logreg_cd/log_reg.cc b/dtrain/test/logreg_cd/log_reg.cc new file mode 100644 index 00000000..ec2331fe --- /dev/null +++ b/dtrain/test/logreg_cd/log_reg.cc @@ -0,0 +1,39 @@ +#include "log_reg.h" + +#include <vector> +#include <cmath> + +#include "sparse_vector.h" + +using namespace std; + +double LogisticRegression::ObjectiveAndGradient(const SparseVector<double>& x, +                              const vector<TrainingInstance>& training_instances, +                              SparseVector<double>* g) const { +  double cll = 0; +  for (int i = 0; i < training_instances.size(); ++i) { +    const double dotprod = training_instances[i].x_feature_map.dot(x); // TODO no bias, if bias, add x[0] +    double lp_false = dotprod; +    double lp_true = -dotprod; +    if (0 < lp_true) { +      lp_true += log1p(exp(-lp_true)); +      lp_false = log1p(exp(lp_false)); +    } else { +      lp_true = log1p(exp(lp_true)); +      lp_false += log1p(exp(-lp_false)); +    } +    lp_true *= -1; +    lp_false *= -1; +    if (training_instances[i].y) {  // true label +      cll -= lp_true; +      (*g) -= training_instances[i].x_feature_map * exp(lp_false); +      // (*g)[0] -= exp(lp_false); // bias +    } else {                  // false label +      cll -= lp_false; +      (*g) += training_instances[i].x_feature_map * exp(lp_true); +      // g += corpus[i].second * exp(lp_true); +    } +  } +  return cll; +} + diff --git a/dtrain/test/logreg_cd/log_reg.h b/dtrain/test/logreg_cd/log_reg.h new file mode 100644 index 00000000..ecc560b8 --- /dev/null +++ b/dtrain/test/logreg_cd/log_reg.h @@ -0,0 +1,14 @@ +#ifndef _LOG_REG_H_ +#define _LOG_REG_H_ + +#include <vector> +#include "sparse_vector.h" +#include "bin_class.h" + +struct LogisticRegression : public Objective { +  double ObjectiveAndGradient(const SparseVector<double>& x, +                              const std::vector<TrainingInstance>& training_instances, +                              SparseVector<double>* g) const; +}; + +#endif diff --git a/dtrain/test/test.in b/dtrain/test/test.in new file mode 100644 index 00000000..4f53335e --- /dev/null +++ b/dtrain/test/test.in @@ -0,0 +1,3 @@ +0	vorrichtung	means	[X] ||| vorrichtung ||| apparatus ||| LogP=0 ||| 0-0 __NEXT_RULE__ [X] ||| vorrichtung ||| means ||| LogP=-100 ||| 0-0 +1	Test	test	[X] ||| Test ||| test ||| LogP=0 ||| 0-0 __NEXT_RULE__ [X] ||| Test ||| xxx ||| LogP=-100 ||| 0-0 +2	kaputt	broken	 diff --git a/dtrain/test/toy/cdec.ini b/dtrain/test/toy/cdec.ini new file mode 100644 index 00000000..98b02d44 --- /dev/null +++ b/dtrain/test/toy/cdec.ini @@ -0,0 +1,2 @@ +formalism=scfg +add_pass_through_rules=true diff --git a/dtrain/test/toy/dtrain.ini b/dtrain/test/toy/dtrain.ini new file mode 100644 index 00000000..3548bbb6 --- /dev/null +++ b/dtrain/test/toy/dtrain.ini @@ -0,0 +1,11 @@ +decoder_config=test/toy/cdec.ini +input=test/toy/in +output=- +print_weights=logp use_shell use_house PassThrough + +k=4 +N=3 +epochs=2 +scorer=stupid_bleu +sample_from=kbest +filter=uniq diff --git a/dtrain/test/toy/in b/dtrain/test/toy/in new file mode 100644 index 00000000..d7b7d080 --- /dev/null +++ b/dtrain/test/toy/in @@ -0,0 +1,2 @@ +0	ich sah ein kleines haus	i saw a little house	[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0	[NP] ||| ich ||| i ||| logp=0	[NP] ||| ein [NN,1] ||| a [1] ||| logp=0	[NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1	[NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1	[JJ] ||| kleines ||| small ||| logp=0	[JJ] ||| kleines ||| little ||| logp=0	[JJ] ||| grosses ||| big ||| logp=0	[JJ] ||| grosses ||| large ||| logp=0	[VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0	[V] ||| sah ||| saw ||| logp=0	[V] ||| fand ||| found ||| logp=0 +1	ich fand ein grosses haus	i found a large house	[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0	[NP] ||| ich ||| i ||| logp=0	[NP] ||| ein [NN,1] ||| a [1] ||| logp=0	[NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1	[NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1	[JJ] ||| kleines ||| small ||| logp=0	[JJ] ||| kleines ||| little ||| logp=0	[JJ] ||| grosses ||| big ||| logp=0	[JJ] ||| grosses ||| large ||| logp=0	[VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0	[V] ||| sah ||| saw ||| logp=0	[V] ||| fand ||| found ||| logp=0 | 
