diff options
Diffstat (limited to 'dtrain')
| -rw-r--r-- | dtrain/README | 15 | ||||
| -rw-r--r-- | dtrain/dtrain.cc | 87 | ||||
| -rw-r--r-- | dtrain/kbestget.h | 12 | ||||
| -rwxr-xr-x | dtrain/run.sh | 8 | ||||
| -rw-r--r-- | dtrain/sample.h | 52 | ||||
| -rw-r--r-- | dtrain/score.h | 16 | ||||
| -rw-r--r-- | dtrain/test/EXAMPLE/cdec.ini | 1 | ||||
| -rw-r--r-- | dtrain/test/EXAMPLE/dtrain.ini | 6 | ||||
| -rw-r--r-- | dtrain/test/log_reg/bin_class.cc | 4 | ||||
| -rw-r--r-- | dtrain/test/log_reg/bin_class.h | 22 | ||||
| -rw-r--r-- | dtrain/test/log_reg/log_reg.cc | 39 | ||||
| -rw-r--r-- | dtrain/test/log_reg/log_reg.h | 14 | ||||
| -rw-r--r-- | dtrain/test/nc-wmt11/dtrain.ini | 2 | ||||
| -rw-r--r-- | dtrain/test/toy.dtrain.ini | 3 | ||||
| -rw-r--r-- | dtrain/test/toy.in | 4 | ||||
| -rw-r--r-- | dtrain/test/toy_cdec/cdec.ini | 3 | ||||
| -rw-r--r-- | dtrain/test/toy_cdec/grammar | 12 | ||||
| -rw-r--r-- | dtrain/test/toy_cdec/in | 1 | ||||
| -rw-r--r-- | dtrain/test/toy_cdec/weights | 2 | 
19 files changed, 238 insertions, 65 deletions
| diff --git a/dtrain/README b/dtrain/README index 74bac6a0..b3f513be 100644 --- a/dtrain/README +++ b/dtrain/README @@ -1,7 +1,7 @@  NOTES   learner gets all used features (binary! and dense (logprob is sum of logprobs!))   weights: see decoder/decoder.cc line 548 - 40k sents, k=100 = ~400M mem, 1 iteration 45min + (40k sents, k=100 = ~400M mem, 1 iteration 45min)?   utils/weights.cc: why wv_?   FD, Weights::wv_ grow too large, see utils/weights.cc;       decoder/hg.h; decoder/scfg_translator.cc; utils/fdict.cc @@ -15,25 +15,26 @@ TODO   GENERATED data? (multi-task, ability to learn, perfect translation in nbest, at first all modelscore 1)   CACHING (ngrams for scoring)   hadoop PIPES imlementation - SHARED LM? + SHARED LM (kenlm actually does this!)?   ITERATION variants    once -> average    shuffle resulting weights   weights AVERAGING in reducer (global Ngram counts)   BATCH implementation (no update after each Kbest list) - SOFIA --eta_type explicit   set REFERENCE for cdec (rescoring)?   MORE THAN ONE reference for BLEU?   kbest NICER (do not iterate twice)!? -> shared_ptr?   DO NOT USE Decoder::Decode (input caching as WordID)!?    sparse vector instead of vector<double> for weights in Decoder(::SetWeights)?   reactivate DTEST and tests - non deterministic, high variance, RANDOWM RESTARTS + non deterministic, high variance, RANDOM RESTARTS   use separate TEST SET  KNOWN BUGS PROBLEMS - does probably OVERFIT - cdec kbest vs 1best (no -k param) fishy! + cdec kbest vs 1best (no -k param), rescoring? => ok(?) + no sparse vector in decoder => ok + ? ok   sh: error while loading shared libraries: libreadline.so.6: cannot open shared object file: Error 24 - PhraseModel_* features (0..99 seem to be generated, default?) + PhraseModel_* features (0..99 seem to be generated, why 99?) + flex scanner jams on malicious input, we could skip that diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 30ced234..4554e417 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -1,7 +1,7 @@  #include "common.h"  #include "kbestget.h" -#include "updater.h"  #include "util.h" +#include "sample.h"  // boost compression  #include <boost/iostreams/device/file.hpp>  @@ -85,18 +85,21 @@ init(int argc, char** argv, po::variables_map* cfg)  } +// output formatting  ostream& _nopos( ostream& out ) { return out << resetiosflags( ios::showpos ); }  ostream& _pos( ostream& out ) { return out << setiosflags( ios::showpos ); }  ostream& _prec2( ostream& out ) { return out << setprecision(2); }  ostream& _prec5( ostream& out ) { return out << setprecision(5); } + +  /* - * main + * dtrain   *   */  int -main(int argc, char** argv) +main( int argc, char** argv )  {    // handle most parameters    po::variables_map cfg; @@ -202,11 +205,14 @@ main(int argc, char** argv)    bool next = false, stop = false;    double score = 0.;    size_t cand_len = 0; -  Scores scores;    double overall_time = 0.;    cout << setprecision( 5 ); +  // for the perceptron +  double eta = 0.5; // TODO as parameter +  lambdas.add_value( FD::Convert("__bias"), 0 ); +    for ( size_t t = 0; t < T; t++ ) // T epochs    { @@ -278,12 +284,15 @@ main(int argc, char** argv)      weights.InitVector( &dense_weights );      decoder.SetWeights( dense_weights ); +    srand ( time(NULL) ); +      switch ( t ) {        case 0:          // handling input          in_split.clear();          boost::split( in_split, in, boost::is_any_of("\t") );          // in_split[0] is id +        //cout << in_split[0] << endl;          // getting reference          ref_tok.clear(); ref_ids.clear();          boost::split( ref_tok, in_split[2], boost::is_any_of(" ") ); @@ -291,7 +300,7 @@ main(int argc, char** argv)          ref_ids_buf.push_back( ref_ids );          // process and set grammar          //grammar_buf << in_split[3] << endl; -        grammar_str = boost::replace_all_copy( in_split[3], " __NEXT__RULE__ ", "\n" ) + "\n"; +        grammar_str = boost::replace_all_copy( in_split[3], " __NEXT__RULE__ ", "\n" ) + "\n"; // FIXME copy, __          grammar_buf << grammar_str << DTRAIN_GRAMMAR_DELIM << endl;          decoder.SetSentenceGrammarFromString( grammar_str );          // decode, kbest @@ -316,14 +325,16 @@ main(int argc, char** argv)      }      // get kbest list -    KBestList* kb = observer.GetKBest(); +    KBestList* kb; +    //if ( ) { // TODO get from forest +      kb = observer.GetKBest(); +    //}      // scoring kbest -    scores.clear();      if ( t > 0 ) ref_ids = ref_ids_buf[sid]; -    for ( size_t i = 0; i < kb->sents.size(); i++ ) { +    for ( size_t i = 0; i < kb->GetSize(); i++ ) {        NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[i], N ); -      // for approx bleu +      // this is for approx bleu        if ( scorer_str == "approx_bleu" ) {          if ( i == 0 ) { // 'context of 1best translations'            global_counts  += counts; @@ -346,29 +357,54 @@ main(int argc, char** argv)                          kb->sents[i].size(), N, bleu_weights );        } +      kb->scores.push_back( score ); +        if ( i == 0 ) {          acc_1best_score += score; -        acc_1best_model += kb->scores[i]; +        acc_1best_model += kb->model_scores[i];        } -      // scorer score and model score -      ScorePair sp( kb->scores[i], score ); -      scores.push_back( sp ); -        if ( verbose ) { -        cout << "k=" << i+1 << " '" << TD::GetString( ref_ids ) << "'[ref] vs '"; -        cout << _prec5 << _nopos << TD::GetString( kb->sents[i] ) << "'[hyp]"; -        cout << " [SCORE=" << score << ",model="<< kb->scores[i] << "]" << endl; -        //cout << kb->feats[i] << endl; // this is maybe too verbose +        if ( i == 0 ) cout << "'" << TD::GetString( ref_ids ) << "' [ref]" << endl; +        cout << _prec5 << _nopos << "[hyp " << i << "] " << "'" << TD::GetString( kb->sents[i] ) << "'"; +        cout << " [SCORE=" << score << ",model="<< kb->model_scores[i] << "]" << endl; +        cout << kb->feats[i] << endl; // this is maybe too verbose        }      } // Nbest loop +      if ( verbose ) cout << endl; -    // update weights; TODO other updaters + +    // UPDATE WEIGHTS      if ( !noup ) { -      SofiaUpdater updater; -      updater.Init( sid, kb->feats, scores ); -      updater.Update( lambdas ); + +      TrainingInstances pairs; + +      sample_all(kb, pairs); +             +      for ( TrainingInstances::iterator ti = pairs.begin(); +            ti != pairs.end(); ti++ ) { +        // perceptron +        SparseVector<double> dv; +        if ( ti->type == -1 ) { +          dv = ti->second - ti->first; +        } else { +          dv = ti->first - ti->second; +        } +        dv.add_value(FD::Convert("__bias"), -1); +        lambdas += dv * eta; + +        /*if ( verbose ) { +          cout << "{{ f(i) > f(j) but g(i) < g(j), so update" << endl; +          cout << " i  " << TD::GetString(kb->sents[ii]) << endl; +          cout << "    " << kb->feats[ii] << endl; +          cout << " j  " << TD::GetString(kb->sents[jj]) << endl; +          cout << "    " << kb->feats[jj] << endl;  +          cout << " dv " << dv << endl; +          cout << "}}" << endl; +        }*/ +      } +      }      ++sid; @@ -426,7 +462,7 @@ main(int argc, char** argv)    } // outer loop -  //unlink( grammar_buf_tmp_fn ); +  unlink( grammar_buf_tmp_fn );    if ( !noup ) {      if ( !quiet ) cout << endl << "writing weights file '" << cfg["output"].as<string>() << "' ...";      weights.WriteToFile( cfg["output"].as<string>(), true ); @@ -439,11 +475,6 @@ main(int argc, char** argv)      cout << _prec2 << "This took " << overall_time/60. << " min." << endl;    } -  // don't do this with many features... -  /*for ( size_t i = 0; i < FD::NumFeats(); i++ ) { -      cout << FD::Convert(i) << " " << dense_weights[i] << endl; -  }*/ -    return 0;  } diff --git a/dtrain/kbestget.h b/dtrain/kbestget.h index bb430b85..ae4588c9 100644 --- a/dtrain/kbestget.h +++ b/dtrain/kbestget.h @@ -14,7 +14,9 @@ namespace dtrain  struct KBestList {    vector<SparseVector<double> > feats;    vector<vector<WordID> > sents; +  vector<double> model_scores;    vector<double> scores; +  size_t GetSize() { return sents.size(); }  }; @@ -52,9 +54,10 @@ struct KBestGetter : public DecoderObserver    void    KBestUnique( const Hypergraph& forest )    { -    kb.scores.clear();      kb.sents.clear();      kb.feats.clear(); +    kb.model_scores.clear(); +    kb.scores.clear();      KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique, prob_t, EdgeProb> kbest( forest, k_ );      for ( size_t i = 0; i < k_; ++i ) {        const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique, prob_t, EdgeProb>::Derivation* d = @@ -62,16 +65,17 @@ struct KBestGetter : public DecoderObserver        if (!d) break;        kb.sents.push_back( d->yield);        kb.feats.push_back( d->feature_values ); -      kb.scores.push_back( d->score ); +      kb.model_scores.push_back( d->score );      }    }    void    KBestNoFilter( const Hypergraph& forest )    { -    kb.scores.clear();      kb.sents.clear();      kb.feats.clear(); +    kb.model_scores.clear(); +    kb.scores.clear();      KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest( forest, k_ );      for ( size_t i = 0; i < k_; ++i ) {        const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = @@ -79,7 +83,7 @@ struct KBestGetter : public DecoderObserver        if (!d) break;        kb.sents.push_back( d->yield);        kb.feats.push_back( d->feature_values ); -      kb.scores.push_back( d->score ); +      kb.model_scores.push_back( d->score );      }    }  }; diff --git a/dtrain/run.sh b/dtrain/run.sh index cdaea067..b2012bcf 100755 --- a/dtrain/run.sh +++ b/dtrain/run.sh @@ -1,8 +1,10 @@  #!/bin/sh -INI=test/blunsom08.dtrain.ini -#INI=test/nc-wmt11/nc-wmt11.loo.dtrain.ini +#INI=test/blunsom08.dtrain.ini +#INI=test/nc-wmt11/dtrain.ini +#INI=test/EXAMPLE/dtrain.ini +INI=test/toy.dtrain.ini  rm /tmp/dtrain-* -./dtrain -c $INI $1 $2 $3 $4 2>/dev/null +./dtrain -c $INI $1 $2 $3 $4  diff --git a/dtrain/sample.h b/dtrain/sample.h new file mode 100644 index 00000000..b9bc4461 --- /dev/null +++ b/dtrain/sample.h @@ -0,0 +1,52 @@ +#include "kbestget.h" + + +namespace dtrain +{ + + +struct TPair +{ +  double type; +  SparseVector<double> first; +  SparseVector<double> second; +}; + +typedef vector<TPair> TrainingInstances; + + +void +sample_all( KBestList* kb, TrainingInstances &training ) +{ +  double type; +  for ( size_t i = 0; i < kb->GetSize()-1; i++ ) { +   for ( size_t j = i+1; j < kb->GetSize(); j++ ) { +     if ( kb->scores[i] - kb->scores[j] < 0 ) { +       type = -1;  +     } else { +       type = 1; +     } +     TPair p; +     p.type = type; +     p.first = kb->feats[i]; +     p.second = kb->feats[j]; +     training.push_back( p ); +   } + } +} + +/*void +sample_all_only_neg(, vector<pair<SparSparseVector<double> > pairs) +{ + +} + +void +sample_random_pos() +{ +  if ( rand() % 2 ) { // sample it? +}*/ + + +} // namespace + diff --git a/dtrain/score.h b/dtrain/score.h index 4314157b..e88387c5 100644 --- a/dtrain/score.h +++ b/dtrain/score.h @@ -18,22 +18,6 @@ namespace dtrain  /* - * ScorePair - * - */ -struct ScorePair -{ -  ScorePair(double modelscore, double score) : modelscore_(modelscore), score_(score) {}  -  double modelscore_, score_; -  double GetModelScore() { return modelscore_; } -  double GetScore() { return score_; } -}; - - -typedef vector<ScorePair> Scores; - - -/*   * NgramCounts   *   */ diff --git a/dtrain/test/EXAMPLE/cdec.ini b/dtrain/test/EXAMPLE/cdec.ini index b6e92b5f..e57138b0 100644 --- a/dtrain/test/EXAMPLE/cdec.ini +++ b/dtrain/test/EXAMPLE/cdec.ini @@ -2,5 +2,6 @@ formalism=scfg  add_pass_through_rules=true  feature_function=WordPenalty  cubepruning_pop_limit=30 +feature_function=KLanguageModel data/nc-wmt11.en.srilm.gz  scfg_max_span_limit=15 diff --git a/dtrain/test/EXAMPLE/dtrain.ini b/dtrain/test/EXAMPLE/dtrain.ini index 1467b332..ffafd0b8 100644 --- a/dtrain/test/EXAMPLE/dtrain.ini +++ b/dtrain/test/EXAMPLE/dtrain.ini @@ -1,10 +1,10 @@  decoder_config=test/EXAMPLE/cdec.ini  kbest=100  ngrams=3 -epochs=22 +epochs=8  input=test/EXAMPLE/dtrain.nc-1k  scorer=approx_bleu  output=test/EXAMPLE/weights.gz -stop_after=5 -wprint=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 +stop_after=1000 +wprint=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough diff --git a/dtrain/test/log_reg/bin_class.cc b/dtrain/test/log_reg/bin_class.cc new file mode 100644 index 00000000..19bcde25 --- /dev/null +++ b/dtrain/test/log_reg/bin_class.cc @@ -0,0 +1,4 @@ +#include "bin_class.h" + +Objective::~Objective() {} + diff --git a/dtrain/test/log_reg/bin_class.h b/dtrain/test/log_reg/bin_class.h new file mode 100644 index 00000000..3466109a --- /dev/null +++ b/dtrain/test/log_reg/bin_class.h @@ -0,0 +1,22 @@ +#ifndef _BIN_CLASS_H_ +#define _BIN_CLASS_H_ + +#include <vector> +#include "sparse_vector.h" + +struct TrainingInstance { +  // TODO add other info? loss for MIRA-type updates? +  SparseVector<double> x_feature_map; +  bool y; +}; + +struct Objective { +  virtual ~Objective(); + +  // returns f(x) and f'(x) +  virtual double ObjectiveAndGradient(const SparseVector<double>& x, +                  const std::vector<TrainingInstance>& training_instances, +                  SparseVector<double>* g) const = 0; +}; + +#endif diff --git a/dtrain/test/log_reg/log_reg.cc b/dtrain/test/log_reg/log_reg.cc new file mode 100644 index 00000000..ec2331fe --- /dev/null +++ b/dtrain/test/log_reg/log_reg.cc @@ -0,0 +1,39 @@ +#include "log_reg.h" + +#include <vector> +#include <cmath> + +#include "sparse_vector.h" + +using namespace std; + +double LogisticRegression::ObjectiveAndGradient(const SparseVector<double>& x, +                              const vector<TrainingInstance>& training_instances, +                              SparseVector<double>* g) const { +  double cll = 0; +  for (int i = 0; i < training_instances.size(); ++i) { +    const double dotprod = training_instances[i].x_feature_map.dot(x); // TODO no bias, if bias, add x[0] +    double lp_false = dotprod; +    double lp_true = -dotprod; +    if (0 < lp_true) { +      lp_true += log1p(exp(-lp_true)); +      lp_false = log1p(exp(lp_false)); +    } else { +      lp_true = log1p(exp(lp_true)); +      lp_false += log1p(exp(-lp_false)); +    } +    lp_true *= -1; +    lp_false *= -1; +    if (training_instances[i].y) {  // true label +      cll -= lp_true; +      (*g) -= training_instances[i].x_feature_map * exp(lp_false); +      // (*g)[0] -= exp(lp_false); // bias +    } else {                  // false label +      cll -= lp_false; +      (*g) += training_instances[i].x_feature_map * exp(lp_true); +      // g += corpus[i].second * exp(lp_true); +    } +  } +  return cll; +} + diff --git a/dtrain/test/log_reg/log_reg.h b/dtrain/test/log_reg/log_reg.h new file mode 100644 index 00000000..ecc560b8 --- /dev/null +++ b/dtrain/test/log_reg/log_reg.h @@ -0,0 +1,14 @@ +#ifndef _LOG_REG_H_ +#define _LOG_REG_H_ + +#include <vector> +#include "sparse_vector.h" +#include "bin_class.h" + +struct LogisticRegression : public Objective { +  double ObjectiveAndGradient(const SparseVector<double>& x, +                              const std::vector<TrainingInstance>& training_instances, +                              SparseVector<double>* g) const; +}; + +#endif diff --git a/dtrain/test/nc-wmt11/dtrain.ini b/dtrain/test/nc-wmt11/dtrain.ini index 51033f2d..ddbf5da7 100644 --- a/dtrain/test/nc-wmt11/dtrain.ini +++ b/dtrain/test/nc-wmt11/dtrain.ini @@ -2,7 +2,7 @@ decoder_config=test/nc-wmt11/cdec.ini  kbest=100  ngrams=3  epochs=8 -input=data/nc-wmt11.loo.localf.p0.500.rule-id #nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain.localf.p0 +input=data/nc-wmt11.loo.localf.p0.500.rule-id  scorer=approx_bleu  output=data/w/nc-wmt11.loo.p0.weights.gz  #stop_after=100 diff --git a/dtrain/test/toy.dtrain.ini b/dtrain/test/toy.dtrain.ini index cacb3a2c..35f76281 100644 --- a/dtrain/test/toy.dtrain.ini +++ b/dtrain/test/toy.dtrain.ini @@ -2,8 +2,9 @@ decoder_config=test/cdec.ini  kbest=4  ngrams=1  epochs=3 -input=data/in.toy +input=test/toy.in  scorer=bleu  output=toy.gz  #stop_after=1000 +wprint=logp use_shell use_house PassThrough diff --git a/dtrain/test/toy.in b/dtrain/test/toy.in index 63f97158..989a1f77 100644 --- a/dtrain/test/toy.in +++ b/dtrain/test/toy.in @@ -1,2 +1,2 @@ -0	ich sah ein kleines haus	i saw a little house	[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [NP] ||| ich ||| i ||| logp=0 __NEXT_RULE__ [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1 __NEXT_RULE__ [JJ] ||| kleines ||| small ||| logp=0 __NEXT_RULE__ [JJ] ||| kleines ||| little ||| logp=0 __NEXT_RULE__ [JJ] ||| grosses ||| big ||| logp=0 __NEXT_RULE__ [JJ] ||| grosses ||| large ||| logp=0 __NEXT_RULE__ [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [V] ||| sah ||| saw ||| logp=0 __NEXT_RULE__ [V] ||| fand ||| found ||| logp=0 -1	ich fand ein grosses haus	i found a large house	[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [NP] ||| ich ||| i ||| logp=0 __NEXT_RULE__ [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1 __NEXT_RULE__ [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1 __NEXT_RULE__ [JJ] ||| kleines ||| small ||| logp=0 __NEXT_RULE__ [JJ] ||| kleines ||| little ||| logp=0 __NEXT_RULE__ [JJ] ||| grosses ||| big ||| logp=0 __NEXT_RULE__ [JJ] ||| grosses ||| large ||| logp=0 __NEXT_RULE__ [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 __NEXT_RULE__ [V] ||| sah ||| saw ||| logp=0 __NEXT_RULE__ [V] ||| fand ||| found ||| logp=0 +0	ich sah ein kleines haus	i saw a little house	[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 __NEXT__RULE__ [NP] ||| ich ||| i ||| logp=0 __NEXT__RULE__ [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 __NEXT__RULE__ [NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1 __NEXT__RULE__ [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1 __NEXT__RULE__ [JJ] ||| kleines ||| small ||| logp=0 __NEXT__RULE__ [JJ] ||| kleines ||| little ||| logp=0 __NEXT__RULE__ [JJ] ||| grosses ||| big ||| logp=0 __NEXT__RULE__ [JJ] ||| grosses ||| large ||| logp=0 __NEXT__RULE__ [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 __NEXT__RULE__ [V] ||| sah ||| saw ||| logp=0 __NEXT__RULE__ [V] ||| fand ||| found ||| logp=0 +1	ich fand ein grosses haus	i found a large house	[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 __NEXT__RULE__ [NP] ||| ich ||| i ||| logp=0 __NEXT__RULE__ [NP] ||| ein [NN,1] ||| a [1] ||| logp=0 __NEXT__RULE__ [NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1 __NEXT__RULE__ [NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1 __NEXT__RULE__ [JJ] ||| kleines ||| small ||| logp=0 __NEXT__RULE__ [JJ] ||| kleines ||| little ||| logp=0 __NEXT__RULE__ [JJ] ||| grosses ||| big ||| logp=0 __NEXT__RULE__ [JJ] ||| grosses ||| large ||| logp=0 __NEXT__RULE__ [VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 __NEXT__RULE__ [V] ||| sah ||| saw ||| logp=0 __NEXT__RULE__ [V] ||| fand ||| found ||| logp=0 diff --git a/dtrain/test/toy_cdec/cdec.ini b/dtrain/test/toy_cdec/cdec.ini new file mode 100644 index 00000000..3a6bab68 --- /dev/null +++ b/dtrain/test/toy_cdec/cdec.ini @@ -0,0 +1,3 @@ +formalism=scfg +grammar=../dtrain/test/toy_cdec/grammar +add_pass_through_rules=true diff --git a/dtrain/test/toy_cdec/grammar b/dtrain/test/toy_cdec/grammar new file mode 100644 index 00000000..aeed75ef --- /dev/null +++ b/dtrain/test/toy_cdec/grammar @@ -0,0 +1,12 @@ +[S] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0 +[NP] ||| ich ||| i ||| logp=0 +[NP] ||| ein [NN,1] ||| a [1] ||| logp=0 +[NN] ||| [JJ,1] haus ||| [1] house ||| logp=0 use_house=1 +[NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0 use_shell=1 +[JJ] ||| kleines ||| small ||| logp=0 +[JJ] ||| kleines ||| little ||| logp=0 +[JJ] ||| grosses ||| big ||| logp=0 +[JJ] ||| grosses ||| large ||| logp=0 +[VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0 +[V] ||| sah ||| saw ||| logp=0 +[V] ||| fand ||| found ||| logp=0 diff --git a/dtrain/test/toy_cdec/in b/dtrain/test/toy_cdec/in new file mode 100644 index 00000000..e6df9275 --- /dev/null +++ b/dtrain/test/toy_cdec/in @@ -0,0 +1 @@ +ich sah ein kleines haus diff --git a/dtrain/test/toy_cdec/weights b/dtrain/test/toy_cdec/weights new file mode 100644 index 00000000..10d7ed83 --- /dev/null +++ b/dtrain/test/toy_cdec/weights @@ -0,0 +1,2 @@ +logp 1 +use_shell 1 | 
