diff options
Diffstat (limited to 'dtrain')
| -rw-r--r-- | dtrain/dtrain.cc | 61 | ||||
| -rw-r--r-- | dtrain/dtrain.h | 12 | ||||
| -rw-r--r-- | dtrain/kbestget.h | 14 | ||||
| -rw-r--r-- | dtrain/ksampler.h | 6 | ||||
| -rw-r--r-- | dtrain/pairsampling.h | 8 | ||||
| -rw-r--r-- | dtrain/score.cc | 63 | ||||
| -rw-r--r-- | dtrain/score.h | 32 | ||||
| -rw-r--r-- | dtrain/test/example/dtrain.ini | 4 | ||||
| -rw-r--r-- | dtrain/test/example/weights.gz | bin | 255 -> 248 bytes | 
9 files changed, 95 insertions, 105 deletions
| diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 0481cf96..44090242 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -6,23 +6,24 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)  {    po::options_description ini("Configuration File Options");    ini.add_options() -    ("input",          po::value<string>()->default_value("-"),                          "input file") -    ("output",         po::value<string>()->default_value("-"),       "output weights file (or VOID)") -    ("input_weights",  po::value<string>(),       "input weights file (e.g. from previous iteration)") -    ("decoder_config", po::value<string>(),                             "configuration file for cdec") -    ("k",              po::value<size_t>()->default_value(100), "size of kbest or sample from forest") -    ("sample_from",    po::value<string>()->default_value("kbest"),  "where to get translations from") -    ("filter",         po::value<string>()->default_value("unique"),              "filter kbest list") -    ("pair_sampling",  po::value<string>()->default_value("all"),    "how to sample pairs: all, rand") -    ("N",              po::value<size_t>()->default_value(3),                          "N for Ngrams") -    ("epochs",         po::value<size_t>()->default_value(2),                     "# of iterations T")  -    ("scorer",         po::value<string>()->default_value("stupid_bleu"),            "scoring metric") -    ("stop_after",     po::value<size_t>()->default_value(0),          "stop after X input sentences") -    ("print_weights",  po::value<string>(),                      "weights to print on each iteration") -    ("hstreaming",     po::value<bool>()->zero_tokens(),               "run in hadoop streaming mode") -    ("learning_rate",  po::value<double>()->default_value(0.0005),                    "learning rate") -    ("gamma",          po::value<double>()->default_value(0.),     "gamma for SVM (0 for perceptron)") -    ("noup",           po::value<bool>()->zero_tokens(),                      "do not update weights"); +    ("input",          po::value<string>()->default_value("-"),                            "input file") +    ("output",         po::value<string>()->default_value("-"),         "output weights file (or VOID)") +    ("input_weights",  po::value<string>(),         "input weights file (e.g. from previous iteration)") +    ("decoder_config", po::value<string>(),                               "configuration file for cdec") +    ("k",              po::value<unsigned>()->default_value(100), "size of kbest or sample from forest") +    ("sample_from",    po::value<string>()->default_value("kbest"),    "where to get translations from") +    ("filter",         po::value<string>()->default_value("unique"),                "filter kbest list") +    ("pair_sampling",  po::value<string>()->default_value("all"),      "how to sample pairs: all, rand") +    ("N",              po::value<unsigned>()->default_value(3),                          "N for Ngrams") +    ("epochs",         po::value<unsigned>()->default_value(2),                     "# of iterations T")  +    ("scorer",         po::value<string>()->default_value("stupid_bleu"),              "scoring metric") +    ("stop_after",     po::value<unsigned>()->default_value(0),          "stop after X input sentences") +    ("print_weights",  po::value<string>(),                        "weights to print on each iteration") +    ("hstreaming",     po::value<bool>()->zero_tokens(),                 "run in hadoop streaming mode") +    ("learning_rate",  po::value<double>()->default_value(0.0005),                      "learning rate") +    ("gamma",          po::value<double>()->default_value(0.),       "gamma for SVM (0 for perceptron)") +    ("tmp",            po::value<string>()->default_value("/tmp"),                    "temp dir to use") // FIXME +    ("noup",           po::value<bool>()->zero_tokens(),                        "do not update weights");    po::options_description cl("Command Line Options");    cl.add_options()      ("config,c",         po::value<string>(),              "dtrain config file") @@ -75,10 +76,10 @@ main(int argc, char** argv)      hstreaming = true;      quiet = true;    } -  const size_t k = cfg["k"].as<size_t>(); -  const size_t N = cfg["N"].as<size_t>();  -  const size_t T = cfg["epochs"].as<size_t>(); -  const size_t stop_after = cfg["stop_after"].as<size_t>(); +  const unsigned k = cfg["k"].as<unsigned>(); +  const unsigned N = cfg["N"].as<unsigned>();  +  const unsigned T = cfg["epochs"].as<unsigned>(); +  const unsigned stop_after = cfg["stop_after"].as<unsigned>();    const string filter_type = cfg["filter"].as<string>();    const string sample_from = cfg["sample_from"].as<string>();    const string pair_sampling = cfg["pair_sampling"].as<string>(); @@ -105,7 +106,7 @@ main(int argc, char** argv)    // scoring metric/scorer    string scorer_str = cfg["scorer"].as<string>(); -  score_t (*scorer)(NgramCounts&, const size_t, const size_t, size_t, vector<score_t>); +  score_t (*scorer)(NgramCounts&, const unsigned, const unsigned, unsigned, vector<score_t>);    if (scorer_str == "bleu") {      scorer = &bleu;    } else if (scorer_str == "stupid_bleu") { @@ -119,8 +120,8 @@ main(int argc, char** argv)      exit(1);    }    NgramCounts global_counts(N); // counts for 1 best translations -  size_t global_hyp_len = 0;    // sum hypothesis lengths -  size_t global_ref_len = 0;    // sum reference lengths +  unsigned global_hyp_len = 0;    // sum hypothesis lengths +  unsigned global_ref_len = 0;    // sum reference lengths    // ^^^ global_* for approx_bleu    vector<score_t> bleu_weights;   // we leave this empty -> 1/N     if (!quiet) cerr << setw(26) << "scorer '" << scorer_str << "'" << endl << endl; @@ -149,10 +150,10 @@ main(int argc, char** argv)    ogzstream grammar_buf_out;    grammar_buf_out.open(grammar_buf_fn); -  size_t in_sz = 999999999; // input index, input size +  unsigned in_sz = 999999999; // input index, input size    vector<pair<score_t,score_t> > all_scores;    score_t max_score = 0.; -  size_t best_it = 0; +  unsigned best_it = 0;    float overall_time = 0.;    // output cfg @@ -178,7 +179,7 @@ main(int argc, char** argv)    } -  for (size_t t = 0; t < T; t++) // T epochs +  for (unsigned t = 0; t < T; t++) // T epochs    {    time_t start, end;   @@ -186,7 +187,7 @@ main(int argc, char** argv)    igzstream grammar_buf_in;    if (t > 0) grammar_buf_in.open(grammar_buf_fn);    score_t score_sum = 0., model_sum = 0.; -  size_t ii = 0; +  unsigned ii = 0;    if (!quiet) cerr << "Iteration #" << t+1 << " of " << T << "." << endl;    while(true) @@ -279,10 +280,10 @@ main(int argc, char** argv)      // (local) scoring      if (t > 0) ref_ids = ref_ids_buf[ii];      score_t score = 0.; -    for (size_t i = 0; i < samples->size(); i++) { +    for (unsigned i = 0; i < samples->size(); i++) {        NgramCounts counts = make_ngram_counts(ref_ids, (*samples)[i].w, N);        if (scorer_str == "approx_bleu") { -        size_t hyp_len = 0; +        unsigned hyp_len = 0;          if (i == 0) { // 'context of 1best translations'            global_counts  += counts;            global_hyp_len += (*samples)[i].w.size(); diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h index 9bc5be93..ed75a297 100644 --- a/dtrain/dtrain.h +++ b/dtrain/dtrain.h @@ -18,8 +18,8 @@  #include "ksampler.h"  #include "pairsampling.h" -#define DTRAIN_DOTS 100                     // when to display a '.' -#define DTRAIN_TMP_DIR "/var/hadoop/mapred/local"               // put this on a SSD? +#define DTRAIN_DOTS 100 // when to display a '.' +#define DTRAIN_TMP_DIR "/tmp"  #define DTRAIN_GRAMMAR_DELIM "########EOS########"  using namespace std; @@ -36,20 +36,20 @@ inline ostream& _p(ostream& out)  { return out << setiosflags(ios::showpos); }  inline ostream& _p2(ostream& out) { return out << setprecision(2); }  inline ostream& _p5(ostream& out) { return out << setprecision(5); }  inline ostream& _p9(ostream& out) { return out << setprecision(9); } -inline void strsplit(string &s, vector<string>& v, char d = '\t', size_t parts = 0) {  +inline void strsplit(string &s, vector<string>& v, char d = '\t', unsigned parts = 0) {     stringstream ss(s);    string t; -  size_t c = 0; +  unsigned i = 0;    while(true)    { -    if (parts > 0 && c == parts-1) { +    if (parts > 0 && i == parts-1) {        getline(ss, t);        v.push_back(t);        break;      }      if (!getline(ss, t, d)) break;      v.push_back(t); -    c++; +    i++;    }  } diff --git a/dtrain/kbestget.h b/dtrain/kbestget.h index 403384de..935998a0 100644 --- a/dtrain/kbestget.h +++ b/dtrain/kbestget.h @@ -22,11 +22,11 @@ struct HypSampler : public DecoderObserver  struct KBestGetter : public HypSampler  { -  const size_t k_; +  const unsigned k_;    const string filter_type_;    vector<ScoredHyp> s_; -  KBestGetter(const size_t k, const string filter_type) : +  KBestGetter(const unsigned k, const string filter_type) :      k_(k), filter_type_(filter_type) {}    virtual void @@ -51,9 +51,11 @@ struct KBestGetter : public HypSampler    KBestUnique(const Hypergraph& forest)    {      s_.clear(); -    KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique, prob_t, EdgeProb> kbest(forest, k_); -    for (size_t i = 0; i < k_; ++i) { -      const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique, prob_t, EdgeProb>::Derivation* d = +    KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, +      KBest::FilterUnique, prob_t, EdgeProb> kbest(forest, k_); +    for (unsigned i = 0; i < k_; ++i) { +      const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique, +              prob_t, EdgeProb>::Derivation* d =              kbest.LazyKthBest(forest.nodes_.size() - 1, i);        if (!d) break;        ScoredHyp h; @@ -69,7 +71,7 @@ struct KBestGetter : public HypSampler    {      s_.clear();      KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(forest, k_); -    for (size_t i = 0; i < k_; ++i) { +    for (unsigned i = 0; i < k_; ++i) {        const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =              kbest.LazyKthBest(forest.nodes_.size() - 1, i);        if (!d) break; diff --git a/dtrain/ksampler.h b/dtrain/ksampler.h index 08bf1498..17b0ba56 100644 --- a/dtrain/ksampler.h +++ b/dtrain/ksampler.h @@ -11,11 +11,11 @@ namespace dtrain  struct KSampler : public HypSampler  { -  const size_t k_; +  const unsigned k_;    vector<ScoredHyp> s_;    MT19937* prng_; -  explicit KSampler(const size_t k, MT19937* prng) : +  explicit KSampler(const unsigned k, MT19937* prng) :      k_(k), prng_(prng) {}    virtual void @@ -30,7 +30,7 @@ struct KSampler : public HypSampler      s_.clear();      std::vector<HypergraphSampler::Hypothesis> samples;      HypergraphSampler::sample_hypotheses(forest, k_, prng_, &samples); -    for (size_t i = 0; i < k_; ++i) { +    for (unsigned i = 0; i < k_; ++i) {        ScoredHyp h;        h.w = samples[i].words;        h.f = samples[i].fmap; diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h index 2e4ab155..9546a945 100644 --- a/dtrain/pairsampling.h +++ b/dtrain/pairsampling.h @@ -11,8 +11,8 @@ namespace dtrain  inline void  sample_all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> > &training)  { -  for (size_t i = 0; i < s->size()-1; i++) { -    for (size_t j = i+1; j < s->size(); j++) { +  for (unsigned i = 0; i < s->size()-1; i++) { +    for (unsigned j = i+1; j < s->size(); j++) {        pair<ScoredHyp,ScoredHyp> p;        p.first = (*s)[i];        p.second = (*s)[j]; @@ -25,8 +25,8 @@ inline void  sample_rand_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> > &training,                    MT19937* prng)  { -  for (size_t i = 0; i < s->size()-1; i++) { -    for (size_t j = i+1; j < s->size(); j++) { +  for (unsigned i = 0; i < s->size()-1; i++) { +    for (unsigned j = i+1; j < s->size(); j++) {        if (prng->next() < .5) {          pair<ScoredHyp,ScoredHyp> p;          p.first = (*s)[i]; diff --git a/dtrain/score.cc b/dtrain/score.cc index c6d3a05f..52644250 100644 --- a/dtrain/score.cc +++ b/dtrain/score.cc @@ -5,13 +5,13 @@ namespace dtrain  Ngrams -make_ngrams(vector<WordID>& s, size_t N) +make_ngrams(vector<WordID>& s, unsigned N)  {    Ngrams ngrams;    vector<WordID> ng;    for (size_t i = 0; i < s.size(); i++) {      ng.clear(); -    for (size_t j = i; j < min(i+N, s.size()); j++) { +    for (unsigned j = i; j < min(i+N, s.size()); j++) {        ng.push_back(s[j]);        ngrams[ng]++;      } @@ -20,7 +20,7 @@ make_ngrams(vector<WordID>& s, size_t N)  }  NgramCounts -make_ngram_counts(vector<WordID> hyp, vector<WordID> ref, size_t N) +make_ngram_counts(vector<WordID> hyp, vector<WordID> ref, unsigned N)  {    Ngrams hyp_ngrams = make_ngrams(hyp, N);    Ngrams ref_ngrams = make_ngrams(ref, N); @@ -48,26 +48,22 @@ make_ngram_counts(vector<WordID> hyp, vector<WordID> ref, size_t N)   * NOTE: 0 if one n in {1..N} has 0 count   */  score_t -brevity_penaly(const size_t hyp_len, const size_t ref_len) +brevity_penaly(const unsigned hyp_len, const unsigned ref_len)  {    if (hyp_len > ref_len) return 1; -  return exp(1 - (score_t)ref_len/(score_t)hyp_len); +  return exp(1 - (score_t)ref_len/hyp_len);  }  score_t -bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, -      size_t N, vector<score_t> weights ) +bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, +      unsigned N, vector<score_t> weights )  {    if (hyp_len == 0 || ref_len == 0) return 0;    if (ref_len < N) N = ref_len; -  score_t N_ = (score_t)N; -  if (weights.empty()) -  { -    for (size_t i = 0; i < N; i++) weights.push_back(1/N_); -  } +  if (weights.empty()) for (unsigned i = 0; i < N; i++) weights.push_back(1./N);    score_t sum = 0; -  for (size_t i = 0; i < N; i++) { +  for (unsigned i = 0; i < N; i++) {      if (counts.clipped[i] == 0 || counts.sum[i] == 0) return 0; -    sum += weights[i] * log((score_t)counts.clipped[i] / (score_t)counts.sum[i]); +    sum += weights[i] * log((score_t)counts.clipped[i] / counts.sum[i]);    }    return brevity_penaly(hyp_len, ref_len) * exp(sum);  } @@ -83,21 +79,16 @@ bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len,   * NOTE: 0 iff no 1gram match   */  score_t -stupid_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, -             size_t N, vector<score_t> weights ) +stupid_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, +             unsigned N, vector<score_t> weights )  {    if (hyp_len == 0 || ref_len == 0) return 0;    if (ref_len < N) N = ref_len; -  score_t N_ = (score_t)N; -  if (weights.empty()) -  { -    for (size_t i = 0; i < N; i++) weights.push_back(1/N_); -  } -  score_t sum = 0; -  score_t add = 0; -  for (size_t i = 0; i < N; i++) { +  if (weights.empty()) for (unsigned i = 0; i < N; i++) weights.push_back(1./N); +  score_t sum = 0, add = 0; +  for (unsigned i = 0; i < N; i++) {      if (i == 1) add = 1; -    sum += weights[i] * log(((score_t)counts.clipped[i] + add) / ((score_t)counts.sum[i] + add)); +    sum += weights[i] * log(((score_t)counts.clipped[i] + add) / ((counts.sum[i] + add)));    }    return brevity_penaly(hyp_len, ref_len) * exp(sum);  } @@ -112,20 +103,16 @@ stupid_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len,   * NOTE: max is 0.9375   */  score_t -smooth_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, -            const size_t N, vector<score_t> weights ) +smooth_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, +            const unsigned N, vector<score_t> weights )  {    if (hyp_len == 0 || ref_len == 0) return 0; -  score_t N_ = (score_t)N; -  if (weights.empty()) -  { -    for (size_t i = 0; i < N; i++) weights.push_back(1/N_); -  } +  if (weights.empty()) for (unsigned i = 0; i < N; i++) weights.push_back(1./N);    score_t sum = 0; -  score_t j = 1; -  for (size_t i = 0; i < N; i++) { +  unsigned j = 1; +  for (unsigned i = 0; i < N; i++) {      if (counts.clipped[i] == 0 || counts.sum[i] == 0) continue; -    sum += exp((weights[i] * log((score_t)counts.clipped[i]/(score_t)counts.sum[i]))) / pow(2, N_-j+1); +    sum += exp((weights[i] * log((score_t)counts.clipped[i]/counts.sum[i]))) / pow(2, N-j+1);      j++;    }    return brevity_penaly(hyp_len, ref_len) * sum; @@ -139,11 +126,11 @@ smooth_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len,   * (Chiang et al. '08)   */  score_t -approx_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, -            const size_t N, vector<score_t> weights) +approx_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, +            const unsigned N, vector<score_t> weights)  {    return brevity_penaly(hyp_len, ref_len)  -         * 0.9 * bleu(counts, hyp_len, ref_len, N, weights); +           * 0.9 * bleu(counts, hyp_len, ref_len, N, weights);  } diff --git a/dtrain/score.h b/dtrain/score.h index bff0b10c..3e5d82a9 100644 --- a/dtrain/score.h +++ b/dtrain/score.h @@ -19,17 +19,17 @@ typedef double score_t; // float  struct NgramCounts  { -  size_t N_; -  map<size_t, size_t> clipped; -  map<size_t, size_t> sum; +  unsigned N_; +  map<unsigned, unsigned> clipped; +  map<unsigned, unsigned> sum; -  NgramCounts(const size_t N) : N_(N) { reset(); }  +  NgramCounts(const unsigned N) : N_(N) { reset(); }     void    operator+=(const NgramCounts& rhs)    {      assert(N_ == rhs.N_); -    for (size_t i = 0; i < N_; i++) { +    for (unsigned i = 0; i < N_; i++) {        this->clipped[i] += rhs.clipped.find(i)->second;        this->sum[i] += rhs.sum.find(i)->second;      } @@ -44,7 +44,7 @@ struct NgramCounts    }    void -  add(size_t count, size_t ref_count, size_t i) +  add(unsigned count, unsigned ref_count, unsigned i)    {      assert(i < N_);      if (count > ref_count) { @@ -59,7 +59,7 @@ struct NgramCounts    void    reset()    { -    size_t i; +    unsigned i;      for (i = 0; i < N_; i++) {        clipped[i] = 0;        sum[i] = 0; @@ -69,26 +69,26 @@ struct NgramCounts    void    print()    { -    for (size_t i = 0; i < N_; i++) { +    for (unsigned i = 0; i < N_; i++) {        cout << i+1 << "grams (clipped):\t" << clipped[i] << endl;        cout << i+1 << "grams:\t\t\t" << sum[i] << endl;      }    }  }; -typedef map<vector<WordID>, size_t> Ngrams; +typedef map<vector<WordID>, unsigned> Ngrams; -Ngrams make_ngrams(vector<WordID>& s, size_t N); -NgramCounts make_ngram_counts(vector<WordID> hyp, vector<WordID> ref, size_t N); +Ngrams make_ngrams(vector<WordID>& s, unsigned N); +NgramCounts make_ngram_counts(vector<WordID> hyp, vector<WordID> ref, unsigned N); -score_t brevity_penaly(const size_t hyp_len, const size_t ref_len); -score_t bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, +score_t brevity_penaly(const unsigned hyp_len, const unsigned ref_len); +score_t bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, const unsigned N,               vector<score_t> weights = vector<score_t>()); -score_t stupid_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, size_t N, +score_t stupid_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, unsigned N,                      vector<score_t> weights = vector<score_t>()); -score_t smooth_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, +score_t smooth_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, const unsigned N,                      vector<score_t> weights = vector<score_t>()); -score_t approx_bleu(NgramCounts& counts, const size_t hyp_len, const size_t ref_len, const size_t N, +score_t approx_bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref_len, const unsigned N,                      vector<score_t> weights = vector<score_t>()); diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini index 00ba72f9..fbddb915 100644 --- a/dtrain/test/example/dtrain.ini +++ b/dtrain/test/example/dtrain.ini @@ -5,7 +5,7 @@ epochs=1000  input=test/example/nc-1k.gz  scorer=stupid_bleu  output=test/example/weights.gz -stop_after=100 -sample_from=forest +stop_after=10 +sample_from=kbest  pair_sampling=all  print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough diff --git a/dtrain/test/example/weights.gz b/dtrain/test/example/weights.gzBinary files differ index 21157427..e2e1ecce 100644 --- a/dtrain/test/example/weights.gz +++ b/dtrain/test/example/weights.gz | 
