summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cab.ark.cs.cmu.edu>2012-06-19 00:05:18 -0400
committerChris Dyer <cdyer@cab.ark.cs.cmu.edu>2012-06-19 00:05:18 -0400
commit5975dcaa50adb5ce7a05b83583b8f9ddc45f3f0a (patch)
tree2bc2eb4e17576e0726d7a2fa7f20eac9061c311d
parent78cc819168b2a550e52e9cac06dbbed41a3b04b2 (diff)
parentee1520c5095ea8648617a3658b20eedfd4dd2007 (diff)
Merge branch 'master' of https://github.com/pks/cdec-dtrain
-rw-r--r--.gitignore1
-rw-r--r--decoder/decoder.cc22
-rw-r--r--decoder/viterbi.cc12
-rw-r--r--decoder/viterbi.h5
-rw-r--r--dtrain/Makefile.am2
-rw-r--r--dtrain/README.md10
-rw-r--r--dtrain/dtrain.cc97
-rw-r--r--dtrain/dtrain.h14
-rw-r--r--dtrain/ksampler.h7
-rw-r--r--dtrain/pairsampling.h49
-rw-r--r--dtrain/score.cc117
-rw-r--r--dtrain/score.h64
-rw-r--r--dtrain/test/example/README4
-rw-r--r--dtrain/test/example/dtrain.ini3
-rw-r--r--dtrain/test/example/expected-output125
15 files changed, 452 insertions, 80 deletions
diff --git a/.gitignore b/.gitignore
index 27c6a739..943e6dc5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -128,6 +128,7 @@ decoder/rule_lexer.cc
training/atools
training/collapse_weights
training/lbfgs_test
+training/libtraining.a
training/mr_optimize_reduce
training/mr_em_adapted_reduce
training/mr_em_map_adapter
diff --git a/decoder/decoder.cc b/decoder/decoder.cc
index cbb97a0d..333f0fb6 100644
--- a/decoder/decoder.cc
+++ b/decoder/decoder.cc
@@ -3,6 +3,7 @@
#include <tr1/unordered_map>
#include <boost/program_options.hpp>
#include <boost/program_options/variables_map.hpp>
+#include <boost/make_shared.hpp>
#include "program_options.h"
#include "stringlib.h"
@@ -187,8 +188,8 @@ struct DecoderImpl {
}
void SetId(int next_sent_id) { sent_id = next_sent_id - 1; }
- void forest_stats(Hypergraph &forest,string name,bool show_tree,bool show_deriv=false) {
- cerr << viterbi_stats(forest,name,true,show_tree,show_deriv);
+ void forest_stats(Hypergraph &forest,string name,bool show_tree,bool show_deriv=false, bool extract_rules=false, boost::shared_ptr<WriteFile> extract_file = boost::make_shared<WriteFile>()) {
+ cerr << viterbi_stats(forest,name,true,show_tree,show_deriv,extract_rules, extract_file);
cerr << endl;
}
@@ -424,7 +425,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
("tagger_tagset,t", po::value<string>(), "(Tagger) file containing tag set")
("csplit_output_plf", "(Compound splitter) Output lattice in PLF format")
("csplit_preserve_full_word", "(Compound splitter) Always include the unsegmented form in the output lattice")
- ("extract_rules", po::value<string>(), "Extract the rules used in translation (de-duped) to this file")
+ ("extract_rules", po::value<string>(), "Extract the rules used in translation (not de-duped!) to a file in this directory")
("show_derivations", po::value<string>(), "Directory to print the derivation structures to")
("graphviz","Show (constrained) translation forest in GraphViz format")
("max_translation_beam,x", po::value<int>(), "Beam approximation to get max translation from the chart")
@@ -570,6 +571,11 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
// cube pruning pop-limit: we may want to configure this on a per-pass basis
pop_limit = conf["cubepruning_pop_limit"].as<int>();
+ if (conf.count("extract_rules")) {
+ if (!DirectoryExists(conf["extract_rules"].as<string>()))
+ MkDirP(conf["extract_rules"].as<string>());
+ }
+
// determine the number of rescoring/pruning/weighting passes configured
const int MAX_PASSES = 3;
for (int pass = 0; pass < MAX_PASSES; ++pass) {
@@ -712,9 +718,11 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
cfg_options.Validate();
#endif
- if (conf.count("extract_rules"))
- extract_file.reset(new WriteFile(str("extract_rules",conf)));
-
+ if (conf.count("extract_rules")) {
+ stringstream ss;
+ ss << sent_id;
+ extract_file.reset(new WriteFile(str("extract_rules",conf)+"/"+ss.str()));
+ }
combine_size = conf["combine_size"].as<int>();
if (combine_size < 1) combine_size = 1;
sent_id = -1;
@@ -851,7 +859,7 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) {
#endif
forest.swap(rescored_forest);
forest.Reweight(cur_weights);
- if (!SILENT) forest_stats(forest," " + passtr +" forest",show_tree_structure,oracle.show_derivation);
+ if (!SILENT) forest_stats(forest," " + passtr +" forest",show_tree_structure,oracle.show_derivation, conf.count("extract_rules"), extract_file);
}
if (conf.count("show_partition")) {
diff --git a/decoder/viterbi.cc b/decoder/viterbi.cc
index 9d19914b..1b9c6665 100644
--- a/decoder/viterbi.cc
+++ b/decoder/viterbi.cc
@@ -5,11 +5,12 @@
#include <vector>
#include "hg.h"
+
//#define DEBUG_VITERBI_SORT
using namespace std;
-std::string viterbi_stats(Hypergraph const& hg, std::string const& name, bool estring, bool etree,bool show_derivation)
+std::string viterbi_stats(Hypergraph const& hg, std::string const& name, bool estring, bool etree,bool show_derivation, bool extract_rules, boost::shared_ptr<WriteFile> extract_file)
{
ostringstream o;
o << hg.stats(name);
@@ -22,6 +23,9 @@ std::string viterbi_stats(Hypergraph const& hg, std::string const& name, bool es
if (etree) {
o<<name<<" tree: "<<ViterbiETree(hg)<<endl;
}
+ if (extract_rules) {
+ ViterbiRules(hg, extract_file->stream());
+ }
if (show_derivation) {
o<<name<<" derivation: ";
o << hg.show_viterbi_tree(false); // last item should be goal (or at least depend on prev items). TODO: this doesn't actually reorder the nodes in hg.
@@ -36,6 +40,12 @@ std::string viterbi_stats(Hypergraph const& hg, std::string const& name, bool es
return o.str();
}
+void ViterbiRules(const Hypergraph& hg, ostream* o) {
+ vector<Hypergraph::Edge const*> edges;
+ Viterbi<ViterbiPathTraversal>(hg, &edges);
+ for (unsigned i = 0; i < edges.size(); i++)
+ (*o) << edges[i]->rule_->AsString(true) << endl;
+}
string ViterbiETree(const Hypergraph& hg) {
vector<WordID> tmp;
diff --git a/decoder/viterbi.h b/decoder/viterbi.h
index 3092f6da..03e961a2 100644
--- a/decoder/viterbi.h
+++ b/decoder/viterbi.h
@@ -5,8 +5,10 @@
#include "prob.h"
#include "hg.h"
#include "tdict.h"
+#include "filelib.h"
+#include <boost/make_shared.hpp>
-std::string viterbi_stats(Hypergraph const& hg, std::string const& name="forest", bool estring=true, bool etree=false, bool derivation_tree=false);
+std::string viterbi_stats(Hypergraph const& hg, std::string const& name="forest", bool estring=true, bool etree=false, bool derivation_tree=false, bool extract_rules=false, boost::shared_ptr<WriteFile> extract_file = boost::make_shared<WriteFile>());
/// computes for each hg node the best (according to WeightType/WeightFunction) derivation, and some homomorphism (bottom up expression tree applied through Traversal) of it. T is the "return type" of Traversal, which is called only once for the best edge for a node's result (i.e. result will start default constructed)
//TODO: make T a typename inside Traversal and WeightType a typename inside WeightFunction?
@@ -201,6 +203,7 @@ struct FeatureVectorTraversal {
std::string JoshuaVisualizationString(const Hypergraph& hg);
prob_t ViterbiESentence(const Hypergraph& hg, std::vector<WordID>* result);
std::string ViterbiETree(const Hypergraph& hg);
+void ViterbiRules(const Hypergraph& hg, std::ostream* s);
prob_t ViterbiFSentence(const Hypergraph& hg, std::vector<WordID>* result);
std::string ViterbiFTree(const Hypergraph& hg);
int ViterbiELength(const Hypergraph& hg);
diff --git a/dtrain/Makefile.am b/dtrain/Makefile.am
index f39d161e..64fef489 100644
--- a/dtrain/Makefile.am
+++ b/dtrain/Makefile.am
@@ -3,5 +3,5 @@ bin_PROGRAMS = dtrain
dtrain_SOURCES = dtrain.cc score.cc
dtrain_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-AM_CPPFLAGS = -O3 -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
diff --git a/dtrain/README.md b/dtrain/README.md
index 9580df6d..7edabbf1 100644
--- a/dtrain/README.md
+++ b/dtrain/README.md
@@ -39,16 +39,6 @@ For an example of local usage (with the 'distributed' format)
the see test/example/ . This expects dtrain to be built without
DTRAIN_LOCAL.
-Next
-----
-+ (dtrain|decoder) more meta-parameters testing
-+ feature selection directly in dtrain
-+ feature template: target side rule ngrams
-+ sa-extract -> leave-one-out for grammar of training set?
-+ make svm doable; no subgradient?
-+ reranking while sgd?
-+ try PRO, mira emulations
-
Legal
-----
Copyright (c) 2012 by Patrick Simianer <p@simianer.de>
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index d9bce843..b3e62914 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -6,37 +6,39 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
{
po::options_description ini("Configuration File Options");
ini.add_options()
- ("input", po::value<string>()->default_value("-"), "input file")
- ("output", po::value<string>()->default_value("-"), "output weights file, '-' for STDOUT")
- ("input_weights", po::value<string>(), "input weights file (e.g. from previous iteration)")
- ("decoder_config", po::value<string>(), "configuration file for cdec")
- ("print_weights", po::value<string>(), "weights to print on each iteration")
- ("stop_after", po::value<unsigned>()->default_value(0), "stop after X input sentences")
- ("tmp", po::value<string>()->default_value("/tmp"), "temp dir to use")
- ("keep", po::value<bool>()->zero_tokens(), "keep weights files for each iteration")
- ("hstreaming", po::value<string>(), "run in hadoop streaming mode, arg is a task id")
- ("epochs", po::value<unsigned>()->default_value(10), "# of iterations T (per shard)")
- ("k", po::value<unsigned>()->default_value(100), "how many translations to sample")
- ("sample_from", po::value<string>()->default_value("kbest"), "where to sample translations from: 'kbest', 'forest'")
- ("filter", po::value<string>()->default_value("uniq"), "filter kbest list: 'not', 'uniq'")
- ("pair_sampling", po::value<string>()->default_value("XYX"), "how to sample pairs: 'all', 'XYX' or 'PRO'")
- ("hi_lo", po::value<float>()->default_value(0.1), "hi and lo (X) for XYX (default 0.1), <= 0.5")
- ("pair_threshold", po::value<score_t>()->default_value(0.), "bleu [0,1] threshold to filter pairs")
- ("N", po::value<unsigned>()->default_value(4), "N for Ngrams (BLEU)")
- ("scorer", po::value<string>()->default_value("stupid_bleu"), "scoring: bleu, stupid_, smooth_, approx_")
- ("learning_rate", po::value<weight_t>()->default_value(0.0001), "learning rate")
- ("gamma", po::value<weight_t>()->default_value(0.), "gamma for SVM (0 for perceptron)")
- ("select_weights", po::value<string>()->default_value("last"), "output best, last, avg weights ('VOID' to throw away)")
- ("rescale", po::value<bool>()->zero_tokens(), "rescale weight vector after each input")
- ("l1_reg", po::value<string>()->default_value("none"), "apply l1 regularization as in 'Tsuroka et al' (2010)")
- ("l1_reg_strength", po::value<weight_t>(), "l1 regularization strength")
- ("fselect", po::value<weight_t>()->default_value(-1), "TODO select top x percent (or by threshold) of features after each epoch")
- ("approx_bleu_d", po::value<score_t>()->default_value(0.9), "discount for approx. BLEU")
- ("scale_bleu_diff", po::value<bool>()->zero_tokens(), "learning rate <- bleu diff of a misranked pair")
+ ("input", po::value<string>()->default_value("-"), "input file")
+ ("output", po::value<string>()->default_value("-"), "output weights file, '-' for STDOUT")
+ ("input_weights", po::value<string>(), "input weights file (e.g. from previous iteration)")
+ ("decoder_config", po::value<string>(), "configuration file for cdec")
+ ("print_weights", po::value<string>(), "weights to print on each iteration")
+ ("stop_after", po::value<unsigned>()->default_value(0), "stop after X input sentences")
+ ("tmp", po::value<string>()->default_value("/tmp"), "temp dir to use")
+ ("keep", po::value<bool>()->zero_tokens(), "keep weights files for each iteration")
+ ("hstreaming", po::value<string>(), "run in hadoop streaming mode, arg is a task id")
+ ("epochs", po::value<unsigned>()->default_value(10), "# of iterations T (per shard)")
+ ("k", po::value<unsigned>()->default_value(100), "how many translations to sample")
+ ("sample_from", po::value<string>()->default_value("kbest"), "where to sample translations from: 'kbest', 'forest'")
+ ("filter", po::value<string>()->default_value("uniq"), "filter kbest list: 'not', 'uniq'")
+ ("pair_sampling", po::value<string>()->default_value("XYX"), "how to sample pairs: 'all', 'XYX' or 'PRO'")
+ ("hi_lo", po::value<float>()->default_value(0.1), "hi and lo (X) for XYX (default 0.1), <= 0.5")
+ ("pair_threshold", po::value<score_t>()->default_value(0.), "bleu [0,1] threshold to filter pairs")
+ ("N", po::value<unsigned>()->default_value(4), "N for Ngrams (BLEU)")
+ ("scorer", po::value<string>()->default_value("stupid_bleu"), "scoring: bleu, stupid_, smooth_, approx_, lc_")
+ ("learning_rate", po::value<weight_t>()->default_value(0.0001), "learning rate")
+ ("gamma", po::value<weight_t>()->default_value(0.), "gamma for SVM (0 for perceptron)")
+ ("select_weights", po::value<string>()->default_value("last"), "output best, last, avg weights ('VOID' to throw away)")
+ ("rescale", po::value<bool>()->zero_tokens(), "rescale weight vector after each input")
+ ("l1_reg", po::value<string>()->default_value("none"), "apply l1 regularization as in 'Tsuroka et al' (2010)")
+ ("l1_reg_strength", po::value<weight_t>(), "l1 regularization strength")
+ ("fselect", po::value<weight_t>()->default_value(-1), "select top x percent (or by threshold) of features after each epoch NOT IMPL") // TODO
+ ("approx_bleu_d", po::value<score_t>()->default_value(0.9), "discount for approx. BLEU")
+ ("scale_bleu_diff", po::value<bool>()->zero_tokens(), "learning rate <- bleu diff of a misranked pair")
+ ("loss_margin", po::value<weight_t>()->default_value(0.), "update if no error in pref pair but model scores this near")
+ ("max_pairs", po::value<unsigned>()->default_value(std::numeric_limits<unsigned>::max()), "max. # of pairs per Sent.")
#ifdef DTRAIN_LOCAL
- ("refs,r", po::value<string>(), "references in local mode")
+ ("refs,r", po::value<string>(), "references in local mode")
#endif
- ("noup", po::value<bool>()->zero_tokens(), "do not update weights");
+ ("noup", po::value<bool>()->zero_tokens(), "do not update weights");
po::options_description cl("Command Line Options");
cl.add_options()
("config,c", po::value<string>(), "dtrain config file")
@@ -134,6 +136,9 @@ main(int argc, char** argv)
const string select_weights = cfg["select_weights"].as<string>();
const float hi_lo = cfg["hi_lo"].as<float>();
const score_t approx_bleu_d = cfg["approx_bleu_d"].as<score_t>();
+ const unsigned max_pairs = cfg["max_pairs"].as<unsigned>();
+ weight_t loss_margin = cfg["loss_margin"].as<weight_t>();
+ if (loss_margin > 9998.) loss_margin = std::numeric_limits<float>::max();
bool scale_bleu_diff = false;
if (cfg.count("scale_bleu_diff")) scale_bleu_diff = true;
bool average = false;
@@ -160,8 +165,16 @@ main(int argc, char** argv)
scorer = dynamic_cast<StupidBleuScorer*>(new StupidBleuScorer);
} else if (scorer_str == "smooth_bleu") {
scorer = dynamic_cast<SmoothBleuScorer*>(new SmoothBleuScorer);
+ } else if (scorer_str == "sum_bleu") {
+ scorer = dynamic_cast<SumBleuScorer*>(new SumBleuScorer);
+ } else if (scorer_str == "sumexp_bleu") {
+ scorer = dynamic_cast<SumExpBleuScorer*>(new SumExpBleuScorer);
+ } else if (scorer_str == "sumwhatever_bleu") {
+ scorer = dynamic_cast<SumWhateverBleuScorer*>(new SumWhateverBleuScorer);
} else if (scorer_str == "approx_bleu") {
scorer = dynamic_cast<ApproxBleuScorer*>(new ApproxBleuScorer(N, approx_bleu_d));
+ } else if (scorer_str == "lc_bleu") {
+ scorer = dynamic_cast<LinearBleuScorer*>(new LinearBleuScorer(N));
} else {
cerr << "Don't know scoring metric: '" << scorer_str << "', exiting." << endl;
exit(1);
@@ -220,7 +233,7 @@ main(int argc, char** argv)
grammar_buf_out.open(grammar_buf_fn.c_str());
#endif
- unsigned in_sz = UINT_MAX; // input index, input size
+ unsigned in_sz = std::numeric_limits<unsigned>::max(); // input index, input size
vector<pair<score_t, score_t> > all_scores;
score_t max_score = 0.;
unsigned best_it = 0;
@@ -242,6 +255,7 @@ main(int argc, char** argv)
if (!scale_bleu_diff) cerr << setw(25) << "learning rate " << eta << endl;
else cerr << setw(25) << "learning rate " << "bleu diff" << endl;
cerr << setw(25) << "gamma " << gamma << endl;
+ cerr << setw(25) << "loss margin " << loss_margin << endl;
cerr << setw(25) << "pairs " << "'" << pair_sampling << "'" << endl;
if (pair_sampling == "XYX")
cerr << setw(25) << "hi lo " << hi_lo << endl;
@@ -251,6 +265,7 @@ main(int argc, char** argv)
cerr << setw(25) << "l1 reg " << l1_reg << " '" << cfg["l1_reg"].as<string>() << "'" << endl;
if (rescale)
cerr << setw(25) << "rescale " << rescale << endl;
+ cerr << setw(25) << "max pairs " << max_pairs << endl;
cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl;
cerr << setw(25) << "input " << "'" << input_fn << "'" << endl;
#ifdef DTRAIN_LOCAL
@@ -415,21 +430,27 @@ main(int argc, char** argv)
// get pairs
vector<pair<ScoredHyp,ScoredHyp> > pairs;
if (pair_sampling == "all")
- all_pairs(samples, pairs, pair_threshold);
+ all_pairs(samples, pairs, pair_threshold, max_pairs);
if (pair_sampling == "XYX")
- partXYX(samples, pairs, pair_threshold, hi_lo);
+ partXYX(samples, pairs, pair_threshold, max_pairs, hi_lo);
if (pair_sampling == "PRO")
- PROsampling(samples, pairs, pair_threshold);
+ PROsampling(samples, pairs, pair_threshold, max_pairs);
npairs += pairs.size();
for (vector<pair<ScoredHyp,ScoredHyp> >::iterator it = pairs.begin();
it != pairs.end(); it++) {
+#ifdef DTRAIN_FASTER_PERCEPTRON
+ bool rank_error = true; // pair sampling already did this for us
+ rank_errors++;
+ score_t margin = std::numeric_limits<float>::max();
+#else
bool rank_error = it->first.model <= it->second.model;
if (rank_error) rank_errors++;
- score_t margin = fabs(it->first.model - it->second.model);
- if (!rank_error && margin < 1) margin_violations++;
+ score_t margin = fabs(fabs(it->first.model) - fabs(it->second.model));
+ if (!rank_error && margin < loss_margin) margin_violations++;
+#endif
if (scale_bleu_diff) eta = it->first.score - it->second.score;
- if (rank_error || (gamma && margin<1)) {
+ if (rank_error || margin < loss_margin) {
SparseVector<weight_t> diff_vec = it->first.f - it->second.f;
lambdas.plus_eq_v_times_s(diff_vec, eta);
if (gamma)
@@ -486,7 +507,7 @@ main(int argc, char** argv)
if (average) w_average += lambdas;
- if (scorer_str == "approx_bleu") scorer->Reset();
+ if (scorer_str == "approx_bleu" || scorer_str == "lc_bleu") scorer->Reset();
if (t == 0) {
in_sz = ii; // remember size of input (# lines)
@@ -534,8 +555,10 @@ main(int argc, char** argv)
cerr << _np << npairs/(float)in_sz << endl;
cerr << " avg # rank err: ";
cerr << rank_errors/(float)in_sz << endl;
+#ifndef DTRAIN_FASTER_PERCEPTRON
cerr << " avg # margin viol: ";
cerr << margin_violations/(float)in_sz << endl;
+#endif
cerr << " non0 feature count: " << nonz << endl;
cerr << " avg list sz: " << list_sz/(float)in_sz << endl;
cerr << " avg f count: " << f_count/(float)list_sz << endl;
diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h
index 94d149ce..7e084a79 100644
--- a/dtrain/dtrain.h
+++ b/dtrain/dtrain.h
@@ -1,6 +1,14 @@
#ifndef _DTRAIN_H_
#define _DTRAIN_H_
+#undef DTRAIN_FASTER_PERCEPTRON // only look at misranked pairs
+ // DO NOT USE WITH SVM!
+#define DTRAIN_LOCAL
+#define DTRAIN_DOTS 10 // after how many inputs to display a '.'
+#define DTRAIN_GRAMMAR_DELIM "########EOS########"
+#define DTRAIN_SCALE 100000
+
+
#include <iomanip>
#include <climits>
#include <string.h>
@@ -13,11 +21,7 @@
#include "filelib.h"
-#undef DTRAIN_LOCAL
-#define DTRAIN_DOTS 10 // after how many inputs to display a '.'
-#define DTRAIN_GRAMMAR_DELIM "########EOS########"
-#define DTRAIN_SCALE 100000
using namespace std;
using namespace dtrain;
@@ -32,7 +36,7 @@ inline void register_and_convert(const vector<string>& strs, vector<WordID>& ids
inline string gettmpf(const string path, const string infix)
{
- char fn[1024];
+ char fn[path.size() + infix.size() + 8];
strcpy(fn, path.c_str());
strcat(fn, "/");
strcat(fn, infix.c_str());
diff --git a/dtrain/ksampler.h b/dtrain/ksampler.h
index f52fb649..bc2f56cd 100644
--- a/dtrain/ksampler.h
+++ b/dtrain/ksampler.h
@@ -8,6 +8,11 @@
namespace dtrain
{
+bool
+cmp_hyp_by_model_d(ScoredHyp a, ScoredHyp b)
+{
+ return a.model > b.model;
+}
struct KSampler : public HypSampler
{
@@ -44,6 +49,8 @@ struct KSampler : public HypSampler
sz_++;
f_count_ += h.f.size();
}
+ sort(s_.begin(), s_.end(), cmp_hyp_by_model_d);
+ for (unsigned i = 0; i < s_.size(); i++) s_[i].rank = i;
}
};
diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h
index bac132c6..84be1efb 100644
--- a/dtrain/pairsampling.h
+++ b/dtrain/pairsampling.h
@@ -19,10 +19,12 @@ cmp_hyp_by_score_d(ScoredHyp a, ScoredHyp b)
}
inline void
-all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, float _unused=1)
+all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, unsigned max, float _unused=1)
{
sort(s->begin(), s->end(), cmp_hyp_by_score_d);
unsigned sz = s->size();
+ bool b = false;
+ unsigned count = 0;
for (unsigned i = 0; i < sz-1; i++) {
for (unsigned j = i+1; j < sz; j++) {
if (threshold > 0) {
@@ -32,7 +34,12 @@ all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, sc
if ((*s)[i].score != (*s)[j].score)
training.push_back(make_pair((*s)[i], (*s)[j]));
}
+ if (++count == max) {
+ b = true;
+ break;
+ }
}
+ if (b) break;
}
}
@@ -44,13 +51,22 @@ all_pairs(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, sc
*/
inline void
-partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, float hi_lo)
+partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, unsigned max, float hi_lo)
{
- sort(s->begin(), s->end(), cmp_hyp_by_score_d);
unsigned sz = s->size();
+ if (sz < 2) return;
+ sort(s->begin(), s->end(), cmp_hyp_by_score_d);
unsigned sep = round(sz*hi_lo);
- for (unsigned i = 0; i < sep; i++) {
- for (unsigned j = sep; j < sz; j++) {
+ unsigned sep_hi = sep;
+ if (sz > 4) while (sep_hi < sz && (*s)[sep_hi-1].score == (*s)[sep_hi].score) ++sep_hi;
+ else sep_hi = 1;
+ bool b = false;
+ unsigned count = 0;
+ for (unsigned i = 0; i < sep_hi; i++) {
+ for (unsigned j = sep_hi; j < sz; j++) {
+#ifdef DTRAIN_FASTER_PERCEPTRON
+ if ((*s)[i].model <= (*s)[j].model) {
+#endif
if (threshold > 0) {
if (accept_pair((*s)[i].score, (*s)[j].score, threshold))
training.push_back(make_pair((*s)[i], (*s)[j]));
@@ -58,10 +74,23 @@ partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, scor
if ((*s)[i].score != (*s)[j].score)
training.push_back(make_pair((*s)[i], (*s)[j]));
}
+ if (++count == max) {
+ b = true;
+ break;
+ }
+#ifdef DTRAIN_FASTER_PERCEPTRON
+ }
+#endif
}
+ if (b) break;
}
- for (unsigned i = sep; i < sz-sep; i++) {
- for (unsigned j = sz-sep; j < sz; j++) {
+ unsigned sep_lo = sz-sep;
+ while (sep_lo > 0 && (*s)[sep_lo-1].score == (*s)[sep_lo].score) --sep_lo;
+ for (unsigned i = sep_hi; i < sz-sep_lo; i++) {
+ for (unsigned j = sz-sep_lo; j < sz; j++) {
+#ifdef DTRAIN_FASTER_PERCEPTRON
+ if ((*s)[i].model <= (*s)[j].model) {
+#endif
if (threshold > 0) {
if (accept_pair((*s)[i].score, (*s)[j].score, threshold))
training.push_back(make_pair((*s)[i], (*s)[j]));
@@ -69,6 +98,10 @@ partXYX(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, scor
if ((*s)[i].score != (*s)[j].score)
training.push_back(make_pair((*s)[i], (*s)[j]));
}
+ if (++count == max) return;
+#ifdef DTRAIN_FASTER_PERCEPTRON
+ }
+#endif
}
}
}
@@ -86,7 +119,7 @@ _PRO_cmp_pair_by_diff_d(pair<ScoredHyp,ScoredHyp> a, pair<ScoredHyp,ScoredHyp> b
return (fabs(a.first.score - a.second.score)) > (fabs(b.first.score - b.second.score));
}
inline void
-PROsampling(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, float _unused=1)
+PROsampling(vector<ScoredHyp>* s, vector<pair<ScoredHyp,ScoredHyp> >& training, score_t threshold, unsigned max, float _unused=1)
{
unsigned max_count = 5000, count = 0, sz = s->size();
bool b = false;
diff --git a/dtrain/score.cc b/dtrain/score.cc
index 7b1f6be4..4a7cac6e 100644
--- a/dtrain/score.cc
+++ b/dtrain/score.cc
@@ -80,7 +80,7 @@ StupidBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
* to Machine Translation"
* (Liang et al. '06)
*
- * NOTE: max is 0.9375
+ * NOTE: max is 0.9375 (with N=4)
*/
score_t
SmoothBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
@@ -103,7 +103,83 @@ SmoothBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
i_bleu[j] += (1/((score_t)j+1)) * i_ng;
}
}
- sum += exp(i_bleu[i])/(pow(2.0, static_cast<double>(N_-i)));
+ sum += exp(i_bleu[i])/(pow(2.0, N_-i));
+ }
+ return brevity_penalty(hyp_len, ref_len) * sum;
+}
+
+/*
+ * 'sum' bleu
+ *
+ * sum up Ngram precisions
+ */
+score_t
+SumBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
+ const unsigned /*rank*/, const unsigned /*src_len*/)
+{
+ unsigned hyp_len = hyp.size(), ref_len = ref.size();
+ if (hyp_len == 0 || ref_len == 0) return 0.;
+ NgramCounts counts = make_ngram_counts(hyp, ref, N_);
+ unsigned M = N_;
+ if (ref_len < N_) M = ref_len;
+ score_t sum = 0.;
+ unsigned j = 1;
+ for (unsigned i = 0; i < M; i++) {
+ if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) break;
+ sum += ((score_t)counts.clipped_[i]/counts.sum_[i])/pow(2., N_-j+1);
+ j++;
+ }
+ return brevity_penalty(hyp_len, ref_len) * sum;
+}
+
+/*
+ * 'sum' (exp) bleu
+ *
+ * sum up exp(Ngram precisions)
+ */
+score_t
+SumExpBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
+ const unsigned /*rank*/, const unsigned /*src_len*/)
+{
+ unsigned hyp_len = hyp.size(), ref_len = ref.size();
+ if (hyp_len == 0 || ref_len == 0) return 0.;
+ NgramCounts counts = make_ngram_counts(hyp, ref, N_);
+ unsigned M = N_;
+ if (ref_len < N_) M = ref_len;
+ score_t sum = 0.;
+ unsigned j = 1;
+ for (unsigned i = 0; i < M; i++) {
+ if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) break;
+ sum += exp(((score_t)counts.clipped_[i]/counts.sum_[i]))/pow(2., N_-j+1);
+ j++;
+ }
+ return brevity_penalty(hyp_len, ref_len) * sum;
+}
+
+/*
+ * 'sum' (whatever) bleu
+ *
+ * sum up exp(weight * log(Ngram precisions))
+ */
+score_t
+SumWhateverBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
+ const unsigned /*rank*/, const unsigned /*src_len*/)
+{
+ unsigned hyp_len = hyp.size(), ref_len = ref.size();
+ if (hyp_len == 0 || ref_len == 0) return 0.;
+ NgramCounts counts = make_ngram_counts(hyp, ref, N_);
+ unsigned M = N_;
+ vector<score_t> v = w_;
+ if (ref_len < N_) {
+ M = ref_len;
+ for (unsigned i = 0; i < M; i++) v[i] = 1/((score_t)M);
+ }
+ score_t sum = 0.;
+ unsigned j = 1;
+ for (unsigned i = 0; i < M; i++) {
+ if (counts.sum_[i] == 0 || counts.clipped_[i] == 0) break;
+ sum += exp(v[i] * log(((score_t)counts.clipped_[i]/counts.sum_[i])))/pow(2., N_-j+1);
+ j++;
}
return brevity_penalty(hyp_len, ref_len) * sum;
}
@@ -115,7 +191,8 @@ SmoothBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
* and Structural Translation Features"
* (Chiang et al. '08)
*
- * NOTE: needs some more code in dtrain.cc
+ * NOTE: Needs some more code in dtrain.cc .
+ * No scaling by src len.
*/
score_t
ApproxBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
@@ -137,7 +214,39 @@ ApproxBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
glob_ref_len_ = discount_ * (glob_ref_len_ + ref_len);
glob_src_len_ = discount_ * (glob_src_len_ + src_len);
}
- return (score_t)glob_src_len_ * score;
+ return score;
+}
+
+/*
+ * Linear (Corpus) Bleu
+ *
+ * as in "Lattice Minimum Bayes-Risk Decoding
+ * for Statistical Machine Translation"
+ * (Tromble et al. '08)
+ *
+ */
+score_t
+LinearBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
+ const unsigned rank, const unsigned /*src_len*/)
+{
+ unsigned hyp_len = hyp.size(), ref_len = ref.size();
+ if (ref_len == 0) return 0.;
+ unsigned M = N_;
+ if (ref_len < N_) M = ref_len;
+ NgramCounts counts(M);
+ if (hyp_len > 0)
+ counts = make_ngram_counts(hyp, ref, M);
+ score_t ret = 0.;
+ for (unsigned i = 0; i < M; i++) {
+ if (counts.sum_[i] == 0 || onebest_counts_.sum_[i] == 0) break;
+ ret += counts.sum_[i]/onebest_counts_.sum_[i];
+ }
+ ret = -(hyp_len/(score_t)onebest_len_) + (1./M) * ret;
+ if (rank == 0) {
+ onebest_len_ += hyp_len;
+ onebest_counts_ += counts;
+ }
+ return ret;
}
diff --git a/dtrain/score.h b/dtrain/score.h
index eb8ad912..f317c903 100644
--- a/dtrain/score.h
+++ b/dtrain/score.h
@@ -20,7 +20,7 @@ struct NgramCounts
inline void
operator+=(const NgramCounts& rhs)
{
- assert(N_ == rhs.N_);
+ if (rhs.N_ > N_) Resize(rhs.N_);
for (unsigned i = 0; i < N_; i++) {
this->clipped_[i] += rhs.clipped_.find(i)->second;
this->sum_[i] += rhs.sum_.find(i)->second;
@@ -59,14 +59,22 @@ struct NgramCounts
inline void
Zero()
{
- unsigned i;
- for (i = 0; i < N_; i++) {
+ for (unsigned i = 0; i < N_; i++) {
clipped_[i] = 0.;
sum_[i] = 0.;
}
}
inline void
+ One()
+ {
+ for (unsigned i = 0; i < N_; i++) {
+ clipped_[i] = 1.;
+ sum_[i] = 1.;
+ }
+ }
+
+ inline void
Print()
{
for (unsigned i = 0; i < N_; i++) {
@@ -74,6 +82,23 @@ struct NgramCounts
cout << i+1 << "grams:\t\t\t" << sum_[i] << endl;
}
}
+
+ inline void Resize(unsigned N)
+ {
+ if (N == N_) return;
+ else if (N > N_) {
+ for (unsigned i = N_; i < N; i++) {
+ clipped_[i] = 0.;
+ sum_[i] = 0.;
+ }
+ } else { // N < N_
+ for (unsigned i = N_-1; i > N-1; i--) {
+ clipped_.erase(i);
+ sum_.erase(i);
+ }
+ }
+ N_ = N;
+ }
};
typedef map<vector<WordID>, unsigned> Ngrams;
@@ -128,6 +153,21 @@ struct SmoothBleuScorer : public LocalScorer
score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
};
+struct SumBleuScorer : public LocalScorer
+{
+ score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
+};
+
+struct SumExpBleuScorer : public LocalScorer
+{
+ score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
+};
+
+struct SumWhateverBleuScorer : public LocalScorer
+{
+ score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned /*rank*/, const unsigned /*src_len*/);
+};
+
struct ApproxBleuScorer : public BleuScorer
{
NgramCounts glob_onebest_counts_;
@@ -147,6 +187,24 @@ struct ApproxBleuScorer : public BleuScorer
score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank, const unsigned src_len);
};
+struct LinearBleuScorer : public BleuScorer
+{
+ unsigned onebest_len_;
+ NgramCounts onebest_counts_;
+
+ LinearBleuScorer(unsigned N) : onebest_len_(1), onebest_counts_(N)
+ {
+ onebest_counts_.One();
+ }
+
+ score_t Score(vector<WordID>& hyp, vector<WordID>& ref, const unsigned rank, const unsigned /*src_len*/);
+
+ inline void Reset() {
+ onebest_len_ = 1;
+ onebest_counts_.One();
+ }
+};
+
} // namespace
diff --git a/dtrain/test/example/README b/dtrain/test/example/README
index b3ea5f06..6937b11b 100644
--- a/dtrain/test/example/README
+++ b/dtrain/test/example/README
@@ -1,8 +1,8 @@
Small example of input format for distributed training.
Call dtrain from cdec/dtrain/ with ./dtrain -c test/example/dtrain.ini .
-For this to work, disable '#define DTRAIN_LOCAL' from dtrain.h
+For this to work, undef 'DTRAIN_LOCAL' in dtrain.h
and recompile.
-Data is here: http://simianer.de/dtrain
+Data is here: http://simianer.de/#dtrain
diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini
index f87ee9cf..c8ac7c3f 100644
--- a/dtrain/test/example/dtrain.ini
+++ b/dtrain/test/example/dtrain.ini
@@ -5,7 +5,7 @@ decoder_config=test/example/cdec.ini # config for cdec
# weights for these features will be printed on each iteration
print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough
tmp=/tmp
-stop_after=10 # stop epoch after 20 inputs
+stop_after=10 # stop epoch after 10 inputs
# interesting stuff
epochs=3 # run over input 3 times
@@ -19,3 +19,4 @@ filter=uniq # only unique entries in kbest (surface form)
pair_sampling=XYX
hi_lo=0.1 # 10 vs 80 vs 10 and 80 vs 10 here
pair_threshold=0 # minimum distance in BLEU (this will still only use pairs with diff > 0)
+loss_margin=0
diff --git a/dtrain/test/example/expected-output b/dtrain/test/example/expected-output
new file mode 100644
index 00000000..25d2c069
--- /dev/null
+++ b/dtrain/test/example/expected-output
@@ -0,0 +1,125 @@
+ cdec cfg 'test/example/cdec.ini'
+feature: WordPenalty (no config parameters)
+State is 0 bytes for feature WordPenalty
+feature: KLanguageModel (with config parameters 'test/example/nc-wmt11.en.srilm.gz')
+Loading the LM will be faster if you build a binary file.
+Reading test/example/nc-wmt11.en.srilm.gz
+----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100
+****************************************************************************************************
+Loaded 5-gram KLM from test/example/nc-wmt11.en.srilm.gz (MapSize=49581)
+State is 98 bytes for feature KLanguageModel test/example/nc-wmt11.en.srilm.gz
+feature: RuleIdentityFeatures (no config parameters)
+State is 0 bytes for feature RuleIdentityFeatures
+feature: RuleNgramFeatures (no config parameters)
+State is 0 bytes for feature RuleNgramFeatures
+feature: RuleShape (no config parameters)
+ Example feature: Shape_S00000_T00000
+State is 0 bytes for feature RuleShape
+Seeding random number sequence to 1072059181
+
+dtrain
+Parameters:
+ k 100
+ N 4
+ T 3
+ scorer 'stupid_bleu'
+ sample from 'kbest'
+ filter 'uniq'
+ learning rate 0.0001
+ gamma 0
+ loss margin 0
+ pairs 'XYX'
+ hi lo 0.1
+ pair threshold 0
+ select weights 'VOID'
+ l1 reg 0 'none'
+ cdec cfg 'test/example/cdec.ini'
+ input 'test/example/nc-wmt11.1k.gz'
+ output '-'
+ stop_after 10
+(a dot represents 10 inputs)
+Iteration #1 of 3.
+ . 10
+Stopping after 10 input sentences.
+WEIGHTS
+ Glue = -0.0293
+ WordPenalty = +0.049075
+ LanguageModel = +0.24345
+ LanguageModel_OOV = -0.2029
+ PhraseModel_0 = +0.0084102
+ PhraseModel_1 = +0.021729
+ PhraseModel_2 = +0.014922
+ PhraseModel_3 = +0.104
+ PhraseModel_4 = -0.14308
+ PhraseModel_5 = +0.0247
+ PhraseModel_6 = -0.012
+ PassThrough = -0.2161
+ ---
+ 1best avg score: 0.16872 (+0.16872)
+ 1best avg model score: -1.8276 (-1.8276)
+ avg # pairs: 1121.1
+ avg # rank err: 555.6
+ avg # margin viol: 0
+ non0 feature count: 277
+ avg list sz: 77.2
+ avg f count: 90.96
+(time 0.1 min, 0.6 s/S)
+
+Iteration #2 of 3.
+ . 10
+WEIGHTS
+ Glue = -0.3526
+ WordPenalty = +0.067576
+ LanguageModel = +1.155
+ LanguageModel_OOV = -0.2728
+ PhraseModel_0 = -0.025529
+ PhraseModel_1 = +0.095869
+ PhraseModel_2 = +0.094567
+ PhraseModel_3 = +0.12482
+ PhraseModel_4 = -0.36533
+ PhraseModel_5 = +0.1068
+ PhraseModel_6 = -0.1517
+ PassThrough = -0.286
+ ---
+ 1best avg score: 0.18394 (+0.015221)
+ 1best avg model score: 3.205 (+5.0326)
+ avg # pairs: 1168.3
+ avg # rank err: 594.8
+ avg # margin viol: 0
+ non0 feature count: 543
+ avg list sz: 77.5
+ avg f count: 85.916
+(time 0.083 min, 0.5 s/S)
+
+Iteration #3 of 3.
+ . 10
+WEIGHTS
+ Glue = -0.392
+ WordPenalty = +0.071963
+ LanguageModel = +0.81266
+ LanguageModel_OOV = -0.4177
+ PhraseModel_0 = -0.2649
+ PhraseModel_1 = -0.17931
+ PhraseModel_2 = +0.038261
+ PhraseModel_3 = +0.20261
+ PhraseModel_4 = -0.42621
+ PhraseModel_5 = +0.3198
+ PhraseModel_6 = -0.1437
+ PassThrough = -0.4309
+ ---
+ 1best avg score: 0.2962 (+0.11225)
+ 1best avg model score: -36.274 (-39.479)
+ avg # pairs: 1109.6
+ avg # rank err: 515.9
+ avg # margin viol: 0
+ non0 feature count: 741
+ avg list sz: 77
+ avg f count: 88.982
+(time 0.083 min, 0.5 s/S)
+
+Writing weights file to '-' ...
+done
+
+---
+Best iteration: 3 [SCORE 'stupid_bleu'=0.2962].
+This took 0.26667 min.