summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--dtrain/Makefile.am2
-rw-r--r--dtrain/NEXT11
-rw-r--r--dtrain/README.md29
-rw-r--r--dtrain/dtrain.cc53
-rw-r--r--dtrain/dtrain.h13
-rw-r--r--dtrain/kbestget.h6
-rw-r--r--dtrain/ksampler.h2
-rw-r--r--dtrain/score.cc34
-rw-r--r--dtrain/test/example/dtrain.ini22
9 files changed, 97 insertions, 75 deletions
diff --git a/dtrain/Makefile.am b/dtrain/Makefile.am
index 471977e1..64fef489 100644
--- a/dtrain/Makefile.am
+++ b/dtrain/Makefile.am
@@ -3,5 +3,5 @@ bin_PROGRAMS = dtrain
dtrain_SOURCES = dtrain.cc score.cc
dtrain_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -O3
+AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
diff --git a/dtrain/NEXT b/dtrain/NEXT
index 24939cf3..eccfb313 100644
--- a/dtrain/NEXT
+++ b/dtrain/NEXT
@@ -1,6 +1,7 @@
-cuda vecs?
-target side rule ngrams
-decoder meta-parameters testing
-cdyer -> sa-extract -> loo?
-reranking while sgd
+make svm faster (cuda)?
+ other learning frameworks
+target side rule ngram feature template
+decoder meta-parameters test
+sa-extract -> leave-one-out?
+rerank while sgd?
diff --git a/dtrain/README.md b/dtrain/README.md
index f4e1abed..2a24ec22 100644
--- a/dtrain/README.md
+++ b/dtrain/README.md
@@ -1,13 +1,20 @@
-This is a simple (but parallelizable) tuning method for cdec, as used here:
+This is a simple (and parallelizable) tuning method for cdec
+which is able to train the weights of very many (sparse) features.
+It was used here:
"Joint Feature Selection in Distributed Stochastic
Learning for Large-Scale Discriminative Training in
- SMT" Simianer, Riezler, Dyer
- ACL 2012
+ SMT" Simianer, Riezler, Dyer; ACL 2012
Building
--------
-builds when building cdec, see ../BUILDING
+Builds when building cdec, see ../BUILDING .
+To build only parts needed for dtrain do
+```
+ autoreconf -ifv
+ ./configure [--disable-test]
+ cd dtrain/; make
+```
Running
-------
@@ -15,10 +22,10 @@ To run this on a dev set locally:
```
#define DTRAIN_LOCAL
```
-otherwise remove that line or undef. You need a single grammar file
-or per-sentence-grammars (psg) as you would use with cdec.
-Additionally you need to give dtrain a file with
-references (--refs).
+otherwise remove that line or undef, then recompile. You need a single
+grammar file or input annotated with per-sentence grammars (psg) as you
+would use with cdec. Additionally you need to give dtrain a file with
+references (--refs) when running locally.
The input for use with hadoop streaming looks like this:
```
@@ -27,12 +34,12 @@ The input for use with hadoop streaming looks like this:
To convert a psg to this format you need to replace all "\n"
by "\t". Make sure there are no tabs in your data.
-For an example of local usage (with 'distributed' format)
+For an example of local usage (with the 'distributed' format)
the see test/example/ . This expects dtrain to be built without
DTRAIN_LOCAL.
-Legal stuff
------------
+Legal
+-----
Copyright (c) 2012 by Patrick Simianer <p@simianer.de>
See the file ../LICENSE.txt for the licensing terms that this software is
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index fb6c6880..e7a1244c 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -29,8 +29,8 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg)
("rescale", po::value<bool>()->zero_tokens(), "rescale weight vector after each input")
("l1_reg", po::value<string>()->default_value("none"), "apply l1 regularization as in 'Tsuroka et al' (2010)")
("l1_reg_strength", po::value<weight_t>(), "l1 regularization strength")
- ("funny", po::value<bool>()->zero_tokens(), "include correctly ranked pairs into updates")
- ("fselect", po::value<weight_t>()->default_value(-1), "select top x percent of features after each epoch")
+ ("inc_correct", po::value<bool>()->zero_tokens(), "include correctly ranked pairs into updates")
+ ("fselect", po::value<weight_t>()->default_value(-1), "TODO select top x percent of features after each epoch")
#ifdef DTRAIN_LOCAL
("refs,r", po::value<string>(), "references in local mode")
#endif
@@ -113,9 +113,9 @@ main(int argc, char** argv)
HSReporter rep(task_id);
bool keep = false;
if (cfg.count("keep")) keep = true;
- bool funny = false;
- if (cfg.count("funny"))
- funny = true;
+ bool inc_correct = false;
+ if (cfg.count("inc_correct"))
+ inc_correct = true;
const unsigned k = cfg["k"].as<unsigned>();
const unsigned N = cfg["N"].as<unsigned>();
@@ -158,10 +158,9 @@ main(int argc, char** argv)
}
vector<score_t> bleu_weights;
scorer->Init(N, bleu_weights);
- if (!quiet) cerr << setw(26) << "scorer '" << scorer_str << "'" << endl << endl;
// setup decoder observer
- MT19937 rng; // random number generator
+ MT19937 rng; // random number generator, only for forest sampling
HypSampler* observer;
if (sample_from == "kbest")
observer = dynamic_cast<KBestGetter*>(new KBestGetter(k, filter_type));
@@ -225,6 +224,7 @@ main(int argc, char** argv)
cerr << setw(25) << "k " << k << endl;
cerr << setw(25) << "N " << N << endl;
cerr << setw(25) << "T " << T << endl;
+ cerr << setw(25) << "scorer '" << scorer_str << "'" << endl;
cerr << setw(25) << "sample from " << "'" << sample_from << "'" << endl;
if (sample_from == "kbest")
cerr << setw(25) << "filter " << "'" << filter_type << "'" << endl;
@@ -235,8 +235,8 @@ main(int argc, char** argv)
cerr << setw(25) << "select weights " << "'" << select_weights << "'" << endl;
if (cfg.count("l1_reg"))
cerr << setw(25) << "l1 reg " << l1_reg << " '" << cfg["l1_reg"].as<string>() << "'" << endl;
- if (funny)
- cerr << setw(25) << "funny " << funny << endl;
+ if (inc_correct)
+ cerr << setw(25) << "inc. correct " << inc_correct << endl;
if (rescale)
cerr << setw(25) << "rescale " << rescale << endl;
cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl;
@@ -246,7 +246,7 @@ main(int argc, char** argv)
#endif
cerr << setw(25) << "output " << "'" << output_fn << "'" << endl;
if (cfg.count("input_weights"))
- cerr << setw(25) << "weights in" << cfg["input_weights"].as<string>() << endl;
+ cerr << setw(25) << "weights in " << "'" << cfg["input_weights"].as<string>() << "'" << endl;
if (cfg.count("stop-after"))
cerr << setw(25) << "stop_after " << stop_after << endl;
if (!verbose) cerr << "(a dot represents " << DTRAIN_DOTS << " lines of input)" << endl;
@@ -279,7 +279,7 @@ main(int argc, char** argv)
} else {
if (ii == in_sz) next = true; // stop if we reach the end of our input
}
- // stop after X sentences (but still iterate for those)
+ // stop after X sentences (but still go on for those)
if (stop_after > 0 && stop_after == ii && !next) stop = true;
// produce some pretty output
@@ -323,14 +323,17 @@ main(int argc, char** argv)
register_and_convert(ref_tok, ref_ids);
ref_ids_buf.push_back(ref_ids);
// process and set grammar
- bool broken_grammar = true;
+ bool broken_grammar = true; // ignore broken grammars
for (string::iterator it = in.begin(); it != in.end(); it++) {
if (!isspace(*it)) {
broken_grammar = false;
break;
}
}
- if (broken_grammar) continue;
+ if (broken_grammar) {
+ cerr << "Broken grammar for " << ii+1 << "! Ignoring this input." << endl;
+ continue;
+ }
boost::replace_all(in, "\t", "\n");
in += "\n";
grammar_buf_out << in << DTRAIN_GRAMMAR_DELIM << " " << in_split[0] << endl;
@@ -389,7 +392,7 @@ main(int argc, char** argv)
}
}
- score_sum += (*samples)[0].score;
+ score_sum += (*samples)[0].score; // stats for 1best
model_sum += (*samples)[0].model;
// weight updates
@@ -415,7 +418,7 @@ main(int argc, char** argv)
lambdas.plus_eq_v_times_s(diff_vec, eta);
rank_errors++;
} else {
- if (funny) {
+ if (inc_correct) {
SparseVector<weight_t> diff_vec = it->first.f - it->second.f;
lambdas.plus_eq_v_times_s(diff_vec, eta);
}
@@ -453,7 +456,7 @@ main(int argc, char** argv)
}
}
} else if (l1cumul) {
- weight_t acc_penalty = (ii+1) * l1_reg; // ii is the index of the current input
+ weight_t acc_penalty = (ii+1) * l1_reg; // Note: ii is the index of the current input
for (unsigned d = 0; d < lambdas.size(); d++) {
if (lambdas.nonzero(d)) {
weight_t v = lambdas.get(d);
@@ -515,7 +518,7 @@ main(int argc, char** argv)
model_diff = model_avg;
}
- unsigned nonz;
+ unsigned nonz = 0;
if (!quiet || hstreaming) nonz = (unsigned)lambdas.size_nonzero();
if (!quiet) {
@@ -524,18 +527,18 @@ main(int argc, char** argv)
cerr << setw(18) << *it << " = " << lambdas.get(FD::Convert(*it)) << endl;
}
cerr << " ---" << endl;
- cerr << _np << " 1best avg score: " << score_avg;
+ cerr << _np << " 1best avg score: " << score_avg;
cerr << _p << " (" << score_diff << ")" << endl;
- cerr << _np << "1best avg model score: " << model_avg;
+ cerr << _np << " 1best avg model score: " << model_avg;
cerr << _p << " (" << model_diff << ")" << endl;
- cerr << " avg #pairs: ";
+ cerr << " avg # pairs: ";
cerr << _np << npairs/(float)in_sz << endl;
- cerr << " avg #rank err: ";
+ cerr << " avg # rank err: ";
cerr << rank_errors/(float)in_sz << endl;
- cerr << " avg #margin viol: ";
+ cerr << " avg # margin viol: ";
cerr << margin_violations/(float)in_sz << endl;
- cerr << " non0 feature count: " << nonz << endl;
- cerr << " avg f count: ";
+ cerr << " non0 feature count: " << nonz << endl;
+ cerr << " avg f count: ";
cerr << feature_count/(float)pair_count << endl;
}
@@ -628,7 +631,5 @@ main(int argc, char** argv)
cerr << best_it+1 << " [SCORE '" << scorer_str << "'=" << max_score << "]." << endl;
cerr << _p2 << "This took " << overall_time/60. << " min." << endl;
}
-
- return 0;
}
diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h
index 783aa179..61d60657 100644
--- a/dtrain/dtrain.h
+++ b/dtrain/dtrain.h
@@ -1,5 +1,5 @@
-#ifndef _DTRAIN_COMMON_H_
-#define _DTRAIN_COMMON_H_
+#ifndef _DTRAIN_H_
+#define _DTRAIN_H_
#include <iomanip>
#include <climits>
@@ -13,9 +13,9 @@
#include "filelib.h"
-#define DTRAIN_LOCAL
+//#define DTRAIN_LOCAL
-#define DTRAIN_DOTS 10 // when to display a '.'
+#define DTRAIN_DOTS 10 // after how many inputs to display a '.'
#define DTRAIN_GRAMMAR_DELIM "########EOS########"
#define DTRAIN_SCALE 100000
@@ -35,7 +35,10 @@ inline string gettmpf(const string path, const string infix) {
strcat(fn, "/");
strcat(fn, infix.c_str());
strcat(fn, "-XXXXXX");
- mkstemp(fn);
+ if (!mkstemp(fn)) {
+ cerr << "Cannot make temp file in" << path << " , exiting." << endl;
+ exit(1);
+ }
return string(fn);
}
diff --git a/dtrain/kbestget.h b/dtrain/kbestget.h
index 1b96bbf4..0c2da994 100644
--- a/dtrain/kbestget.h
+++ b/dtrain/kbestget.h
@@ -46,7 +46,7 @@ struct LocalScorer
}
inline score_t
- brevity_penaly(const unsigned hyp_len, const unsigned ref_len)
+ brevity_penalty(const unsigned hyp_len, const unsigned ref_len)
{
if (hyp_len > ref_len) return 1;
return exp(1 - (score_t)ref_len/hyp_len);
@@ -61,7 +61,7 @@ struct HypSampler : public DecoderObserver
inline void SetScorer(LocalScorer* scorer) { scorer_ = scorer; }
inline void SetRef(vector<WordID>& ref) { ref_ = &ref; }
};
-///////////////////////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////////////////////////
@@ -76,7 +76,7 @@ struct KBestGetter : public HypSampler
k_(k), filter_type_(filter_type) {}
virtual void
- NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg)
+ NotifyTranslationForest(const SentenceMetadata& /*smeta*/, Hypergraph* hg)
{
KBestScored(*hg);
}
diff --git a/dtrain/ksampler.h b/dtrain/ksampler.h
index 8b1c09f2..c45c8f64 100644
--- a/dtrain/ksampler.h
+++ b/dtrain/ksampler.h
@@ -20,7 +20,7 @@ struct KSampler : public HypSampler
k_(k), prng_(prng) {}
virtual void
- NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg)
+ NotifyTranslationForest(const SentenceMetadata& /*smeta*/, Hypergraph* hg)
{
ScoredSamples(*hg);
}
diff --git a/dtrain/score.cc b/dtrain/score.cc
index 4cde638a..ec844437 100644
--- a/dtrain/score.cc
+++ b/dtrain/score.cc
@@ -24,12 +24,12 @@ BleuScorer::Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref
if (counts.clipped[i] == 0 || counts.sum[i] == 0) return 0;
sum += w_[i] * log((score_t)counts.clipped[i]/counts.sum[i]);
}
- return brevity_penaly(hyp_len, ref_len) * exp(sum);
+ return brevity_penalty(hyp_len, ref_len) * exp(sum);
}
score_t
BleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
- const unsigned rank)
+ const unsigned /*rank*/)
{
unsigned hyp_len = hyp.size(), ref_len = ref.size();
if (hyp_len == 0 || ref_len == 0) return 0;
@@ -49,7 +49,7 @@ BleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
*/
score_t
StupidBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
- const unsigned rank)
+ const unsigned /*rank*/)
{
unsigned hyp_len = hyp.size(), ref_len = ref.size();
if (hyp_len == 0 || ref_len == 0) return 0;
@@ -58,10 +58,11 @@ StupidBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
if (ref_len < N_) M = ref_len;
score_t sum = 0, add = 0;
for (unsigned i = 0; i < M; i++) {
+ if (i == 0 && (counts.clipped[i] == 0 || counts.sum[i] == 0)) return 0;
if (i == 1) add = 1;
sum += w_[i] * log(((score_t)counts.clipped[i] + add)/((counts.sum[i] + add)));
}
- return brevity_penaly(hyp_len, ref_len) * exp(sum);
+ return brevity_penalty(hyp_len, ref_len) * exp(sum);
}
/*
@@ -75,19 +76,28 @@ StupidBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
*/
score_t
SmoothBleuScorer::Score(vector<WordID>& hyp, vector<WordID>& ref,
- const unsigned rank)
+ const unsigned /*rank*/)
{
unsigned hyp_len = hyp.size(), ref_len = ref.size();
if (hyp_len == 0 || ref_len == 0) return 0;
NgramCounts counts = make_ngram_counts(hyp, ref, N_);
- score_t sum = 0;
- unsigned j = 1;
- for (unsigned i = 0; i < N_; i++) {
- if (counts.clipped[i] == 0 || counts.sum[i] == 0) continue;
- sum += exp((w_[i] * log((score_t)counts.clipped[i]/counts.sum[i])))/pow(2, N_-j+1);
- j++;
+ unsigned M = N_;
+ if (ref_len < N_) M = ref_len;
+ score_t sum = 0.;
+ vector<score_t> i_bleu;
+ for (unsigned i = 0; i < M; i++) i_bleu.push_back(0.);
+ for (unsigned i = 0; i < M; i++) {
+ if (counts.clipped[i] == 0 || counts.sum[i] == 0) {
+ break;
+ } else {
+ score_t i_ng = log((score_t)counts.clipped[i]/counts.sum[i]);
+ for (unsigned j = i; j < M; j++) {
+ i_bleu[j] += (1/((score_t)j+1)) * i_ng;
+ }
+ }
+ sum += exp(i_bleu[i])/(pow(2, N_-i));
}
- return brevity_penaly(hyp_len, ref_len) * sum;
+ return brevity_penalty(hyp_len, ref_len) * sum;
}
/*
diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini
index 68173e11..66be6bf2 100644
--- a/dtrain/test/example/dtrain.ini
+++ b/dtrain/test/example/dtrain.ini
@@ -1,20 +1,20 @@
input=test/example/nc-wmt11.1k.gz # use '-' for stdin
-output=- # a weights file or stdout
-decoder_config=test/example/cdec.ini # ini for cdec
-# these will be printed on each iteration
+output=weights.gz # a weights file (add .gz for gzip compression) or STDOUT '-'
+decoder_config=test/example/cdec.ini # config for cdec
+# weights for these features will be printed on each iteration
print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough
tmp=/tmp
-stop_after=10 # stop iteration after 10 inputs
+stop_after=100 # stop epoch after 10 inputs
# interesting stuff
-epochs=3 # run over input 3 times
-k=200 # use 100best lists
-N=4 # optimize (approx) BLEU4
+epochs=100 # run over input 3 times
+k=100 # use 100best lists
+N=4 # optimize (approx) BLEU4
learning_rate=0.0001 # learning rate
-gamma=0.00001 # use SVM reg
-scorer=stupid_bleu # use stupid BLEU+1 approx.
+gamma=0 # use SVM reg
+scorer=smooth_bleu # use smooth BLEU of (Liang et al. '06)
sample_from=kbest # use kbest lists (as opposed to forest)
-filter=uniq # only uniq entries in kbest
+filter=uniq # only unique entries in kbest (surface form)
pair_sampling=108010 # 10 vs 80 vs 10 and 80 vs 10
-pair_threshold=0 # minimum distance in BLEU
+pair_threshold=0 # minimum distance in BLEU (this will still only use pairs with diff > 0)
select_weights=last # just output last weights