From 8271eb7cfc10e58b2b8ff50d27d6680dedfba043 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Thu, 26 Apr 2012 09:31:34 +0200 Subject: merge older changes, more polishing --- dtrain/Makefile.am | 2 +- dtrain/NEXT | 11 +++++---- dtrain/README.md | 29 ++++++++++++++--------- dtrain/dtrain.cc | 53 +++++++++++++++++++++--------------------- dtrain/dtrain.h | 13 +++++++---- dtrain/kbestget.h | 6 ++--- dtrain/ksampler.h | 2 +- dtrain/score.cc | 34 +++++++++++++++++---------- dtrain/test/example/dtrain.ini | 22 +++++++++--------- 9 files changed, 97 insertions(+), 75 deletions(-) (limited to 'dtrain') diff --git a/dtrain/Makefile.am b/dtrain/Makefile.am index 471977e1..64fef489 100644 --- a/dtrain/Makefile.am +++ b/dtrain/Makefile.am @@ -3,5 +3,5 @@ bin_PROGRAMS = dtrain dtrain_SOURCES = dtrain.cc score.cc dtrain_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval -O3 +AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/dtrain/NEXT b/dtrain/NEXT index 24939cf3..eccfb313 100644 --- a/dtrain/NEXT +++ b/dtrain/NEXT @@ -1,6 +1,7 @@ -cuda vecs? -target side rule ngrams -decoder meta-parameters testing -cdyer -> sa-extract -> loo? -reranking while sgd +make svm faster (cuda)? + other learning frameworks +target side rule ngram feature template +decoder meta-parameters test +sa-extract -> leave-one-out? +rerank while sgd? diff --git a/dtrain/README.md b/dtrain/README.md index f4e1abed..2a24ec22 100644 --- a/dtrain/README.md +++ b/dtrain/README.md @@ -1,13 +1,20 @@ -This is a simple (but parallelizable) tuning method for cdec, as used here: +This is a simple (and parallelizable) tuning method for cdec +which is able to train the weights of very many (sparse) features. +It was used here: "Joint Feature Selection in Distributed Stochastic Learning for Large-Scale Discriminative Training in - SMT" Simianer, Riezler, Dyer - ACL 2012 + SMT" Simianer, Riezler, Dyer; ACL 2012 Building -------- -builds when building cdec, see ../BUILDING +Builds when building cdec, see ../BUILDING . +To build only parts needed for dtrain do +``` + autoreconf -ifv + ./configure [--disable-test] + cd dtrain/; make +``` Running ------- @@ -15,10 +22,10 @@ To run this on a dev set locally: ``` #define DTRAIN_LOCAL ``` -otherwise remove that line or undef. You need a single grammar file -or per-sentence-grammars (psg) as you would use with cdec. -Additionally you need to give dtrain a file with -references (--refs). +otherwise remove that line or undef, then recompile. You need a single +grammar file or input annotated with per-sentence grammars (psg) as you +would use with cdec. Additionally you need to give dtrain a file with +references (--refs) when running locally. The input for use with hadoop streaming looks like this: ``` @@ -27,12 +34,12 @@ The input for use with hadoop streaming looks like this: To convert a psg to this format you need to replace all "\n" by "\t". Make sure there are no tabs in your data. -For an example of local usage (with 'distributed' format) +For an example of local usage (with the 'distributed' format) the see test/example/ . This expects dtrain to be built without DTRAIN_LOCAL. -Legal stuff ------------ +Legal +----- Copyright (c) 2012 by Patrick Simianer See the file ../LICENSE.txt for the licensing terms that this software is diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index fb6c6880..e7a1244c 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -29,8 +29,8 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg) ("rescale", po::value()->zero_tokens(), "rescale weight vector after each input") ("l1_reg", po::value()->default_value("none"), "apply l1 regularization as in 'Tsuroka et al' (2010)") ("l1_reg_strength", po::value(), "l1 regularization strength") - ("funny", po::value()->zero_tokens(), "include correctly ranked pairs into updates") - ("fselect", po::value()->default_value(-1), "select top x percent of features after each epoch") + ("inc_correct", po::value()->zero_tokens(), "include correctly ranked pairs into updates") + ("fselect", po::value()->default_value(-1), "TODO select top x percent of features after each epoch") #ifdef DTRAIN_LOCAL ("refs,r", po::value(), "references in local mode") #endif @@ -113,9 +113,9 @@ main(int argc, char** argv) HSReporter rep(task_id); bool keep = false; if (cfg.count("keep")) keep = true; - bool funny = false; - if (cfg.count("funny")) - funny = true; + bool inc_correct = false; + if (cfg.count("inc_correct")) + inc_correct = true; const unsigned k = cfg["k"].as(); const unsigned N = cfg["N"].as(); @@ -158,10 +158,9 @@ main(int argc, char** argv) } vector bleu_weights; scorer->Init(N, bleu_weights); - if (!quiet) cerr << setw(26) << "scorer '" << scorer_str << "'" << endl << endl; // setup decoder observer - MT19937 rng; // random number generator + MT19937 rng; // random number generator, only for forest sampling HypSampler* observer; if (sample_from == "kbest") observer = dynamic_cast(new KBestGetter(k, filter_type)); @@ -225,6 +224,7 @@ main(int argc, char** argv) cerr << setw(25) << "k " << k << endl; cerr << setw(25) << "N " << N << endl; cerr << setw(25) << "T " << T << endl; + cerr << setw(25) << "scorer '" << scorer_str << "'" << endl; cerr << setw(25) << "sample from " << "'" << sample_from << "'" << endl; if (sample_from == "kbest") cerr << setw(25) << "filter " << "'" << filter_type << "'" << endl; @@ -235,8 +235,8 @@ main(int argc, char** argv) cerr << setw(25) << "select weights " << "'" << select_weights << "'" << endl; if (cfg.count("l1_reg")) cerr << setw(25) << "l1 reg " << l1_reg << " '" << cfg["l1_reg"].as() << "'" << endl; - if (funny) - cerr << setw(25) << "funny " << funny << endl; + if (inc_correct) + cerr << setw(25) << "inc. correct " << inc_correct << endl; if (rescale) cerr << setw(25) << "rescale " << rescale << endl; cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as() << "'" << endl; @@ -246,7 +246,7 @@ main(int argc, char** argv) #endif cerr << setw(25) << "output " << "'" << output_fn << "'" << endl; if (cfg.count("input_weights")) - cerr << setw(25) << "weights in" << cfg["input_weights"].as() << endl; + cerr << setw(25) << "weights in " << "'" << cfg["input_weights"].as() << "'" << endl; if (cfg.count("stop-after")) cerr << setw(25) << "stop_after " << stop_after << endl; if (!verbose) cerr << "(a dot represents " << DTRAIN_DOTS << " lines of input)" << endl; @@ -279,7 +279,7 @@ main(int argc, char** argv) } else { if (ii == in_sz) next = true; // stop if we reach the end of our input } - // stop after X sentences (but still iterate for those) + // stop after X sentences (but still go on for those) if (stop_after > 0 && stop_after == ii && !next) stop = true; // produce some pretty output @@ -323,14 +323,17 @@ main(int argc, char** argv) register_and_convert(ref_tok, ref_ids); ref_ids_buf.push_back(ref_ids); // process and set grammar - bool broken_grammar = true; + bool broken_grammar = true; // ignore broken grammars for (string::iterator it = in.begin(); it != in.end(); it++) { if (!isspace(*it)) { broken_grammar = false; break; } } - if (broken_grammar) continue; + if (broken_grammar) { + cerr << "Broken grammar for " << ii+1 << "! Ignoring this input." << endl; + continue; + } boost::replace_all(in, "\t", "\n"); in += "\n"; grammar_buf_out << in << DTRAIN_GRAMMAR_DELIM << " " << in_split[0] << endl; @@ -389,7 +392,7 @@ main(int argc, char** argv) } } - score_sum += (*samples)[0].score; + score_sum += (*samples)[0].score; // stats for 1best model_sum += (*samples)[0].model; // weight updates @@ -415,7 +418,7 @@ main(int argc, char** argv) lambdas.plus_eq_v_times_s(diff_vec, eta); rank_errors++; } else { - if (funny) { + if (inc_correct) { SparseVector diff_vec = it->first.f - it->second.f; lambdas.plus_eq_v_times_s(diff_vec, eta); } @@ -453,7 +456,7 @@ main(int argc, char** argv) } } } else if (l1cumul) { - weight_t acc_penalty = (ii+1) * l1_reg; // ii is the index of the current input + weight_t acc_penalty = (ii+1) * l1_reg; // Note: ii is the index of the current input for (unsigned d = 0; d < lambdas.size(); d++) { if (lambdas.nonzero(d)) { weight_t v = lambdas.get(d); @@ -515,7 +518,7 @@ main(int argc, char** argv) model_diff = model_avg; } - unsigned nonz; + unsigned nonz = 0; if (!quiet || hstreaming) nonz = (unsigned)lambdas.size_nonzero(); if (!quiet) { @@ -524,18 +527,18 @@ main(int argc, char** argv) cerr << setw(18) << *it << " = " << lambdas.get(FD::Convert(*it)) << endl; } cerr << " ---" << endl; - cerr << _np << " 1best avg score: " << score_avg; + cerr << _np << " 1best avg score: " << score_avg; cerr << _p << " (" << score_diff << ")" << endl; - cerr << _np << "1best avg model score: " << model_avg; + cerr << _np << " 1best avg model score: " << model_avg; cerr << _p << " (" << model_diff << ")" << endl; - cerr << " avg #pairs: "; + cerr << " avg # pairs: "; cerr << _np << npairs/(float)in_sz << endl; - cerr << " avg #rank err: "; + cerr << " avg # rank err: "; cerr << rank_errors/(float)in_sz << endl; - cerr << " avg #margin viol: "; + cerr << " avg # margin viol: "; cerr << margin_violations/(float)in_sz << endl; - cerr << " non0 feature count: " << nonz << endl; - cerr << " avg f count: "; + cerr << " non0 feature count: " << nonz << endl; + cerr << " avg f count: "; cerr << feature_count/(float)pair_count << endl; } @@ -628,7 +631,5 @@ main(int argc, char** argv) cerr << best_it+1 << " [SCORE '" << scorer_str << "'=" << max_score << "]." << endl; cerr << _p2 << "This took " << overall_time/60. << " min." << endl; } - - return 0; } diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h index 783aa179..61d60657 100644 --- a/dtrain/dtrain.h +++ b/dtrain/dtrain.h @@ -1,5 +1,5 @@ -#ifndef _DTRAIN_COMMON_H_ -#define _DTRAIN_COMMON_H_ +#ifndef _DTRAIN_H_ +#define _DTRAIN_H_ #include #include @@ -13,9 +13,9 @@ #include "filelib.h" -#define DTRAIN_LOCAL +//#define DTRAIN_LOCAL -#define DTRAIN_DOTS 10 // when to display a '.' +#define DTRAIN_DOTS 10 // after how many inputs to display a '.' #define DTRAIN_GRAMMAR_DELIM "########EOS########" #define DTRAIN_SCALE 100000 @@ -35,7 +35,10 @@ inline string gettmpf(const string path, const string infix) { strcat(fn, "/"); strcat(fn, infix.c_str()); strcat(fn, "-XXXXXX"); - mkstemp(fn); + if (!mkstemp(fn)) { + cerr << "Cannot make temp file in" << path << " , exiting." << endl; + exit(1); + } return string(fn); } diff --git a/dtrain/kbestget.h b/dtrain/kbestget.h index 1b96bbf4..0c2da994 100644 --- a/dtrain/kbestget.h +++ b/dtrain/kbestget.h @@ -46,7 +46,7 @@ struct LocalScorer } inline score_t - brevity_penaly(const unsigned hyp_len, const unsigned ref_len) + brevity_penalty(const unsigned hyp_len, const unsigned ref_len) { if (hyp_len > ref_len) return 1; return exp(1 - (score_t)ref_len/hyp_len); @@ -61,7 +61,7 @@ struct HypSampler : public DecoderObserver inline void SetScorer(LocalScorer* scorer) { scorer_ = scorer; } inline void SetRef(vector& ref) { ref_ = &ref; } }; -/////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// @@ -76,7 +76,7 @@ struct KBestGetter : public HypSampler k_(k), filter_type_(filter_type) {} virtual void - NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) + NotifyTranslationForest(const SentenceMetadata& /*smeta*/, Hypergraph* hg) { KBestScored(*hg); } diff --git a/dtrain/ksampler.h b/dtrain/ksampler.h index 8b1c09f2..c45c8f64 100644 --- a/dtrain/ksampler.h +++ b/dtrain/ksampler.h @@ -20,7 +20,7 @@ struct KSampler : public HypSampler k_(k), prng_(prng) {} virtual void - NotifyTranslationForest(const SentenceMetadata& smeta, Hypergraph* hg) + NotifyTranslationForest(const SentenceMetadata& /*smeta*/, Hypergraph* hg) { ScoredSamples(*hg); } diff --git a/dtrain/score.cc b/dtrain/score.cc index 4cde638a..ec844437 100644 --- a/dtrain/score.cc +++ b/dtrain/score.cc @@ -24,12 +24,12 @@ BleuScorer::Bleu(NgramCounts& counts, const unsigned hyp_len, const unsigned ref if (counts.clipped[i] == 0 || counts.sum[i] == 0) return 0; sum += w_[i] * log((score_t)counts.clipped[i]/counts.sum[i]); } - return brevity_penaly(hyp_len, ref_len) * exp(sum); + return brevity_penalty(hyp_len, ref_len) * exp(sum); } score_t BleuScorer::Score(vector& hyp, vector& ref, - const unsigned rank) + const unsigned /*rank*/) { unsigned hyp_len = hyp.size(), ref_len = ref.size(); if (hyp_len == 0 || ref_len == 0) return 0; @@ -49,7 +49,7 @@ BleuScorer::Score(vector& hyp, vector& ref, */ score_t StupidBleuScorer::Score(vector& hyp, vector& ref, - const unsigned rank) + const unsigned /*rank*/) { unsigned hyp_len = hyp.size(), ref_len = ref.size(); if (hyp_len == 0 || ref_len == 0) return 0; @@ -58,10 +58,11 @@ StupidBleuScorer::Score(vector& hyp, vector& ref, if (ref_len < N_) M = ref_len; score_t sum = 0, add = 0; for (unsigned i = 0; i < M; i++) { + if (i == 0 && (counts.clipped[i] == 0 || counts.sum[i] == 0)) return 0; if (i == 1) add = 1; sum += w_[i] * log(((score_t)counts.clipped[i] + add)/((counts.sum[i] + add))); } - return brevity_penaly(hyp_len, ref_len) * exp(sum); + return brevity_penalty(hyp_len, ref_len) * exp(sum); } /* @@ -75,19 +76,28 @@ StupidBleuScorer::Score(vector& hyp, vector& ref, */ score_t SmoothBleuScorer::Score(vector& hyp, vector& ref, - const unsigned rank) + const unsigned /*rank*/) { unsigned hyp_len = hyp.size(), ref_len = ref.size(); if (hyp_len == 0 || ref_len == 0) return 0; NgramCounts counts = make_ngram_counts(hyp, ref, N_); - score_t sum = 0; - unsigned j = 1; - for (unsigned i = 0; i < N_; i++) { - if (counts.clipped[i] == 0 || counts.sum[i] == 0) continue; - sum += exp((w_[i] * log((score_t)counts.clipped[i]/counts.sum[i])))/pow(2, N_-j+1); - j++; + unsigned M = N_; + if (ref_len < N_) M = ref_len; + score_t sum = 0.; + vector i_bleu; + for (unsigned i = 0; i < M; i++) i_bleu.push_back(0.); + for (unsigned i = 0; i < M; i++) { + if (counts.clipped[i] == 0 || counts.sum[i] == 0) { + break; + } else { + score_t i_ng = log((score_t)counts.clipped[i]/counts.sum[i]); + for (unsigned j = i; j < M; j++) { + i_bleu[j] += (1/((score_t)j+1)) * i_ng; + } + } + sum += exp(i_bleu[i])/(pow(2, N_-i)); } - return brevity_penaly(hyp_len, ref_len) * sum; + return brevity_penalty(hyp_len, ref_len) * sum; } /* diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini index 68173e11..66be6bf2 100644 --- a/dtrain/test/example/dtrain.ini +++ b/dtrain/test/example/dtrain.ini @@ -1,20 +1,20 @@ input=test/example/nc-wmt11.1k.gz # use '-' for stdin -output=- # a weights file or stdout -decoder_config=test/example/cdec.ini # ini for cdec -# these will be printed on each iteration +output=weights.gz # a weights file (add .gz for gzip compression) or STDOUT '-' +decoder_config=test/example/cdec.ini # config for cdec +# weights for these features will be printed on each iteration print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PhraseModel_5 PhraseModel_6 PassThrough tmp=/tmp -stop_after=10 # stop iteration after 10 inputs +stop_after=100 # stop epoch after 10 inputs # interesting stuff -epochs=3 # run over input 3 times -k=200 # use 100best lists -N=4 # optimize (approx) BLEU4 +epochs=100 # run over input 3 times +k=100 # use 100best lists +N=4 # optimize (approx) BLEU4 learning_rate=0.0001 # learning rate -gamma=0.00001 # use SVM reg -scorer=stupid_bleu # use stupid BLEU+1 approx. +gamma=0 # use SVM reg +scorer=smooth_bleu # use smooth BLEU of (Liang et al. '06) sample_from=kbest # use kbest lists (as opposed to forest) -filter=uniq # only uniq entries in kbest +filter=uniq # only unique entries in kbest (surface form) pair_sampling=108010 # 10 vs 80 vs 10 and 80 vs 10 -pair_threshold=0 # minimum distance in BLEU +pair_threshold=0 # minimum distance in BLEU (this will still only use pairs with diff > 0) select_weights=last # just output last weights -- cgit v1.2.3