From dc9fd7a3adc863510d79a718e919b6833a86729c Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Fri, 23 Sep 2011 20:53:15 +0200 Subject: begin refactoring --- .gitignore | 1 + dtrain/Makefile.am | 7 +- dtrain/README | 1 + dtrain/avgweights.rb | 27 ------ dtrain/common.h | 43 --------- dtrain/dtest.cc | 94 -------------------- dtrain/dtrain.cc | 44 ++++------ dtrain/dtrain.h | 60 +++++++++++++ dtrain/hstreaming/avgweights.rb | 27 ++++++ dtrain/hstreaming/cdec.ini | 8 ++ dtrain/hstreaming/dtrain.ini | 10 +++ dtrain/hstreaming/hadoop-streaming-job.sh | 23 +++++ dtrain/job/avgweights.rb | 30 ------- dtrain/job/cdec.ini | 8 -- dtrain/job/dtrain.ini | 10 --- dtrain/job/dtrain.sh | 6 -- dtrain/job/hadoop-streaming-job.sh | 23 ----- dtrain/job2/avgweights.rb | 30 ------- dtrain/job2/cdec.ini | 8 -- dtrain/job2/dtrain.ini | 10 --- dtrain/job2/dtrain.sh | 6 -- dtrain/job2/hadoop-streaming-job.sh | 23 ----- dtrain/pairsampling.h | 64 ++++++++++++++ dtrain/run.sh | 12 --- dtrain/sample.h | 64 -------------- dtrain/test-reducer | 7 -- dtrain/tests.cc | 141 ------------------------------ dtrain/tests.h | 26 ------ dtrain/updater.h | 107 ----------------------- dtrain/util.cc | 34 ------- dtrain/util.h | 28 ------ 31 files changed, 213 insertions(+), 769 deletions(-) delete mode 100755 dtrain/avgweights.rb delete mode 100644 dtrain/common.h delete mode 100644 dtrain/dtest.cc create mode 100644 dtrain/dtrain.h create mode 100755 dtrain/hstreaming/avgweights.rb create mode 100644 dtrain/hstreaming/cdec.ini create mode 100644 dtrain/hstreaming/dtrain.ini create mode 100755 dtrain/hstreaming/hadoop-streaming-job.sh delete mode 100755 dtrain/job/avgweights.rb delete mode 100644 dtrain/job/cdec.ini delete mode 100644 dtrain/job/dtrain.ini delete mode 100755 dtrain/job/dtrain.sh delete mode 100755 dtrain/job/hadoop-streaming-job.sh delete mode 100755 dtrain/job2/avgweights.rb delete mode 100644 dtrain/job2/cdec.ini delete mode 100644 dtrain/job2/dtrain.ini delete mode 100755 dtrain/job2/dtrain.sh delete mode 100755 dtrain/job2/hadoop-streaming-job.sh create mode 100644 dtrain/pairsampling.h delete mode 100755 dtrain/run.sh delete mode 100644 dtrain/sample.h delete mode 100644 dtrain/test-reducer delete mode 100644 dtrain/tests.cc delete mode 100644 dtrain/tests.h delete mode 100644 dtrain/updater.h delete mode 100644 dtrain/util.cc delete mode 100644 dtrain/util.h diff --git a/.gitignore b/.gitignore index 95262a09..2a5979cb 100644 --- a/.gitignore +++ b/.gitignore @@ -130,3 +130,4 @@ training/mpi_em_optimize training/test_ngram utils/ts training/compute_cllh +dtrain/dtrain diff --git a/dtrain/Makefile.am b/dtrain/Makefile.am index c08cd1ea..9b5df8bf 100644 --- a/dtrain/Makefile.am +++ b/dtrain/Makefile.am @@ -1,11 +1,8 @@ # TODO I'm sure I can leave something out. -bin_PROGRAMS = dtrain dtest +bin_PROGRAMS = dtrain -dtrain_SOURCES = dtrain.cc score.cc tests.cc util.cc sample_hg.cc +dtrain_SOURCES = dtrain.cc score.cc sample_hg.cc dtrain_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -lboost_filesystem -lboost_iostreams -dtest_SOURCES = dtest.cc score.cc util.cc -dtest_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/dtrain/README b/dtrain/README index b3f513be..0cc52acc 100644 --- a/dtrain/README +++ b/dtrain/README @@ -31,6 +31,7 @@ TODO use separate TEST SET KNOWN BUGS PROBLEMS + if size of candidate < N => 0 score cdec kbest vs 1best (no -k param), rescoring? => ok(?) no sparse vector in decoder => ok ? ok diff --git a/dtrain/avgweights.rb b/dtrain/avgweights.rb deleted file mode 100755 index d5cfaa4d..00000000 --- a/dtrain/avgweights.rb +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env ruby1.9.1 - - -STDIN.set_encoding 'utf-8' - -#shard_count_key = "__SHARD_COUNT__" - -w = {} -c = {} -w.default = 0 -c.default = 0 -while line = STDIN.gets - key, val = line.split /\t/ - w[key] += val.to_f - c[key] += 1.0 -end - -#shard_count = w["__SHARD_COUNT__"] - -w.each_key { |k| - #if k == shard_count_key then next end - #if k == "__bias" then next end - puts "#{k}\t#{w[k]/c[k]}" #{w[k]/shard_count}" -} - -#puts "#{shard_count_key}\t#{w[shard_count_key]}" - diff --git a/dtrain/common.h b/dtrain/common.h deleted file mode 100644 index 49dc85b7..00000000 --- a/dtrain/common.h +++ /dev/null @@ -1,43 +0,0 @@ -#ifndef _DTRAIN_COMMON_H_ -#define _DTRAIN_COMMON_H_ - - -#include -#include -#include -#include -#include -#include - -// cdec includes -#include "sentence_metadata.h" -#include "verbose.h" -#include "viterbi.h" -#include "kbest.h" -#include "ff_register.h" -#include "decoder.h" -#include "weights.h" - -// boost includes -#include -#include - -// own headers -#include "score.h" - -#define DTRAIN_DEFAULT_K 100 // k for kbest lists -#define DTRAIN_DEFAULT_N 4 // N for ngrams (e.g. BLEU) -#define DTRAIN_DEFAULT_T 1 // iterations -#define DTRAIN_DEFAULT_SCORER "stupid_bleu" // scorer -#define DTRAIN_DOTS 100 // when to display a '.' -#define DTRAIN_TMP_DIR "/var/hadoop/mapred/local" // put this on a SSD? -#define DTRAIN_GRAMMAR_DELIM "########EOS########" - - -using namespace std; -using namespace dtrain; -namespace po = boost::program_options; - - -#endif - diff --git a/dtrain/dtest.cc b/dtrain/dtest.cc deleted file mode 100644 index 36c880a3..00000000 --- a/dtrain/dtest.cc +++ /dev/null @@ -1,94 +0,0 @@ -#include "common.h" -#include "kbestget.h" -#include "util.h" - - -/* - * init - * - */ -bool -init(int argc, char** argv, po::variables_map* conf) -{ - int N; - po::options_description opts( "Command Line Options" ); - opts.add_options() - ( "decoder-config,c", po::value(), "configuration file for cdec" ) - ( "weights,w", po::value(), "weights file" ) - ( "ngrams,n", po::value(&N)->default_value(DTRAIN_DEFAULT_N), "N for Ngrams (default 5)" ); - po::options_description cmdline_options; - cmdline_options.add(opts); - po::store( parse_command_line(argc, argv, cmdline_options), *conf ); - po::notify( *conf ); - if ( ! (conf->count("decoder-config") || conf->count("weights")) ) { - cerr << cmdline_options << endl; - return false; - } - return true; -} - - -/* - * main - * - */ -int -main(int argc, char** argv) -{ - SetSilent( true ); - po::variables_map conf; - if ( !init(argc, argv, &conf) ) return 1; - register_feature_functions(); - size_t k = 1; - ReadFile ini_rf( conf["decoder-config"].as() ); - Decoder decoder( ini_rf.stream() ); - KBestGetter observer( k, "no" ); - size_t N = conf["ngrams"].as(); - - Weights weights; - if ( conf.count("weights") ) weights.InitFromFile( conf["weights"].as() ); - vector w; - weights.InitVector( &w ); - decoder.SetWeights( w ); - - vector in_split, ref_strs; - vector ref_ids; - string in, psg; - size_t sn = 0; - double overall = 0.0; - double overall1 = 0.0; - double overall2 = 0.0; - while( getline(cin, in) ) { - in_split.clear(); - boost::split( in_split, in, boost::is_any_of("\t") ); - // grammar - psg = boost::replace_all_copy( in_split[3], " __NEXT__RULE__ ", "\n" ); psg += "\n"; - decoder.SetSentenceGrammarFromString( psg ); - decoder.Decode( in_split[1], &observer ); - KBestList* kb = observer.GetKBest(); - // reference - ref_strs.clear(); ref_ids.clear(); - boost::split( ref_strs, in_split[2], boost::is_any_of(" ") ); - register_and_convert( ref_strs, ref_ids ); - // scoring kbest - double score = 0.0; - double score1 = 0.0; - double score2 = 0.0; - NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[0], N ); - score = smooth_bleu( counts, ref_ids.size(), kb->sents[0].size(), N ); - score1 = stupid_bleu( counts, ref_ids.size(), kb->sents[0].size(), N ); - score2 = bleu( counts, ref_ids.size(), kb->sents[0].size(), N ); - cout << TD::GetString( kb->sents[0] ) << endl; - overall += score; - overall1 += score1; - overall2 += score2; - sn += 1; - } - cerr << "Average score (smooth) : " << overall/(double)(sn+1) << endl; - cerr << "Average score (stupid) : " << overall1/(double)(sn+1) << endl; - cerr << "Average score (vanilla): " << overall2/(double)(sn+1) << endl; - cerr << endl; - - return 0; -} - diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index f005008e..01821b30 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -1,24 +1,19 @@ -#include "common.h" -#include "kbestget.h" -#include "util.h" -#include "sample.h" +#include "dtrain.h" -#include "ksampler.h" -// boost compression -#include -#include -#include -//#include -//#include -using namespace boost::iostreams; -#include -#include - -#ifdef DTRAIN_DEBUG -#include "tests.h" -#endif +/* + * register_and_convert + * + */ +void +register_and_convert(const vector& strs, vector& ids) +{ + vector::const_iterator it; + for ( it = strs.begin(); it < strs.end(); it++ ) { + ids.push_back( TD::Convert( *it ) ); + } +} /* @@ -49,12 +44,7 @@ init(int argc, char** argv, po::variables_map* cfg) clo.add_options() ( "config,c", po::value(), "dtrain config file" ) ( "quiet,q", po::value()->zero_tokens(), "be quiet" ) - ( "verbose,v", po::value()->zero_tokens(), "be verbose" ) -#ifndef DTRAIN_DEBUG - ; -#else - ( "test", "run tests and exit"); -#endif + ( "verbose,v", po::value()->zero_tokens(), "be verbose" ); po::options_description config_options, cmdline_options; config_options.add(conff); @@ -149,9 +139,9 @@ main( int argc, char** argv ) if ( !quiet ) cout << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as() << "'" << endl; Decoder decoder( ini_rf.stream() ); - //KBestGetter observer( k, filter_type ); + KBestGetter observer( k, filter_type ); MT19937 rng; - KSampler observer( k, &rng ); + //KSampler observer( k, &rng ); // scoring metric/scorer string scorer_str = cfg["scorer"].as(); @@ -433,7 +423,7 @@ main( int argc, char** argv ) } ++sid; - cerr << "reporter:counter:dtrain,sent," << sid << endl; + //cerr << "reporter:counter:dtrain,sent," << sid << endl; } // input loop diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h new file mode 100644 index 00000000..3d319233 --- /dev/null +++ b/dtrain/dtrain.h @@ -0,0 +1,60 @@ +#ifndef _DTRAIN_COMMON_H_ +#define _DTRAIN_COMMON_H_ + + +#include +#include +#include +#include +#include +#include + +// cdec includes +#include "sentence_metadata.h" +#include "verbose.h" +#include "viterbi.h" +#include "kbest.h" +#include "ff_register.h" +#include "decoder.h" +#include "weights.h" + +// boost includes +#include +#include + +// own headers +#include "score.h" + +#define DTRAIN_DEFAULT_K 100 // k for kbest lists +#define DTRAIN_DEFAULT_N 4 // N for ngrams (e.g. BLEU) +#define DTRAIN_DEFAULT_T 1 // iterations +#define DTRAIN_DEFAULT_SCORER "stupid_bleu" // scorer +#define DTRAIN_DOTS 100 // when to display a '.' +#define DTRAIN_TMP_DIR "/tmp" // put this on a SSD? +#define DTRAIN_GRAMMAR_DELIM "########EOS########" + + +#include "kbestget.h" +#include "pairsampling.h" + +#include "ksampler.h" + +// boost compression +#include +#include +#include +//#include +//#include +using namespace boost::iostreams; + +#include +#include + + +using namespace std; +using namespace dtrain; +namespace po = boost::program_options; + + +#endif + diff --git a/dtrain/hstreaming/avgweights.rb b/dtrain/hstreaming/avgweights.rb new file mode 100755 index 00000000..d5cfaa4d --- /dev/null +++ b/dtrain/hstreaming/avgweights.rb @@ -0,0 +1,27 @@ +#!/usr/bin/env ruby1.9.1 + + +STDIN.set_encoding 'utf-8' + +#shard_count_key = "__SHARD_COUNT__" + +w = {} +c = {} +w.default = 0 +c.default = 0 +while line = STDIN.gets + key, val = line.split /\t/ + w[key] += val.to_f + c[key] += 1.0 +end + +#shard_count = w["__SHARD_COUNT__"] + +w.each_key { |k| + #if k == shard_count_key then next end + #if k == "__bias" then next end + puts "#{k}\t#{w[k]/c[k]}" #{w[k]/shard_count}" +} + +#puts "#{shard_count_key}\t#{w[shard_count_key]}" + diff --git a/dtrain/hstreaming/cdec.ini b/dtrain/hstreaming/cdec.ini new file mode 100644 index 00000000..0d32f0b7 --- /dev/null +++ b/dtrain/hstreaming/cdec.ini @@ -0,0 +1,8 @@ +formalism=scfg +add_pass_through_rules=true +feature_function=WordPenalty +cubepruning_pop_limit=30 +feature_function=KLanguageModel nc-wmt11.en.srilm.3.gz +feature_function=RuleIdentityFeatures +scfg_max_span_limit=15 + diff --git a/dtrain/hstreaming/dtrain.ini b/dtrain/hstreaming/dtrain.ini new file mode 100644 index 00000000..079d7d69 --- /dev/null +++ b/dtrain/hstreaming/dtrain.ini @@ -0,0 +1,10 @@ +decoder_config=cdec.ini +kbest=100 +ngrams=4 +epochs=10 +input=- +scorer=stupid_bleu +output=- +#stop_after=100 +#wprint=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough + diff --git a/dtrain/hstreaming/hadoop-streaming-job.sh b/dtrain/hstreaming/hadoop-streaming-job.sh new file mode 100755 index 00000000..2cf3f50a --- /dev/null +++ b/dtrain/hstreaming/hadoop-streaming-job.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +HADOOP_HOME=/usr/lib/hadoop-0.20 +JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar +HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR" + +IN=in/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain.1400m +OUT=out/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain-weights-1400m + +$HSTREAMING \ + -mapper "dtrain.sh" \ + -reducer "avgweights.rb" \ + -input $IN \ + -output $OUT \ + -file avgweights.rb \ + -file dtrain.sh \ + -file dtrain \ + -file dtrain.ini \ + -file cdec.ini \ + -file nc-wmt11.en.srilm.3.gz \ + -jobconf mapred.reduce.tasks=1 \ + -jobconf mapred.max.map.failures.percent=100 + diff --git a/dtrain/job/avgweights.rb b/dtrain/job/avgweights.rb deleted file mode 100755 index e635aab4..00000000 --- a/dtrain/job/avgweights.rb +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env ruby1.9.1 - - -STDIN.set_encoding 'utf-8' - -#shard_count_key = "__SHARD_COUNT__" - -w = {} -#c = {} -w.default = 0 -#c.default = 0 -while line = STDIN.gets - key, val = line.split /\t/ - w[key] += val.to_f - #c[key] += 1.0 -end - -#shard_count = w["__SHARD_COUNT__"] - -num_map = 104.0 - -w.each_key { |k| - #if k == shard_count_key then next end - #if k == "__bias" then next end - puts "#{k}\t#{w[k]/num_map}" - #/c[k]}" #{w[k]/shard_count}" -} - -#puts "#{shard_count_key}\t#{w[shard_count_key]}" - diff --git a/dtrain/job/cdec.ini b/dtrain/job/cdec.ini deleted file mode 100644 index 0d32f0b7..00000000 --- a/dtrain/job/cdec.ini +++ /dev/null @@ -1,8 +0,0 @@ -formalism=scfg -add_pass_through_rules=true -feature_function=WordPenalty -cubepruning_pop_limit=30 -feature_function=KLanguageModel nc-wmt11.en.srilm.3.gz -feature_function=RuleIdentityFeatures -scfg_max_span_limit=15 - diff --git a/dtrain/job/dtrain.ini b/dtrain/job/dtrain.ini deleted file mode 100644 index 079d7d69..00000000 --- a/dtrain/job/dtrain.ini +++ /dev/null @@ -1,10 +0,0 @@ -decoder_config=cdec.ini -kbest=100 -ngrams=4 -epochs=10 -input=- -scorer=stupid_bleu -output=- -#stop_after=100 -#wprint=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough - diff --git a/dtrain/job/dtrain.sh b/dtrain/job/dtrain.sh deleted file mode 100755 index 75ec29ea..00000000 --- a/dtrain/job/dtrain.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/sh - -./dtrain -q -c dtrain.ini - -exit 0 - diff --git a/dtrain/job/hadoop-streaming-job.sh b/dtrain/job/hadoop-streaming-job.sh deleted file mode 100755 index 2cf3f50a..00000000 --- a/dtrain/job/hadoop-streaming-job.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -HADOOP_HOME=/usr/lib/hadoop-0.20 -JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar -HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR" - -IN=in/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain.1400m -OUT=out/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain-weights-1400m - -$HSTREAMING \ - -mapper "dtrain.sh" \ - -reducer "avgweights.rb" \ - -input $IN \ - -output $OUT \ - -file avgweights.rb \ - -file dtrain.sh \ - -file dtrain \ - -file dtrain.ini \ - -file cdec.ini \ - -file nc-wmt11.en.srilm.3.gz \ - -jobconf mapred.reduce.tasks=1 \ - -jobconf mapred.max.map.failures.percent=100 - diff --git a/dtrain/job2/avgweights.rb b/dtrain/job2/avgweights.rb deleted file mode 100755 index 31048f16..00000000 --- a/dtrain/job2/avgweights.rb +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env ruby1.9.1 - - -STDIN.set_encoding 'utf-8' - -#shard_count_key = "__SHARD_COUNT__" - -w = {} -#c = {} -w.default = 0 -#c.default = 0 -while line = STDIN.gets - key, val = line.split /\t/ - w[key] += val.to_f - #c[key] += 1.0 -end - -#shard_count = w["__SHARD_COUNT__"] - -num_map = 2107.0 - -w.each_key { |k| - #if k == shard_count_key then next end - #if k == "__bias" then next end - puts "#{k}\t#{w[k]/num_map}" - #/c[k]}" #{w[k]/shard_count}" -} - -#puts "#{shard_count_key}\t#{w[shard_count_key]}" - diff --git a/dtrain/job2/cdec.ini b/dtrain/job2/cdec.ini deleted file mode 100644 index 0d32f0b7..00000000 --- a/dtrain/job2/cdec.ini +++ /dev/null @@ -1,8 +0,0 @@ -formalism=scfg -add_pass_through_rules=true -feature_function=WordPenalty -cubepruning_pop_limit=30 -feature_function=KLanguageModel nc-wmt11.en.srilm.3.gz -feature_function=RuleIdentityFeatures -scfg_max_span_limit=15 - diff --git a/dtrain/job2/dtrain.ini b/dtrain/job2/dtrain.ini deleted file mode 100644 index ec005e46..00000000 --- a/dtrain/job2/dtrain.ini +++ /dev/null @@ -1,10 +0,0 @@ -decoder_config=cdec.ini -kbest=100 -ngrams=3 -epochs=10 -input=- -scorer=stupid_bleu -output=- -#stop_after=100 -#wprint=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough - diff --git a/dtrain/job2/dtrain.sh b/dtrain/job2/dtrain.sh deleted file mode 100755 index 75ec29ea..00000000 --- a/dtrain/job2/dtrain.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/sh - -./dtrain -q -c dtrain.ini - -exit 0 - diff --git a/dtrain/job2/hadoop-streaming-job.sh b/dtrain/job2/hadoop-streaming-job.sh deleted file mode 100755 index 9ee70a33..00000000 --- a/dtrain/job2/hadoop-streaming-job.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -HADOOP_HOME=/usr/lib/hadoop-0.20 -JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar -HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR" - -IN=in/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain -OUT=out/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain-weights.50 - -$HSTREAMING \ - -mapper "dtrain.sh" \ - -reducer "avgweights.rb" \ - -input $IN \ - -output $OUT \ - -file avgweights.rb \ - -file dtrain.sh \ - -file dtrain \ - -file dtrain.ini \ - -file cdec.ini \ - -file nc-wmt11.en.srilm.3.gz \ - -jobconf mapred.reduce.tasks=1 \ - -jobconf mapred.max.map.failures.percent=100 - diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h new file mode 100644 index 00000000..502901af --- /dev/null +++ b/dtrain/pairsampling.h @@ -0,0 +1,64 @@ +#ifndef _DTRAIN_SAMPLE_H_ +#define _DTRAIN_SAMPLE_H_ + + +#include "kbestget.h" + + +namespace dtrain +{ + + +struct TPair +{ + SparseVector first, second; + size_t first_rank, second_rank; + double first_score, second_score; +}; + +typedef vector TrainingInstances; + + +void +sample_all( KBestList* kb, TrainingInstances &training ) +{ + for ( size_t i = 0; i < kb->GetSize()-1; i++ ) { + for ( size_t j = i+1; j < kb->GetSize(); j++ ) { + TPair p; + p.first = kb->feats[i]; + p.second = kb->feats[j]; + p.first_rank = i; + p.second_rank = j; + p.first_score = kb->scores[i]; + p.second_score = kb->scores[j]; + training.push_back( p ); + } + } +} + +void +sample_rand( KBestList* kb, TrainingInstances &training ) +{ + srand( time(NULL) ); + for ( size_t i = 0; i < kb->GetSize()-1; i++ ) { + for ( size_t j = i+1; j < kb->GetSize(); j++ ) { + if ( rand() % 2 ) { + TPair p; + p.first = kb->feats[i]; + p.second = kb->feats[j]; + p.first_rank = i; + p.second_rank = j; + p.first_score = kb->scores[i]; + p.second_score = kb->scores[j]; + training.push_back( p ); + } + } + } +} + + +} // namespace + + +#endif + diff --git a/dtrain/run.sh b/dtrain/run.sh deleted file mode 100755 index 72e56f3e..00000000 --- a/dtrain/run.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/sh - -#INI=test/blunsom08.dtrain.ini -#INI=test/nc-wmt11/dtrain.ini -#INI=test/EXAMPLE/dtrain.ini -#INI=test/EXAMPLE/dtrain.ruleids.ini -INI=test/toy.dtrain.ini -#INI=test/EXAMPLE/dtrain.cdecrid.ini - -#rm /tmp/dtrain-* -./dtrain -c $INI $1 $2 $3 $4 - diff --git a/dtrain/sample.h b/dtrain/sample.h deleted file mode 100644 index 502901af..00000000 --- a/dtrain/sample.h +++ /dev/null @@ -1,64 +0,0 @@ -#ifndef _DTRAIN_SAMPLE_H_ -#define _DTRAIN_SAMPLE_H_ - - -#include "kbestget.h" - - -namespace dtrain -{ - - -struct TPair -{ - SparseVector first, second; - size_t first_rank, second_rank; - double first_score, second_score; -}; - -typedef vector TrainingInstances; - - -void -sample_all( KBestList* kb, TrainingInstances &training ) -{ - for ( size_t i = 0; i < kb->GetSize()-1; i++ ) { - for ( size_t j = i+1; j < kb->GetSize(); j++ ) { - TPair p; - p.first = kb->feats[i]; - p.second = kb->feats[j]; - p.first_rank = i; - p.second_rank = j; - p.first_score = kb->scores[i]; - p.second_score = kb->scores[j]; - training.push_back( p ); - } - } -} - -void -sample_rand( KBestList* kb, TrainingInstances &training ) -{ - srand( time(NULL) ); - for ( size_t i = 0; i < kb->GetSize()-1; i++ ) { - for ( size_t j = i+1; j < kb->GetSize(); j++ ) { - if ( rand() % 2 ) { - TPair p; - p.first = kb->feats[i]; - p.second = kb->feats[j]; - p.first_rank = i; - p.second_rank = j; - p.first_score = kb->scores[i]; - p.second_score = kb->scores[j]; - training.push_back( p ); - } - } - } -} - - -} // namespace - - -#endif - diff --git a/dtrain/test-reducer b/dtrain/test-reducer deleted file mode 100644 index b86e7894..00000000 --- a/dtrain/test-reducer +++ /dev/null @@ -1,7 +0,0 @@ -a 1 -b 2 -c 3.5 -a 1 -b 2 -c 3.5 -__SHARD_COUNT__ 2 diff --git a/dtrain/tests.cc b/dtrain/tests.cc deleted file mode 100644 index 997eafbb..00000000 --- a/dtrain/tests.cc +++ /dev/null @@ -1,141 +0,0 @@ -#include "tests.h" - - -namespace dtrain -{ - - -/* - * approx_equal - * - */ -double -approx_equal( double x, double y ) -{ - const double EPSILON = 1E-5; - if ( x == 0 ) return fabs( y ) <= EPSILON; - if ( y == 0 ) return fabs( x ) <= EPSILON; - return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON; -} - - -/* - * test_ngrams - * - */ -void -test_ngrams() -{ - cout << "Testing ngrams..." << endl << endl; - size_t N = 5; - cout << "N = " << N << endl; - vector a; // hyp - vector b; // ref - cout << "a "; - for (size_t i = 1; i <= 8; i++) { - cout << i << " "; - a.push_back(i); - } - cout << endl << "b "; - for (size_t i = 1; i <= 4; i++) { - cout << i << " "; - b.push_back(i); - } - cout << endl << endl; - NgramCounts c = make_ngram_counts( a, b, N ); - assert( c.clipped[N-1] == 0 ); - assert( c.sum[N-1] == 4 ); - c.print(); - c += c; - cout << endl; - c.print(); - cout << endl; -} - - -/* - * test_metrics - * - */ -void -test_metrics() -{ - cout << "Testing metrics..." << endl << endl; - using namespace boost::assign; - vector a, b; - vector expect_vanilla, expect_smooth, expect_stupid; - a += "a a a a", "a a a a", "a", "a", "b", "a a a a", "a a", "a a a", "a b a"; // hyp - b += "b b b b", "a a a a", "a", "b", "b b b b", "a", "a a", "a a a", "a b b"; // ref - expect_vanilla += 0, 1, 1, 0, 0, .25, 1, 1, 0; - expect_smooth += 0, .9375, .0625, 0, .00311169, .0441942, .1875, .4375, .161587; - expect_stupid += 0, 1, 1, 0, .0497871, .25, 1, 1, .605707; - vector aa, bb; - vector aai, bbi; - double vanilla, smooth, stupid; - size_t N = 4; - cout << "N = " << N << endl << endl; - for ( size_t i = 0; i < a.size(); i++ ) { - cout << " hyp: " << a[i] << endl; - cout << " ref: " << b[i] << endl; - aa.clear(); bb.clear(); aai.clear(); bbi.clear(); - boost::split( aa, a[i], boost::is_any_of(" ") ); - boost::split( bb, b[i], boost::is_any_of(" ") ); - register_and_convert( aa, aai ); - register_and_convert( bb, bbi ); - NgramCounts counts = make_ngram_counts( aai, bbi, N ); - vanilla = bleu( counts, aa.size(), bb.size(), N); - smooth = smooth_bleu( counts, aa.size(), bb.size(), N); - stupid = stupid_bleu( counts, aa.size(), bb.size(), N); - assert( approx_equal(vanilla, expect_vanilla[i]) ); - assert( approx_equal(smooth, expect_smooth[i]) ); - assert( approx_equal(stupid, expect_stupid[i]) ); - cout << setw(14) << "bleu = " << vanilla << endl; - cout << setw(14) << "smooth bleu = " << smooth << endl; - cout << setw(14) << "stupid bleu = " << stupid << endl << endl; - } - cout << endl; -} - - -/* - * test_SetWeights - * - */ -void -test_SetWeights() -{ - cout << "Testing Weights::SetWeight..." << endl << endl; - Weights weights; - SparseVector lambdas; - weights.InitSparseVector( &lambdas ); - weights.SetWeight( &lambdas, "test", 0 ); - weights.SetWeight( &lambdas, "test1", 1 ); - WordID fid = FD::Convert( "test2" ); - weights.SetWeight( &lambdas, fid, 2 ); - string fn = "weights-test"; - cout << "FD::NumFeats() " << FD::NumFeats() << endl; - assert( FD::NumFeats() == 4 ); - weights.WriteToFile( fn, true ); - cout << endl; -} - - -/* - * run_tests - * - */ -void -run_tests() -{ - cout << endl; - test_ngrams(); - cout << endl; - test_metrics(); - cout << endl; - test_SetWeights(); - exit(0); -} - - -} // namespace - diff --git a/dtrain/tests.h b/dtrain/tests.h deleted file mode 100644 index 9853e3c3..00000000 --- a/dtrain/tests.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef _DTRAIN_TESTS_H_ -#define _DTRAIN_TESTS_H_ - -#include -#include - -#include "common.h" -#include "util.h" - - -namespace dtrain -{ - - -double approx_equal( double x, double y ); -void test_ngrams(); -void test_metrics(); -void test_SetWeights(); -void run_tests(); - - -} // namespace - - -#endif - diff --git a/dtrain/updater.h b/dtrain/updater.h deleted file mode 100644 index b54c25de..00000000 --- a/dtrain/updater.h +++ /dev/null @@ -1,107 +0,0 @@ -#ifndef _DTRAIN_LEARNER_H_ -#define _DTRAIN_LEARNER_H_ - -#include -#include -#include - -#include "sparse_vector.h" -#include "score.h" - - -namespace dtrain -{ - - -class Updater -{ - public: - virtual void Init( const vector >& kbest, const Scores& scores, - const bool invert_score = false ) {}; - virtual void Update( SparseVector& lambdas ) {}; -}; - - -class SofiaUpdater : public Updater -{ - public: - void - Init( const size_t sid, const vector >& kbest, /*const FIXME operator[]*/ Scores& scores, - const bool invert_score = false ) - { - assert( kbest.size() == scores.size() ); - ofstream o; - char tmp[] = DTRAIN_TMP_DIR"/dtrain-sofia-data-XXXXXX"; - mkstemp( tmp ); - tmp_data_fn = tmp; - o.open( tmp_data_fn.c_str(), ios::trunc ); - int fid = 0; - map::iterator ff; - double score; - for ( size_t k = 0; k < kbest.size(); ++k ) { - map m; - SparseVector::const_iterator it = kbest[k].begin(); - score = scores[k].GetScore(); - if ( invert_score ) score = -score; - o << score; - for ( ; it != kbest[k].end(); ++it ) { - ff = fmap.find( it->first ); - if ( ff == fmap.end() ) { - fmap.insert( pair(it->first, fid) ); - fmap1.insert( pair(fid, it->first) ); - fid++; - } - m.insert( pair(fmap[it->first], it->second) ); - } - map::iterator ti = m.begin(); - for ( ; ti != m.end(); ++ti ) { - o << " " << ti->first << ":" << ti->second; - } - o << endl; - } - o.close(); - } - - void - Update(SparseVector& lambdas) - { - char tmp[] = DTRAIN_TMP_DIR"/dtrain-sofia-model-XXXXXX"; - mkstemp(tmp); - tmp_model_fn = tmp; - //--random_seed 123456789010 - string call = "./sofia-ml --training_file " + tmp_data_fn; - call += " --model_out " + tmp_model_fn; - call += " --loop_type rank --lambda 100 --eta_type constant --dimensionality "; - std::stringstream out; - out << fmap.size(); - call += out.str(); - call += " &>/dev/null"; - system ( call.c_str() ); - ifstream i; - i.open( tmp_model_fn.c_str(), ios::in ); - string model; - getline( i, model ); - vector strs; - boost::split( strs, model, boost::is_any_of(" ") ); - int j = 0; - for ( vector::iterator it = strs.begin(); it != strs.end(); ++it ) { - lambdas.set_value(fmap1[j], atof( it->c_str() ) ); - j++; - } - i.close(); - unlink( tmp_data_fn.c_str() ); - unlink( tmp_model_fn.c_str() ); - } - - private: - string tmp_data_fn; - string tmp_model_fn; - map fmap; - map fmap1; -}; - - -} // namespace - -#endif - diff --git a/dtrain/util.cc b/dtrain/util.cc deleted file mode 100644 index 7b3bbe3d..00000000 --- a/dtrain/util.cc +++ /dev/null @@ -1,34 +0,0 @@ -#include "util.h" - - -namespace dtrain -{ - - -/* - * register_and_convert - * - */ -void -register_and_convert(const vector& strs, vector& ids) -{ - vector::const_iterator it; - for ( it = strs.begin(); it < strs.end(); it++ ) { - ids.push_back( TD::Convert( *it ) ); - } -} - - -/* - * print_FD - * - */ -void -print_FD() -{ - for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl; -} - - -} // namespace - diff --git a/dtrain/util.h b/dtrain/util.h deleted file mode 100644 index 6a548519..00000000 --- a/dtrain/util.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef _DTRAIN_UTIL_H_ -#define _DTRAIN_UTIL_H_ - - -#include -#include -#include - -#include "fdict.h" -#include "tdict.h" -#include "wordid.h" - -using namespace std; - - -namespace dtrain -{ - - -void register_and_convert(const vector& strs, vector& ids); -void print_FD(); - - -} // namespace - - -#endif - -- cgit v1.2.3