diff options
author | Patrick Simianer <p@simianer.de> | 2011-09-23 20:53:15 +0200 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2011-09-23 20:53:15 +0200 |
commit | 9bde56ed23b4b97f8193f9f8f582f18086ff17c1 (patch) | |
tree | 83bd5687f2069405537f7f8fbdfbe208a634ca54 /dtrain | |
parent | 4433886ac335e6db7ded081b5ef673490ee27718 (diff) |
begin refactoring
Diffstat (limited to 'dtrain')
-rw-r--r-- | dtrain/Makefile.am | 7 | ||||
-rw-r--r-- | dtrain/README | 1 | ||||
-rw-r--r-- | dtrain/dtest.cc | 94 | ||||
-rw-r--r-- | dtrain/dtrain.cc | 44 | ||||
-rw-r--r-- | dtrain/dtrain.h (renamed from dtrain/common.h) | 19 | ||||
-rwxr-xr-x | dtrain/hstreaming/avgweights.rb (renamed from dtrain/avgweights.rb) | 0 | ||||
-rw-r--r-- | dtrain/hstreaming/cdec.ini (renamed from dtrain/job/cdec.ini) | 0 | ||||
-rw-r--r-- | dtrain/hstreaming/dtrain.ini (renamed from dtrain/job/dtrain.ini) | 0 | ||||
-rwxr-xr-x | dtrain/hstreaming/hadoop-streaming-job.sh (renamed from dtrain/job/hadoop-streaming-job.sh) | 0 | ||||
-rwxr-xr-x | dtrain/job/avgweights.rb | 30 | ||||
-rwxr-xr-x | dtrain/job/dtrain.sh | 6 | ||||
-rwxr-xr-x | dtrain/job2/avgweights.rb | 30 | ||||
-rw-r--r-- | dtrain/job2/cdec.ini | 8 | ||||
-rw-r--r-- | dtrain/job2/dtrain.ini | 10 | ||||
-rwxr-xr-x | dtrain/job2/dtrain.sh | 6 | ||||
-rwxr-xr-x | dtrain/job2/hadoop-streaming-job.sh | 23 | ||||
-rw-r--r-- | dtrain/pairsampling.h (renamed from dtrain/sample.h) | 0 | ||||
-rwxr-xr-x | dtrain/run.sh | 12 | ||||
-rw-r--r-- | dtrain/test-reducer | 7 | ||||
-rw-r--r-- | dtrain/tests.cc | 141 | ||||
-rw-r--r-- | dtrain/tests.h | 26 | ||||
-rw-r--r-- | dtrain/updater.h | 107 | ||||
-rw-r--r-- | dtrain/util.cc | 34 | ||||
-rw-r--r-- | dtrain/util.h | 28 |
24 files changed, 38 insertions, 595 deletions
diff --git a/dtrain/Makefile.am b/dtrain/Makefile.am index c08cd1ea..9b5df8bf 100644 --- a/dtrain/Makefile.am +++ b/dtrain/Makefile.am @@ -1,11 +1,8 @@ # TODO I'm sure I can leave something out. -bin_PROGRAMS = dtrain dtest +bin_PROGRAMS = dtrain -dtrain_SOURCES = dtrain.cc score.cc tests.cc util.cc sample_hg.cc +dtrain_SOURCES = dtrain.cc score.cc sample_hg.cc dtrain_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -lboost_filesystem -lboost_iostreams -dtest_SOURCES = dtest.cc score.cc util.cc -dtest_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz - AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/dtrain/README b/dtrain/README index b3f513be..0cc52acc 100644 --- a/dtrain/README +++ b/dtrain/README @@ -31,6 +31,7 @@ TODO use separate TEST SET KNOWN BUGS PROBLEMS + if size of candidate < N => 0 score cdec kbest vs 1best (no -k param), rescoring? => ok(?) no sparse vector in decoder => ok ? ok diff --git a/dtrain/dtest.cc b/dtrain/dtest.cc deleted file mode 100644 index 36c880a3..00000000 --- a/dtrain/dtest.cc +++ /dev/null @@ -1,94 +0,0 @@ -#include "common.h" -#include "kbestget.h" -#include "util.h" - - -/* - * init - * - */ -bool -init(int argc, char** argv, po::variables_map* conf) -{ - int N; - po::options_description opts( "Command Line Options" ); - opts.add_options() - ( "decoder-config,c", po::value<string>(), "configuration file for cdec" ) - ( "weights,w", po::value<string>(), "weights file" ) - ( "ngrams,n", po::value<int>(&N)->default_value(DTRAIN_DEFAULT_N), "N for Ngrams (default 5)" ); - po::options_description cmdline_options; - cmdline_options.add(opts); - po::store( parse_command_line(argc, argv, cmdline_options), *conf ); - po::notify( *conf ); - if ( ! (conf->count("decoder-config") || conf->count("weights")) ) { - cerr << cmdline_options << endl; - return false; - } - return true; -} - - -/* - * main - * - */ -int -main(int argc, char** argv) -{ - SetSilent( true ); - po::variables_map conf; - if ( !init(argc, argv, &conf) ) return 1; - register_feature_functions(); - size_t k = 1; - ReadFile ini_rf( conf["decoder-config"].as<string>() ); - Decoder decoder( ini_rf.stream() ); - KBestGetter observer( k, "no" ); - size_t N = conf["ngrams"].as<int>(); - - Weights weights; - if ( conf.count("weights") ) weights.InitFromFile( conf["weights"].as<string>() ); - vector<double> w; - weights.InitVector( &w ); - decoder.SetWeights( w ); - - vector<string> in_split, ref_strs; - vector<WordID> ref_ids; - string in, psg; - size_t sn = 0; - double overall = 0.0; - double overall1 = 0.0; - double overall2 = 0.0; - while( getline(cin, in) ) { - in_split.clear(); - boost::split( in_split, in, boost::is_any_of("\t") ); - // grammar - psg = boost::replace_all_copy( in_split[3], " __NEXT__RULE__ ", "\n" ); psg += "\n"; - decoder.SetSentenceGrammarFromString( psg ); - decoder.Decode( in_split[1], &observer ); - KBestList* kb = observer.GetKBest(); - // reference - ref_strs.clear(); ref_ids.clear(); - boost::split( ref_strs, in_split[2], boost::is_any_of(" ") ); - register_and_convert( ref_strs, ref_ids ); - // scoring kbest - double score = 0.0; - double score1 = 0.0; - double score2 = 0.0; - NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[0], N ); - score = smooth_bleu( counts, ref_ids.size(), kb->sents[0].size(), N ); - score1 = stupid_bleu( counts, ref_ids.size(), kb->sents[0].size(), N ); - score2 = bleu( counts, ref_ids.size(), kb->sents[0].size(), N ); - cout << TD::GetString( kb->sents[0] ) << endl; - overall += score; - overall1 += score1; - overall2 += score2; - sn += 1; - } - cerr << "Average score (smooth) : " << overall/(double)(sn+1) << endl; - cerr << "Average score (stupid) : " << overall1/(double)(sn+1) << endl; - cerr << "Average score (vanilla): " << overall2/(double)(sn+1) << endl; - cerr << endl; - - return 0; -} - diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index f005008e..01821b30 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -1,24 +1,19 @@ -#include "common.h" -#include "kbestget.h" -#include "util.h" -#include "sample.h" +#include "dtrain.h" -#include "ksampler.h" -// boost compression -#include <boost/iostreams/device/file.hpp> -#include <boost/iostreams/filtering_stream.hpp> -#include <boost/iostreams/filter/gzip.hpp> -//#include <boost/iostreams/filter/zlib.hpp> -//#include <boost/iostreams/filter/bzip2.hpp> -using namespace boost::iostreams; -#include <boost/algorithm/string/predicate.hpp> -#include <boost/lexical_cast.hpp> - -#ifdef DTRAIN_DEBUG -#include "tests.h" -#endif +/* + * register_and_convert + * + */ +void +register_and_convert(const vector<string>& strs, vector<WordID>& ids) +{ + vector<string>::const_iterator it; + for ( it = strs.begin(); it < strs.end(); it++ ) { + ids.push_back( TD::Convert( *it ) ); + } +} /* @@ -49,12 +44,7 @@ init(int argc, char** argv, po::variables_map* cfg) clo.add_options() ( "config,c", po::value<string>(), "dtrain config file" ) ( "quiet,q", po::value<bool>()->zero_tokens(), "be quiet" ) - ( "verbose,v", po::value<bool>()->zero_tokens(), "be verbose" ) -#ifndef DTRAIN_DEBUG - ; -#else - ( "test", "run tests and exit"); -#endif + ( "verbose,v", po::value<bool>()->zero_tokens(), "be verbose" ); po::options_description config_options, cmdline_options; config_options.add(conff); @@ -149,9 +139,9 @@ main( int argc, char** argv ) if ( !quiet ) cout << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl; Decoder decoder( ini_rf.stream() ); - //KBestGetter observer( k, filter_type ); + KBestGetter observer( k, filter_type ); MT19937 rng; - KSampler observer( k, &rng ); + //KSampler observer( k, &rng ); // scoring metric/scorer string scorer_str = cfg["scorer"].as<string>(); @@ -433,7 +423,7 @@ main( int argc, char** argv ) } ++sid; - cerr << "reporter:counter:dtrain,sent," << sid << endl; + //cerr << "reporter:counter:dtrain,sent," << sid << endl; } // input loop diff --git a/dtrain/common.h b/dtrain/dtrain.h index 49dc85b7..3d319233 100644 --- a/dtrain/common.h +++ b/dtrain/dtrain.h @@ -30,10 +30,27 @@ #define DTRAIN_DEFAULT_T 1 // iterations #define DTRAIN_DEFAULT_SCORER "stupid_bleu" // scorer #define DTRAIN_DOTS 100 // when to display a '.' -#define DTRAIN_TMP_DIR "/var/hadoop/mapred/local" // put this on a SSD? +#define DTRAIN_TMP_DIR "/tmp" // put this on a SSD? #define DTRAIN_GRAMMAR_DELIM "########EOS########" +#include "kbestget.h" +#include "pairsampling.h" + +#include "ksampler.h" + +// boost compression +#include <boost/iostreams/device/file.hpp> +#include <boost/iostreams/filtering_stream.hpp> +#include <boost/iostreams/filter/gzip.hpp> +//#include <boost/iostreams/filter/zlib.hpp> +//#include <boost/iostreams/filter/bzip2.hpp> +using namespace boost::iostreams; + +#include <boost/algorithm/string/predicate.hpp> +#include <boost/lexical_cast.hpp> + + using namespace std; using namespace dtrain; namespace po = boost::program_options; diff --git a/dtrain/avgweights.rb b/dtrain/hstreaming/avgweights.rb index d5cfaa4d..d5cfaa4d 100755 --- a/dtrain/avgweights.rb +++ b/dtrain/hstreaming/avgweights.rb diff --git a/dtrain/job/cdec.ini b/dtrain/hstreaming/cdec.ini index 0d32f0b7..0d32f0b7 100644 --- a/dtrain/job/cdec.ini +++ b/dtrain/hstreaming/cdec.ini diff --git a/dtrain/job/dtrain.ini b/dtrain/hstreaming/dtrain.ini index 079d7d69..079d7d69 100644 --- a/dtrain/job/dtrain.ini +++ b/dtrain/hstreaming/dtrain.ini diff --git a/dtrain/job/hadoop-streaming-job.sh b/dtrain/hstreaming/hadoop-streaming-job.sh index 2cf3f50a..2cf3f50a 100755 --- a/dtrain/job/hadoop-streaming-job.sh +++ b/dtrain/hstreaming/hadoop-streaming-job.sh diff --git a/dtrain/job/avgweights.rb b/dtrain/job/avgweights.rb deleted file mode 100755 index e635aab4..00000000 --- a/dtrain/job/avgweights.rb +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env ruby1.9.1 - - -STDIN.set_encoding 'utf-8' - -#shard_count_key = "__SHARD_COUNT__" - -w = {} -#c = {} -w.default = 0 -#c.default = 0 -while line = STDIN.gets - key, val = line.split /\t/ - w[key] += val.to_f - #c[key] += 1.0 -end - -#shard_count = w["__SHARD_COUNT__"] - -num_map = 104.0 - -w.each_key { |k| - #if k == shard_count_key then next end - #if k == "__bias" then next end - puts "#{k}\t#{w[k]/num_map}" - #/c[k]}" #{w[k]/shard_count}" -} - -#puts "#{shard_count_key}\t#{w[shard_count_key]}" - diff --git a/dtrain/job/dtrain.sh b/dtrain/job/dtrain.sh deleted file mode 100755 index 75ec29ea..00000000 --- a/dtrain/job/dtrain.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/sh - -./dtrain -q -c dtrain.ini - -exit 0 - diff --git a/dtrain/job2/avgweights.rb b/dtrain/job2/avgweights.rb deleted file mode 100755 index 31048f16..00000000 --- a/dtrain/job2/avgweights.rb +++ /dev/null @@ -1,30 +0,0 @@ -#!/usr/bin/env ruby1.9.1 - - -STDIN.set_encoding 'utf-8' - -#shard_count_key = "__SHARD_COUNT__" - -w = {} -#c = {} -w.default = 0 -#c.default = 0 -while line = STDIN.gets - key, val = line.split /\t/ - w[key] += val.to_f - #c[key] += 1.0 -end - -#shard_count = w["__SHARD_COUNT__"] - -num_map = 2107.0 - -w.each_key { |k| - #if k == shard_count_key then next end - #if k == "__bias" then next end - puts "#{k}\t#{w[k]/num_map}" - #/c[k]}" #{w[k]/shard_count}" -} - -#puts "#{shard_count_key}\t#{w[shard_count_key]}" - diff --git a/dtrain/job2/cdec.ini b/dtrain/job2/cdec.ini deleted file mode 100644 index 0d32f0b7..00000000 --- a/dtrain/job2/cdec.ini +++ /dev/null @@ -1,8 +0,0 @@ -formalism=scfg -add_pass_through_rules=true -feature_function=WordPenalty -cubepruning_pop_limit=30 -feature_function=KLanguageModel nc-wmt11.en.srilm.3.gz -feature_function=RuleIdentityFeatures -scfg_max_span_limit=15 - diff --git a/dtrain/job2/dtrain.ini b/dtrain/job2/dtrain.ini deleted file mode 100644 index ec005e46..00000000 --- a/dtrain/job2/dtrain.ini +++ /dev/null @@ -1,10 +0,0 @@ -decoder_config=cdec.ini -kbest=100 -ngrams=3 -epochs=10 -input=- -scorer=stupid_bleu -output=- -#stop_after=100 -#wprint=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough - diff --git a/dtrain/job2/dtrain.sh b/dtrain/job2/dtrain.sh deleted file mode 100755 index 75ec29ea..00000000 --- a/dtrain/job2/dtrain.sh +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/sh - -./dtrain -q -c dtrain.ini - -exit 0 - diff --git a/dtrain/job2/hadoop-streaming-job.sh b/dtrain/job2/hadoop-streaming-job.sh deleted file mode 100755 index 9ee70a33..00000000 --- a/dtrain/job2/hadoop-streaming-job.sh +++ /dev/null @@ -1,23 +0,0 @@ -#!/bin/bash - -HADOOP_HOME=/usr/lib/hadoop-0.20 -JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar -HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR" - -IN=in/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain -OUT=out/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain-weights.50 - -$HSTREAMING \ - -mapper "dtrain.sh" \ - -reducer "avgweights.rb" \ - -input $IN \ - -output $OUT \ - -file avgweights.rb \ - -file dtrain.sh \ - -file dtrain \ - -file dtrain.ini \ - -file cdec.ini \ - -file nc-wmt11.en.srilm.3.gz \ - -jobconf mapred.reduce.tasks=1 \ - -jobconf mapred.max.map.failures.percent=100 - diff --git a/dtrain/sample.h b/dtrain/pairsampling.h index 502901af..502901af 100644 --- a/dtrain/sample.h +++ b/dtrain/pairsampling.h diff --git a/dtrain/run.sh b/dtrain/run.sh deleted file mode 100755 index 72e56f3e..00000000 --- a/dtrain/run.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/sh - -#INI=test/blunsom08.dtrain.ini -#INI=test/nc-wmt11/dtrain.ini -#INI=test/EXAMPLE/dtrain.ini -#INI=test/EXAMPLE/dtrain.ruleids.ini -INI=test/toy.dtrain.ini -#INI=test/EXAMPLE/dtrain.cdecrid.ini - -#rm /tmp/dtrain-* -./dtrain -c $INI $1 $2 $3 $4 - diff --git a/dtrain/test-reducer b/dtrain/test-reducer deleted file mode 100644 index b86e7894..00000000 --- a/dtrain/test-reducer +++ /dev/null @@ -1,7 +0,0 @@ -a 1 -b 2 -c 3.5 -a 1 -b 2 -c 3.5 -__SHARD_COUNT__ 2 diff --git a/dtrain/tests.cc b/dtrain/tests.cc deleted file mode 100644 index 997eafbb..00000000 --- a/dtrain/tests.cc +++ /dev/null @@ -1,141 +0,0 @@ -#include "tests.h" - - -namespace dtrain -{ - - -/* - * approx_equal - * - */ -double -approx_equal( double x, double y ) -{ - const double EPSILON = 1E-5; - if ( x == 0 ) return fabs( y ) <= EPSILON; - if ( y == 0 ) return fabs( x ) <= EPSILON; - return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON; -} - - -/* - * test_ngrams - * - */ -void -test_ngrams() -{ - cout << "Testing ngrams..." << endl << endl; - size_t N = 5; - cout << "N = " << N << endl; - vector<int> a; // hyp - vector<int> b; // ref - cout << "a "; - for (size_t i = 1; i <= 8; i++) { - cout << i << " "; - a.push_back(i); - } - cout << endl << "b "; - for (size_t i = 1; i <= 4; i++) { - cout << i << " "; - b.push_back(i); - } - cout << endl << endl; - NgramCounts c = make_ngram_counts( a, b, N ); - assert( c.clipped[N-1] == 0 ); - assert( c.sum[N-1] == 4 ); - c.print(); - c += c; - cout << endl; - c.print(); - cout << endl; -} - - -/* - * test_metrics - * - */ -void -test_metrics() -{ - cout << "Testing metrics..." << endl << endl; - using namespace boost::assign; - vector<string> a, b; - vector<double> expect_vanilla, expect_smooth, expect_stupid; - a += "a a a a", "a a a a", "a", "a", "b", "a a a a", "a a", "a a a", "a b a"; // hyp - b += "b b b b", "a a a a", "a", "b", "b b b b", "a", "a a", "a a a", "a b b"; // ref - expect_vanilla += 0, 1, 1, 0, 0, .25, 1, 1, 0; - expect_smooth += 0, .9375, .0625, 0, .00311169, .0441942, .1875, .4375, .161587; - expect_stupid += 0, 1, 1, 0, .0497871, .25, 1, 1, .605707; - vector<string> aa, bb; - vector<WordID> aai, bbi; - double vanilla, smooth, stupid; - size_t N = 4; - cout << "N = " << N << endl << endl; - for ( size_t i = 0; i < a.size(); i++ ) { - cout << " hyp: " << a[i] << endl; - cout << " ref: " << b[i] << endl; - aa.clear(); bb.clear(); aai.clear(); bbi.clear(); - boost::split( aa, a[i], boost::is_any_of(" ") ); - boost::split( bb, b[i], boost::is_any_of(" ") ); - register_and_convert( aa, aai ); - register_and_convert( bb, bbi ); - NgramCounts counts = make_ngram_counts( aai, bbi, N ); - vanilla = bleu( counts, aa.size(), bb.size(), N); - smooth = smooth_bleu( counts, aa.size(), bb.size(), N); - stupid = stupid_bleu( counts, aa.size(), bb.size(), N); - assert( approx_equal(vanilla, expect_vanilla[i]) ); - assert( approx_equal(smooth, expect_smooth[i]) ); - assert( approx_equal(stupid, expect_stupid[i]) ); - cout << setw(14) << "bleu = " << vanilla << endl; - cout << setw(14) << "smooth bleu = " << smooth << endl; - cout << setw(14) << "stupid bleu = " << stupid << endl << endl; - } - cout << endl; -} - - -/* - * test_SetWeights - * - */ -void -test_SetWeights() -{ - cout << "Testing Weights::SetWeight..." << endl << endl; - Weights weights; - SparseVector<double> lambdas; - weights.InitSparseVector( &lambdas ); - weights.SetWeight( &lambdas, "test", 0 ); - weights.SetWeight( &lambdas, "test1", 1 ); - WordID fid = FD::Convert( "test2" ); - weights.SetWeight( &lambdas, fid, 2 ); - string fn = "weights-test"; - cout << "FD::NumFeats() " << FD::NumFeats() << endl; - assert( FD::NumFeats() == 4 ); - weights.WriteToFile( fn, true ); - cout << endl; -} - - -/* - * run_tests - * - */ -void -run_tests() -{ - cout << endl; - test_ngrams(); - cout << endl; - test_metrics(); - cout << endl; - test_SetWeights(); - exit(0); -} - - -} // namespace - diff --git a/dtrain/tests.h b/dtrain/tests.h deleted file mode 100644 index 9853e3c3..00000000 --- a/dtrain/tests.h +++ /dev/null @@ -1,26 +0,0 @@ -#ifndef _DTRAIN_TESTS_H_ -#define _DTRAIN_TESTS_H_ - -#include <iomanip> -#include <boost/assign/std/vector.hpp> - -#include "common.h" -#include "util.h" - - -namespace dtrain -{ - - -double approx_equal( double x, double y ); -void test_ngrams(); -void test_metrics(); -void test_SetWeights(); -void run_tests(); - - -} // namespace - - -#endif - diff --git a/dtrain/updater.h b/dtrain/updater.h deleted file mode 100644 index b54c25de..00000000 --- a/dtrain/updater.h +++ /dev/null @@ -1,107 +0,0 @@ -#ifndef _DTRAIN_LEARNER_H_ -#define _DTRAIN_LEARNER_H_ - -#include <string> -#include <vector> -#include <map> - -#include "sparse_vector.h" -#include "score.h" - - -namespace dtrain -{ - - -class Updater -{ - public: - virtual void Init( const vector<SparseVector<double> >& kbest, const Scores& scores, - const bool invert_score = false ) {}; - virtual void Update( SparseVector<double>& lambdas ) {}; -}; - - -class SofiaUpdater : public Updater -{ - public: - void - Init( const size_t sid, const vector<SparseVector<double> >& kbest, /*const FIXME operator[]*/ Scores& scores, - const bool invert_score = false ) - { - assert( kbest.size() == scores.size() ); - ofstream o; - char tmp[] = DTRAIN_TMP_DIR"/dtrain-sofia-data-XXXXXX"; - mkstemp( tmp ); - tmp_data_fn = tmp; - o.open( tmp_data_fn.c_str(), ios::trunc ); - int fid = 0; - map<int,int>::iterator ff; - double score; - for ( size_t k = 0; k < kbest.size(); ++k ) { - map<int,double> m; - SparseVector<double>::const_iterator it = kbest[k].begin(); - score = scores[k].GetScore(); - if ( invert_score ) score = -score; - o << score; - for ( ; it != kbest[k].end(); ++it ) { - ff = fmap.find( it->first ); - if ( ff == fmap.end() ) { - fmap.insert( pair<int,int>(it->first, fid) ); - fmap1.insert( pair<int,int>(fid, it->first) ); - fid++; - } - m.insert( pair<int,double>(fmap[it->first], it->second) ); - } - map<int,double>::iterator ti = m.begin(); - for ( ; ti != m.end(); ++ti ) { - o << " " << ti->first << ":" << ti->second; - } - o << endl; - } - o.close(); - } - - void - Update(SparseVector<double>& lambdas) - { - char tmp[] = DTRAIN_TMP_DIR"/dtrain-sofia-model-XXXXXX"; - mkstemp(tmp); - tmp_model_fn = tmp; - //--random_seed 123456789010 - string call = "./sofia-ml --training_file " + tmp_data_fn; - call += " --model_out " + tmp_model_fn; - call += " --loop_type rank --lambda 100 --eta_type constant --dimensionality "; - std::stringstream out; - out << fmap.size(); - call += out.str(); - call += " &>/dev/null"; - system ( call.c_str() ); - ifstream i; - i.open( tmp_model_fn.c_str(), ios::in ); - string model; - getline( i, model ); - vector<string> strs; - boost::split( strs, model, boost::is_any_of(" ") ); - int j = 0; - for ( vector<string>::iterator it = strs.begin(); it != strs.end(); ++it ) { - lambdas.set_value(fmap1[j], atof( it->c_str() ) ); - j++; - } - i.close(); - unlink( tmp_data_fn.c_str() ); - unlink( tmp_model_fn.c_str() ); - } - - private: - string tmp_data_fn; - string tmp_model_fn; - map<int,int> fmap; - map<int,int> fmap1; -}; - - -} // namespace - -#endif - diff --git a/dtrain/util.cc b/dtrain/util.cc deleted file mode 100644 index 7b3bbe3d..00000000 --- a/dtrain/util.cc +++ /dev/null @@ -1,34 +0,0 @@ -#include "util.h" - - -namespace dtrain -{ - - -/* - * register_and_convert - * - */ -void -register_and_convert(const vector<string>& strs, vector<WordID>& ids) -{ - vector<string>::const_iterator it; - for ( it = strs.begin(); it < strs.end(); it++ ) { - ids.push_back( TD::Convert( *it ) ); - } -} - - -/* - * print_FD - * - */ -void -print_FD() -{ - for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl; -} - - -} // namespace - diff --git a/dtrain/util.h b/dtrain/util.h deleted file mode 100644 index 6a548519..00000000 --- a/dtrain/util.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef _DTRAIN_UTIL_H_ -#define _DTRAIN_UTIL_H_ - - -#include <iostream> -#include <string> -#include <vector> - -#include "fdict.h" -#include "tdict.h" -#include "wordid.h" - -using namespace std; - - -namespace dtrain -{ - - -void register_and_convert(const vector<string>& strs, vector<WordID>& ids); -void print_FD(); - - -} // namespace - - -#endif - |