summaryrefslogtreecommitdiff
path: root/dtrain
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2011-09-23 20:53:15 +0200
committerPatrick Simianer <p@simianer.de>2011-09-23 20:53:15 +0200
commitdc9fd7a3adc863510d79a718e919b6833a86729c (patch)
tree4baf0c6cadff000a20039994237ccaf468daee40 /dtrain
parent1ad0eb820ee946e5a142567380fc0488c9a5d6de (diff)
begin refactoring
Diffstat (limited to 'dtrain')
-rw-r--r--dtrain/Makefile.am7
-rw-r--r--dtrain/README1
-rw-r--r--dtrain/dtest.cc94
-rw-r--r--dtrain/dtrain.cc44
-rw-r--r--dtrain/dtrain.h (renamed from dtrain/common.h)19
-rwxr-xr-xdtrain/hstreaming/avgweights.rb (renamed from dtrain/avgweights.rb)0
-rw-r--r--dtrain/hstreaming/cdec.ini (renamed from dtrain/job/cdec.ini)0
-rw-r--r--dtrain/hstreaming/dtrain.ini (renamed from dtrain/job/dtrain.ini)0
-rwxr-xr-xdtrain/hstreaming/hadoop-streaming-job.sh (renamed from dtrain/job/hadoop-streaming-job.sh)0
-rwxr-xr-xdtrain/job/avgweights.rb30
-rwxr-xr-xdtrain/job/dtrain.sh6
-rwxr-xr-xdtrain/job2/avgweights.rb30
-rw-r--r--dtrain/job2/cdec.ini8
-rw-r--r--dtrain/job2/dtrain.ini10
-rwxr-xr-xdtrain/job2/dtrain.sh6
-rwxr-xr-xdtrain/job2/hadoop-streaming-job.sh23
-rw-r--r--dtrain/pairsampling.h (renamed from dtrain/sample.h)0
-rwxr-xr-xdtrain/run.sh12
-rw-r--r--dtrain/test-reducer7
-rw-r--r--dtrain/tests.cc141
-rw-r--r--dtrain/tests.h26
-rw-r--r--dtrain/updater.h107
-rw-r--r--dtrain/util.cc34
-rw-r--r--dtrain/util.h28
24 files changed, 38 insertions, 595 deletions
diff --git a/dtrain/Makefile.am b/dtrain/Makefile.am
index c08cd1ea..9b5df8bf 100644
--- a/dtrain/Makefile.am
+++ b/dtrain/Makefile.am
@@ -1,11 +1,8 @@
# TODO I'm sure I can leave something out.
-bin_PROGRAMS = dtrain dtest
+bin_PROGRAMS = dtrain
-dtrain_SOURCES = dtrain.cc score.cc tests.cc util.cc sample_hg.cc
+dtrain_SOURCES = dtrain.cc score.cc sample_hg.cc
dtrain_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -lboost_filesystem -lboost_iostreams
-dtest_SOURCES = dtest.cc score.cc util.cc
-dtest_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
diff --git a/dtrain/README b/dtrain/README
index b3f513be..0cc52acc 100644
--- a/dtrain/README
+++ b/dtrain/README
@@ -31,6 +31,7 @@ TODO
use separate TEST SET
KNOWN BUGS PROBLEMS
+ if size of candidate < N => 0 score
cdec kbest vs 1best (no -k param), rescoring? => ok(?)
no sparse vector in decoder => ok
? ok
diff --git a/dtrain/dtest.cc b/dtrain/dtest.cc
deleted file mode 100644
index 36c880a3..00000000
--- a/dtrain/dtest.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-#include "common.h"
-#include "kbestget.h"
-#include "util.h"
-
-
-/*
- * init
- *
- */
-bool
-init(int argc, char** argv, po::variables_map* conf)
-{
- int N;
- po::options_description opts( "Command Line Options" );
- opts.add_options()
- ( "decoder-config,c", po::value<string>(), "configuration file for cdec" )
- ( "weights,w", po::value<string>(), "weights file" )
- ( "ngrams,n", po::value<int>(&N)->default_value(DTRAIN_DEFAULT_N), "N for Ngrams (default 5)" );
- po::options_description cmdline_options;
- cmdline_options.add(opts);
- po::store( parse_command_line(argc, argv, cmdline_options), *conf );
- po::notify( *conf );
- if ( ! (conf->count("decoder-config") || conf->count("weights")) ) {
- cerr << cmdline_options << endl;
- return false;
- }
- return true;
-}
-
-
-/*
- * main
- *
- */
-int
-main(int argc, char** argv)
-{
- SetSilent( true );
- po::variables_map conf;
- if ( !init(argc, argv, &conf) ) return 1;
- register_feature_functions();
- size_t k = 1;
- ReadFile ini_rf( conf["decoder-config"].as<string>() );
- Decoder decoder( ini_rf.stream() );
- KBestGetter observer( k, "no" );
- size_t N = conf["ngrams"].as<int>();
-
- Weights weights;
- if ( conf.count("weights") ) weights.InitFromFile( conf["weights"].as<string>() );
- vector<double> w;
- weights.InitVector( &w );
- decoder.SetWeights( w );
-
- vector<string> in_split, ref_strs;
- vector<WordID> ref_ids;
- string in, psg;
- size_t sn = 0;
- double overall = 0.0;
- double overall1 = 0.0;
- double overall2 = 0.0;
- while( getline(cin, in) ) {
- in_split.clear();
- boost::split( in_split, in, boost::is_any_of("\t") );
- // grammar
- psg = boost::replace_all_copy( in_split[3], " __NEXT__RULE__ ", "\n" ); psg += "\n";
- decoder.SetSentenceGrammarFromString( psg );
- decoder.Decode( in_split[1], &observer );
- KBestList* kb = observer.GetKBest();
- // reference
- ref_strs.clear(); ref_ids.clear();
- boost::split( ref_strs, in_split[2], boost::is_any_of(" ") );
- register_and_convert( ref_strs, ref_ids );
- // scoring kbest
- double score = 0.0;
- double score1 = 0.0;
- double score2 = 0.0;
- NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[0], N );
- score = smooth_bleu( counts, ref_ids.size(), kb->sents[0].size(), N );
- score1 = stupid_bleu( counts, ref_ids.size(), kb->sents[0].size(), N );
- score2 = bleu( counts, ref_ids.size(), kb->sents[0].size(), N );
- cout << TD::GetString( kb->sents[0] ) << endl;
- overall += score;
- overall1 += score1;
- overall2 += score2;
- sn += 1;
- }
- cerr << "Average score (smooth) : " << overall/(double)(sn+1) << endl;
- cerr << "Average score (stupid) : " << overall1/(double)(sn+1) << endl;
- cerr << "Average score (vanilla): " << overall2/(double)(sn+1) << endl;
- cerr << endl;
-
- return 0;
-}
-
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index f005008e..01821b30 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -1,24 +1,19 @@
-#include "common.h"
-#include "kbestget.h"
-#include "util.h"
-#include "sample.h"
+#include "dtrain.h"
-#include "ksampler.h"
-// boost compression
-#include <boost/iostreams/device/file.hpp>
-#include <boost/iostreams/filtering_stream.hpp>
-#include <boost/iostreams/filter/gzip.hpp>
-//#include <boost/iostreams/filter/zlib.hpp>
-//#include <boost/iostreams/filter/bzip2.hpp>
-using namespace boost::iostreams;
-#include <boost/algorithm/string/predicate.hpp>
-#include <boost/lexical_cast.hpp>
-
-#ifdef DTRAIN_DEBUG
-#include "tests.h"
-#endif
+/*
+ * register_and_convert
+ *
+ */
+void
+register_and_convert(const vector<string>& strs, vector<WordID>& ids)
+{
+ vector<string>::const_iterator it;
+ for ( it = strs.begin(); it < strs.end(); it++ ) {
+ ids.push_back( TD::Convert( *it ) );
+ }
+}
/*
@@ -49,12 +44,7 @@ init(int argc, char** argv, po::variables_map* cfg)
clo.add_options()
( "config,c", po::value<string>(), "dtrain config file" )
( "quiet,q", po::value<bool>()->zero_tokens(), "be quiet" )
- ( "verbose,v", po::value<bool>()->zero_tokens(), "be verbose" )
-#ifndef DTRAIN_DEBUG
- ;
-#else
- ( "test", "run tests and exit");
-#endif
+ ( "verbose,v", po::value<bool>()->zero_tokens(), "be verbose" );
po::options_description config_options, cmdline_options;
config_options.add(conff);
@@ -149,9 +139,9 @@ main( int argc, char** argv )
if ( !quiet )
cout << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl;
Decoder decoder( ini_rf.stream() );
- //KBestGetter observer( k, filter_type );
+ KBestGetter observer( k, filter_type );
MT19937 rng;
- KSampler observer( k, &rng );
+ //KSampler observer( k, &rng );
// scoring metric/scorer
string scorer_str = cfg["scorer"].as<string>();
@@ -433,7 +423,7 @@ main( int argc, char** argv )
}
++sid;
- cerr << "reporter:counter:dtrain,sent," << sid << endl;
+ //cerr << "reporter:counter:dtrain,sent," << sid << endl;
} // input loop
diff --git a/dtrain/common.h b/dtrain/dtrain.h
index 49dc85b7..3d319233 100644
--- a/dtrain/common.h
+++ b/dtrain/dtrain.h
@@ -30,10 +30,27 @@
#define DTRAIN_DEFAULT_T 1 // iterations
#define DTRAIN_DEFAULT_SCORER "stupid_bleu" // scorer
#define DTRAIN_DOTS 100 // when to display a '.'
-#define DTRAIN_TMP_DIR "/var/hadoop/mapred/local" // put this on a SSD?
+#define DTRAIN_TMP_DIR "/tmp" // put this on a SSD?
#define DTRAIN_GRAMMAR_DELIM "########EOS########"
+#include "kbestget.h"
+#include "pairsampling.h"
+
+#include "ksampler.h"
+
+// boost compression
+#include <boost/iostreams/device/file.hpp>
+#include <boost/iostreams/filtering_stream.hpp>
+#include <boost/iostreams/filter/gzip.hpp>
+//#include <boost/iostreams/filter/zlib.hpp>
+//#include <boost/iostreams/filter/bzip2.hpp>
+using namespace boost::iostreams;
+
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/lexical_cast.hpp>
+
+
using namespace std;
using namespace dtrain;
namespace po = boost::program_options;
diff --git a/dtrain/avgweights.rb b/dtrain/hstreaming/avgweights.rb
index d5cfaa4d..d5cfaa4d 100755
--- a/dtrain/avgweights.rb
+++ b/dtrain/hstreaming/avgweights.rb
diff --git a/dtrain/job/cdec.ini b/dtrain/hstreaming/cdec.ini
index 0d32f0b7..0d32f0b7 100644
--- a/dtrain/job/cdec.ini
+++ b/dtrain/hstreaming/cdec.ini
diff --git a/dtrain/job/dtrain.ini b/dtrain/hstreaming/dtrain.ini
index 079d7d69..079d7d69 100644
--- a/dtrain/job/dtrain.ini
+++ b/dtrain/hstreaming/dtrain.ini
diff --git a/dtrain/job/hadoop-streaming-job.sh b/dtrain/hstreaming/hadoop-streaming-job.sh
index 2cf3f50a..2cf3f50a 100755
--- a/dtrain/job/hadoop-streaming-job.sh
+++ b/dtrain/hstreaming/hadoop-streaming-job.sh
diff --git a/dtrain/job/avgweights.rb b/dtrain/job/avgweights.rb
deleted file mode 100755
index e635aab4..00000000
--- a/dtrain/job/avgweights.rb
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/env ruby1.9.1
-
-
-STDIN.set_encoding 'utf-8'
-
-#shard_count_key = "__SHARD_COUNT__"
-
-w = {}
-#c = {}
-w.default = 0
-#c.default = 0
-while line = STDIN.gets
- key, val = line.split /\t/
- w[key] += val.to_f
- #c[key] += 1.0
-end
-
-#shard_count = w["__SHARD_COUNT__"]
-
-num_map = 104.0
-
-w.each_key { |k|
- #if k == shard_count_key then next end
- #if k == "__bias" then next end
- puts "#{k}\t#{w[k]/num_map}"
- #/c[k]}" #{w[k]/shard_count}"
-}
-
-#puts "#{shard_count_key}\t#{w[shard_count_key]}"
-
diff --git a/dtrain/job/dtrain.sh b/dtrain/job/dtrain.sh
deleted file mode 100755
index 75ec29ea..00000000
--- a/dtrain/job/dtrain.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/sh
-
-./dtrain -q -c dtrain.ini
-
-exit 0
-
diff --git a/dtrain/job2/avgweights.rb b/dtrain/job2/avgweights.rb
deleted file mode 100755
index 31048f16..00000000
--- a/dtrain/job2/avgweights.rb
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/env ruby1.9.1
-
-
-STDIN.set_encoding 'utf-8'
-
-#shard_count_key = "__SHARD_COUNT__"
-
-w = {}
-#c = {}
-w.default = 0
-#c.default = 0
-while line = STDIN.gets
- key, val = line.split /\t/
- w[key] += val.to_f
- #c[key] += 1.0
-end
-
-#shard_count = w["__SHARD_COUNT__"]
-
-num_map = 2107.0
-
-w.each_key { |k|
- #if k == shard_count_key then next end
- #if k == "__bias" then next end
- puts "#{k}\t#{w[k]/num_map}"
- #/c[k]}" #{w[k]/shard_count}"
-}
-
-#puts "#{shard_count_key}\t#{w[shard_count_key]}"
-
diff --git a/dtrain/job2/cdec.ini b/dtrain/job2/cdec.ini
deleted file mode 100644
index 0d32f0b7..00000000
--- a/dtrain/job2/cdec.ini
+++ /dev/null
@@ -1,8 +0,0 @@
-formalism=scfg
-add_pass_through_rules=true
-feature_function=WordPenalty
-cubepruning_pop_limit=30
-feature_function=KLanguageModel nc-wmt11.en.srilm.3.gz
-feature_function=RuleIdentityFeatures
-scfg_max_span_limit=15
-
diff --git a/dtrain/job2/dtrain.ini b/dtrain/job2/dtrain.ini
deleted file mode 100644
index ec005e46..00000000
--- a/dtrain/job2/dtrain.ini
+++ /dev/null
@@ -1,10 +0,0 @@
-decoder_config=cdec.ini
-kbest=100
-ngrams=3
-epochs=10
-input=-
-scorer=stupid_bleu
-output=-
-#stop_after=100
-#wprint=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough
-
diff --git a/dtrain/job2/dtrain.sh b/dtrain/job2/dtrain.sh
deleted file mode 100755
index 75ec29ea..00000000
--- a/dtrain/job2/dtrain.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/sh
-
-./dtrain -q -c dtrain.ini
-
-exit 0
-
diff --git a/dtrain/job2/hadoop-streaming-job.sh b/dtrain/job2/hadoop-streaming-job.sh
deleted file mode 100755
index 9ee70a33..00000000
--- a/dtrain/job2/hadoop-streaming-job.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-HADOOP_HOME=/usr/lib/hadoop-0.20
-JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar
-HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR"
-
-IN=in/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain
-OUT=out/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain-weights.50
-
-$HSTREAMING \
- -mapper "dtrain.sh" \
- -reducer "avgweights.rb" \
- -input $IN \
- -output $OUT \
- -file avgweights.rb \
- -file dtrain.sh \
- -file dtrain \
- -file dtrain.ini \
- -file cdec.ini \
- -file nc-wmt11.en.srilm.3.gz \
- -jobconf mapred.reduce.tasks=1 \
- -jobconf mapred.max.map.failures.percent=100
-
diff --git a/dtrain/sample.h b/dtrain/pairsampling.h
index 502901af..502901af 100644
--- a/dtrain/sample.h
+++ b/dtrain/pairsampling.h
diff --git a/dtrain/run.sh b/dtrain/run.sh
deleted file mode 100755
index 72e56f3e..00000000
--- a/dtrain/run.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/sh
-
-#INI=test/blunsom08.dtrain.ini
-#INI=test/nc-wmt11/dtrain.ini
-#INI=test/EXAMPLE/dtrain.ini
-#INI=test/EXAMPLE/dtrain.ruleids.ini
-INI=test/toy.dtrain.ini
-#INI=test/EXAMPLE/dtrain.cdecrid.ini
-
-#rm /tmp/dtrain-*
-./dtrain -c $INI $1 $2 $3 $4
-
diff --git a/dtrain/test-reducer b/dtrain/test-reducer
deleted file mode 100644
index b86e7894..00000000
--- a/dtrain/test-reducer
+++ /dev/null
@@ -1,7 +0,0 @@
-a 1
-b 2
-c 3.5
-a 1
-b 2
-c 3.5
-__SHARD_COUNT__ 2
diff --git a/dtrain/tests.cc b/dtrain/tests.cc
deleted file mode 100644
index 997eafbb..00000000
--- a/dtrain/tests.cc
+++ /dev/null
@@ -1,141 +0,0 @@
-#include "tests.h"
-
-
-namespace dtrain
-{
-
-
-/*
- * approx_equal
- *
- */
-double
-approx_equal( double x, double y )
-{
- const double EPSILON = 1E-5;
- if ( x == 0 ) return fabs( y ) <= EPSILON;
- if ( y == 0 ) return fabs( x ) <= EPSILON;
- return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON;
-}
-
-
-/*
- * test_ngrams
- *
- */
-void
-test_ngrams()
-{
- cout << "Testing ngrams..." << endl << endl;
- size_t N = 5;
- cout << "N = " << N << endl;
- vector<int> a; // hyp
- vector<int> b; // ref
- cout << "a ";
- for (size_t i = 1; i <= 8; i++) {
- cout << i << " ";
- a.push_back(i);
- }
- cout << endl << "b ";
- for (size_t i = 1; i <= 4; i++) {
- cout << i << " ";
- b.push_back(i);
- }
- cout << endl << endl;
- NgramCounts c = make_ngram_counts( a, b, N );
- assert( c.clipped[N-1] == 0 );
- assert( c.sum[N-1] == 4 );
- c.print();
- c += c;
- cout << endl;
- c.print();
- cout << endl;
-}
-
-
-/*
- * test_metrics
- *
- */
-void
-test_metrics()
-{
- cout << "Testing metrics..." << endl << endl;
- using namespace boost::assign;
- vector<string> a, b;
- vector<double> expect_vanilla, expect_smooth, expect_stupid;
- a += "a a a a", "a a a a", "a", "a", "b", "a a a a", "a a", "a a a", "a b a"; // hyp
- b += "b b b b", "a a a a", "a", "b", "b b b b", "a", "a a", "a a a", "a b b"; // ref
- expect_vanilla += 0, 1, 1, 0, 0, .25, 1, 1, 0;
- expect_smooth += 0, .9375, .0625, 0, .00311169, .0441942, .1875, .4375, .161587;
- expect_stupid += 0, 1, 1, 0, .0497871, .25, 1, 1, .605707;
- vector<string> aa, bb;
- vector<WordID> aai, bbi;
- double vanilla, smooth, stupid;
- size_t N = 4;
- cout << "N = " << N << endl << endl;
- for ( size_t i = 0; i < a.size(); i++ ) {
- cout << " hyp: " << a[i] << endl;
- cout << " ref: " << b[i] << endl;
- aa.clear(); bb.clear(); aai.clear(); bbi.clear();
- boost::split( aa, a[i], boost::is_any_of(" ") );
- boost::split( bb, b[i], boost::is_any_of(" ") );
- register_and_convert( aa, aai );
- register_and_convert( bb, bbi );
- NgramCounts counts = make_ngram_counts( aai, bbi, N );
- vanilla = bleu( counts, aa.size(), bb.size(), N);
- smooth = smooth_bleu( counts, aa.size(), bb.size(), N);
- stupid = stupid_bleu( counts, aa.size(), bb.size(), N);
- assert( approx_equal(vanilla, expect_vanilla[i]) );
- assert( approx_equal(smooth, expect_smooth[i]) );
- assert( approx_equal(stupid, expect_stupid[i]) );
- cout << setw(14) << "bleu = " << vanilla << endl;
- cout << setw(14) << "smooth bleu = " << smooth << endl;
- cout << setw(14) << "stupid bleu = " << stupid << endl << endl;
- }
- cout << endl;
-}
-
-
-/*
- * test_SetWeights
- *
- */
-void
-test_SetWeights()
-{
- cout << "Testing Weights::SetWeight..." << endl << endl;
- Weights weights;
- SparseVector<double> lambdas;
- weights.InitSparseVector( &lambdas );
- weights.SetWeight( &lambdas, "test", 0 );
- weights.SetWeight( &lambdas, "test1", 1 );
- WordID fid = FD::Convert( "test2" );
- weights.SetWeight( &lambdas, fid, 2 );
- string fn = "weights-test";
- cout << "FD::NumFeats() " << FD::NumFeats() << endl;
- assert( FD::NumFeats() == 4 );
- weights.WriteToFile( fn, true );
- cout << endl;
-}
-
-
-/*
- * run_tests
- *
- */
-void
-run_tests()
-{
- cout << endl;
- test_ngrams();
- cout << endl;
- test_metrics();
- cout << endl;
- test_SetWeights();
- exit(0);
-}
-
-
-} // namespace
-
diff --git a/dtrain/tests.h b/dtrain/tests.h
deleted file mode 100644
index 9853e3c3..00000000
--- a/dtrain/tests.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef _DTRAIN_TESTS_H_
-#define _DTRAIN_TESTS_H_
-
-#include <iomanip>
-#include <boost/assign/std/vector.hpp>
-
-#include "common.h"
-#include "util.h"
-
-
-namespace dtrain
-{
-
-
-double approx_equal( double x, double y );
-void test_ngrams();
-void test_metrics();
-void test_SetWeights();
-void run_tests();
-
-
-} // namespace
-
-
-#endif
-
diff --git a/dtrain/updater.h b/dtrain/updater.h
deleted file mode 100644
index b54c25de..00000000
--- a/dtrain/updater.h
+++ /dev/null
@@ -1,107 +0,0 @@
-#ifndef _DTRAIN_LEARNER_H_
-#define _DTRAIN_LEARNER_H_
-
-#include <string>
-#include <vector>
-#include <map>
-
-#include "sparse_vector.h"
-#include "score.h"
-
-
-namespace dtrain
-{
-
-
-class Updater
-{
- public:
- virtual void Init( const vector<SparseVector<double> >& kbest, const Scores& scores,
- const bool invert_score = false ) {};
- virtual void Update( SparseVector<double>& lambdas ) {};
-};
-
-
-class SofiaUpdater : public Updater
-{
- public:
- void
- Init( const size_t sid, const vector<SparseVector<double> >& kbest, /*const FIXME operator[]*/ Scores& scores,
- const bool invert_score = false )
- {
- assert( kbest.size() == scores.size() );
- ofstream o;
- char tmp[] = DTRAIN_TMP_DIR"/dtrain-sofia-data-XXXXXX";
- mkstemp( tmp );
- tmp_data_fn = tmp;
- o.open( tmp_data_fn.c_str(), ios::trunc );
- int fid = 0;
- map<int,int>::iterator ff;
- double score;
- for ( size_t k = 0; k < kbest.size(); ++k ) {
- map<int,double> m;
- SparseVector<double>::const_iterator it = kbest[k].begin();
- score = scores[k].GetScore();
- if ( invert_score ) score = -score;
- o << score;
- for ( ; it != kbest[k].end(); ++it ) {
- ff = fmap.find( it->first );
- if ( ff == fmap.end() ) {
- fmap.insert( pair<int,int>(it->first, fid) );
- fmap1.insert( pair<int,int>(fid, it->first) );
- fid++;
- }
- m.insert( pair<int,double>(fmap[it->first], it->second) );
- }
- map<int,double>::iterator ti = m.begin();
- for ( ; ti != m.end(); ++ti ) {
- o << " " << ti->first << ":" << ti->second;
- }
- o << endl;
- }
- o.close();
- }
-
- void
- Update(SparseVector<double>& lambdas)
- {
- char tmp[] = DTRAIN_TMP_DIR"/dtrain-sofia-model-XXXXXX";
- mkstemp(tmp);
- tmp_model_fn = tmp;
- //--random_seed 123456789010
- string call = "./sofia-ml --training_file " + tmp_data_fn;
- call += " --model_out " + tmp_model_fn;
- call += " --loop_type rank --lambda 100 --eta_type constant --dimensionality ";
- std::stringstream out;
- out << fmap.size();
- call += out.str();
- call += " &>/dev/null";
- system ( call.c_str() );
- ifstream i;
- i.open( tmp_model_fn.c_str(), ios::in );
- string model;
- getline( i, model );
- vector<string> strs;
- boost::split( strs, model, boost::is_any_of(" ") );
- int j = 0;
- for ( vector<string>::iterator it = strs.begin(); it != strs.end(); ++it ) {
- lambdas.set_value(fmap1[j], atof( it->c_str() ) );
- j++;
- }
- i.close();
- unlink( tmp_data_fn.c_str() );
- unlink( tmp_model_fn.c_str() );
- }
-
- private:
- string tmp_data_fn;
- string tmp_model_fn;
- map<int,int> fmap;
- map<int,int> fmap1;
-};
-
-
-} // namespace
-
-#endif
-
diff --git a/dtrain/util.cc b/dtrain/util.cc
deleted file mode 100644
index 7b3bbe3d..00000000
--- a/dtrain/util.cc
+++ /dev/null
@@ -1,34 +0,0 @@
-#include "util.h"
-
-
-namespace dtrain
-{
-
-
-/*
- * register_and_convert
- *
- */
-void
-register_and_convert(const vector<string>& strs, vector<WordID>& ids)
-{
- vector<string>::const_iterator it;
- for ( it = strs.begin(); it < strs.end(); it++ ) {
- ids.push_back( TD::Convert( *it ) );
- }
-}
-
-
-/*
- * print_FD
- *
- */
-void
-print_FD()
-{
- for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl;
-}
-
-
-} // namespace
-
diff --git a/dtrain/util.h b/dtrain/util.h
deleted file mode 100644
index 6a548519..00000000
--- a/dtrain/util.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef _DTRAIN_UTIL_H_
-#define _DTRAIN_UTIL_H_
-
-
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "fdict.h"
-#include "tdict.h"
-#include "wordid.h"
-
-using namespace std;
-
-
-namespace dtrain
-{
-
-
-void register_and_convert(const vector<string>& strs, vector<WordID>& ids);
-void print_FD();
-
-
-} // namespace
-
-
-#endif
-