begin refactoring

author: Patrick Simianer <p@simianer.de> 2011-09-23 20:53:15 +0200
committer: Patrick Simianer <p@simianer.de> 2011-09-23 20:53:15 +0200
commit: 9bde56ed23b4b97f8193f9f8f582f18086ff17c1 (patch)
tree: 83bd5687f2069405537f7f8fbdfbe208a634ca54 /dtrain
parent: 4433886ac335e6db7ded081b5ef673490ee27718 (diff)
24 files changed, 38 insertions, 595 deletions
diff --git a/dtrain/Makefile.am b/dtrain/Makefile.am
index c08cd1ea..9b5df8bf 100644
--- a/dtrain/Makefile.am
+++ b/dtrain/Makefile.am
@@ -1,11 +1,8 @@
 # TODO I'm sure I can leave something out.
-bin_PROGRAMS = dtrain dtest
+bin_PROGRAMS = dtrain
 
-dtrain_SOURCES = dtrain.cc score.cc tests.cc util.cc sample_hg.cc
+dtrain_SOURCES = dtrain.cc score.cc sample_hg.cc
 dtrain_LDADD   = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -lboost_filesystem -lboost_iostreams
 
-dtest_SOURCES = dtest.cc score.cc util.cc
-dtest_LDADD   = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
 AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
 
diff --git a/dtrain/README b/dtrain/README
index b3f513be..0cc52acc 100644
--- a/dtrain/README
+++ b/dtrain/README
@@ -31,6 +31,7 @@ TODO
  use separate TEST SET
 
 KNOWN BUGS PROBLEMS
+ if size of candidate < N => 0 score
  cdec kbest vs 1best (no -k param), rescoring? => ok(?)
  no sparse vector in decoder => ok
  ? ok
diff --git a/dtrain/dtest.cc b/dtrain/dtest.cc
deleted file mode 100644
index 36c880a3..00000000
--- a/dtrain/dtest.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-#include "common.h"
-#include "kbestget.h"
-#include "util.h"
-
-
-/*
- * init
- *
- */
-bool
-init(int argc, char** argv, po::variables_map* conf)
-{
-  int N;
-  po::options_description opts( "Command Line Options" );
-  opts.add_options()
-    ( "decoder-config,c", po::value<string>(),                              "configuration file for cdec" )
-    ( "weights,w",        po::value<string>(),                                             "weights file" )
-    ( "ngrams,n",         po::value<int>(&N)->default_value(DTRAIN_DEFAULT_N), "N for Ngrams (default 5)" );
-  po::options_description cmdline_options;
-  cmdline_options.add(opts);
-  po::store( parse_command_line(argc, argv, cmdline_options), *conf );
-  po::notify( *conf );
-  if ( ! (conf->count("decoder-config") || conf->count("weights")) ) {
-    cerr << cmdline_options << endl;
-    return false;
-  }
-  return true;
-}
-
-
-/*
- * main
- *
- */
-int
-main(int argc, char** argv)
-{
-  SetSilent( true );
-  po::variables_map conf;
-  if ( !init(argc, argv, &conf) ) return 1;
-  register_feature_functions();
-  size_t k = 1;
-  ReadFile ini_rf( conf["decoder-config"].as<string>() );
-  Decoder decoder( ini_rf.stream() );
-  KBestGetter observer( k, "no" );
-  size_t N = conf["ngrams"].as<int>();
-
-  Weights weights;
-  if ( conf.count("weights") ) weights.InitFromFile( conf["weights"].as<string>() );
-  vector<double> w;
-  weights.InitVector( &w );
-  decoder.SetWeights( w );
- 
-  vector<string> in_split, ref_strs;
-  vector<WordID> ref_ids;
-  string in, psg;
-  size_t sn = 0;
-  double overall  = 0.0;
-  double overall1 = 0.0;
-  double overall2 = 0.0;
-  while( getline(cin, in) ) {
-    in_split.clear();
-    boost::split( in_split, in, boost::is_any_of("\t") );
-    // grammar
-    psg = boost::replace_all_copy( in_split[3], " __NEXT__RULE__ ", "\n" ); psg += "\n";
-    decoder.SetSentenceGrammarFromString( psg );
-    decoder.Decode( in_split[1], &observer );
-    KBestList* kb = observer.GetKBest();
-    // reference
-    ref_strs.clear(); ref_ids.clear();
-    boost::split( ref_strs, in_split[2], boost::is_any_of(" ") );
-    register_and_convert( ref_strs, ref_ids );
-    // scoring kbest
-    double score  = 0.0;
-    double score1 = 0.0;
-    double score2 = 0.0;
-    NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[0], N );
-    score =  smooth_bleu( counts, ref_ids.size(), kb->sents[0].size(), N );
-    score1 = stupid_bleu( counts, ref_ids.size(), kb->sents[0].size(), N );
-    score2 =        bleu( counts, ref_ids.size(), kb->sents[0].size(), N );
-    cout << TD::GetString( kb->sents[0] ) << endl;
-    overall += score;
-    overall1 += score1;
-    overall2 += score2;
-    sn += 1;
-  }
-  cerr << "Average score (smooth) : " << overall/(double)(sn+1) << endl;
-  cerr << "Average score (stupid) : " << overall1/(double)(sn+1) << endl;
-  cerr << "Average score (vanilla): " << overall2/(double)(sn+1) << endl;
-  cerr << endl;
-
-  return 0;
-}
-
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index f005008e..01821b30 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -1,24 +1,19 @@
-#include "common.h"
-#include "kbestget.h"
-#include "util.h"
-#include "sample.h"
+#include "dtrain.h"
 
-#include "ksampler.h"
 
-// boost compression
-#include <boost/iostreams/device/file.hpp> 
-#include <boost/iostreams/filtering_stream.hpp>
-#include <boost/iostreams/filter/gzip.hpp>
-//#include <boost/iostreams/filter/zlib.hpp>
-//#include <boost/iostreams/filter/bzip2.hpp>
-using namespace boost::iostreams;
 
-#include <boost/algorithm/string/predicate.hpp>
-#include <boost/lexical_cast.hpp>
-
-#ifdef DTRAIN_DEBUG
-#include "tests.h"
-#endif
+/*
+ * register_and_convert
+ *
+ */
+void
+register_and_convert(const vector<string>& strs, vector<WordID>& ids)
+{
+  vector<string>::const_iterator it;
+  for ( it = strs.begin(); it < strs.end(); it++ ) {
+    ids.push_back( TD::Convert( *it ) );
+  }
+}
 
 
 /*
@@ -49,12 +44,7 @@ init(int argc, char** argv, po::variables_map* cfg)
   clo.add_options()
     ( "config,c",         po::value<string>(),              "dtrain config file" )
     ( "quiet,q",          po::value<bool>()->zero_tokens(),           "be quiet" )
-    ( "verbose,v",        po::value<bool>()->zero_tokens(),         "be verbose" )
-#ifndef DTRAIN_DEBUG
-    ;
-#else
-    ( "test", "run tests and exit");
-#endif
+    ( "verbose,v",        po::value<bool>()->zero_tokens(),         "be verbose" );
   po::options_description config_options, cmdline_options;
 
   config_options.add(conff);
@@ -149,9 +139,9 @@ main( int argc, char** argv )
   if ( !quiet )
     cout << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl;
   Decoder decoder( ini_rf.stream() );
-  //KBestGetter observer( k, filter_type );
+  KBestGetter observer( k, filter_type );
   MT19937 rng;
-  KSampler observer( k, &rng );
+  //KSampler observer( k, &rng );
 
   // scoring metric/scorer
   string scorer_str = cfg["scorer"].as<string>();
@@ -433,7 +423,7 @@ main( int argc, char** argv )
     }
 
     ++sid;
-    cerr << "reporter:counter:dtrain,sent," << sid << endl;
+    //cerr << "reporter:counter:dtrain,sent," << sid << endl;
 
   } // input loop
 
diff --git a/dtrain/common.h b/dtrain/dtrain.h
index 49dc85b7..3d319233 100644
--- a/dtrain/common.h
+++ b/dtrain/dtrain.h
@@ -30,10 +30,27 @@
 #define DTRAIN_DEFAULT_T 1                  // iterations
 #define DTRAIN_DEFAULT_SCORER "stupid_bleu" // scorer
 #define DTRAIN_DOTS 100                     // when to display a '.'
-#define DTRAIN_TMP_DIR "/var/hadoop/mapred/local"               // put this on a SSD?
+#define DTRAIN_TMP_DIR "/tmp"               // put this on a SSD?
 #define DTRAIN_GRAMMAR_DELIM "########EOS########"
 
 
+#include "kbestget.h"
+#include "pairsampling.h"
+
+#include "ksampler.h"
+
+// boost compression
+#include <boost/iostreams/device/file.hpp> 
+#include <boost/iostreams/filtering_stream.hpp>
+#include <boost/iostreams/filter/gzip.hpp>
+//#include <boost/iostreams/filter/zlib.hpp>
+//#include <boost/iostreams/filter/bzip2.hpp>
+using namespace boost::iostreams;
+
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/lexical_cast.hpp>
+
+
 using namespace std;
 using namespace dtrain;
 namespace po = boost::program_options;
diff --git a/dtrain/avgweights.rb b/dtrain/hstreaming/avgweights.rb
index d5cfaa4d..d5cfaa4d 100755
--- a/dtrain/avgweights.rb
+++ b/dtrain/hstreaming/avgweights.rb
diff --git a/dtrain/job/cdec.ini b/dtrain/hstreaming/cdec.ini
index 0d32f0b7..0d32f0b7 100644
--- a/dtrain/job/cdec.ini
+++ b/dtrain/hstreaming/cdec.ini
diff --git a/dtrain/job/dtrain.ini b/dtrain/hstreaming/dtrain.ini
index 079d7d69..079d7d69 100644
--- a/dtrain/job/dtrain.ini
+++ b/dtrain/hstreaming/dtrain.ini
diff --git a/dtrain/job/hadoop-streaming-job.sh b/dtrain/hstreaming/hadoop-streaming-job.sh
index 2cf3f50a..2cf3f50a 100755
--- a/dtrain/job/hadoop-streaming-job.sh
+++ b/dtrain/hstreaming/hadoop-streaming-job.sh
diff --git a/dtrain/job/avgweights.rb b/dtrain/job/avgweights.rb
deleted file mode 100755
index e635aab4..00000000
--- a/dtrain/job/avgweights.rb
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/env ruby1.9.1
-
-
-STDIN.set_encoding 'utf-8'
-
-#shard_count_key = "__SHARD_COUNT__"
-
-w = {}
-#c = {}
-w.default = 0
-#c.default = 0
-while line = STDIN.gets
-  key, val = line.split /\t/
-  w[key] += val.to_f
-  #c[key] += 1.0
-end
-
-#shard_count = w["__SHARD_COUNT__"]
-
-num_map = 104.0
-
-w.each_key { |k|
-  #if k == shard_count_key then next end
-  #if k == "__bias" then next end
-  puts "#{k}\t#{w[k]/num_map}"
-  #/c[k]}" #{w[k]/shard_count}"
-}
-
-#puts "#{shard_count_key}\t#{w[shard_count_key]}"
-
diff --git a/dtrain/job/dtrain.sh b/dtrain/job/dtrain.sh
deleted file mode 100755
index 75ec29ea..00000000
--- a/dtrain/job/dtrain.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/sh
-
-./dtrain -q -c dtrain.ini
-
-exit 0
-
diff --git a/dtrain/job2/avgweights.rb b/dtrain/job2/avgweights.rb
deleted file mode 100755
index 31048f16..00000000
--- a/dtrain/job2/avgweights.rb
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/env ruby1.9.1
-
-
-STDIN.set_encoding 'utf-8'
-
-#shard_count_key = "__SHARD_COUNT__"
-
-w = {}
-#c = {}
-w.default = 0
-#c.default = 0
-while line = STDIN.gets
-  key, val = line.split /\t/
-  w[key] += val.to_f
-  #c[key] += 1.0
-end
-
-#shard_count = w["__SHARD_COUNT__"]
-
-num_map = 2107.0
-
-w.each_key { |k|
-  #if k == shard_count_key then next end
-  #if k == "__bias" then next end
-  puts "#{k}\t#{w[k]/num_map}"
-  #/c[k]}" #{w[k]/shard_count}"
-}
-
-#puts "#{shard_count_key}\t#{w[shard_count_key]}"
-
diff --git a/dtrain/job2/cdec.ini b/dtrain/job2/cdec.ini
deleted file mode 100644
index 0d32f0b7..00000000
--- a/dtrain/job2/cdec.ini
+++ /dev/null
@@ -1,8 +0,0 @@
-formalism=scfg
-add_pass_through_rules=true
-feature_function=WordPenalty
-cubepruning_pop_limit=30
-feature_function=KLanguageModel nc-wmt11.en.srilm.3.gz
-feature_function=RuleIdentityFeatures
-scfg_max_span_limit=15
-
diff --git a/dtrain/job2/dtrain.ini b/dtrain/job2/dtrain.ini
deleted file mode 100644
index ec005e46..00000000
--- a/dtrain/job2/dtrain.ini
+++ /dev/null
@@ -1,10 +0,0 @@
-decoder_config=cdec.ini
-kbest=100
-ngrams=3
-epochs=10
-input=-
-scorer=stupid_bleu
-output=-
-#stop_after=100
-#wprint=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough
-
diff --git a/dtrain/job2/dtrain.sh b/dtrain/job2/dtrain.sh
deleted file mode 100755
index 75ec29ea..00000000
--- a/dtrain/job2/dtrain.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/sh
-
-./dtrain -q -c dtrain.ini
-
-exit 0
-
diff --git a/dtrain/job2/hadoop-streaming-job.sh b/dtrain/job2/hadoop-streaming-job.sh
deleted file mode 100755
index 9ee70a33..00000000
--- a/dtrain/job2/hadoop-streaming-job.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-HADOOP_HOME=/usr/lib/hadoop-0.20
-JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar
-HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR"
-
-IN=in/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain
-OUT=out/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain-weights.50
-
-$HSTREAMING \
-    -mapper "dtrain.sh" \
-    -reducer "avgweights.rb" \
-    -input $IN \
-    -output $OUT \
-    -file avgweights.rb \
-    -file dtrain.sh \
-    -file dtrain \
-    -file dtrain.ini \
-    -file cdec.ini \
-    -file nc-wmt11.en.srilm.3.gz \
-    -jobconf mapred.reduce.tasks=1 \
-    -jobconf mapred.max.map.failures.percent=100
-
diff --git a/dtrain/sample.h b/dtrain/pairsampling.h
index 502901af..502901af 100644
--- a/dtrain/sample.h
+++ b/dtrain/pairsampling.h
diff --git a/dtrain/run.sh b/dtrain/run.sh
deleted file mode 100755
index 72e56f3e..00000000
--- a/dtrain/run.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/sh
-
-#INI=test/blunsom08.dtrain.ini
-#INI=test/nc-wmt11/dtrain.ini
-#INI=test/EXAMPLE/dtrain.ini
-#INI=test/EXAMPLE/dtrain.ruleids.ini
-INI=test/toy.dtrain.ini
-#INI=test/EXAMPLE/dtrain.cdecrid.ini
-
-#rm /tmp/dtrain-*
-./dtrain -c $INI $1 $2 $3 $4 
-
diff --git a/dtrain/test-reducer b/dtrain/test-reducer
deleted file mode 100644
index b86e7894..00000000
--- a/dtrain/test-reducer
+++ /dev/null
@@ -1,7 +0,0 @@
-a	1
-b	2
-c	3.5
-a	1
-b	2
-c	3.5
-__SHARD_COUNT__	2
diff --git a/dtrain/tests.cc b/dtrain/tests.cc
deleted file mode 100644
index 997eafbb..00000000
--- a/dtrain/tests.cc
+++ /dev/null
@@ -1,141 +0,0 @@
-#include "tests.h"
-
-
-namespace dtrain
-{
-
-
-/*
- * approx_equal
- *
- */
-double
-approx_equal( double x, double y )
-{
-  const double EPSILON = 1E-5;
-  if ( x == 0 ) return fabs( y ) <= EPSILON;
-  if ( y == 0 ) return fabs( x ) <= EPSILON;
-  return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON;
-}
-
-
-/*
- * test_ngrams
- *
- */
-void
-test_ngrams()
-{
-  cout << "Testing ngrams..." << endl << endl;
-  size_t N = 5;
-  cout << "N = " << N << endl;
-  vector<int> a; // hyp
-  vector<int> b; // ref
-  cout << "a ";
-  for (size_t i = 1; i <= 8; i++) {
-    cout << i << " ";
-    a.push_back(i);
-  }
-  cout << endl << "b ";
-  for (size_t i = 1; i <= 4; i++) {
-    cout << i << " ";
-    b.push_back(i);
-  }
-  cout << endl << endl;
-  NgramCounts c = make_ngram_counts( a, b, N );
-  assert( c.clipped[N-1] == 0 );
-  assert( c.sum[N-1] == 4 );
-  c.print();
-  c += c;
-  cout << endl;
-  c.print();
-  cout << endl;
-}
-
-
-/*
- * test_metrics
- *
- */
-void
-test_metrics()
-{
-  cout << "Testing metrics..." << endl << endl;
-  using namespace boost::assign;
-  vector<string> a, b;
-  vector<double> expect_vanilla, expect_smooth, expect_stupid;
-  a +=              "a a a a", "a a a a", "a",   "a", "b",        "a a a a", "a a",  "a a a", "a b a"; // hyp
-  b +=              "b b b b", "a a a a", "a",   "b", "b b b b",  "a",       "a a",  "a a a", "a b b"; // ref
-  expect_vanilla += 0,         1,         1,      0,  0,          .25,       1,      1,       0;
-  expect_smooth  += 0,          .9375,     .0625, 0,   .00311169, .0441942,   .1875,  .4375,   .161587;
-  expect_stupid  += 0,         1,         1,      0,   .0497871,  .25,       1,      1,        .605707;
-  vector<string> aa, bb;
-  vector<WordID> aai, bbi;
-  double vanilla, smooth, stupid;
-  size_t N = 4;
-  cout << "N = " << N << endl << endl;
-  for ( size_t i = 0; i < a.size(); i++ ) {
-    cout << " hyp: " << a[i] << endl;
-    cout << " ref: " << b[i] << endl;
-    aa.clear(); bb.clear(); aai.clear(); bbi.clear();
-    boost::split( aa, a[i], boost::is_any_of(" ") );
-    boost::split( bb, b[i], boost::is_any_of(" ") );
-    register_and_convert( aa, aai );
-    register_and_convert( bb, bbi );
-    NgramCounts counts = make_ngram_counts( aai, bbi, N );
-    vanilla =        bleu( counts, aa.size(), bb.size(), N);
-    smooth  = smooth_bleu( counts, aa.size(), bb.size(), N);
-    stupid  = stupid_bleu( counts, aa.size(), bb.size(), N);
-    assert( approx_equal(vanilla, expect_vanilla[i]) );
-    assert( approx_equal(smooth, expect_smooth[i]) );
-    assert( approx_equal(stupid, expect_stupid[i]) );
-    cout << setw(14) << "bleu = "      << vanilla << endl;
-    cout << setw(14) << "smooth bleu = " << smooth << endl;
-    cout << setw(14) << "stupid bleu = " << stupid << endl << endl;
-  }
-  cout << endl;
-}
-
-
-/*
- * test_SetWeights
- *
- */
-void
-test_SetWeights()
-{
-  cout << "Testing Weights::SetWeight..." << endl << endl;
-  Weights weights;
-  SparseVector<double> lambdas;
-  weights.InitSparseVector( &lambdas );
-  weights.SetWeight( &lambdas, "test", 0 );
-  weights.SetWeight( &lambdas, "test1", 1 );
-  WordID fid = FD::Convert( "test2" );
-  weights.SetWeight( &lambdas, fid, 2 );
-  string fn = "weights-test";
-  cout << "FD::NumFeats() " << FD::NumFeats() << endl;
-  assert( FD::NumFeats() == 4 );
-  weights.WriteToFile( fn, true );
-  cout << endl;
-}
-
-
-/*
- * run_tests
- *
- */
-void
-run_tests()
-{
-  cout << endl;
-  test_ngrams();
-  cout << endl;
-  test_metrics();
-  cout << endl;
-  test_SetWeights();
-  exit(0);
-}
-
-
-} // namespace
-
diff --git a/dtrain/tests.h b/dtrain/tests.h
deleted file mode 100644
index 9853e3c3..00000000
--- a/dtrain/tests.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef _DTRAIN_TESTS_H_
-#define _DTRAIN_TESTS_H_
-
-#include <iomanip>
-#include <boost/assign/std/vector.hpp>
-
-#include "common.h"
-#include "util.h"
-
-
-namespace dtrain
-{
-
-
-double approx_equal( double x, double y );
-void test_ngrams();
-void test_metrics();
-void test_SetWeights();
-void run_tests();
-
-
-} // namespace
-
-
-#endif
-
diff --git a/dtrain/updater.h b/dtrain/updater.h
deleted file mode 100644
index b54c25de..00000000
--- a/dtrain/updater.h
+++ /dev/null
@@ -1,107 +0,0 @@
-#ifndef _DTRAIN_LEARNER_H_
-#define _DTRAIN_LEARNER_H_
-
-#include <string>
-#include <vector>
-#include <map>
-
-#include "sparse_vector.h"
-#include "score.h"
-
-
-namespace dtrain
-{
-
-
-class Updater
-{
-  public:
-    virtual void Init( const vector<SparseVector<double> >& kbest, const Scores& scores,
-                       const bool invert_score = false ) {};
-    virtual void Update( SparseVector<double>& lambdas ) {};
-};
-
-
-class SofiaUpdater : public Updater
-{
-  public:
-    void
-    Init( const size_t sid, const vector<SparseVector<double> >& kbest, /*const FIXME operator[]*/ Scores& scores,
-          const bool invert_score = false )
-    {
-      assert( kbest.size() == scores.size() );
-      ofstream o;
-      char tmp[] = DTRAIN_TMP_DIR"/dtrain-sofia-data-XXXXXX";
-      mkstemp( tmp );
-      tmp_data_fn = tmp;
-      o.open( tmp_data_fn.c_str(), ios::trunc );
-      int fid = 0;
-      map<int,int>::iterator ff;
-      double score;
-      for ( size_t k = 0; k < kbest.size(); ++k ) {
-        map<int,double> m;
-        SparseVector<double>::const_iterator it = kbest[k].begin();
-        score = scores[k].GetScore();
-        if ( invert_score ) score = -score;
-        o << score;
-        for ( ; it != kbest[k].end(); ++it ) {
-          ff = fmap.find( it->first );
-          if ( ff == fmap.end() ) {
-            fmap.insert( pair<int,int>(it->first, fid) );
-            fmap1.insert( pair<int,int>(fid, it->first) );
-            fid++;
-          }
-          m.insert( pair<int,double>(fmap[it->first], it->second) );
-        }
-        map<int,double>::iterator ti = m.begin();
-        for ( ; ti != m.end(); ++ti ) {
-          o << " " << ti->first << ":" << ti->second;
-        }
-        o << endl;
-      }
-      o.close();
-    }
-
-    void
-    Update(SparseVector<double>& lambdas)
-    {
-      char tmp[] = DTRAIN_TMP_DIR"/dtrain-sofia-model-XXXXXX";
-      mkstemp(tmp);
-      tmp_model_fn = tmp;
-      //--random_seed 123456789010
-      string call = "./sofia-ml  --training_file " + tmp_data_fn;
-      call += " --model_out " + tmp_model_fn;
-      call += " --loop_type rank --lambda 100 --eta_type constant --dimensionality ";
-      std::stringstream out;
-      out << fmap.size();
-      call += out.str();
-      call += " &>/dev/null";
-      system ( call.c_str() );
-      ifstream i;
-      i.open( tmp_model_fn.c_str(), ios::in );
-      string model;
-      getline( i, model );
-      vector<string> strs;
-      boost::split( strs, model, boost::is_any_of(" ") );
-      int j = 0;
-      for ( vector<string>::iterator it = strs.begin(); it != strs.end(); ++it ) {
-        lambdas.set_value(fmap1[j], atof( it->c_str() ) );
-        j++;
-      }
-      i.close();
-      unlink( tmp_data_fn.c_str() );
-      unlink( tmp_model_fn.c_str() );
-    }
-
-  private:
-    string tmp_data_fn;
-    string tmp_model_fn;
-    map<int,int> fmap;
-    map<int,int> fmap1;
-};
-
-
-} // namespace
-
-#endif
-
diff --git a/dtrain/util.cc b/dtrain/util.cc
deleted file mode 100644
index 7b3bbe3d..00000000
--- a/dtrain/util.cc
+++ /dev/null
@@ -1,34 +0,0 @@
-#include "util.h"
-
-
-namespace dtrain
-{
-
-
-/*
- * register_and_convert
- *
- */
-void
-register_and_convert(const vector<string>& strs, vector<WordID>& ids)
-{
-  vector<string>::const_iterator it;
-  for ( it = strs.begin(); it < strs.end(); it++ ) {
-    ids.push_back( TD::Convert( *it ) );
-  }
-}
-
-
-/*
- * print_FD
- *
- */
-void
-print_FD()
-{
-  for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl;
-}
-
-
-} // namespace
-
diff --git a/dtrain/util.h b/dtrain/util.h
deleted file mode 100644
index 6a548519..00000000
--- a/dtrain/util.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef _DTRAIN_UTIL_H_
-#define _DTRAIN_UTIL_H_
-
-
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "fdict.h"
-#include "tdict.h"
-#include "wordid.h"
-
-using namespace std;
-
-
-namespace dtrain
-{
-
-
-void register_and_convert(const vector<string>& strs, vector<WordID>& ids);
-void print_FD();
-
-
-} // namespace
-
-
-#endif
-
author	Patrick Simianer <p@simianer.de>	2011-09-23 20:53:15 +0200
committer	Patrick Simianer <p@simianer.de>	2011-09-23 20:53:15 +0200
commit	9bde56ed23b4b97f8193f9f8f582f18086ff17c1 (patch)
tree	83bd5687f2069405537f7f8fbdfbe208a634ca54 /dtrain
parent	4433886ac335e6db7ded081b5ef673490ee27718 (diff)