From dc9fd7a3adc863510d79a718e919b6833a86729c Mon Sep 17 00:00:00 2001
From: Patrick Simianer <p@simianer.de>
Date: Fri, 23 Sep 2011 20:53:15 +0200
Subject: begin refactoring

---
 .gitignore                                |   1 +
 dtrain/Makefile.am                        |   7 +-
 dtrain/README                             |   1 +
 dtrain/avgweights.rb                      |  27 ------
 dtrain/common.h                           |  43 ---------
 dtrain/dtest.cc                           |  94 --------------------
 dtrain/dtrain.cc                          |  44 ++++------
 dtrain/dtrain.h                           |  60 +++++++++++++
 dtrain/hstreaming/avgweights.rb           |  27 ++++++
 dtrain/hstreaming/cdec.ini                |   8 ++
 dtrain/hstreaming/dtrain.ini              |  10 +++
 dtrain/hstreaming/hadoop-streaming-job.sh |  23 +++++
 dtrain/job/avgweights.rb                  |  30 -------
 dtrain/job/cdec.ini                       |   8 --
 dtrain/job/dtrain.ini                     |  10 ---
 dtrain/job/dtrain.sh                      |   6 --
 dtrain/job/hadoop-streaming-job.sh        |  23 -----
 dtrain/job2/avgweights.rb                 |  30 -------
 dtrain/job2/cdec.ini                      |   8 --
 dtrain/job2/dtrain.ini                    |  10 ---
 dtrain/job2/dtrain.sh                     |   6 --
 dtrain/job2/hadoop-streaming-job.sh       |  23 -----
 dtrain/pairsampling.h                     |  64 ++++++++++++++
 dtrain/run.sh                             |  12 ---
 dtrain/sample.h                           |  64 --------------
 dtrain/test-reducer                       |   7 --
 dtrain/tests.cc                           | 141 ------------------------------
 dtrain/tests.h                            |  26 ------
 dtrain/updater.h                          | 107 -----------------------
 dtrain/util.cc                            |  34 -------
 dtrain/util.h                             |  28 ------
 31 files changed, 213 insertions(+), 769 deletions(-)
 delete mode 100755 dtrain/avgweights.rb
 delete mode 100644 dtrain/common.h
 delete mode 100644 dtrain/dtest.cc
 create mode 100644 dtrain/dtrain.h
 create mode 100755 dtrain/hstreaming/avgweights.rb
 create mode 100644 dtrain/hstreaming/cdec.ini
 create mode 100644 dtrain/hstreaming/dtrain.ini
 create mode 100755 dtrain/hstreaming/hadoop-streaming-job.sh
 delete mode 100755 dtrain/job/avgweights.rb
 delete mode 100644 dtrain/job/cdec.ini
 delete mode 100644 dtrain/job/dtrain.ini
 delete mode 100755 dtrain/job/dtrain.sh
 delete mode 100755 dtrain/job/hadoop-streaming-job.sh
 delete mode 100755 dtrain/job2/avgweights.rb
 delete mode 100644 dtrain/job2/cdec.ini
 delete mode 100644 dtrain/job2/dtrain.ini
 delete mode 100755 dtrain/job2/dtrain.sh
 delete mode 100755 dtrain/job2/hadoop-streaming-job.sh
 create mode 100644 dtrain/pairsampling.h
 delete mode 100755 dtrain/run.sh
 delete mode 100644 dtrain/sample.h
 delete mode 100644 dtrain/test-reducer
 delete mode 100644 dtrain/tests.cc
 delete mode 100644 dtrain/tests.h
 delete mode 100644 dtrain/updater.h
 delete mode 100644 dtrain/util.cc
 delete mode 100644 dtrain/util.h

diff --git a/.gitignore b/.gitignore
index 95262a09..2a5979cb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -130,3 +130,4 @@ training/mpi_em_optimize
 training/test_ngram
 utils/ts
 training/compute_cllh
+dtrain/dtrain
diff --git a/dtrain/Makefile.am b/dtrain/Makefile.am
index c08cd1ea..9b5df8bf 100644
--- a/dtrain/Makefile.am
+++ b/dtrain/Makefile.am
@@ -1,11 +1,8 @@
 # TODO I'm sure I can leave something out.
-bin_PROGRAMS = dtrain dtest
+bin_PROGRAMS = dtrain
 
-dtrain_SOURCES = dtrain.cc score.cc tests.cc util.cc sample_hg.cc
+dtrain_SOURCES = dtrain.cc score.cc sample_hg.cc
 dtrain_LDADD   = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz -lboost_filesystem -lboost_iostreams
 
-dtest_SOURCES = dtest.cc score.cc util.cc
-dtest_LDADD   = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz
-
 AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval
 
diff --git a/dtrain/README b/dtrain/README
index b3f513be..0cc52acc 100644
--- a/dtrain/README
+++ b/dtrain/README
@@ -31,6 +31,7 @@ TODO
  use separate TEST SET
 
 KNOWN BUGS PROBLEMS
+ if size of candidate < N => 0 score
  cdec kbest vs 1best (no -k param), rescoring? => ok(?)
  no sparse vector in decoder => ok
  ? ok
diff --git a/dtrain/avgweights.rb b/dtrain/avgweights.rb
deleted file mode 100755
index d5cfaa4d..00000000
--- a/dtrain/avgweights.rb
+++ /dev/null
@@ -1,27 +0,0 @@
-#!/usr/bin/env ruby1.9.1
-
-
-STDIN.set_encoding 'utf-8'
-
-#shard_count_key = "__SHARD_COUNT__"
-
-w = {}
-c = {}
-w.default = 0
-c.default = 0
-while line = STDIN.gets
-  key, val = line.split /\t/
-  w[key] += val.to_f
-  c[key] += 1.0
-end
-
-#shard_count = w["__SHARD_COUNT__"]
-
-w.each_key { |k|
-  #if k == shard_count_key then next end
-  #if k == "__bias" then next end
-  puts "#{k}\t#{w[k]/c[k]}" #{w[k]/shard_count}"
-}
-
-#puts "#{shard_count_key}\t#{w[shard_count_key]}"
-
diff --git a/dtrain/common.h b/dtrain/common.h
deleted file mode 100644
index 49dc85b7..00000000
--- a/dtrain/common.h
+++ /dev/null
@@ -1,43 +0,0 @@
-#ifndef _DTRAIN_COMMON_H_
-#define _DTRAIN_COMMON_H_
-
-
-#include <sstream>
-#include <iostream>
-#include <vector>
-#include <cassert>
-#include <cmath>
-#include <iomanip>
-
-// cdec includes
-#include "sentence_metadata.h"
-#include "verbose.h"
-#include "viterbi.h"
-#include "kbest.h"
-#include "ff_register.h"
-#include "decoder.h"
-#include "weights.h"
-
-// boost includes
-#include <boost/algorithm/string.hpp>
-#include <boost/program_options.hpp>
-
-// own headers
-#include "score.h"
-
-#define DTRAIN_DEFAULT_K 100                // k for kbest lists
-#define DTRAIN_DEFAULT_N 4                  // N for ngrams (e.g. BLEU)
-#define DTRAIN_DEFAULT_T 1                  // iterations
-#define DTRAIN_DEFAULT_SCORER "stupid_bleu" // scorer
-#define DTRAIN_DOTS 100                     // when to display a '.'
-#define DTRAIN_TMP_DIR "/var/hadoop/mapred/local"               // put this on a SSD?
-#define DTRAIN_GRAMMAR_DELIM "########EOS########"
-
-
-using namespace std;
-using namespace dtrain;
-namespace po = boost::program_options;
-
-
-#endif
-
diff --git a/dtrain/dtest.cc b/dtrain/dtest.cc
deleted file mode 100644
index 36c880a3..00000000
--- a/dtrain/dtest.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-#include "common.h"
-#include "kbestget.h"
-#include "util.h"
-
-
-/*
- * init
- *
- */
-bool
-init(int argc, char** argv, po::variables_map* conf)
-{
-  int N;
-  po::options_description opts( "Command Line Options" );
-  opts.add_options()
-    ( "decoder-config,c", po::value<string>(),                              "configuration file for cdec" )
-    ( "weights,w",        po::value<string>(),                                             "weights file" )
-    ( "ngrams,n",         po::value<int>(&N)->default_value(DTRAIN_DEFAULT_N), "N for Ngrams (default 5)" );
-  po::options_description cmdline_options;
-  cmdline_options.add(opts);
-  po::store( parse_command_line(argc, argv, cmdline_options), *conf );
-  po::notify( *conf );
-  if ( ! (conf->count("decoder-config") || conf->count("weights")) ) {
-    cerr << cmdline_options << endl;
-    return false;
-  }
-  return true;
-}
-
-
-/*
- * main
- *
- */
-int
-main(int argc, char** argv)
-{
-  SetSilent( true );
-  po::variables_map conf;
-  if ( !init(argc, argv, &conf) ) return 1;
-  register_feature_functions();
-  size_t k = 1;
-  ReadFile ini_rf( conf["decoder-config"].as<string>() );
-  Decoder decoder( ini_rf.stream() );
-  KBestGetter observer( k, "no" );
-  size_t N = conf["ngrams"].as<int>();
-
-  Weights weights;
-  if ( conf.count("weights") ) weights.InitFromFile( conf["weights"].as<string>() );
-  vector<double> w;
-  weights.InitVector( &w );
-  decoder.SetWeights( w );
- 
-  vector<string> in_split, ref_strs;
-  vector<WordID> ref_ids;
-  string in, psg;
-  size_t sn = 0;
-  double overall  = 0.0;
-  double overall1 = 0.0;
-  double overall2 = 0.0;
-  while( getline(cin, in) ) {
-    in_split.clear();
-    boost::split( in_split, in, boost::is_any_of("\t") );
-    // grammar
-    psg = boost::replace_all_copy( in_split[3], " __NEXT__RULE__ ", "\n" ); psg += "\n";
-    decoder.SetSentenceGrammarFromString( psg );
-    decoder.Decode( in_split[1], &observer );
-    KBestList* kb = observer.GetKBest();
-    // reference
-    ref_strs.clear(); ref_ids.clear();
-    boost::split( ref_strs, in_split[2], boost::is_any_of(" ") );
-    register_and_convert( ref_strs, ref_ids );
-    // scoring kbest
-    double score  = 0.0;
-    double score1 = 0.0;
-    double score2 = 0.0;
-    NgramCounts counts = make_ngram_counts( ref_ids, kb->sents[0], N );
-    score =  smooth_bleu( counts, ref_ids.size(), kb->sents[0].size(), N );
-    score1 = stupid_bleu( counts, ref_ids.size(), kb->sents[0].size(), N );
-    score2 =        bleu( counts, ref_ids.size(), kb->sents[0].size(), N );
-    cout << TD::GetString( kb->sents[0] ) << endl;
-    overall += score;
-    overall1 += score1;
-    overall2 += score2;
-    sn += 1;
-  }
-  cerr << "Average score (smooth) : " << overall/(double)(sn+1) << endl;
-  cerr << "Average score (stupid) : " << overall1/(double)(sn+1) << endl;
-  cerr << "Average score (vanilla): " << overall2/(double)(sn+1) << endl;
-  cerr << endl;
-
-  return 0;
-}
-
diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc
index f005008e..01821b30 100644
--- a/dtrain/dtrain.cc
+++ b/dtrain/dtrain.cc
@@ -1,24 +1,19 @@
-#include "common.h"
-#include "kbestget.h"
-#include "util.h"
-#include "sample.h"
+#include "dtrain.h"
 
-#include "ksampler.h"
 
-// boost compression
-#include <boost/iostreams/device/file.hpp> 
-#include <boost/iostreams/filtering_stream.hpp>
-#include <boost/iostreams/filter/gzip.hpp>
-//#include <boost/iostreams/filter/zlib.hpp>
-//#include <boost/iostreams/filter/bzip2.hpp>
-using namespace boost::iostreams;
 
-#include <boost/algorithm/string/predicate.hpp>
-#include <boost/lexical_cast.hpp>
-
-#ifdef DTRAIN_DEBUG
-#include "tests.h"
-#endif
+/*
+ * register_and_convert
+ *
+ */
+void
+register_and_convert(const vector<string>& strs, vector<WordID>& ids)
+{
+  vector<string>::const_iterator it;
+  for ( it = strs.begin(); it < strs.end(); it++ ) {
+    ids.push_back( TD::Convert( *it ) );
+  }
+}
 
 
 /*
@@ -49,12 +44,7 @@ init(int argc, char** argv, po::variables_map* cfg)
   clo.add_options()
     ( "config,c",         po::value<string>(),              "dtrain config file" )
     ( "quiet,q",          po::value<bool>()->zero_tokens(),           "be quiet" )
-    ( "verbose,v",        po::value<bool>()->zero_tokens(),         "be verbose" )
-#ifndef DTRAIN_DEBUG
-    ;
-#else
-    ( "test", "run tests and exit");
-#endif
+    ( "verbose,v",        po::value<bool>()->zero_tokens(),         "be verbose" );
   po::options_description config_options, cmdline_options;
 
   config_options.add(conff);
@@ -149,9 +139,9 @@ main( int argc, char** argv )
   if ( !quiet )
     cout << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl;
   Decoder decoder( ini_rf.stream() );
-  //KBestGetter observer( k, filter_type );
+  KBestGetter observer( k, filter_type );
   MT19937 rng;
-  KSampler observer( k, &rng );
+  //KSampler observer( k, &rng );
 
   // scoring metric/scorer
   string scorer_str = cfg["scorer"].as<string>();
@@ -433,7 +423,7 @@ main( int argc, char** argv )
     }
 
     ++sid;
-    cerr << "reporter:counter:dtrain,sent," << sid << endl;
+    //cerr << "reporter:counter:dtrain,sent," << sid << endl;
 
   } // input loop
 
diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h
new file mode 100644
index 00000000..3d319233
--- /dev/null
+++ b/dtrain/dtrain.h
@@ -0,0 +1,60 @@
+#ifndef _DTRAIN_COMMON_H_
+#define _DTRAIN_COMMON_H_
+
+
+#include <sstream>
+#include <iostream>
+#include <vector>
+#include <cassert>
+#include <cmath>
+#include <iomanip>
+
+// cdec includes
+#include "sentence_metadata.h"
+#include "verbose.h"
+#include "viterbi.h"
+#include "kbest.h"
+#include "ff_register.h"
+#include "decoder.h"
+#include "weights.h"
+
+// boost includes
+#include <boost/algorithm/string.hpp>
+#include <boost/program_options.hpp>
+
+// own headers
+#include "score.h"
+
+#define DTRAIN_DEFAULT_K 100                // k for kbest lists
+#define DTRAIN_DEFAULT_N 4                  // N for ngrams (e.g. BLEU)
+#define DTRAIN_DEFAULT_T 1                  // iterations
+#define DTRAIN_DEFAULT_SCORER "stupid_bleu" // scorer
+#define DTRAIN_DOTS 100                     // when to display a '.'
+#define DTRAIN_TMP_DIR "/tmp"               // put this on a SSD?
+#define DTRAIN_GRAMMAR_DELIM "########EOS########"
+
+
+#include "kbestget.h"
+#include "pairsampling.h"
+
+#include "ksampler.h"
+
+// boost compression
+#include <boost/iostreams/device/file.hpp> 
+#include <boost/iostreams/filtering_stream.hpp>
+#include <boost/iostreams/filter/gzip.hpp>
+//#include <boost/iostreams/filter/zlib.hpp>
+//#include <boost/iostreams/filter/bzip2.hpp>
+using namespace boost::iostreams;
+
+#include <boost/algorithm/string/predicate.hpp>
+#include <boost/lexical_cast.hpp>
+
+
+using namespace std;
+using namespace dtrain;
+namespace po = boost::program_options;
+
+
+#endif
+
diff --git a/dtrain/hstreaming/avgweights.rb b/dtrain/hstreaming/avgweights.rb
new file mode 100755
index 00000000..d5cfaa4d
--- /dev/null
+++ b/dtrain/hstreaming/avgweights.rb
@@ -0,0 +1,27 @@
+#!/usr/bin/env ruby1.9.1
+
+
+STDIN.set_encoding 'utf-8'
+
+#shard_count_key = "__SHARD_COUNT__"
+
+w = {}
+c = {}
+w.default = 0
+c.default = 0
+while line = STDIN.gets
+  key, val = line.split /\t/
+  w[key] += val.to_f
+  c[key] += 1.0
+end
+
+#shard_count = w["__SHARD_COUNT__"]
+
+w.each_key { |k|
+  #if k == shard_count_key then next end
+  #if k == "__bias" then next end
+  puts "#{k}\t#{w[k]/c[k]}" #{w[k]/shard_count}"
+}
+
+#puts "#{shard_count_key}\t#{w[shard_count_key]}"
+
diff --git a/dtrain/hstreaming/cdec.ini b/dtrain/hstreaming/cdec.ini
new file mode 100644
index 00000000..0d32f0b7
--- /dev/null
+++ b/dtrain/hstreaming/cdec.ini
@@ -0,0 +1,8 @@
+formalism=scfg
+add_pass_through_rules=true
+feature_function=WordPenalty
+cubepruning_pop_limit=30
+feature_function=KLanguageModel nc-wmt11.en.srilm.3.gz
+feature_function=RuleIdentityFeatures
+scfg_max_span_limit=15
+
diff --git a/dtrain/hstreaming/dtrain.ini b/dtrain/hstreaming/dtrain.ini
new file mode 100644
index 00000000..079d7d69
--- /dev/null
+++ b/dtrain/hstreaming/dtrain.ini
@@ -0,0 +1,10 @@
+decoder_config=cdec.ini
+kbest=100
+ngrams=4
+epochs=10
+input=-
+scorer=stupid_bleu
+output=-
+#stop_after=100
+#wprint=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough
+
diff --git a/dtrain/hstreaming/hadoop-streaming-job.sh b/dtrain/hstreaming/hadoop-streaming-job.sh
new file mode 100755
index 00000000..2cf3f50a
--- /dev/null
+++ b/dtrain/hstreaming/hadoop-streaming-job.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+HADOOP_HOME=/usr/lib/hadoop-0.20
+JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar
+HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR"
+
+IN=in/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain.1400m
+OUT=out/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain-weights-1400m
+
+$HSTREAMING \
+    -mapper "dtrain.sh" \
+    -reducer "avgweights.rb" \
+    -input $IN \
+    -output $OUT \
+    -file avgweights.rb \
+    -file dtrain.sh \
+    -file dtrain \
+    -file dtrain.ini \
+    -file cdec.ini \
+    -file nc-wmt11.en.srilm.3.gz \
+    -jobconf mapred.reduce.tasks=1 \
+    -jobconf mapred.max.map.failures.percent=100
+
diff --git a/dtrain/job/avgweights.rb b/dtrain/job/avgweights.rb
deleted file mode 100755
index e635aab4..00000000
--- a/dtrain/job/avgweights.rb
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/env ruby1.9.1
-
-
-STDIN.set_encoding 'utf-8'
-
-#shard_count_key = "__SHARD_COUNT__"
-
-w = {}
-#c = {}
-w.default = 0
-#c.default = 0
-while line = STDIN.gets
-  key, val = line.split /\t/
-  w[key] += val.to_f
-  #c[key] += 1.0
-end
-
-#shard_count = w["__SHARD_COUNT__"]
-
-num_map = 104.0
-
-w.each_key { |k|
-  #if k == shard_count_key then next end
-  #if k == "__bias" then next end
-  puts "#{k}\t#{w[k]/num_map}"
-  #/c[k]}" #{w[k]/shard_count}"
-}
-
-#puts "#{shard_count_key}\t#{w[shard_count_key]}"
-
diff --git a/dtrain/job/cdec.ini b/dtrain/job/cdec.ini
deleted file mode 100644
index 0d32f0b7..00000000
--- a/dtrain/job/cdec.ini
+++ /dev/null
@@ -1,8 +0,0 @@
-formalism=scfg
-add_pass_through_rules=true
-feature_function=WordPenalty
-cubepruning_pop_limit=30
-feature_function=KLanguageModel nc-wmt11.en.srilm.3.gz
-feature_function=RuleIdentityFeatures
-scfg_max_span_limit=15
-
diff --git a/dtrain/job/dtrain.ini b/dtrain/job/dtrain.ini
deleted file mode 100644
index 079d7d69..00000000
--- a/dtrain/job/dtrain.ini
+++ /dev/null
@@ -1,10 +0,0 @@
-decoder_config=cdec.ini
-kbest=100
-ngrams=4
-epochs=10
-input=-
-scorer=stupid_bleu
-output=-
-#stop_after=100
-#wprint=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough
-
diff --git a/dtrain/job/dtrain.sh b/dtrain/job/dtrain.sh
deleted file mode 100755
index 75ec29ea..00000000
--- a/dtrain/job/dtrain.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/sh
-
-./dtrain -q -c dtrain.ini
-
-exit 0
-
diff --git a/dtrain/job/hadoop-streaming-job.sh b/dtrain/job/hadoop-streaming-job.sh
deleted file mode 100755
index 2cf3f50a..00000000
--- a/dtrain/job/hadoop-streaming-job.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-HADOOP_HOME=/usr/lib/hadoop-0.20
-JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar
-HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR"
-
-IN=in/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain.1400m
-OUT=out/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain-weights-1400m
-
-$HSTREAMING \
-    -mapper "dtrain.sh" \
-    -reducer "avgweights.rb" \
-    -input $IN \
-    -output $OUT \
-    -file avgweights.rb \
-    -file dtrain.sh \
-    -file dtrain \
-    -file dtrain.ini \
-    -file cdec.ini \
-    -file nc-wmt11.en.srilm.3.gz \
-    -jobconf mapred.reduce.tasks=1 \
-    -jobconf mapred.max.map.failures.percent=100
-
diff --git a/dtrain/job2/avgweights.rb b/dtrain/job2/avgweights.rb
deleted file mode 100755
index 31048f16..00000000
--- a/dtrain/job2/avgweights.rb
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/env ruby1.9.1
-
-
-STDIN.set_encoding 'utf-8'
-
-#shard_count_key = "__SHARD_COUNT__"
-
-w = {}
-#c = {}
-w.default = 0
-#c.default = 0
-while line = STDIN.gets
-  key, val = line.split /\t/
-  w[key] += val.to_f
-  #c[key] += 1.0
-end
-
-#shard_count = w["__SHARD_COUNT__"]
-
-num_map = 2107.0
-
-w.each_key { |k|
-  #if k == shard_count_key then next end
-  #if k == "__bias" then next end
-  puts "#{k}\t#{w[k]/num_map}"
-  #/c[k]}" #{w[k]/shard_count}"
-}
-
-#puts "#{shard_count_key}\t#{w[shard_count_key]}"
-
diff --git a/dtrain/job2/cdec.ini b/dtrain/job2/cdec.ini
deleted file mode 100644
index 0d32f0b7..00000000
--- a/dtrain/job2/cdec.ini
+++ /dev/null
@@ -1,8 +0,0 @@
-formalism=scfg
-add_pass_through_rules=true
-feature_function=WordPenalty
-cubepruning_pop_limit=30
-feature_function=KLanguageModel nc-wmt11.en.srilm.3.gz
-feature_function=RuleIdentityFeatures
-scfg_max_span_limit=15
-
diff --git a/dtrain/job2/dtrain.ini b/dtrain/job2/dtrain.ini
deleted file mode 100644
index ec005e46..00000000
--- a/dtrain/job2/dtrain.ini
+++ /dev/null
@@ -1,10 +0,0 @@
-decoder_config=cdec.ini
-kbest=100
-ngrams=3
-epochs=10
-input=-
-scorer=stupid_bleu
-output=-
-#stop_after=100
-#wprint=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough
-
diff --git a/dtrain/job2/dtrain.sh b/dtrain/job2/dtrain.sh
deleted file mode 100755
index 75ec29ea..00000000
--- a/dtrain/job2/dtrain.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/sh
-
-./dtrain -q -c dtrain.ini
-
-exit 0
-
diff --git a/dtrain/job2/hadoop-streaming-job.sh b/dtrain/job2/hadoop-streaming-job.sh
deleted file mode 100755
index 9ee70a33..00000000
--- a/dtrain/job2/hadoop-streaming-job.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-HADOOP_HOME=/usr/lib/hadoop-0.20
-JAR=contrib/streaming/hadoop-streaming-0.20.2-cdh3u1.jar
-HSTREAMING="$HADOOP_HOME/bin/hadoop jar $HADOOP_HOME/$JAR"
-
-IN=in/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain
-OUT=out/nc-wmt11-de-en-dyer-cs-joshua.tok.lc.fixamp1.loo.psg.dtrain-weights.50
-
-$HSTREAMING \
-    -mapper "dtrain.sh" \
-    -reducer "avgweights.rb" \
-    -input $IN \
-    -output $OUT \
-    -file avgweights.rb \
-    -file dtrain.sh \
-    -file dtrain \
-    -file dtrain.ini \
-    -file cdec.ini \
-    -file nc-wmt11.en.srilm.3.gz \
-    -jobconf mapred.reduce.tasks=1 \
-    -jobconf mapred.max.map.failures.percent=100
-
diff --git a/dtrain/pairsampling.h b/dtrain/pairsampling.h
new file mode 100644
index 00000000..502901af
--- /dev/null
+++ b/dtrain/pairsampling.h
@@ -0,0 +1,64 @@
+#ifndef _DTRAIN_SAMPLE_H_
+#define _DTRAIN_SAMPLE_H_
+
+
+#include "kbestget.h"
+
+
+namespace dtrain
+{
+
+
+struct TPair
+{
+  SparseVector<double> first, second;
+  size_t first_rank, second_rank;
+  double first_score, second_score;
+};
+
+typedef vector<TPair> TrainingInstances;
+
+
+void
+sample_all( KBestList* kb, TrainingInstances &training )
+{
+  for ( size_t i = 0; i < kb->GetSize()-1; i++ ) {
+    for ( size_t j = i+1; j < kb->GetSize(); j++ ) {
+      TPair p;
+      p.first = kb->feats[i];
+      p.second = kb->feats[j];
+      p.first_rank = i;
+      p.second_rank = j;
+      p.first_score = kb->scores[i];
+      p.second_score = kb->scores[j];
+      training.push_back( p );
+    }
+  }
+}
+
+void
+sample_rand( KBestList* kb, TrainingInstances &training )
+{
+  srand( time(NULL) );
+  for ( size_t i = 0; i < kb->GetSize()-1; i++ ) {
+    for ( size_t j = i+1; j < kb->GetSize(); j++ ) {
+      if ( rand() % 2 ) {
+        TPair p;
+        p.first = kb->feats[i];
+        p.second = kb->feats[j];
+        p.first_rank = i;
+        p.second_rank = j;
+        p.first_score = kb->scores[i];
+        p.second_score = kb->scores[j];
+        training.push_back( p );
+      }
+    }
+  }
+}
+
+
+} // namespace
+
+
+#endif
+
diff --git a/dtrain/run.sh b/dtrain/run.sh
deleted file mode 100755
index 72e56f3e..00000000
--- a/dtrain/run.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/sh
-
-#INI=test/blunsom08.dtrain.ini
-#INI=test/nc-wmt11/dtrain.ini
-#INI=test/EXAMPLE/dtrain.ini
-#INI=test/EXAMPLE/dtrain.ruleids.ini
-INI=test/toy.dtrain.ini
-#INI=test/EXAMPLE/dtrain.cdecrid.ini
-
-#rm /tmp/dtrain-*
-./dtrain -c $INI $1 $2 $3 $4 
-
diff --git a/dtrain/sample.h b/dtrain/sample.h
deleted file mode 100644
index 502901af..00000000
--- a/dtrain/sample.h
+++ /dev/null
@@ -1,64 +0,0 @@
-#ifndef _DTRAIN_SAMPLE_H_
-#define _DTRAIN_SAMPLE_H_
-
-
-#include "kbestget.h"
-
-
-namespace dtrain
-{
-
-
-struct TPair
-{
-  SparseVector<double> first, second;
-  size_t first_rank, second_rank;
-  double first_score, second_score;
-};
-
-typedef vector<TPair> TrainingInstances;
-
-
-void
-sample_all( KBestList* kb, TrainingInstances &training )
-{
-  for ( size_t i = 0; i < kb->GetSize()-1; i++ ) {
-    for ( size_t j = i+1; j < kb->GetSize(); j++ ) {
-      TPair p;
-      p.first = kb->feats[i];
-      p.second = kb->feats[j];
-      p.first_rank = i;
-      p.second_rank = j;
-      p.first_score = kb->scores[i];
-      p.second_score = kb->scores[j];
-      training.push_back( p );
-    }
-  }
-}
-
-void
-sample_rand( KBestList* kb, TrainingInstances &training )
-{
-  srand( time(NULL) );
-  for ( size_t i = 0; i < kb->GetSize()-1; i++ ) {
-    for ( size_t j = i+1; j < kb->GetSize(); j++ ) {
-      if ( rand() % 2 ) {
-        TPair p;
-        p.first = kb->feats[i];
-        p.second = kb->feats[j];
-        p.first_rank = i;
-        p.second_rank = j;
-        p.first_score = kb->scores[i];
-        p.second_score = kb->scores[j];
-        training.push_back( p );
-      }
-    }
-  }
-}
-
-
-} // namespace
-
-
-#endif
-
diff --git a/dtrain/test-reducer b/dtrain/test-reducer
deleted file mode 100644
index b86e7894..00000000
--- a/dtrain/test-reducer
+++ /dev/null
@@ -1,7 +0,0 @@
-a	1
-b	2
-c	3.5
-a	1
-b	2
-c	3.5
-__SHARD_COUNT__	2
diff --git a/dtrain/tests.cc b/dtrain/tests.cc
deleted file mode 100644
index 997eafbb..00000000
--- a/dtrain/tests.cc
+++ /dev/null
@@ -1,141 +0,0 @@
-#include "tests.h"
-
-
-namespace dtrain
-{
-
-
-/*
- * approx_equal
- *
- */
-double
-approx_equal( double x, double y )
-{
-  const double EPSILON = 1E-5;
-  if ( x == 0 ) return fabs( y ) <= EPSILON;
-  if ( y == 0 ) return fabs( x ) <= EPSILON;
-  return fabs( x - y ) / max( fabs(x), fabs(y) ) <= EPSILON;
-}
-
-
-/*
- * test_ngrams
- *
- */
-void
-test_ngrams()
-{
-  cout << "Testing ngrams..." << endl << endl;
-  size_t N = 5;
-  cout << "N = " << N << endl;
-  vector<int> a; // hyp
-  vector<int> b; // ref
-  cout << "a ";
-  for (size_t i = 1; i <= 8; i++) {
-    cout << i << " ";
-    a.push_back(i);
-  }
-  cout << endl << "b ";
-  for (size_t i = 1; i <= 4; i++) {
-    cout << i << " ";
-    b.push_back(i);
-  }
-  cout << endl << endl;
-  NgramCounts c = make_ngram_counts( a, b, N );
-  assert( c.clipped[N-1] == 0 );
-  assert( c.sum[N-1] == 4 );
-  c.print();
-  c += c;
-  cout << endl;
-  c.print();
-  cout << endl;
-}
-
-
-/*
- * test_metrics
- *
- */
-void
-test_metrics()
-{
-  cout << "Testing metrics..." << endl << endl;
-  using namespace boost::assign;
-  vector<string> a, b;
-  vector<double> expect_vanilla, expect_smooth, expect_stupid;
-  a +=              "a a a a", "a a a a", "a",   "a", "b",        "a a a a", "a a",  "a a a", "a b a"; // hyp
-  b +=              "b b b b", "a a a a", "a",   "b", "b b b b",  "a",       "a a",  "a a a", "a b b"; // ref
-  expect_vanilla += 0,         1,         1,      0,  0,          .25,       1,      1,       0;
-  expect_smooth  += 0,          .9375,     .0625, 0,   .00311169, .0441942,   .1875,  .4375,   .161587;
-  expect_stupid  += 0,         1,         1,      0,   .0497871,  .25,       1,      1,        .605707;
-  vector<string> aa, bb;
-  vector<WordID> aai, bbi;
-  double vanilla, smooth, stupid;
-  size_t N = 4;
-  cout << "N = " << N << endl << endl;
-  for ( size_t i = 0; i < a.size(); i++ ) {
-    cout << " hyp: " << a[i] << endl;
-    cout << " ref: " << b[i] << endl;
-    aa.clear(); bb.clear(); aai.clear(); bbi.clear();
-    boost::split( aa, a[i], boost::is_any_of(" ") );
-    boost::split( bb, b[i], boost::is_any_of(" ") );
-    register_and_convert( aa, aai );
-    register_and_convert( bb, bbi );
-    NgramCounts counts = make_ngram_counts( aai, bbi, N );
-    vanilla =        bleu( counts, aa.size(), bb.size(), N);
-    smooth  = smooth_bleu( counts, aa.size(), bb.size(), N);
-    stupid  = stupid_bleu( counts, aa.size(), bb.size(), N);
-    assert( approx_equal(vanilla, expect_vanilla[i]) );
-    assert( approx_equal(smooth, expect_smooth[i]) );
-    assert( approx_equal(stupid, expect_stupid[i]) );
-    cout << setw(14) << "bleu = "      << vanilla << endl;
-    cout << setw(14) << "smooth bleu = " << smooth << endl;
-    cout << setw(14) << "stupid bleu = " << stupid << endl << endl;
-  }
-  cout << endl;
-}
-
-
-/*
- * test_SetWeights
- *
- */
-void
-test_SetWeights()
-{
-  cout << "Testing Weights::SetWeight..." << endl << endl;
-  Weights weights;
-  SparseVector<double> lambdas;
-  weights.InitSparseVector( &lambdas );
-  weights.SetWeight( &lambdas, "test", 0 );
-  weights.SetWeight( &lambdas, "test1", 1 );
-  WordID fid = FD::Convert( "test2" );
-  weights.SetWeight( &lambdas, fid, 2 );
-  string fn = "weights-test";
-  cout << "FD::NumFeats() " << FD::NumFeats() << endl;
-  assert( FD::NumFeats() == 4 );
-  weights.WriteToFile( fn, true );
-  cout << endl;
-}
-
-
-/*
- * run_tests
- *
- */
-void
-run_tests()
-{
-  cout << endl;
-  test_ngrams();
-  cout << endl;
-  test_metrics();
-  cout << endl;
-  test_SetWeights();
-  exit(0);
-}
-
-
-} // namespace
-
diff --git a/dtrain/tests.h b/dtrain/tests.h
deleted file mode 100644
index 9853e3c3..00000000
--- a/dtrain/tests.h
+++ /dev/null
@@ -1,26 +0,0 @@
-#ifndef _DTRAIN_TESTS_H_
-#define _DTRAIN_TESTS_H_
-
-#include <iomanip>
-#include <boost/assign/std/vector.hpp>
-
-#include "common.h"
-#include "util.h"
-
-
-namespace dtrain
-{
-
-
-double approx_equal( double x, double y );
-void test_ngrams();
-void test_metrics();
-void test_SetWeights();
-void run_tests();
-
-
-} // namespace
-
-
-#endif
-
diff --git a/dtrain/updater.h b/dtrain/updater.h
deleted file mode 100644
index b54c25de..00000000
--- a/dtrain/updater.h
+++ /dev/null
@@ -1,107 +0,0 @@
-#ifndef _DTRAIN_LEARNER_H_
-#define _DTRAIN_LEARNER_H_
-
-#include <string>
-#include <vector>
-#include <map>
-
-#include "sparse_vector.h"
-#include "score.h"
-
-
-namespace dtrain
-{
-
-
-class Updater
-{
-  public:
-    virtual void Init( const vector<SparseVector<double> >& kbest, const Scores& scores,
-                       const bool invert_score = false ) {};
-    virtual void Update( SparseVector<double>& lambdas ) {};
-};
-
-
-class SofiaUpdater : public Updater
-{
-  public:
-    void
-    Init( const size_t sid, const vector<SparseVector<double> >& kbest, /*const FIXME operator[]*/ Scores& scores,
-          const bool invert_score = false )
-    {
-      assert( kbest.size() == scores.size() );
-      ofstream o;
-      char tmp[] = DTRAIN_TMP_DIR"/dtrain-sofia-data-XXXXXX";
-      mkstemp( tmp );
-      tmp_data_fn = tmp;
-      o.open( tmp_data_fn.c_str(), ios::trunc );
-      int fid = 0;
-      map<int,int>::iterator ff;
-      double score;
-      for ( size_t k = 0; k < kbest.size(); ++k ) {
-        map<int,double> m;
-        SparseVector<double>::const_iterator it = kbest[k].begin();
-        score = scores[k].GetScore();
-        if ( invert_score ) score = -score;
-        o << score;
-        for ( ; it != kbest[k].end(); ++it ) {
-          ff = fmap.find( it->first );
-          if ( ff == fmap.end() ) {
-            fmap.insert( pair<int,int>(it->first, fid) );
-            fmap1.insert( pair<int,int>(fid, it->first) );
-            fid++;
-          }
-          m.insert( pair<int,double>(fmap[it->first], it->second) );
-        }
-        map<int,double>::iterator ti = m.begin();
-        for ( ; ti != m.end(); ++ti ) {
-          o << " " << ti->first << ":" << ti->second;
-        }
-        o << endl;
-      }
-      o.close();
-    }
-
-    void
-    Update(SparseVector<double>& lambdas)
-    {
-      char tmp[] = DTRAIN_TMP_DIR"/dtrain-sofia-model-XXXXXX";
-      mkstemp(tmp);
-      tmp_model_fn = tmp;
-      //--random_seed 123456789010
-      string call = "./sofia-ml  --training_file " + tmp_data_fn;
-      call += " --model_out " + tmp_model_fn;
-      call += " --loop_type rank --lambda 100 --eta_type constant --dimensionality ";
-      std::stringstream out;
-      out << fmap.size();
-      call += out.str();
-      call += " &>/dev/null";
-      system ( call.c_str() );
-      ifstream i;
-      i.open( tmp_model_fn.c_str(), ios::in );
-      string model;
-      getline( i, model );
-      vector<string> strs;
-      boost::split( strs, model, boost::is_any_of(" ") );
-      int j = 0;
-      for ( vector<string>::iterator it = strs.begin(); it != strs.end(); ++it ) {
-        lambdas.set_value(fmap1[j], atof( it->c_str() ) );
-        j++;
-      }
-      i.close();
-      unlink( tmp_data_fn.c_str() );
-      unlink( tmp_model_fn.c_str() );
-    }
-
-  private:
-    string tmp_data_fn;
-    string tmp_model_fn;
-    map<int,int> fmap;
-    map<int,int> fmap1;
-};
-
-
-} // namespace
-
-#endif
-
diff --git a/dtrain/util.cc b/dtrain/util.cc
deleted file mode 100644
index 7b3bbe3d..00000000
--- a/dtrain/util.cc
+++ /dev/null
@@ -1,34 +0,0 @@
-#include "util.h"
-
-
-namespace dtrain
-{
-
-
-/*
- * register_and_convert
- *
- */
-void
-register_and_convert(const vector<string>& strs, vector<WordID>& ids)
-{
-  vector<string>::const_iterator it;
-  for ( it = strs.begin(); it < strs.end(); it++ ) {
-    ids.push_back( TD::Convert( *it ) );
-  }
-}
-
-
-/*
- * print_FD
- *
- */
-void
-print_FD()
-{
-  for ( size_t i = 0; i < FD::NumFeats(); i++ ) cout << FD::Convert(i)<< endl;
-}
-
-
-} // namespace
-
diff --git a/dtrain/util.h b/dtrain/util.h
deleted file mode 100644
index 6a548519..00000000
--- a/dtrain/util.h
+++ /dev/null
@@ -1,28 +0,0 @@
-#ifndef _DTRAIN_UTIL_H_
-#define _DTRAIN_UTIL_H_
-
-
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "fdict.h"
-#include "tdict.h"
-#include "wordid.h"
-
-using namespace std;
-
-
-namespace dtrain
-{
-
-
-void register_and_convert(const vector<string>& strs, vector<WordID>& ids);
-void print_FD();
-
-
-} // namespace
-
-
-#endif
-
-- 
cgit v1.2.3