migration to new metric api for vest, clean up of unsupported/not functional code

author: Chris Dyer <cdyer@cs.cmu.edu> 2012-01-27 13:19:27 -0500
committer: Chris Dyer <cdyer@cs.cmu.edu> 2012-01-27 13:19:27 -0500
commit: 3d17bf9ae1ba67cd091794839d4d5f4c393a0e2c (patch)
tree: 8e85fa3da0759c6df13ca306f95c026743989eba
parent: 3c1c98b5aec7aec34432ddc37385df06d301bdd5 (diff)
6 files changed, 84 insertions, 475 deletions
diff --git a/mteval/mbr_kbest.cc b/mteval/mbr_kbest.cc
index 64a6a8bf..b5e4750c 100644
--- a/mteval/mbr_kbest.cc
+++ b/mteval/mbr_kbest.cc
@@ -5,7 +5,7 @@
 
 #include "prob.h"
 #include "tdict.h"
-#include "scorer.h"
+#include "ns.h"
 #include "filelib.h"
 #include "stringlib.h"
 
@@ -17,7 +17,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   po::options_description opts("Configuration options");
   opts.add_options()
         ("scale,a",po::value<double>()->default_value(1.0), "Posterior scaling factor (alpha)")
-        ("loss_function,l",po::value<string>()->default_value("bleu"), "Loss function")
+        ("evaluation_metric,m",po::value<string>()->default_value("ibm_bleu"), "Evaluation metric")
         ("input,i",po::value<string>()->default_value("-"), "File to read k-best lists from")
         ("output_list,L", "Show reranked list as output")
         ("help,h", "Help");
@@ -75,13 +75,14 @@ bool ReadKBestList(istream* in, string* sent_id, vector<pair<vector<WordID>, pro
 int main(int argc, char** argv) {
   po::variables_map conf;
   InitCommandLine(argc, argv, &conf);
-  const string metric = conf["loss_function"].as<string>();
+  const string smetric = conf["evaluation_metric"].as<string>();
+  EvaluationMetric* metric = EvaluationMetric::Instance(smetric);
+  const bool is_loss = (UppercaseString(smetric) == "TER");
   const bool output_list = conf.count("output_list") > 0;
   const string file = conf["input"].as<string>();
   const double mbr_scale = conf["scale"].as<double>();
   cerr << "Posterior scaling factor (alpha) = " << mbr_scale << endl;
 
-  ScoreType type = ScoreTypeFromString(metric);
   vector<pair<vector<WordID>, prob_t> > list;
   ReadFile rf(file);
   string sent_id;
@@ -99,15 +100,15 @@ int main(int argc, char** argv) {
     vector<double> mbr_scores(output_list ? list.size() : 0);
     double mbr_loss = numeric_limits<double>::max();
     for (int i = 0 ; i < list.size(); ++i) {
-      vector<vector<WordID> > refs(1, list[i].first);
-      //cerr << i << ": " << list[i].second <<"\t" << TD::GetString(list[i].first) << endl;
-      ScorerP scorer = SentenceScorer::CreateSentenceScorer(type, refs);
+      const vector<vector<WordID> > refs(1, list[i].first);
+
       double wl_acc = 0;
       for (int j = 0; j < list.size(); ++j) {
         if (i != j) {
-          ScoreP s = scorer->ScoreCandidate(list[j].first);
-          double loss = 1.0 - s->ComputeScore();
-          if (type == TER || type == AER) loss = 1.0 - loss;
+          SufficientStats ss;
+          metric->ComputeSufficientStatistics(list[j].first, refs, &ss);
+          double loss = 1.0 - metric->ComputeScore(ss);
+          if (is_loss) loss = 1.0 - loss;
           double weighted_loss = loss * (joints[j] / marginal).as_float();
           wl_acc += weighted_loss;
           if ((!output_list) && wl_acc > mbr_loss) break;
diff --git a/utils/fast_sparse_vector.h b/utils/fast_sparse_vector.h
index 1301581a..17fa47bf 100644
--- a/utils/fast_sparse_vector.h
+++ b/utils/fast_sparse_vector.h
@@ -178,6 +178,12 @@ class FastSparseVector {
   T l2norm() const {
     return sqrt(l2norm_sq());
   }
+  T pnorm(const double p) const {
+    T sum = T();
+    for (const_iterator it = begin(), e = end(); it != e; ++it)
+      sum += pow(fabs(it->second), p);
+    return pow(sum, 1.0 / p);
+  }
   // if values are binary, gives |A intersect B|/|A union B|
   template<typename S>
   S tanimoto_coef(const FastSparseVector<S> &vec) const {
diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl
index 8cde748b..1ec8c6b1 100755
--- a/vest/dist-vest.pl
+++ b/vest/dist-vest.pl
@@ -65,8 +65,6 @@ my $oraclen=0;
 my $oracleb=20;
 my $bleu_weight=1;
 my $use_make = 1;  # use make to parallelize line search
-my $dirargs='';
-my $density_prune;
 my $useqsub;
 my $pass_suffix = '';
 my $cpbin=1;
@@ -75,7 +73,6 @@ Getopt::Long::Configure("no_auto_abbrev");
 if (GetOptions(
 	"decoder=s" => \$decoderOpt,
 	"jobs=i" => \$jobs,
-	"density-prune=f" => \$density_prune,
 	"dont-clean" => \$disable_clean,
 	"pass-suffix=s" => \$pass_suffix,
 	"dry-run" => \$dryrun,
@@ -87,15 +84,7 @@ if (GetOptions(
 	"normalize=s" => \$normalize,
 	"pmem=s" => \$pmem,
         "cpbin!" => \$cpbin,
-	"rand-directions=i" => \$rand_directions,
-	"random_directions=i" => \$rand_directions,
-        "bleu_weight=s" => \$bleu_weight,
-        "no-primary!" => \$noprimary,
-        "max-similarity=s" => \$maxsim,
-        "oracle-directions=i" => \$oraclen,
-        "n-oracle=i" => \$oraclen,
-        "oracle-batch=i" => \$oracleb,
-        "directions-args=s" => \$dirargs,
+	"random-directions=i" => \$rand_directions,
 	"ref-files=s" => \$refFiles,
 	"metric=s" => \$metric,
 	"source-file=s" => \$srcFile,
@@ -107,10 +96,6 @@ if (GetOptions(
 	exit;
 }
 
-if (defined $density_prune) {
-  die "--density_prune n: n must be greater than 1.0\n" unless $density_prune > 1.0;
-}
-
 if ($useqsub) {
   $use_make = 0;
   die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub();
@@ -328,10 +313,7 @@ while (1){
 		print STDERR "\nGENERATE OPTIMIZATION STRATEGY (OPT-ITERATION $opt_iter/$optimization_iters)\n";
 		print STDERR unchecked_output("date");
 		$icc++;
-		my $nop=$noprimary?"--no_primary":"";
-		my $targs=$oraclen ? "--decoder_translations='$runFile.gz' ".get_comma_sep_refs('-references',$refFiles):"";
-		my $bwargs=$bleu_weight!=1 ? "--bleu_weight=$bleu_weight":"";
-		$cmd="$MAPINPUT -w $inweights -r $dir/hgs $bwargs -s $devSize -d $rand_directions --max_similarity=$maxsim --oracle_directions=$oraclen --oracle_batch=$oracleb $targs $dirargs > $dir/agenda.$im1-$opt_iter";
+		$cmd="$MAPINPUT -w $inweights -r $dir/hgs -s $devSize -d $rand_directions > $dir/agenda.$im1-$opt_iter";
 		print STDERR "COMMAND:\n$cmd\n";
 		check_call($cmd);
 		check_call("mkdir -p $dir/splag.$im1");
diff --git a/vest/mbr_kbest.cc b/vest/mbr_kbest.cc
deleted file mode 100644
index 2867b36b..00000000
--- a/vest/mbr_kbest.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-#include <iostream>
-#include <vector>
-
-#include <boost/program_options.hpp>
-
-#include "prob.h"
-#include "tdict.h"
-#include "scorer.h"
-#include "filelib.h"
-#include "stringlib.h"
-
-using namespace std;
-
-namespace po = boost::program_options;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
-  po::options_description opts("Configuration options");
-  opts.add_options()
-        ("scale,a",po::value<double>()->default_value(1.0), "Posterior scaling factor (alpha)")
-        ("loss_function,l",po::value<string>()->default_value("bleu"), "Loss function")
-        ("input,i",po::value<string>()->default_value("-"), "File to read k-best lists from")
-        ("output_list,L", "Show reranked list as output")
-        ("help,h", "Help");
-  po::options_description dcmdline_options;
-  dcmdline_options.add(opts);
-  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
-  bool flag = false;
-  if (flag || conf->count("help")) {
-    cerr << dcmdline_options << endl;
-    exit(1);
-  }
-}
-
-struct LossComparer {
-  bool operator()(const pair<vector<WordID>, double>& a, const pair<vector<WordID>, double>& b) const {
-    return a.second < b.second;
-  }
-};
-
-bool ReadKBestList(istream* in, string* sent_id, vector<pair<vector<WordID>, prob_t> >* list) {
-  static string cache_id;
-  static pair<vector<WordID>, prob_t> cache_pair;
-  list->clear();
-  string cur_id;
-  if (cache_pair.first.size() > 0) {
-    list->push_back(cache_pair);
-    cur_id = cache_id;
-    cache_pair.first.clear();
-  }
-  string line;
-  string tstr;
-  while(*in) {
-    getline(*in, line);
-    if (line.empty()) continue;
-    size_t p1 = line.find(" ||| ");
-    if (p1 == string::npos) { cerr << "Bad format: " << line << endl; abort(); }
-    size_t p2 = line.find(" ||| ", p1 + 4);
-    if (p2 == string::npos) { cerr << "Bad format: " << line << endl; abort(); }
-    size_t p3 = line.rfind(" ||| ");
-    cache_id = line.substr(0, p1);
-    tstr = line.substr(p1 + 5, p2 - p1 - 5);
-    double val = strtod(line.substr(p3 + 5).c_str(), NULL);
-    TD::ConvertSentence(tstr, &cache_pair.first);
-    cache_pair.second.logeq(val);
-    if (cur_id.empty()) cur_id = cache_id;
-    if (cur_id == cache_id) {
-      list->push_back(cache_pair);
-      *sent_id = cur_id;
-      cache_pair.first.clear();
-    } else { break; }
-  }
-  return !list->empty();
-}
-
-int main(int argc, char** argv) {
-  po::variables_map conf;
-  InitCommandLine(argc, argv, &conf);
-  const string metric = conf["loss_function"].as<string>();
-  const bool output_list = conf.count("output_list") > 0;
-  const string file = conf["input"].as<string>();
-  const double mbr_scale = conf["scale"].as<double>();
-  cerr << "Posterior scaling factor (alpha) = " << mbr_scale << endl;
-
-  ScoreType type = ScoreTypeFromString(metric);
-  vector<pair<vector<WordID>, prob_t> > list;
-  ReadFile rf(file);
-  string sent_id;
-  while(ReadKBestList(rf.stream(), &sent_id, &list)) {
-    vector<prob_t> joints(list.size());
-    const prob_t max_score = pow(list.front().second, mbr_scale);
-    prob_t marginal = prob_t::Zero();
-    for (int i = 0 ; i < list.size(); ++i) {
-      const prob_t joint = pow(list[i].second, mbr_scale) / max_score;
-      joints[i] = joint;
-      // cerr << "list[" << i << "] joint=" << log(joint) << endl;
-      marginal += joint;
-    }
-    int mbr_idx = -1;
-    vector<double> mbr_scores(output_list ? list.size() : 0);
-    double mbr_loss = numeric_limits<double>::max();
-    for (int i = 0 ; i < list.size(); ++i) {
-      vector<vector<WordID> > refs(1, list[i].first);
-      //cerr << i << ": " << list[i].second <<"\t" << TD::GetString(list[i].first) << endl;
-      ScorerP scorer = SentenceScorer::CreateSentenceScorer(type, refs);
-      double wl_acc = 0;
-      for (int j = 0; j < list.size(); ++j) {
-        if (i != j) {
-          ScoreP s = scorer->ScoreCandidate(list[j].first);
-          double loss = 1.0 - s->ComputeScore();
-          if (type == TER || type == AER) loss = 1.0 - loss;
-          double weighted_loss = loss * (joints[j] / marginal);
-          wl_acc += weighted_loss;
-          if ((!output_list) && wl_acc > mbr_loss) break;
-        }
-      }
-      if (output_list) mbr_scores[i] = wl_acc;
-      if (wl_acc < mbr_loss) {
-        mbr_loss = wl_acc;
-        mbr_idx = i;
-      }
-    }
-    // cerr << "ML translation: " << TD::GetString(list[0].first) << endl;
-    cerr << "MBR Best idx: " << mbr_idx << endl;
-    if (output_list) {
-      for (int i = 0; i < list.size(); ++i)
-        list[i].second.logeq(mbr_scores[i]);
-      sort(list.begin(), list.end(), LossComparer());
-      for (int i = 0; i < list.size(); ++i)
-        cout << sent_id << " ||| "
-             << TD::GetString(list[i].first) << " ||| "
-             << log(list[i].second) << endl;
-    } else {
-      cout << TD::GetString(list[mbr_idx].first) << endl;
-    }
-  }
-  return 0;
-}
-
diff --git a/vest/mr_vest_generate_mapper_input.cc b/vest/mr_vest_generate_mapper_input.cc
index 0c094fd5..59d4f24f 100644
--- a/vest/mr_vest_generate_mapper_input.cc
+++ b/vest/mr_vest_generate_mapper_input.cc
@@ -1,320 +1,78 @@
-//TODO: debug segfault when references supplied, null shared_ptr when oracle
 #include <iostream>
 #include <vector>
-#include <sstream>
 
 #include <boost/program_options.hpp>
 #include <boost/program_options/variables_map.hpp>
 
-#include "sampler.h"
 #include "filelib.h"
 #include "weights.h"
 #include "line_optimizer.h"
-#include "hg.h"
-#include "hg_io.h"
-#include "scorer.h"
-#include "oracle_bleu.h"
-#include "ff_bleu.h"
-
-const bool DEBUG_ORACLE=true;
-
-//TODO: decide on cdec_ff ffs, or just bleumodel - if just bleumodel, then do existing features on serialized hypergraphs remain?  weights (origin) is passed to oracle_bleu.h:ComputeOracle
-//void register_feature_functions();
-//FFRegistry ff_registry;
-namespace {
-void init_bleumodel() {
-  ff_registry.clear();
-  ff_registry.Register(new FFFactory<BLEUModel>);
-}
-
-struct init_ff {
-  init_ff() {
-    init_bleumodel();
-  }
-};
-//init_ff reg; // order of initialization?  ff_registry may not be init yet.  call in Run() instead.
-}
 
 using namespace std;
 namespace po = boost::program_options;
 
-typedef SparseVector<double> Dir;
-typedef Dir Point;
-
-void compress_similar(vector<Dir> &dirs,double min_dist,ostream *log=&cerr,bool avg=true,bool verbose=true) {
-  //  return; //TODO: debug
-  if (min_dist<=0) return;
-  double max_s=1.-min_dist;
-  if (log&&verbose) *log<<"max allowed S="<<max_s<<endl;
-  unsigned N=dirs.size();
-  for (int i=0;i<N;++i) {
-    for (int j=i+1;j<N;++j) {
-      double s=dirs[i].tanimoto_coef(dirs[j]);
-      if (log&&verbose) *log<<"S["<<i<<","<<j<<"]="<<s<<' ';
-      if (s>max_s) {
-        if (log) *log << "Collapsing similar directions (T="<<s<<" > "<<max_s<<").  dirs["<<i<<"]="<<dirs[i]<<" dirs["<<j<<"]"<<endl;
-        if (avg) {
-          dirs[i]+=dirs[j];
-          dirs[i]/=2.;
-          if (log) *log<<" averaged="<<dirs[i];
-        }
-        if (log) *log<<endl;
-        swap(dirs[j],dirs[--N]);
-      }
-    }
-    if (log&&verbose) *log<<endl;
-
-  }
-  dirs.resize(N);
-}
-
-struct oracle_directions {
-  MT19937 rng;
-  OracleBleu oracle;
-  vector<Dir> directions;
-
-  bool start_random;
-  bool include_primary;
-  bool old_to_hope;
-  bool fear_to_hope;
-  unsigned n_random;
-  void AddPrimaryAndRandomDirections() {
-    LineOptimizer::CreateOptimizationDirections(
-      fids,n_random,&rng,&directions,include_primary);
-  }
-
-  void Print() {
-    for (int i = 0; i < dev_set_size; ++i)
-      for (int j = 0; j < directions.size(); ++j) {
-        cout << forest_file(i) <<" " << i<<" ";
-        print(cout,origin,"=",";");
-        cout<<" ";
-        print(cout,directions[j],"=",";");
-        cout<<"\n";
-      }
-  }
-
-  void AddOptions(po::options_description *opts) {
-    oracle.AddOptions(opts);
-    opts->add_options()
-      ("dev_set_size,s",po::value<unsigned>(&dev_set_size),"[REQD] Development set size (# of parallel sentences)")
-      ("forest_repository,r",po::value<string>(&forest_repository),"[REQD] Path to forest repository")
-      ("weights,w",po::value<string>(&weights_file),"[REQD] Current feature weights file")
-      ("optimize_feature,o",po::value<vector<string> >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)")
-      ("random_directions,d",po::value<unsigned>(&n_random)->default_value(10),"Number of random directions to run the line optimizer in")
-      ("no_primary,n","don't use the primary (orthogonal each feature alone) directions")
-      ("oracle_directions,O",po::value<unsigned>(&n_oracle)->default_value(0),"read the forests and choose this many directions based on heading toward a hope max (bleu+modelscore) translation.")
-      ("oracle_start_random",po::bool_switch(&start_random),"sample random subsets of dev set for ALL oracle directions, not just those after a sequential run through it")
-      ("oracle_batch,b",po::value<unsigned>(&oracle_batch)->default_value(10),"to produce each oracle direction, sum the 'gradient' over this many sentences")
-      ("max_similarity,m",po::value<double>(&max_similarity)->default_value(0),"remove directions that are too similar (Tanimoto coeff. less than (1-this)).  0 means don't filter, 1 means only 1 direction allowed?")
-      ("fear_to_hope,f",po::bool_switch(&fear_to_hope),"for each of the oracle_directions, also include a direction from fear to hope (as well as origin to hope)")
-      ("no_old_to_hope","don't emit the usual old -> hope oracle")
-      ("decoder_translations",po::value<string>(&decoder_translations_file)->default_value(""),"one per line decoder 1best translations for computing document BLEU vs. sentences-seen-so-far BLEU")
-      ;
-  }
-  void InitCommandLine(int argc, char *argv[], po::variables_map *conf) {
-    po::options_description opts("Configuration options");
-    AddOptions(&opts);
-    opts.add_options()("help,h", "Help");
-
-    po::options_description dcmdline_options;
-    dcmdline_options.add(opts);
-    po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
-    po::notify(*conf);
-    if (conf->count("dev_set_size") == 0) {
-      cerr << "Please specify the size of the development set using -s N\n";
-      goto bad_cmdline;
-    }
-    if (conf->count("weights") == 0) {
-      cerr << "Please specify the starting-point weights using -w <weightfile.txt>\n";
-      goto bad_cmdline;
-    }
-    if (conf->count("forest_repository") == 0) {
-      cerr << "Please specify the forest repository location using -r <DIR>\n";
-      goto bad_cmdline;
-    }
-    if (n_oracle && oracle.refs.empty()) {
-      cerr<<"Specify references when using oracle directions\n";
-      goto bad_cmdline;
-    }
-    if (conf->count("help")) {
-      cout << dcmdline_options << endl;
-      exit(0);
-    }
-
-    return;
-    bad_cmdline:
-      cerr << dcmdline_options << endl;
-      exit(1);
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("dev_set_size,s",po::value<unsigned>(),"[REQD] Development set size (# of parallel sentences)")
+        ("forest_repository,r",po::value<string>(),"[REQD] Path to forest repository")
+        ("weights,w",po::value<string>(),"[REQD] Current feature weights file")
+        ("optimize_feature,o",po::value<vector<string> >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)")
+        ("random_directions,d",po::value<unsigned int>()->default_value(20),"Number of random directions to run the line optimizer in")
+        ("help,h", "Help");
+  po::options_description dcmdline_options;
+  dcmdline_options.add(opts);
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  bool flag = false;
+  if (conf->count("dev_set_size") == 0) {
+    cerr << "Please specify the size of the development set using -d N\n";
+    flag = true;
   }
-
-  int main(int argc, char *argv[]) {
-    po::variables_map conf;
-    InitCommandLine(argc,argv,&conf);
-    init_bleumodel();
-    UseConf(conf);
-    Run();
-    return 0;
+  if (conf->count("weights") == 0) {
+    cerr << "Please specify the starting-point weights using -w <weightfile.txt>\n";
+    flag = true;
   }
-  bool verbose() const { return oracle.verbose; }
-  void Run() {
-//    register_feature_functions();
-    AddPrimaryAndRandomDirections();
-    AddOracleDirections();
-    compress_similar(directions,max_similarity,&cerr,true,verbose());
-    Print();
+  if (conf->count("forest_repository") == 0) {
+    cerr << "Please specify the forest repository location using -r <DIR>\n";
+    flag = true;
   }
-
-
-  Point origin; // old weights that gave model 1best.
-  vector<string> optimize_features;
-  void UseConf(po::variables_map const& conf) {
-    oracle.UseConf(conf);
-    include_primary=!conf.count("no_primary");
-    old_to_hope=!conf.count("no_old_to_hope");
-
-    if (conf.count("optimize_feature") > 0)
-      optimize_features=conf["optimize_feature"].as<vector<string> >();
-    Init();
+  if (flag || conf->count("help")) {
+    cerr << dcmdline_options << endl;
+    exit(1);
   }
+}
 
-  string weights_file;
-  double max_similarity;
-  unsigned n_oracle, oracle_batch;
-  string forest_repository;
-  unsigned dev_set_size;
-  vector<Oracle> oracles;
-  vector<int> fids;
-  string forest_file(unsigned i) const {
-    ostringstream o;
-    o << forest_repository << '/' << i << ".json.gz";
-    return o.str();
-  }
-
-  oracle_directions() { }
-
-  Sentences model_hyps;
-
-  vector<ScoreP> model_scores;
-  bool have_doc;
-  void Init() {
-    have_doc=!decoder_translations_file.empty();
-    if (have_doc) {
-      model_hyps.Load(decoder_translations_file);
-      if (verbose()) model_hyps.Print(cerr,5);
-      model_scores.resize(model_hyps.size());
-      if (dev_set_size!=model_hyps.size()) {
-        cerr<<"You supplied decoder_translations with a different number of lines ("<<model_hyps.size()<<") than dev_set_size ("<<dev_set_size<<")"<<endl;
-        abort();
-      }
-      cerr << "Scoring model translations " << model_hyps << endl;
-      for (int i=0;i<model_hyps.size();++i) {
-        //TODO: what is scoreCcand? without clipping? do without for consistency w/ oracle
-        model_scores[i]=oracle.ds[i]->ScoreCandidate(model_hyps[i]);
-        assert(model_scores[i]);
-        if (verbose()) cerr<<"Before model["<<i<<"]: "<<ds().ScoreDetails()<<endl;
-        if (verbose()) cerr<<"model["<<i<<"]: "<<model_scores[i]->ScoreDetails()<<endl;
-        oracle.doc_score->PlusEquals(*model_scores[i]);
-        if (verbose()) cerr<<"After model["<<i<<"]: "<<ds().ScoreDetails()<<endl;
-      }
-      //TODO: compute doc bleu stats for each sentence, then when getting oracle temporarily exclude stats for that sentence (skip regular score updating)
-    }
-    start_random=false;
-    cerr << "Forest repo: " << forest_repository << endl;
-    assert(DirectoryExists(forest_repository));
-    vector<string> features;
-    vector<weight_t> dorigin;
-    Weights::InitFromFile(weights_file, &dorigin, &features);
-    if (optimize_features.size())
-      features=optimize_features;
-    Weights::InitSparseVector(dorigin, &origin);
-    fids.clear();
-    AddFeatureIds(features);
-    oracles.resize(dev_set_size);
-  }
-
-  void AddFeatureIds(vector<string> const& features) {
-    int i = fids.size();
-    fids.resize(fids.size()+features.size());
-    for (; i < features.size(); ++i)
-      fids[i] = FD::Convert(features[i]);
- }
-
-
-  std::string decoder_translations_file; // one per line
-  //TODO: is it worthwhile to get a complete document bleu first?  would take a list of 1best translations one per line from the decoders, rather than loading all the forests (expensive).  translations are in run.raw.N.gz - new arg
-  void adjust_doc(unsigned i,double scale=1.) {
-    oracle.doc_score->PlusEquals(*model_scores[i],scale);
-  }
-
-  Score &ds() {
-    return *oracle.doc_score;
-  }
-
-  Oracle const& ComputeOracle(unsigned i) {
-    Oracle &o=oracles[i];
-    if (o.is_null()) {
-      if (have_doc) {
-        if (verbose()) cerr<<"Before removing i="<<i<<" "<<ds().ScoreDetails()<<"\n";
-        adjust_doc(i,-1);
-      }
-      ReadFile rf(forest_file(i));
-      Hypergraph hg;
-      {
-        Timer t("Loading forest from JSON "+forest_file(i));
-        HypergraphIO::ReadFromJSON(rf.stream(), &hg);
-      }
-      if (verbose()) cerr<<"Before oracle["<<i<<"]: "<<ds().ScoreDetails()<<endl;
-      o=oracle.ComputeOracle(oracle.MakeMetadata(hg,i),&hg,origin);
-      if (verbose()) {
-        cerr << o;
-        ScoreP hopesc=oracle.GetScore(o.hope.sentence,i);
-        oracle.doc_score->PlusEquals(*hopesc,1);
-        cerr<<"With hope: "<<ds().ScoreDetails()<<endl;
-        oracle.doc_score->PlusEquals(*hopesc,-1);
-        cerr<<"Without hope: "<<ds().ScoreDetails()<<endl;
-        cerr<<" oracle="<<oracle.GetScore(o.hope.sentence,i)->ScoreDetails()<<endl
-            <<" model="<<oracle.GetScore(o.model.sentence,i)->ScoreDetails()<<endl;
-        if (have_doc)
-          cerr<<" doc (should = model): "<<model_scores[i]->ScoreDetails()<<endl;
-      }
-      if (have_doc) {
-        adjust_doc(i,1);
-      } else
-        oracle.IncludeLastScore();
-    }
-    return o;
-  }
-
-  // if start_random is true, immediately sample w/ replacement from src sentences; otherwise, consume them sequentially until exhausted, then random.  oracle vectors are summed
-  void AddOracleDirections() {
-    MT19937::IntRNG rsg=rng.inclusive(0,dev_set_size-1);
-    unsigned b=0;
-    for(unsigned i=0;i<n_oracle;++i) {
-      Dir o2hope;
-      Dir fear2hope;
-      for (unsigned j=0;j<oracle_batch;++j,++b) {
-        Oracle const& o=ComputeOracle((start_random||b>=dev_set_size) ? rsg() : b);
-
-        if (old_to_hope)
-          o2hope+=o.ModelHopeGradient();
-        if (fear_to_hope)
-          fear2hope+=o.FearHopeGradient();
-      }
-      double N=(double)oracle_batch;
-      if (old_to_hope) {
-        o2hope/=N;
-        directions.push_back(o2hope);
-      }
-      if (fear_to_hope) {
-        fear2hope/=N;
-        directions.push_back(fear2hope);
-      }
+int main(int argc, char** argv) {
+  RandomNumberGenerator<boost::mt19937> rng;
+  po::variables_map conf;
+  InitCommandLine(argc, argv, &conf);
+  vector<string> features;
+  SparseVector<weight_t> origin;
+  vector<weight_t> w;
+  Weights::InitFromFile(conf["weights"].as<string>(), &w, &features);
+  Weights::InitSparseVector(w, &origin);
+  const string forest_repository = conf["forest_repository"].as<string>();
+  assert(DirectoryExists(forest_repository));
+  if (conf.count("optimize_feature") > 0)
+    features=conf["optimize_feature"].as<vector<string> >();
+  vector<SparseVector<weight_t> > directions;
+  vector<int> fids(features.size());
+  for (int i = 0; i < features.size(); ++i)
+    fids[i] = FD::Convert(features[i]);
+  LineOptimizer::CreateOptimizationDirections(
+     fids,
+     conf["random_directions"].as<unsigned int>(),
+     &rng,
+     &directions);
+  unsigned dev_set_size = conf["dev_set_size"].as<unsigned>();
+  for (unsigned i = 0; i < dev_set_size; ++i) {
+    for (unsigned j = 0; j < directions.size(); ++j) {
+      cout << forest_repository << '/' << i << ".json.gz " << i << ' ';
+      print(cout, origin, "=", ";");
+      cout << ' ';
+      print(cout, directions[j], "=", ";");
+      cout << endl;
     }
   }
-};
-
-int main(int argc, char** argv) {
-  oracle_directions od;
-  return od.main(argc,argv);
+  return 0;
 }
diff --git a/vest/mr_vest_map.cc b/vest/mr_vest_map.cc
index 8f6e085d..7d9625bc 100644
--- a/vest/mr_vest_map.cc
+++ b/vest/mr_vest_map.cc
@@ -82,20 +82,20 @@ int main(int argc, char** argv) {
     if (line.empty()) continue;
     istringstream is(line);
     int sent_id;
-    string file, s_origin, s_axis;
+    string file, s_origin, s_direction;
     // path-to-file (JSON) sent_ed starting-point search-direction
-    is >> file >> sent_id >> s_origin >> s_axis;
+    is >> file >> sent_id >> s_origin >> s_direction;
     SparseVector<double> origin;
-    assert(ReadSparseVectorString(s_origin, &origin));
-    SparseVector<double> axis;
-    assert(ReadSparseVectorString(s_axis, &axis));
-    // cerr << "File: " << file << "\nAxis: " << axis << "\n   X: " << origin << endl;
+    ReadSparseVectorString(s_origin, &origin);
+    SparseVector<double> direction;
+    ReadSparseVectorString(s_direction, &direction);
+    // cerr << "File: " << file << "\nDir: " << direction << "\n   X: " << origin << endl;
     if (last_file != file) {
       last_file = file;
       ReadFile rf(file);
       HypergraphIO::ReadFromJSON(rf.stream(), &hg);
     }
-    ViterbiEnvelopeWeightFunction wf(origin, axis);
+    ViterbiEnvelopeWeightFunction wf(origin, direction);
     ViterbiEnvelope ve = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg, NULL, wf);
     ErrorSurface es;
 
@@ -104,7 +104,7 @@ int main(int argc, char** argv) {
     // cerr << "Error surface has " << es.size() << " segments\n";
     string val;
     es.Serialize(&val);
-    cout << 'M' << ' ' << s_origin << ' ' << s_axis << '\t';
+    cout << 'M' << ' ' << s_origin << ' ' << s_direction << '\t';
     B64::b64encode(val.c_str(), val.size(), &cout);
     cout << endl << flush;
   }
author	Chris Dyer <cdyer@cs.cmu.edu>	2012-01-27 13:19:27 -0500
committer	Chris Dyer <cdyer@cs.cmu.edu>	2012-01-27 13:19:27 -0500
commit	3d17bf9ae1ba67cd091794839d4d5f4c393a0e2c (patch)
tree	8e85fa3da0759c6df13ca306f95c026743989eba
parent	3c1c98b5aec7aec34432ddc37385df06d301bdd5 (diff)