1 files changed, 0 insertions, 320 deletions
diff --git a/vest/mr_vest_generate_mapper_input.cc b/vest/mr_vest_generate_mapper_input.cc
deleted file mode 100644
index 0c094fd5..00000000
--- a/vest/mr_vest_generate_mapper_input.cc
+++ /dev/null
@@ -1,320 +0,0 @@
-//TODO: debug segfault when references supplied, null shared_ptr when oracle
-#include <iostream>
-#include <vector>
-#include <sstream>
-
-#include <boost/program_options.hpp>
-#include <boost/program_options/variables_map.hpp>
-
-#include "sampler.h"
-#include "filelib.h"
-#include "weights.h"
-#include "line_optimizer.h"
-#include "hg.h"
-#include "hg_io.h"
-#include "scorer.h"
-#include "oracle_bleu.h"
-#include "ff_bleu.h"
-
-const bool DEBUG_ORACLE=true;
-
-//TODO: decide on cdec_ff ffs, or just bleumodel - if just bleumodel, then do existing features on serialized hypergraphs remain?  weights (origin) is passed to oracle_bleu.h:ComputeOracle
-//void register_feature_functions();
-//FFRegistry ff_registry;
-namespace {
-void init_bleumodel() {
-  ff_registry.clear();
-  ff_registry.Register(new FFFactory<BLEUModel>);
-}
-
-struct init_ff {
-  init_ff() {
-    init_bleumodel();
-  }
-};
-//init_ff reg; // order of initialization?  ff_registry may not be init yet.  call in Run() instead.
-}
-
-using namespace std;
-namespace po = boost::program_options;
-
-typedef SparseVector<double> Dir;
-typedef Dir Point;
-
-void compress_similar(vector<Dir> &dirs,double min_dist,ostream *log=&cerr,bool avg=true,bool verbose=true) {
-  //  return; //TODO: debug
-  if (min_dist<=0) return;
-  double max_s=1.-min_dist;
-  if (log&&verbose) *log<<"max allowed S="<<max_s<<endl;
-  unsigned N=dirs.size();
-  for (int i=0;i<N;++i) {
-    for (int j=i+1;j<N;++j) {
-      double s=dirs[i].tanimoto_coef(dirs[j]);
-      if (log&&verbose) *log<<"S["<<i<<","<<j<<"]="<<s<<' ';
-      if (s>max_s) {
-        if (log) *log << "Collapsing similar directions (T="<<s<<" > "<<max_s<<").  dirs["<<i<<"]="<<dirs[i]<<" dirs["<<j<<"]"<<endl;
-        if (avg) {
-          dirs[i]+=dirs[j];
-          dirs[i]/=2.;
-          if (log) *log<<" averaged="<<dirs[i];
-        }
-        if (log) *log<<endl;
-        swap(dirs[j],dirs[--N]);
-      }
-    }
-    if (log&&verbose) *log<<endl;
-
-  }
-  dirs.resize(N);
-}
-
-struct oracle_directions {
-  MT19937 rng;
-  OracleBleu oracle;
-  vector<Dir> directions;
-
-  bool start_random;
-  bool include_primary;
-  bool old_to_hope;
-  bool fear_to_hope;
-  unsigned n_random;
-  void AddPrimaryAndRandomDirections() {
-    LineOptimizer::CreateOptimizationDirections(
-      fids,n_random,&rng,&directions,include_primary);
-  }
-
-  void Print() {
-    for (int i = 0; i < dev_set_size; ++i)
-      for (int j = 0; j < directions.size(); ++j) {
-        cout << forest_file(i) <<" " << i<<" ";
-        print(cout,origin,"=",";");
-        cout<<" ";
-        print(cout,directions[j],"=",";");
-        cout<<"\n";
-      }
-  }
-
-  void AddOptions(po::options_description *opts) {
-    oracle.AddOptions(opts);
-    opts->add_options()
-      ("dev_set_size,s",po::value<unsigned>(&dev_set_size),"[REQD] Development set size (# of parallel sentences)")
-      ("forest_repository,r",po::value<string>(&forest_repository),"[REQD] Path to forest repository")
-      ("weights,w",po::value<string>(&weights_file),"[REQD] Current feature weights file")
-      ("optimize_feature,o",po::value<vector<string> >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)")
-      ("random_directions,d",po::value<unsigned>(&n_random)->default_value(10),"Number of random directions to run the line optimizer in")
-      ("no_primary,n","don't use the primary (orthogonal each feature alone) directions")
-      ("oracle_directions,O",po::value<unsigned>(&n_oracle)->default_value(0),"read the forests and choose this many directions based on heading toward a hope max (bleu+modelscore) translation.")
-      ("oracle_start_random",po::bool_switch(&start_random),"sample random subsets of dev set for ALL oracle directions, not just those after a sequential run through it")
-      ("oracle_batch,b",po::value<unsigned>(&oracle_batch)->default_value(10),"to produce each oracle direction, sum the 'gradient' over this many sentences")
-      ("max_similarity,m",po::value<double>(&max_similarity)->default_value(0),"remove directions that are too similar (Tanimoto coeff. less than (1-this)).  0 means don't filter, 1 means only 1 direction allowed?")
-      ("fear_to_hope,f",po::bool_switch(&fear_to_hope),"for each of the oracle_directions, also include a direction from fear to hope (as well as origin to hope)")
-      ("no_old_to_hope","don't emit the usual old -> hope oracle")
-      ("decoder_translations",po::value<string>(&decoder_translations_file)->default_value(""),"one per line decoder 1best translations for computing document BLEU vs. sentences-seen-so-far BLEU")
-      ;
-  }
-  void InitCommandLine(int argc, char *argv[], po::variables_map *conf) {
-    po::options_description opts("Configuration options");
-    AddOptions(&opts);
-    opts.add_options()("help,h", "Help");
-
-    po::options_description dcmdline_options;
-    dcmdline_options.add(opts);
-    po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
-    po::notify(*conf);
-    if (conf->count("dev_set_size") == 0) {
-      cerr << "Please specify the size of the development set using -s N\n";
-      goto bad_cmdline;
-    }
-    if (conf->count("weights") == 0) {
-      cerr << "Please specify the starting-point weights using -w <weightfile.txt>\n";
-      goto bad_cmdline;
-    }
-    if (conf->count("forest_repository") == 0) {
-      cerr << "Please specify the forest repository location using -r <DIR>\n";
-      goto bad_cmdline;
-    }
-    if (n_oracle && oracle.refs.empty()) {
-      cerr<<"Specify references when using oracle directions\n";
-      goto bad_cmdline;
-    }
-    if (conf->count("help")) {
-      cout << dcmdline_options << endl;
-      exit(0);
-    }
-
-    return;
-    bad_cmdline:
-      cerr << dcmdline_options << endl;
-      exit(1);
-  }
-
-  int main(int argc, char *argv[]) {
-    po::variables_map conf;
-    InitCommandLine(argc,argv,&conf);
-    init_bleumodel();
-    UseConf(conf);
-    Run();
-    return 0;
-  }
-  bool verbose() const { return oracle.verbose; }
-  void Run() {
-//    register_feature_functions();
-    AddPrimaryAndRandomDirections();
-    AddOracleDirections();
-    compress_similar(directions,max_similarity,&cerr,true,verbose());
-    Print();
-  }
-
-
-  Point origin; // old weights that gave model 1best.
-  vector<string> optimize_features;
-  void UseConf(po::variables_map const& conf) {
-    oracle.UseConf(conf);
-    include_primary=!conf.count("no_primary");
-    old_to_hope=!conf.count("no_old_to_hope");
-
-    if (conf.count("optimize_feature") > 0)
-      optimize_features=conf["optimize_feature"].as<vector<string> >();
-    Init();
-  }
-
-  string weights_file;
-  double max_similarity;
-  unsigned n_oracle, oracle_batch;
-  string forest_repository;
-  unsigned dev_set_size;
-  vector<Oracle> oracles;
-  vector<int> fids;
-  string forest_file(unsigned i) const {
-    ostringstream o;
-    o << forest_repository << '/' << i << ".json.gz";
-    return o.str();
-  }
-
-  oracle_directions() { }
-
-  Sentences model_hyps;
-
-  vector<ScoreP> model_scores;
-  bool have_doc;
-  void Init() {
-    have_doc=!decoder_translations_file.empty();
-    if (have_doc) {
-      model_hyps.Load(decoder_translations_file);
-      if (verbose()) model_hyps.Print(cerr,5);
-      model_scores.resize(model_hyps.size());
-      if (dev_set_size!=model_hyps.size()) {
-        cerr<<"You supplied decoder_translations with a different number of lines ("<<model_hyps.size()<<") than dev_set_size ("<<dev_set_size<<")"<<endl;
-        abort();
-      }
-      cerr << "Scoring model translations " << model_hyps << endl;
-      for (int i=0;i<model_hyps.size();++i) {
-        //TODO: what is scoreCcand? without clipping? do without for consistency w/ oracle
-        model_scores[i]=oracle.ds[i]->ScoreCandidate(model_hyps[i]);
-        assert(model_scores[i]);
-        if (verbose()) cerr<<"Before model["<<i<<"]: "<<ds().ScoreDetails()<<endl;
-        if (verbose()) cerr<<"model["<<i<<"]: "<<model_scores[i]->ScoreDetails()<<endl;
-        oracle.doc_score->PlusEquals(*model_scores[i]);
-        if (verbose()) cerr<<"After model["<<i<<"]: "<<ds().ScoreDetails()<<endl;
-      }
-      //TODO: compute doc bleu stats for each sentence, then when getting oracle temporarily exclude stats for that sentence (skip regular score updating)
-    }
-    start_random=false;
-    cerr << "Forest repo: " << forest_repository << endl;
-    assert(DirectoryExists(forest_repository));
-    vector<string> features;
-    vector<weight_t> dorigin;
-    Weights::InitFromFile(weights_file, &dorigin, &features);
-    if (optimize_features.size())
-      features=optimize_features;
-    Weights::InitSparseVector(dorigin, &origin);
-    fids.clear();
-    AddFeatureIds(features);
-    oracles.resize(dev_set_size);
-  }
-
-  void AddFeatureIds(vector<string> const& features) {
-    int i = fids.size();
-    fids.resize(fids.size()+features.size());
-    for (; i < features.size(); ++i)
-      fids[i] = FD::Convert(features[i]);
- }
-
-
-  std::string decoder_translations_file; // one per line
-  //TODO: is it worthwhile to get a complete document bleu first?  would take a list of 1best translations one per line from the decoders, rather than loading all the forests (expensive).  translations are in run.raw.N.gz - new arg
-  void adjust_doc(unsigned i,double scale=1.) {
-    oracle.doc_score->PlusEquals(*model_scores[i],scale);
-  }
-
-  Score &ds() {
-    return *oracle.doc_score;
-  }
-
-  Oracle const& ComputeOracle(unsigned i) {
-    Oracle &o=oracles[i];
-    if (o.is_null()) {
-      if (have_doc) {
-        if (verbose()) cerr<<"Before removing i="<<i<<" "<<ds().ScoreDetails()<<"\n";
-        adjust_doc(i,-1);
-      }
-      ReadFile rf(forest_file(i));
-      Hypergraph hg;
-      {
-        Timer t("Loading forest from JSON "+forest_file(i));
-        HypergraphIO::ReadFromJSON(rf.stream(), &hg);
-      }
-      if (verbose()) cerr<<"Before oracle["<<i<<"]: "<<ds().ScoreDetails()<<endl;
-      o=oracle.ComputeOracle(oracle.MakeMetadata(hg,i),&hg,origin);
-      if (verbose()) {
-        cerr << o;
-        ScoreP hopesc=oracle.GetScore(o.hope.sentence,i);
-        oracle.doc_score->PlusEquals(*hopesc,1);
-        cerr<<"With hope: "<<ds().ScoreDetails()<<endl;
-        oracle.doc_score->PlusEquals(*hopesc,-1);
-        cerr<<"Without hope: "<<ds().ScoreDetails()<<endl;
-        cerr<<" oracle="<<oracle.GetScore(o.hope.sentence,i)->ScoreDetails()<<endl
-            <<" model="<<oracle.GetScore(o.model.sentence,i)->ScoreDetails()<<endl;
-        if (have_doc)
-          cerr<<" doc (should = model): "<<model_scores[i]->ScoreDetails()<<endl;
-      }
-      if (have_doc) {
-        adjust_doc(i,1);
-      } else
-        oracle.IncludeLastScore();
-    }
-    return o;
-  }
-
-  // if start_random is true, immediately sample w/ replacement from src sentences; otherwise, consume them sequentially until exhausted, then random.  oracle vectors are summed
-  void AddOracleDirections() {
-    MT19937::IntRNG rsg=rng.inclusive(0,dev_set_size-1);
-    unsigned b=0;
-    for(unsigned i=0;i<n_oracle;++i) {
-      Dir o2hope;
-      Dir fear2hope;
-      for (unsigned j=0;j<oracle_batch;++j,++b) {
-        Oracle const& o=ComputeOracle((start_random||b>=dev_set_size) ? rsg() : b);
-
-        if (old_to_hope)
-          o2hope+=o.ModelHopeGradient();
-        if (fear_to_hope)
-          fear2hope+=o.FearHopeGradient();
-      }
-      double N=(double)oracle_batch;
-      if (old_to_hope) {
-        o2hope/=N;
-        directions.push_back(o2hope);
-      }
-      if (fear_to_hope) {
-        fear2hope/=N;
-        directions.push_back(fear2hope);
-      }
-    }
-  }
-};
-
-int main(int argc, char** argv) {
-  oracle_directions od;
-  return od.main(argc,argv);
-}