summaryrefslogtreecommitdiff
path: root/vest/mr_vest_generate_mapper_input.cc
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2012-01-27 13:19:27 -0500
committerChris Dyer <cdyer@cs.cmu.edu>2012-01-27 13:19:27 -0500
commit203c3c3357b9ed8cfe44932c2bf5ea19eba6238c (patch)
treec446f8e8afbe194ef656b33cfc643f83633cf18c /vest/mr_vest_generate_mapper_input.cc
parent481a120564fdb73c8c6833e2102acb533683261c (diff)
migration to new metric api for vest, clean up of unsupported/not functional code
Diffstat (limited to 'vest/mr_vest_generate_mapper_input.cc')
-rw-r--r--vest/mr_vest_generate_mapper_input.cc356
1 files changed, 57 insertions, 299 deletions
diff --git a/vest/mr_vest_generate_mapper_input.cc b/vest/mr_vest_generate_mapper_input.cc
index 0c094fd5..59d4f24f 100644
--- a/vest/mr_vest_generate_mapper_input.cc
+++ b/vest/mr_vest_generate_mapper_input.cc
@@ -1,320 +1,78 @@
-//TODO: debug segfault when references supplied, null shared_ptr when oracle
#include <iostream>
#include <vector>
-#include <sstream>
#include <boost/program_options.hpp>
#include <boost/program_options/variables_map.hpp>
-#include "sampler.h"
#include "filelib.h"
#include "weights.h"
#include "line_optimizer.h"
-#include "hg.h"
-#include "hg_io.h"
-#include "scorer.h"
-#include "oracle_bleu.h"
-#include "ff_bleu.h"
-
-const bool DEBUG_ORACLE=true;
-
-//TODO: decide on cdec_ff ffs, or just bleumodel - if just bleumodel, then do existing features on serialized hypergraphs remain? weights (origin) is passed to oracle_bleu.h:ComputeOracle
-//void register_feature_functions();
-//FFRegistry ff_registry;
-namespace {
-void init_bleumodel() {
- ff_registry.clear();
- ff_registry.Register(new FFFactory<BLEUModel>);
-}
-
-struct init_ff {
- init_ff() {
- init_bleumodel();
- }
-};
-//init_ff reg; // order of initialization? ff_registry may not be init yet. call in Run() instead.
-}
using namespace std;
namespace po = boost::program_options;
-typedef SparseVector<double> Dir;
-typedef Dir Point;
-
-void compress_similar(vector<Dir> &dirs,double min_dist,ostream *log=&cerr,bool avg=true,bool verbose=true) {
- // return; //TODO: debug
- if (min_dist<=0) return;
- double max_s=1.-min_dist;
- if (log&&verbose) *log<<"max allowed S="<<max_s<<endl;
- unsigned N=dirs.size();
- for (int i=0;i<N;++i) {
- for (int j=i+1;j<N;++j) {
- double s=dirs[i].tanimoto_coef(dirs[j]);
- if (log&&verbose) *log<<"S["<<i<<","<<j<<"]="<<s<<' ';
- if (s>max_s) {
- if (log) *log << "Collapsing similar directions (T="<<s<<" > "<<max_s<<"). dirs["<<i<<"]="<<dirs[i]<<" dirs["<<j<<"]"<<endl;
- if (avg) {
- dirs[i]+=dirs[j];
- dirs[i]/=2.;
- if (log) *log<<" averaged="<<dirs[i];
- }
- if (log) *log<<endl;
- swap(dirs[j],dirs[--N]);
- }
- }
- if (log&&verbose) *log<<endl;
-
- }
- dirs.resize(N);
-}
-
-struct oracle_directions {
- MT19937 rng;
- OracleBleu oracle;
- vector<Dir> directions;
-
- bool start_random;
- bool include_primary;
- bool old_to_hope;
- bool fear_to_hope;
- unsigned n_random;
- void AddPrimaryAndRandomDirections() {
- LineOptimizer::CreateOptimizationDirections(
- fids,n_random,&rng,&directions,include_primary);
- }
-
- void Print() {
- for (int i = 0; i < dev_set_size; ++i)
- for (int j = 0; j < directions.size(); ++j) {
- cout << forest_file(i) <<" " << i<<" ";
- print(cout,origin,"=",";");
- cout<<" ";
- print(cout,directions[j],"=",";");
- cout<<"\n";
- }
- }
-
- void AddOptions(po::options_description *opts) {
- oracle.AddOptions(opts);
- opts->add_options()
- ("dev_set_size,s",po::value<unsigned>(&dev_set_size),"[REQD] Development set size (# of parallel sentences)")
- ("forest_repository,r",po::value<string>(&forest_repository),"[REQD] Path to forest repository")
- ("weights,w",po::value<string>(&weights_file),"[REQD] Current feature weights file")
- ("optimize_feature,o",po::value<vector<string> >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)")
- ("random_directions,d",po::value<unsigned>(&n_random)->default_value(10),"Number of random directions to run the line optimizer in")
- ("no_primary,n","don't use the primary (orthogonal each feature alone) directions")
- ("oracle_directions,O",po::value<unsigned>(&n_oracle)->default_value(0),"read the forests and choose this many directions based on heading toward a hope max (bleu+modelscore) translation.")
- ("oracle_start_random",po::bool_switch(&start_random),"sample random subsets of dev set for ALL oracle directions, not just those after a sequential run through it")
- ("oracle_batch,b",po::value<unsigned>(&oracle_batch)->default_value(10),"to produce each oracle direction, sum the 'gradient' over this many sentences")
- ("max_similarity,m",po::value<double>(&max_similarity)->default_value(0),"remove directions that are too similar (Tanimoto coeff. less than (1-this)). 0 means don't filter, 1 means only 1 direction allowed?")
- ("fear_to_hope,f",po::bool_switch(&fear_to_hope),"for each of the oracle_directions, also include a direction from fear to hope (as well as origin to hope)")
- ("no_old_to_hope","don't emit the usual old -> hope oracle")
- ("decoder_translations",po::value<string>(&decoder_translations_file)->default_value(""),"one per line decoder 1best translations for computing document BLEU vs. sentences-seen-so-far BLEU")
- ;
- }
- void InitCommandLine(int argc, char *argv[], po::variables_map *conf) {
- po::options_description opts("Configuration options");
- AddOptions(&opts);
- opts.add_options()("help,h", "Help");
-
- po::options_description dcmdline_options;
- dcmdline_options.add(opts);
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- po::notify(*conf);
- if (conf->count("dev_set_size") == 0) {
- cerr << "Please specify the size of the development set using -s N\n";
- goto bad_cmdline;
- }
- if (conf->count("weights") == 0) {
- cerr << "Please specify the starting-point weights using -w <weightfile.txt>\n";
- goto bad_cmdline;
- }
- if (conf->count("forest_repository") == 0) {
- cerr << "Please specify the forest repository location using -r <DIR>\n";
- goto bad_cmdline;
- }
- if (n_oracle && oracle.refs.empty()) {
- cerr<<"Specify references when using oracle directions\n";
- goto bad_cmdline;
- }
- if (conf->count("help")) {
- cout << dcmdline_options << endl;
- exit(0);
- }
-
- return;
- bad_cmdline:
- cerr << dcmdline_options << endl;
- exit(1);
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("dev_set_size,s",po::value<unsigned>(),"[REQD] Development set size (# of parallel sentences)")
+ ("forest_repository,r",po::value<string>(),"[REQD] Path to forest repository")
+ ("weights,w",po::value<string>(),"[REQD] Current feature weights file")
+ ("optimize_feature,o",po::value<vector<string> >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)")
+ ("random_directions,d",po::value<unsigned int>()->default_value(20),"Number of random directions to run the line optimizer in")
+ ("help,h", "Help");
+ po::options_description dcmdline_options;
+ dcmdline_options.add(opts);
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ bool flag = false;
+ if (conf->count("dev_set_size") == 0) {
+ cerr << "Please specify the size of the development set using -d N\n";
+ flag = true;
}
-
- int main(int argc, char *argv[]) {
- po::variables_map conf;
- InitCommandLine(argc,argv,&conf);
- init_bleumodel();
- UseConf(conf);
- Run();
- return 0;
+ if (conf->count("weights") == 0) {
+ cerr << "Please specify the starting-point weights using -w <weightfile.txt>\n";
+ flag = true;
}
- bool verbose() const { return oracle.verbose; }
- void Run() {
-// register_feature_functions();
- AddPrimaryAndRandomDirections();
- AddOracleDirections();
- compress_similar(directions,max_similarity,&cerr,true,verbose());
- Print();
+ if (conf->count("forest_repository") == 0) {
+ cerr << "Please specify the forest repository location using -r <DIR>\n";
+ flag = true;
}
-
-
- Point origin; // old weights that gave model 1best.
- vector<string> optimize_features;
- void UseConf(po::variables_map const& conf) {
- oracle.UseConf(conf);
- include_primary=!conf.count("no_primary");
- old_to_hope=!conf.count("no_old_to_hope");
-
- if (conf.count("optimize_feature") > 0)
- optimize_features=conf["optimize_feature"].as<vector<string> >();
- Init();
+ if (flag || conf->count("help")) {
+ cerr << dcmdline_options << endl;
+ exit(1);
}
+}
- string weights_file;
- double max_similarity;
- unsigned n_oracle, oracle_batch;
- string forest_repository;
- unsigned dev_set_size;
- vector<Oracle> oracles;
- vector<int> fids;
- string forest_file(unsigned i) const {
- ostringstream o;
- o << forest_repository << '/' << i << ".json.gz";
- return o.str();
- }
-
- oracle_directions() { }
-
- Sentences model_hyps;
-
- vector<ScoreP> model_scores;
- bool have_doc;
- void Init() {
- have_doc=!decoder_translations_file.empty();
- if (have_doc) {
- model_hyps.Load(decoder_translations_file);
- if (verbose()) model_hyps.Print(cerr,5);
- model_scores.resize(model_hyps.size());
- if (dev_set_size!=model_hyps.size()) {
- cerr<<"You supplied decoder_translations with a different number of lines ("<<model_hyps.size()<<") than dev_set_size ("<<dev_set_size<<")"<<endl;
- abort();
- }
- cerr << "Scoring model translations " << model_hyps << endl;
- for (int i=0;i<model_hyps.size();++i) {
- //TODO: what is scoreCcand? without clipping? do without for consistency w/ oracle
- model_scores[i]=oracle.ds[i]->ScoreCandidate(model_hyps[i]);
- assert(model_scores[i]);
- if (verbose()) cerr<<"Before model["<<i<<"]: "<<ds().ScoreDetails()<<endl;
- if (verbose()) cerr<<"model["<<i<<"]: "<<model_scores[i]->ScoreDetails()<<endl;
- oracle.doc_score->PlusEquals(*model_scores[i]);
- if (verbose()) cerr<<"After model["<<i<<"]: "<<ds().ScoreDetails()<<endl;
- }
- //TODO: compute doc bleu stats for each sentence, then when getting oracle temporarily exclude stats for that sentence (skip regular score updating)
- }
- start_random=false;
- cerr << "Forest repo: " << forest_repository << endl;
- assert(DirectoryExists(forest_repository));
- vector<string> features;
- vector<weight_t> dorigin;
- Weights::InitFromFile(weights_file, &dorigin, &features);
- if (optimize_features.size())
- features=optimize_features;
- Weights::InitSparseVector(dorigin, &origin);
- fids.clear();
- AddFeatureIds(features);
- oracles.resize(dev_set_size);
- }
-
- void AddFeatureIds(vector<string> const& features) {
- int i = fids.size();
- fids.resize(fids.size()+features.size());
- for (; i < features.size(); ++i)
- fids[i] = FD::Convert(features[i]);
- }
-
-
- std::string decoder_translations_file; // one per line
- //TODO: is it worthwhile to get a complete document bleu first? would take a list of 1best translations one per line from the decoders, rather than loading all the forests (expensive). translations are in run.raw.N.gz - new arg
- void adjust_doc(unsigned i,double scale=1.) {
- oracle.doc_score->PlusEquals(*model_scores[i],scale);
- }
-
- Score &ds() {
- return *oracle.doc_score;
- }
-
- Oracle const& ComputeOracle(unsigned i) {
- Oracle &o=oracles[i];
- if (o.is_null()) {
- if (have_doc) {
- if (verbose()) cerr<<"Before removing i="<<i<<" "<<ds().ScoreDetails()<<"\n";
- adjust_doc(i,-1);
- }
- ReadFile rf(forest_file(i));
- Hypergraph hg;
- {
- Timer t("Loading forest from JSON "+forest_file(i));
- HypergraphIO::ReadFromJSON(rf.stream(), &hg);
- }
- if (verbose()) cerr<<"Before oracle["<<i<<"]: "<<ds().ScoreDetails()<<endl;
- o=oracle.ComputeOracle(oracle.MakeMetadata(hg,i),&hg,origin);
- if (verbose()) {
- cerr << o;
- ScoreP hopesc=oracle.GetScore(o.hope.sentence,i);
- oracle.doc_score->PlusEquals(*hopesc,1);
- cerr<<"With hope: "<<ds().ScoreDetails()<<endl;
- oracle.doc_score->PlusEquals(*hopesc,-1);
- cerr<<"Without hope: "<<ds().ScoreDetails()<<endl;
- cerr<<" oracle="<<oracle.GetScore(o.hope.sentence,i)->ScoreDetails()<<endl
- <<" model="<<oracle.GetScore(o.model.sentence,i)->ScoreDetails()<<endl;
- if (have_doc)
- cerr<<" doc (should = model): "<<model_scores[i]->ScoreDetails()<<endl;
- }
- if (have_doc) {
- adjust_doc(i,1);
- } else
- oracle.IncludeLastScore();
- }
- return o;
- }
-
- // if start_random is true, immediately sample w/ replacement from src sentences; otherwise, consume them sequentially until exhausted, then random. oracle vectors are summed
- void AddOracleDirections() {
- MT19937::IntRNG rsg=rng.inclusive(0,dev_set_size-1);
- unsigned b=0;
- for(unsigned i=0;i<n_oracle;++i) {
- Dir o2hope;
- Dir fear2hope;
- for (unsigned j=0;j<oracle_batch;++j,++b) {
- Oracle const& o=ComputeOracle((start_random||b>=dev_set_size) ? rsg() : b);
-
- if (old_to_hope)
- o2hope+=o.ModelHopeGradient();
- if (fear_to_hope)
- fear2hope+=o.FearHopeGradient();
- }
- double N=(double)oracle_batch;
- if (old_to_hope) {
- o2hope/=N;
- directions.push_back(o2hope);
- }
- if (fear_to_hope) {
- fear2hope/=N;
- directions.push_back(fear2hope);
- }
+int main(int argc, char** argv) {
+ RandomNumberGenerator<boost::mt19937> rng;
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ vector<string> features;
+ SparseVector<weight_t> origin;
+ vector<weight_t> w;
+ Weights::InitFromFile(conf["weights"].as<string>(), &w, &features);
+ Weights::InitSparseVector(w, &origin);
+ const string forest_repository = conf["forest_repository"].as<string>();
+ assert(DirectoryExists(forest_repository));
+ if (conf.count("optimize_feature") > 0)
+ features=conf["optimize_feature"].as<vector<string> >();
+ vector<SparseVector<weight_t> > directions;
+ vector<int> fids(features.size());
+ for (int i = 0; i < features.size(); ++i)
+ fids[i] = FD::Convert(features[i]);
+ LineOptimizer::CreateOptimizationDirections(
+ fids,
+ conf["random_directions"].as<unsigned int>(),
+ &rng,
+ &directions);
+ unsigned dev_set_size = conf["dev_set_size"].as<unsigned>();
+ for (unsigned i = 0; i < dev_set_size; ++i) {
+ for (unsigned j = 0; j < directions.size(); ++j) {
+ cout << forest_repository << '/' << i << ".json.gz " << i << ' ';
+ print(cout, origin, "=", ";");
+ cout << ' ';
+ print(cout, directions[j], "=", ";");
+ cout << endl;
}
}
-};
-
-int main(int argc, char** argv) {
- oracle_directions od;
- return od.main(argc,argv);
+ return 0;
}