diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2012-01-27 13:19:27 -0500 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2012-01-27 13:19:27 -0500 |
commit | 203c3c3357b9ed8cfe44932c2bf5ea19eba6238c (patch) | |
tree | c446f8e8afbe194ef656b33cfc643f83633cf18c /vest/mr_vest_generate_mapper_input.cc | |
parent | 481a120564fdb73c8c6833e2102acb533683261c (diff) |
migration to new metric api for vest, clean up of unsupported/not functional code
Diffstat (limited to 'vest/mr_vest_generate_mapper_input.cc')
-rw-r--r-- | vest/mr_vest_generate_mapper_input.cc | 356 |
1 files changed, 57 insertions, 299 deletions
diff --git a/vest/mr_vest_generate_mapper_input.cc b/vest/mr_vest_generate_mapper_input.cc index 0c094fd5..59d4f24f 100644 --- a/vest/mr_vest_generate_mapper_input.cc +++ b/vest/mr_vest_generate_mapper_input.cc @@ -1,320 +1,78 @@ -//TODO: debug segfault when references supplied, null shared_ptr when oracle #include <iostream> #include <vector> -#include <sstream> #include <boost/program_options.hpp> #include <boost/program_options/variables_map.hpp> -#include "sampler.h" #include "filelib.h" #include "weights.h" #include "line_optimizer.h" -#include "hg.h" -#include "hg_io.h" -#include "scorer.h" -#include "oracle_bleu.h" -#include "ff_bleu.h" - -const bool DEBUG_ORACLE=true; - -//TODO: decide on cdec_ff ffs, or just bleumodel - if just bleumodel, then do existing features on serialized hypergraphs remain? weights (origin) is passed to oracle_bleu.h:ComputeOracle -//void register_feature_functions(); -//FFRegistry ff_registry; -namespace { -void init_bleumodel() { - ff_registry.clear(); - ff_registry.Register(new FFFactory<BLEUModel>); -} - -struct init_ff { - init_ff() { - init_bleumodel(); - } -}; -//init_ff reg; // order of initialization? ff_registry may not be init yet. call in Run() instead. -} using namespace std; namespace po = boost::program_options; -typedef SparseVector<double> Dir; -typedef Dir Point; - -void compress_similar(vector<Dir> &dirs,double min_dist,ostream *log=&cerr,bool avg=true,bool verbose=true) { - // return; //TODO: debug - if (min_dist<=0) return; - double max_s=1.-min_dist; - if (log&&verbose) *log<<"max allowed S="<<max_s<<endl; - unsigned N=dirs.size(); - for (int i=0;i<N;++i) { - for (int j=i+1;j<N;++j) { - double s=dirs[i].tanimoto_coef(dirs[j]); - if (log&&verbose) *log<<"S["<<i<<","<<j<<"]="<<s<<' '; - if (s>max_s) { - if (log) *log << "Collapsing similar directions (T="<<s<<" > "<<max_s<<"). dirs["<<i<<"]="<<dirs[i]<<" dirs["<<j<<"]"<<endl; - if (avg) { - dirs[i]+=dirs[j]; - dirs[i]/=2.; - if (log) *log<<" averaged="<<dirs[i]; - } - if (log) *log<<endl; - swap(dirs[j],dirs[--N]); - } - } - if (log&&verbose) *log<<endl; - - } - dirs.resize(N); -} - -struct oracle_directions { - MT19937 rng; - OracleBleu oracle; - vector<Dir> directions; - - bool start_random; - bool include_primary; - bool old_to_hope; - bool fear_to_hope; - unsigned n_random; - void AddPrimaryAndRandomDirections() { - LineOptimizer::CreateOptimizationDirections( - fids,n_random,&rng,&directions,include_primary); - } - - void Print() { - for (int i = 0; i < dev_set_size; ++i) - for (int j = 0; j < directions.size(); ++j) { - cout << forest_file(i) <<" " << i<<" "; - print(cout,origin,"=",";"); - cout<<" "; - print(cout,directions[j],"=",";"); - cout<<"\n"; - } - } - - void AddOptions(po::options_description *opts) { - oracle.AddOptions(opts); - opts->add_options() - ("dev_set_size,s",po::value<unsigned>(&dev_set_size),"[REQD] Development set size (# of parallel sentences)") - ("forest_repository,r",po::value<string>(&forest_repository),"[REQD] Path to forest repository") - ("weights,w",po::value<string>(&weights_file),"[REQD] Current feature weights file") - ("optimize_feature,o",po::value<vector<string> >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)") - ("random_directions,d",po::value<unsigned>(&n_random)->default_value(10),"Number of random directions to run the line optimizer in") - ("no_primary,n","don't use the primary (orthogonal each feature alone) directions") - ("oracle_directions,O",po::value<unsigned>(&n_oracle)->default_value(0),"read the forests and choose this many directions based on heading toward a hope max (bleu+modelscore) translation.") - ("oracle_start_random",po::bool_switch(&start_random),"sample random subsets of dev set for ALL oracle directions, not just those after a sequential run through it") - ("oracle_batch,b",po::value<unsigned>(&oracle_batch)->default_value(10),"to produce each oracle direction, sum the 'gradient' over this many sentences") - ("max_similarity,m",po::value<double>(&max_similarity)->default_value(0),"remove directions that are too similar (Tanimoto coeff. less than (1-this)). 0 means don't filter, 1 means only 1 direction allowed?") - ("fear_to_hope,f",po::bool_switch(&fear_to_hope),"for each of the oracle_directions, also include a direction from fear to hope (as well as origin to hope)") - ("no_old_to_hope","don't emit the usual old -> hope oracle") - ("decoder_translations",po::value<string>(&decoder_translations_file)->default_value(""),"one per line decoder 1best translations for computing document BLEU vs. sentences-seen-so-far BLEU") - ; - } - void InitCommandLine(int argc, char *argv[], po::variables_map *conf) { - po::options_description opts("Configuration options"); - AddOptions(&opts); - opts.add_options()("help,h", "Help"); - - po::options_description dcmdline_options; - dcmdline_options.add(opts); - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - po::notify(*conf); - if (conf->count("dev_set_size") == 0) { - cerr << "Please specify the size of the development set using -s N\n"; - goto bad_cmdline; - } - if (conf->count("weights") == 0) { - cerr << "Please specify the starting-point weights using -w <weightfile.txt>\n"; - goto bad_cmdline; - } - if (conf->count("forest_repository") == 0) { - cerr << "Please specify the forest repository location using -r <DIR>\n"; - goto bad_cmdline; - } - if (n_oracle && oracle.refs.empty()) { - cerr<<"Specify references when using oracle directions\n"; - goto bad_cmdline; - } - if (conf->count("help")) { - cout << dcmdline_options << endl; - exit(0); - } - - return; - bad_cmdline: - cerr << dcmdline_options << endl; - exit(1); +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("dev_set_size,s",po::value<unsigned>(),"[REQD] Development set size (# of parallel sentences)") + ("forest_repository,r",po::value<string>(),"[REQD] Path to forest repository") + ("weights,w",po::value<string>(),"[REQD] Current feature weights file") + ("optimize_feature,o",po::value<vector<string> >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)") + ("random_directions,d",po::value<unsigned int>()->default_value(20),"Number of random directions to run the line optimizer in") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (conf->count("dev_set_size") == 0) { + cerr << "Please specify the size of the development set using -d N\n"; + flag = true; } - - int main(int argc, char *argv[]) { - po::variables_map conf; - InitCommandLine(argc,argv,&conf); - init_bleumodel(); - UseConf(conf); - Run(); - return 0; + if (conf->count("weights") == 0) { + cerr << "Please specify the starting-point weights using -w <weightfile.txt>\n"; + flag = true; } - bool verbose() const { return oracle.verbose; } - void Run() { -// register_feature_functions(); - AddPrimaryAndRandomDirections(); - AddOracleDirections(); - compress_similar(directions,max_similarity,&cerr,true,verbose()); - Print(); + if (conf->count("forest_repository") == 0) { + cerr << "Please specify the forest repository location using -r <DIR>\n"; + flag = true; } - - - Point origin; // old weights that gave model 1best. - vector<string> optimize_features; - void UseConf(po::variables_map const& conf) { - oracle.UseConf(conf); - include_primary=!conf.count("no_primary"); - old_to_hope=!conf.count("no_old_to_hope"); - - if (conf.count("optimize_feature") > 0) - optimize_features=conf["optimize_feature"].as<vector<string> >(); - Init(); + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); } +} - string weights_file; - double max_similarity; - unsigned n_oracle, oracle_batch; - string forest_repository; - unsigned dev_set_size; - vector<Oracle> oracles; - vector<int> fids; - string forest_file(unsigned i) const { - ostringstream o; - o << forest_repository << '/' << i << ".json.gz"; - return o.str(); - } - - oracle_directions() { } - - Sentences model_hyps; - - vector<ScoreP> model_scores; - bool have_doc; - void Init() { - have_doc=!decoder_translations_file.empty(); - if (have_doc) { - model_hyps.Load(decoder_translations_file); - if (verbose()) model_hyps.Print(cerr,5); - model_scores.resize(model_hyps.size()); - if (dev_set_size!=model_hyps.size()) { - cerr<<"You supplied decoder_translations with a different number of lines ("<<model_hyps.size()<<") than dev_set_size ("<<dev_set_size<<")"<<endl; - abort(); - } - cerr << "Scoring model translations " << model_hyps << endl; - for (int i=0;i<model_hyps.size();++i) { - //TODO: what is scoreCcand? without clipping? do without for consistency w/ oracle - model_scores[i]=oracle.ds[i]->ScoreCandidate(model_hyps[i]); - assert(model_scores[i]); - if (verbose()) cerr<<"Before model["<<i<<"]: "<<ds().ScoreDetails()<<endl; - if (verbose()) cerr<<"model["<<i<<"]: "<<model_scores[i]->ScoreDetails()<<endl; - oracle.doc_score->PlusEquals(*model_scores[i]); - if (verbose()) cerr<<"After model["<<i<<"]: "<<ds().ScoreDetails()<<endl; - } - //TODO: compute doc bleu stats for each sentence, then when getting oracle temporarily exclude stats for that sentence (skip regular score updating) - } - start_random=false; - cerr << "Forest repo: " << forest_repository << endl; - assert(DirectoryExists(forest_repository)); - vector<string> features; - vector<weight_t> dorigin; - Weights::InitFromFile(weights_file, &dorigin, &features); - if (optimize_features.size()) - features=optimize_features; - Weights::InitSparseVector(dorigin, &origin); - fids.clear(); - AddFeatureIds(features); - oracles.resize(dev_set_size); - } - - void AddFeatureIds(vector<string> const& features) { - int i = fids.size(); - fids.resize(fids.size()+features.size()); - for (; i < features.size(); ++i) - fids[i] = FD::Convert(features[i]); - } - - - std::string decoder_translations_file; // one per line - //TODO: is it worthwhile to get a complete document bleu first? would take a list of 1best translations one per line from the decoders, rather than loading all the forests (expensive). translations are in run.raw.N.gz - new arg - void adjust_doc(unsigned i,double scale=1.) { - oracle.doc_score->PlusEquals(*model_scores[i],scale); - } - - Score &ds() { - return *oracle.doc_score; - } - - Oracle const& ComputeOracle(unsigned i) { - Oracle &o=oracles[i]; - if (o.is_null()) { - if (have_doc) { - if (verbose()) cerr<<"Before removing i="<<i<<" "<<ds().ScoreDetails()<<"\n"; - adjust_doc(i,-1); - } - ReadFile rf(forest_file(i)); - Hypergraph hg; - { - Timer t("Loading forest from JSON "+forest_file(i)); - HypergraphIO::ReadFromJSON(rf.stream(), &hg); - } - if (verbose()) cerr<<"Before oracle["<<i<<"]: "<<ds().ScoreDetails()<<endl; - o=oracle.ComputeOracle(oracle.MakeMetadata(hg,i),&hg,origin); - if (verbose()) { - cerr << o; - ScoreP hopesc=oracle.GetScore(o.hope.sentence,i); - oracle.doc_score->PlusEquals(*hopesc,1); - cerr<<"With hope: "<<ds().ScoreDetails()<<endl; - oracle.doc_score->PlusEquals(*hopesc,-1); - cerr<<"Without hope: "<<ds().ScoreDetails()<<endl; - cerr<<" oracle="<<oracle.GetScore(o.hope.sentence,i)->ScoreDetails()<<endl - <<" model="<<oracle.GetScore(o.model.sentence,i)->ScoreDetails()<<endl; - if (have_doc) - cerr<<" doc (should = model): "<<model_scores[i]->ScoreDetails()<<endl; - } - if (have_doc) { - adjust_doc(i,1); - } else - oracle.IncludeLastScore(); - } - return o; - } - - // if start_random is true, immediately sample w/ replacement from src sentences; otherwise, consume them sequentially until exhausted, then random. oracle vectors are summed - void AddOracleDirections() { - MT19937::IntRNG rsg=rng.inclusive(0,dev_set_size-1); - unsigned b=0; - for(unsigned i=0;i<n_oracle;++i) { - Dir o2hope; - Dir fear2hope; - for (unsigned j=0;j<oracle_batch;++j,++b) { - Oracle const& o=ComputeOracle((start_random||b>=dev_set_size) ? rsg() : b); - - if (old_to_hope) - o2hope+=o.ModelHopeGradient(); - if (fear_to_hope) - fear2hope+=o.FearHopeGradient(); - } - double N=(double)oracle_batch; - if (old_to_hope) { - o2hope/=N; - directions.push_back(o2hope); - } - if (fear_to_hope) { - fear2hope/=N; - directions.push_back(fear2hope); - } +int main(int argc, char** argv) { + RandomNumberGenerator<boost::mt19937> rng; + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + vector<string> features; + SparseVector<weight_t> origin; + vector<weight_t> w; + Weights::InitFromFile(conf["weights"].as<string>(), &w, &features); + Weights::InitSparseVector(w, &origin); + const string forest_repository = conf["forest_repository"].as<string>(); + assert(DirectoryExists(forest_repository)); + if (conf.count("optimize_feature") > 0) + features=conf["optimize_feature"].as<vector<string> >(); + vector<SparseVector<weight_t> > directions; + vector<int> fids(features.size()); + for (int i = 0; i < features.size(); ++i) + fids[i] = FD::Convert(features[i]); + LineOptimizer::CreateOptimizationDirections( + fids, + conf["random_directions"].as<unsigned int>(), + &rng, + &directions); + unsigned dev_set_size = conf["dev_set_size"].as<unsigned>(); + for (unsigned i = 0; i < dev_set_size; ++i) { + for (unsigned j = 0; j < directions.size(); ++j) { + cout << forest_repository << '/' << i << ".json.gz " << i << ' '; + print(cout, origin, "=", ";"); + cout << ' '; + print(cout, directions[j], "=", ";"); + cout << endl; } } -}; - -int main(int argc, char** argv) { - oracle_directions od; - return od.main(argc,argv); + return 0; } |