From 203c3c3357b9ed8cfe44932c2bf5ea19eba6238c Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 27 Jan 2012 13:19:27 -0500 Subject: migration to new metric api for vest, clean up of unsupported/not functional code --- vest/mr_vest_generate_mapper_input.cc | 356 ++++++---------------------------- 1 file changed, 57 insertions(+), 299 deletions(-) (limited to 'vest/mr_vest_generate_mapper_input.cc') diff --git a/vest/mr_vest_generate_mapper_input.cc b/vest/mr_vest_generate_mapper_input.cc index 0c094fd5..59d4f24f 100644 --- a/vest/mr_vest_generate_mapper_input.cc +++ b/vest/mr_vest_generate_mapper_input.cc @@ -1,320 +1,78 @@ -//TODO: debug segfault when references supplied, null shared_ptr when oracle #include #include -#include #include #include -#include "sampler.h" #include "filelib.h" #include "weights.h" #include "line_optimizer.h" -#include "hg.h" -#include "hg_io.h" -#include "scorer.h" -#include "oracle_bleu.h" -#include "ff_bleu.h" - -const bool DEBUG_ORACLE=true; - -//TODO: decide on cdec_ff ffs, or just bleumodel - if just bleumodel, then do existing features on serialized hypergraphs remain? weights (origin) is passed to oracle_bleu.h:ComputeOracle -//void register_feature_functions(); -//FFRegistry ff_registry; -namespace { -void init_bleumodel() { - ff_registry.clear(); - ff_registry.Register(new FFFactory); -} - -struct init_ff { - init_ff() { - init_bleumodel(); - } -}; -//init_ff reg; // order of initialization? ff_registry may not be init yet. call in Run() instead. -} using namespace std; namespace po = boost::program_options; -typedef SparseVector Dir; -typedef Dir Point; - -void compress_similar(vector &dirs,double min_dist,ostream *log=&cerr,bool avg=true,bool verbose=true) { - // return; //TODO: debug - if (min_dist<=0) return; - double max_s=1.-min_dist; - if (log&&verbose) *log<<"max allowed S="< "<add_options() - ("dev_set_size,s",po::value(&dev_set_size),"[REQD] Development set size (# of parallel sentences)") - ("forest_repository,r",po::value(&forest_repository),"[REQD] Path to forest repository") - ("weights,w",po::value(&weights_file),"[REQD] Current feature weights file") - ("optimize_feature,o",po::value >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)") - ("random_directions,d",po::value(&n_random)->default_value(10),"Number of random directions to run the line optimizer in") - ("no_primary,n","don't use the primary (orthogonal each feature alone) directions") - ("oracle_directions,O",po::value(&n_oracle)->default_value(0),"read the forests and choose this many directions based on heading toward a hope max (bleu+modelscore) translation.") - ("oracle_start_random",po::bool_switch(&start_random),"sample random subsets of dev set for ALL oracle directions, not just those after a sequential run through it") - ("oracle_batch,b",po::value(&oracle_batch)->default_value(10),"to produce each oracle direction, sum the 'gradient' over this many sentences") - ("max_similarity,m",po::value(&max_similarity)->default_value(0),"remove directions that are too similar (Tanimoto coeff. less than (1-this)). 0 means don't filter, 1 means only 1 direction allowed?") - ("fear_to_hope,f",po::bool_switch(&fear_to_hope),"for each of the oracle_directions, also include a direction from fear to hope (as well as origin to hope)") - ("no_old_to_hope","don't emit the usual old -> hope oracle") - ("decoder_translations",po::value(&decoder_translations_file)->default_value(""),"one per line decoder 1best translations for computing document BLEU vs. sentences-seen-so-far BLEU") - ; - } - void InitCommandLine(int argc, char *argv[], po::variables_map *conf) { - po::options_description opts("Configuration options"); - AddOptions(&opts); - opts.add_options()("help,h", "Help"); - - po::options_description dcmdline_options; - dcmdline_options.add(opts); - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - po::notify(*conf); - if (conf->count("dev_set_size") == 0) { - cerr << "Please specify the size of the development set using -s N\n"; - goto bad_cmdline; - } - if (conf->count("weights") == 0) { - cerr << "Please specify the starting-point weights using -w \n"; - goto bad_cmdline; - } - if (conf->count("forest_repository") == 0) { - cerr << "Please specify the forest repository location using -r \n"; - goto bad_cmdline; - } - if (n_oracle && oracle.refs.empty()) { - cerr<<"Specify references when using oracle directions\n"; - goto bad_cmdline; - } - if (conf->count("help")) { - cout << dcmdline_options << endl; - exit(0); - } - - return; - bad_cmdline: - cerr << dcmdline_options << endl; - exit(1); +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("dev_set_size,s",po::value(),"[REQD] Development set size (# of parallel sentences)") + ("forest_repository,r",po::value(),"[REQD] Path to forest repository") + ("weights,w",po::value(),"[REQD] Current feature weights file") + ("optimize_feature,o",po::value >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)") + ("random_directions,d",po::value()->default_value(20),"Number of random directions to run the line optimizer in") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (conf->count("dev_set_size") == 0) { + cerr << "Please specify the size of the development set using -d N\n"; + flag = true; } - - int main(int argc, char *argv[]) { - po::variables_map conf; - InitCommandLine(argc,argv,&conf); - init_bleumodel(); - UseConf(conf); - Run(); - return 0; + if (conf->count("weights") == 0) { + cerr << "Please specify the starting-point weights using -w \n"; + flag = true; } - bool verbose() const { return oracle.verbose; } - void Run() { -// register_feature_functions(); - AddPrimaryAndRandomDirections(); - AddOracleDirections(); - compress_similar(directions,max_similarity,&cerr,true,verbose()); - Print(); + if (conf->count("forest_repository") == 0) { + cerr << "Please specify the forest repository location using -r \n"; + flag = true; } - - - Point origin; // old weights that gave model 1best. - vector optimize_features; - void UseConf(po::variables_map const& conf) { - oracle.UseConf(conf); - include_primary=!conf.count("no_primary"); - old_to_hope=!conf.count("no_old_to_hope"); - - if (conf.count("optimize_feature") > 0) - optimize_features=conf["optimize_feature"].as >(); - Init(); + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); } +} - string weights_file; - double max_similarity; - unsigned n_oracle, oracle_batch; - string forest_repository; - unsigned dev_set_size; - vector oracles; - vector fids; - string forest_file(unsigned i) const { - ostringstream o; - o << forest_repository << '/' << i << ".json.gz"; - return o.str(); - } - - oracle_directions() { } - - Sentences model_hyps; - - vector model_scores; - bool have_doc; - void Init() { - have_doc=!decoder_translations_file.empty(); - if (have_doc) { - model_hyps.Load(decoder_translations_file); - if (verbose()) model_hyps.Print(cerr,5); - model_scores.resize(model_hyps.size()); - if (dev_set_size!=model_hyps.size()) { - cerr<<"You supplied decoder_translations with a different number of lines ("<ScoreCandidate(model_hyps[i]); - assert(model_scores[i]); - if (verbose()) cerr<<"Before model["<ScoreDetails()<PlusEquals(*model_scores[i]); - if (verbose()) cerr<<"After model["< features; - vector dorigin; - Weights::InitFromFile(weights_file, &dorigin, &features); - if (optimize_features.size()) - features=optimize_features; - Weights::InitSparseVector(dorigin, &origin); - fids.clear(); - AddFeatureIds(features); - oracles.resize(dev_set_size); - } - - void AddFeatureIds(vector const& features) { - int i = fids.size(); - fids.resize(fids.size()+features.size()); - for (; i < features.size(); ++i) - fids[i] = FD::Convert(features[i]); - } - - - std::string decoder_translations_file; // one per line - //TODO: is it worthwhile to get a complete document bleu first? would take a list of 1best translations one per line from the decoders, rather than loading all the forests (expensive). translations are in run.raw.N.gz - new arg - void adjust_doc(unsigned i,double scale=1.) { - oracle.doc_score->PlusEquals(*model_scores[i],scale); - } - - Score &ds() { - return *oracle.doc_score; - } - - Oracle const& ComputeOracle(unsigned i) { - Oracle &o=oracles[i]; - if (o.is_null()) { - if (have_doc) { - if (verbose()) cerr<<"Before removing i="<PlusEquals(*hopesc,1); - cerr<<"With hope: "<PlusEquals(*hopesc,-1); - cerr<<"Without hope: "<ScoreDetails()<=dev_set_size) ? rsg() : b); - - if (old_to_hope) - o2hope+=o.ModelHopeGradient(); - if (fear_to_hope) - fear2hope+=o.FearHopeGradient(); - } - double N=(double)oracle_batch; - if (old_to_hope) { - o2hope/=N; - directions.push_back(o2hope); - } - if (fear_to_hope) { - fear2hope/=N; - directions.push_back(fear2hope); - } +int main(int argc, char** argv) { + RandomNumberGenerator rng; + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + vector features; + SparseVector origin; + vector w; + Weights::InitFromFile(conf["weights"].as(), &w, &features); + Weights::InitSparseVector(w, &origin); + const string forest_repository = conf["forest_repository"].as(); + assert(DirectoryExists(forest_repository)); + if (conf.count("optimize_feature") > 0) + features=conf["optimize_feature"].as >(); + vector > directions; + vector fids(features.size()); + for (int i = 0; i < features.size(); ++i) + fids[i] = FD::Convert(features[i]); + LineOptimizer::CreateOptimizationDirections( + fids, + conf["random_directions"].as(), + &rng, + &directions); + unsigned dev_set_size = conf["dev_set_size"].as(); + for (unsigned i = 0; i < dev_set_size; ++i) { + for (unsigned j = 0; j < directions.size(); ++j) { + cout << forest_repository << '/' << i << ".json.gz " << i << ' '; + print(cout, origin, "=", ";"); + cout << ' '; + print(cout, directions[j], "=", ";"); + cout << endl; } } -}; - -int main(int argc, char** argv) { - oracle_directions od; - return od.main(argc,argv); + return 0; } -- cgit v1.2.3