From 1671e12ef0b069a5d2ae1c2d4fea20b9b1087af3 Mon Sep 17 00:00:00 2001 From: graehl Date: Fri, 16 Jul 2010 01:56:17 +0000 Subject: refactor vest mapper input; --optimize_feature (s) should now limit non-oracle directions git-svn-id: https://ws10smt.googlecode.com/svn/trunk@273 ec762483-ff6d-05da-a07a-a48fb63a330f --- vest/mr_vest_generate_mapper_input.cc | 270 ++++++++++++++++++++-------------- 1 file changed, 163 insertions(+), 107 deletions(-) (limited to 'vest') diff --git a/vest/mr_vest_generate_mapper_input.cc b/vest/mr_vest_generate_mapper_input.cc index c0f80d0c..e9a5650b 100644 --- a/vest/mr_vest_generate_mapper_input.cc +++ b/vest/mr_vest_generate_mapper_input.cc @@ -13,15 +13,149 @@ #include "hg_io.h" #include "scorer.h" #include "oracle_bleu.h" +#include "ff_bleu.h" + +boost::shared_ptr global_ff_registry; +namespace { +struct init_ff { + init_ff() { + global_ff_registry.reset(new FFRegistry); + global_ff_registry->Register(new FFFactory); + } +}; +init_ff reg; +} using namespace std; namespace po = boost::program_options; typedef SparseVector Dir; +typedef Dir Point; -MT19937 rng; + +void compress_similar(vector &dirs,double min_dist,ostream *log=&cerr,bool avg=true) { + if (min_dist<=0) return; + double max_s=1.-min_dist; + unsigned N=dirs.size(); + for (int i=0;imax_s) { + if (log) *log << "Collapsing similar directions (T="< "<(&dev_set_size),"[REQD] Development set size (# of parallel sentences)") + ("forest_repository,r",po::value(),"[REQD] Path to forest repository") + ("weights,w",po::value(),"[REQD] Current feature weights file") + ("optimize_feature,o",po::value >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)") + ("random_directions,d",po::value()->default_value(20),"Number of random directions to run the line optimizer in") + ("no_primary,n","don't use the primary (orthogonal each feature alone) directions") + ("oracle_directions,O",po::value()->default_value(0),"read the forests and choose this many directions based on heading toward a hope max (bleu+modelscore) translation.") + ("oracle_start_random",po::bool_switch(&start_random),"sample random subsets of dev set for ALL oracle directions, not just those after a sequential run through it") + ("oracle_batch,b",po::value()->default_value(10),"to produce each oracle direction, sum the 'gradient' over this many sentences") + ("max_similarity,m",po::value()->default_value(0),"remove directions that are too similar (Tanimoto coeff. less than (1-this)). 0 means don't filter, 1 means only 1 direction allowed?") + ("fear_to_hope,f","for each of the oracle_directions, also include a direction from fear to hope (as well as origin to hope)") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (conf->count("dev_set_size") == 0) { + cerr << "Please specify the size of the development set using -d N\n"; + flag = true; + } + if (conf->count("weights") == 0) { + cerr << "Please specify the starting-point weights using -w \n"; + flag = true; + } + if (conf->count("forest_repository") == 0) { + cerr << "Please specify the forest repository location using -r \n"; + flag = true; + } + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } + } + + int main(int argc, char *argv[]) { + po::variables_map conf; + InitCommandLine(argc,argv,&conf); + UseConf(conf); + Run(); + return 0; + } + + void Run() { + AddPrimaryAndRandomDirections(); + AddOracleDirections(); + compress_similar(directions,max_similarity); + Print(); + } + + + Point origin; // old weights that gave model 1best. + vector optimize_features; + void UseConf(po::variables_map const& conf) { + oracle.UseConf(conf); + + include_primary=!conf.count("no_primary"); + if (conf.count("optimize_feature") > 0) + optimize_features=conf["optimize_feature"].as >(); + fear_to_hope=conf.count("fear_to_hope"); + n_random=conf["random_directions"].as(); + forest_repository=conf["forest_repository"].as(); +// dev_set_size=conf["dev_set_size"].as(); + n_oracle=conf["oracle_directions"].as(); + oracle_batch=conf["oracle_batch"].as(); + max_similarity=conf["max_similarity"].as(); + weights_file=conf["weights"].as(); + + Init(); + } + + string weights_file; + double max_similarity; + unsigned n_oracle, oracle_batch; string forest_repository; unsigned dev_set_size; vector dirs; //best_to_hope_dirs @@ -32,14 +166,28 @@ struct oracle_directions { return o.str(); } - void set_dev_set_size(int i) { - dev_set_size=i; - dirs.resize(dev_set_size); + oracle_directions() { } + + void Init() { + start_random=false; + assert(DirectoryExists(forest_repository)); + vector features; + weights.InitFromFile(weights_file, &features); + if (optimize_features.size()) + features=optimize_features; + weights.InitSparseVector(&origin); + fids.clear(); + AddFeatureIds(features); } - oracle_directions(string forest_repository="",unsigned dev_set_sz=0,vector const& fids=vector()): forest_repository(forest_repository),fids(fids) { - set_dev_set_size(dev_set_sz); - } + Weights weights; + void AddFeatureIds(vector const& features) { + int i = fids.size(); + fids.resize(fids.size()+features.size()); + for (; i < features.size(); ++i) + fids[i] = FD::Convert(features[i]); + } + Dir const& operator[](unsigned i) { Dir &dir=dirs[i]; @@ -52,112 +200,20 @@ struct oracle_directions { return dir; } // if start_random is true, immediately sample w/ replacement from src sentences; otherwise, consume them sequentially until exhausted, then random. oracle vectors are summed - void add_directions(vector &dirs,unsigned n,unsigned batchsz=20,bool start_random=false) { + void AddOracleDirections() { MT19937::IntRNG rsg=rng.inclusive(0,dev_set_size-1); unsigned b=0; - for(unsigned i=0;i=dev_set_size)?rsg():b]; - d/=(double)batchsz; + d/=(double)oracle_batch; } } - }; -void compress_similar(vector &dirs,double min_dist,ostream *log=&cerr,bool avg=true) { - if (min_dist<=0) return; - double max_s=1.-min_dist; - unsigned N=dirs.size(); - for (int i=0;imax_s) { - if (log) *log << "Collapsing similar directions (T="< "<(),"[REQD] Development set size (# of parallel sentences)") - ("forest_repository,r",po::value(),"[REQD] Path to forest repository") - ("weights,w",po::value(),"[REQD] Current feature weights file") - ("optimize_feature,o",po::value >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)") - ("random_directions,d",po::value()->default_value(20),"Number of random directions to run the line optimizer in") - ("no_primary,n","don't use the primary (orthogonal each feature alone) directions") - ("oracle_directions,O",po::value()->default_value(0),"read the forests and choose this many directions based on heading toward a hope max (bleu+modelscore) translation.") - ("oracle_batch,b",po::value()->default_value(10),"to produce each oracle direction, sum the 'gradient' over this many sentences") - ("max_similarity,m",po::value()->default_value(0),"remove directions that are too similar (Tanimoto coeff. less than (1-this)). 0 means don't filter, 1 means only 1 direction allowed?") - ("help,h", "Help"); - po::options_description dcmdline_options; - dcmdline_options.add(opts); - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - bool flag = false; - if (conf->count("dev_set_size") == 0) { - cerr << "Please specify the size of the development set using -d N\n"; - flag = true; - } - if (conf->count("weights") == 0) { - cerr << "Please specify the starting-point weights using -w \n"; - flag = true; - } - if (conf->count("forest_repository") == 0) { - cerr << "Please specify the forest repository location using -r \n"; - flag = true; - } - if (flag || conf->count("help")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - Weights weights; - vector features; - weights.InitFromFile(conf["weights"].as(), &features); - vector fids(features.size()); - for (int i = 0; i < features.size(); ++i) - fids[i] = FD::Convert(features[i]); - - oracle_directions od(conf["forest_repository"].as() - , conf["dev_set_size"].as() - , fids - ); -; - assert(DirectoryExists(od.forest_repository)); - SparseVector origin; - weights.InitSparseVector(&origin); - if (conf.count("optimize_feature") > 0) - features=conf["optimize_feature"].as >(); - vector > axes; - LineOptimizer::CreateOptimizationDirections( - fids, - conf["random_directions"].as(), - &rng, - &axes, - !conf.count("no_primary") - ); - od.add_directions(axes,conf["oracle_directions"].as(),conf["oracle_batch"].as()); - compress_similar(axes,conf["max_similarity"].as()); - for (int i = 0; i < od.dev_set_size; ++i) - for (int j = 0; j < axes.size(); ++j) - cout << od.forest_file(i) <<" " << i << ' ' << origin << ' ' << axes[j] << endl; - return 0; + oracle_directions od; + return od.main(argc,argv); } -- cgit v1.2.3