From 1671e12ef0b069a5d2ae1c2d4fea20b9b1087af3 Mon Sep 17 00:00:00 2001 From: graehl Date: Fri, 16 Jul 2010 01:56:17 +0000 Subject: refactor vest mapper input; --optimize_feature (s) should now limit non-oracle directions git-svn-id: https://ws10smt.googlecode.com/svn/trunk@273 ec762483-ff6d-05da-a07a-a48fb63a330f --- decoder/Makefile.am | 3 +- decoder/cdec_ff.cc | 2 +- decoder/ff_factory.h | 2 +- decoder/oracle_bleu.h | 1 - vest/mr_vest_generate_mapper_input.cc | 270 ++++++++++++++++++++-------------- 5 files changed, 167 insertions(+), 111 deletions(-) diff --git a/decoder/Makefile.am b/decoder/Makefile.am index e7b6abd8..a34aba1a 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -13,7 +13,7 @@ noinst_PROGRAMS = \ small_vector_test endif -cdec_SOURCES = cdec.cc forest_writer.cc maxtrans_blunsom.cc cdec_ff.cc ff_factory.cc timing_stats.cc +cdec_SOURCES = cdec.cc forest_writer.cc maxtrans_blunsom.cc cdec_ff.cc timing_stats.cc small_vector_test_SOURCES = small_vector_test.cc small_vector_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a parser_test_SOURCES = parser_test.cc @@ -75,6 +75,7 @@ libcdec_a_SOURCES = \ ff_csplit.cc \ ff_tagger.cc \ ff_bleu.cc \ + ff_factory.cc \ ../vest/scorer.cc \ ../vest/ter.cc \ ../vest/aer_scorer.cc \ diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc index c91780e2..069e07f1 100644 --- a/decoder/cdec_ff.cc +++ b/decoder/cdec_ff.cc @@ -21,7 +21,7 @@ void register_feature_functions() { global_ff_registry->Register(new FFFactory); global_ff_registry->Register(new FFFactory); global_ff_registry->Register(new FFFactory); - global_ff_registry->Register("BLEUModel", new FFFactory); + global_ff_registry->Register(new FFFactory); global_ff_registry->Register("RuleShape", new FFFactory); global_ff_registry->Register("RelativeSentencePosition", new FFFactory); global_ff_registry->Register("Model2BinaryFeatures", new FFFactory); diff --git a/decoder/ff_factory.h b/decoder/ff_factory.h index 75911f38..6f86f2f9 100644 --- a/decoder/ff_factory.h +++ b/decoder/ff_factory.h @@ -21,8 +21,8 @@ class FFRegistry { void DisplayList() const; void Register(const std::string& ffname, FFFactoryBase* factory); void Register(FFFactoryBase* factory); - private: FFRegistry() {} + private: std::map > reg_; }; diff --git a/decoder/oracle_bleu.h b/decoder/oracle_bleu.h index 32525466..5fef53fd 100755 --- a/decoder/oracle_bleu.h +++ b/decoder/oracle_bleu.h @@ -195,7 +195,6 @@ struct OracleBleu { std::ostringstream kbest_string_stream; kbest_string_stream << conf["forest_output"].as() << "/kbest_"< global_ff_registry; +namespace { +struct init_ff { + init_ff() { + global_ff_registry.reset(new FFRegistry); + global_ff_registry->Register(new FFFactory); + } +}; +init_ff reg; +} using namespace std; namespace po = boost::program_options; typedef SparseVector Dir; +typedef Dir Point; -MT19937 rng; + +void compress_similar(vector &dirs,double min_dist,ostream *log=&cerr,bool avg=true) { + if (min_dist<=0) return; + double max_s=1.-min_dist; + unsigned N=dirs.size(); + for (int i=0;imax_s) { + if (log) *log << "Collapsing similar directions (T="< "<(&dev_set_size),"[REQD] Development set size (# of parallel sentences)") + ("forest_repository,r",po::value(),"[REQD] Path to forest repository") + ("weights,w",po::value(),"[REQD] Current feature weights file") + ("optimize_feature,o",po::value >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)") + ("random_directions,d",po::value()->default_value(20),"Number of random directions to run the line optimizer in") + ("no_primary,n","don't use the primary (orthogonal each feature alone) directions") + ("oracle_directions,O",po::value()->default_value(0),"read the forests and choose this many directions based on heading toward a hope max (bleu+modelscore) translation.") + ("oracle_start_random",po::bool_switch(&start_random),"sample random subsets of dev set for ALL oracle directions, not just those after a sequential run through it") + ("oracle_batch,b",po::value()->default_value(10),"to produce each oracle direction, sum the 'gradient' over this many sentences") + ("max_similarity,m",po::value()->default_value(0),"remove directions that are too similar (Tanimoto coeff. less than (1-this)). 0 means don't filter, 1 means only 1 direction allowed?") + ("fear_to_hope,f","for each of the oracle_directions, also include a direction from fear to hope (as well as origin to hope)") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (conf->count("dev_set_size") == 0) { + cerr << "Please specify the size of the development set using -d N\n"; + flag = true; + } + if (conf->count("weights") == 0) { + cerr << "Please specify the starting-point weights using -w \n"; + flag = true; + } + if (conf->count("forest_repository") == 0) { + cerr << "Please specify the forest repository location using -r \n"; + flag = true; + } + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } + } + + int main(int argc, char *argv[]) { + po::variables_map conf; + InitCommandLine(argc,argv,&conf); + UseConf(conf); + Run(); + return 0; + } + + void Run() { + AddPrimaryAndRandomDirections(); + AddOracleDirections(); + compress_similar(directions,max_similarity); + Print(); + } + + + Point origin; // old weights that gave model 1best. + vector optimize_features; + void UseConf(po::variables_map const& conf) { + oracle.UseConf(conf); + + include_primary=!conf.count("no_primary"); + if (conf.count("optimize_feature") > 0) + optimize_features=conf["optimize_feature"].as >(); + fear_to_hope=conf.count("fear_to_hope"); + n_random=conf["random_directions"].as(); + forest_repository=conf["forest_repository"].as(); +// dev_set_size=conf["dev_set_size"].as(); + n_oracle=conf["oracle_directions"].as(); + oracle_batch=conf["oracle_batch"].as(); + max_similarity=conf["max_similarity"].as(); + weights_file=conf["weights"].as(); + + Init(); + } + + string weights_file; + double max_similarity; + unsigned n_oracle, oracle_batch; string forest_repository; unsigned dev_set_size; vector dirs; //best_to_hope_dirs @@ -32,14 +166,28 @@ struct oracle_directions { return o.str(); } - void set_dev_set_size(int i) { - dev_set_size=i; - dirs.resize(dev_set_size); + oracle_directions() { } + + void Init() { + start_random=false; + assert(DirectoryExists(forest_repository)); + vector features; + weights.InitFromFile(weights_file, &features); + if (optimize_features.size()) + features=optimize_features; + weights.InitSparseVector(&origin); + fids.clear(); + AddFeatureIds(features); } - oracle_directions(string forest_repository="",unsigned dev_set_sz=0,vector const& fids=vector()): forest_repository(forest_repository),fids(fids) { - set_dev_set_size(dev_set_sz); - } + Weights weights; + void AddFeatureIds(vector const& features) { + int i = fids.size(); + fids.resize(fids.size()+features.size()); + for (; i < features.size(); ++i) + fids[i] = FD::Convert(features[i]); + } + Dir const& operator[](unsigned i) { Dir &dir=dirs[i]; @@ -52,112 +200,20 @@ struct oracle_directions { return dir; } // if start_random is true, immediately sample w/ replacement from src sentences; otherwise, consume them sequentially until exhausted, then random. oracle vectors are summed - void add_directions(vector &dirs,unsigned n,unsigned batchsz=20,bool start_random=false) { + void AddOracleDirections() { MT19937::IntRNG rsg=rng.inclusive(0,dev_set_size-1); unsigned b=0; - for(unsigned i=0;i=dev_set_size)?rsg():b]; - d/=(double)batchsz; + d/=(double)oracle_batch; } } - }; -void compress_similar(vector &dirs,double min_dist,ostream *log=&cerr,bool avg=true) { - if (min_dist<=0) return; - double max_s=1.-min_dist; - unsigned N=dirs.size(); - for (int i=0;imax_s) { - if (log) *log << "Collapsing similar directions (T="< "<(),"[REQD] Development set size (# of parallel sentences)") - ("forest_repository,r",po::value(),"[REQD] Path to forest repository") - ("weights,w",po::value(),"[REQD] Current feature weights file") - ("optimize_feature,o",po::value >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)") - ("random_directions,d",po::value()->default_value(20),"Number of random directions to run the line optimizer in") - ("no_primary,n","don't use the primary (orthogonal each feature alone) directions") - ("oracle_directions,O",po::value()->default_value(0),"read the forests and choose this many directions based on heading toward a hope max (bleu+modelscore) translation.") - ("oracle_batch,b",po::value()->default_value(10),"to produce each oracle direction, sum the 'gradient' over this many sentences") - ("max_similarity,m",po::value()->default_value(0),"remove directions that are too similar (Tanimoto coeff. less than (1-this)). 0 means don't filter, 1 means only 1 direction allowed?") - ("help,h", "Help"); - po::options_description dcmdline_options; - dcmdline_options.add(opts); - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - bool flag = false; - if (conf->count("dev_set_size") == 0) { - cerr << "Please specify the size of the development set using -d N\n"; - flag = true; - } - if (conf->count("weights") == 0) { - cerr << "Please specify the starting-point weights using -w \n"; - flag = true; - } - if (conf->count("forest_repository") == 0) { - cerr << "Please specify the forest repository location using -r \n"; - flag = true; - } - if (flag || conf->count("help")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - Weights weights; - vector features; - weights.InitFromFile(conf["weights"].as(), &features); - vector fids(features.size()); - for (int i = 0; i < features.size(); ++i) - fids[i] = FD::Convert(features[i]); - - oracle_directions od(conf["forest_repository"].as() - , conf["dev_set_size"].as() - , fids - ); -; - assert(DirectoryExists(od.forest_repository)); - SparseVector origin; - weights.InitSparseVector(&origin); - if (conf.count("optimize_feature") > 0) - features=conf["optimize_feature"].as >(); - vector > axes; - LineOptimizer::CreateOptimizationDirections( - fids, - conf["random_directions"].as(), - &rng, - &axes, - !conf.count("no_primary") - ); - od.add_directions(axes,conf["oracle_directions"].as(),conf["oracle_batch"].as()); - compress_similar(axes,conf["max_similarity"].as()); - for (int i = 0; i < od.dev_set_size; ++i) - for (int j = 0; j < axes.size(); ++j) - cout << od.forest_file(i) <<" " << i << ' ' << origin << ' ' << axes[j] << endl; - return 0; + oracle_directions od; + return od.main(argc,argv); } -- cgit v1.2.3