diff options
author | graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-16 01:56:17 +0000 |
---|---|---|
committer | graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-16 01:56:17 +0000 |
commit | 06a90130833d695b588170495e20c3443431c1e7 (patch) | |
tree | c779d2edfd4ec964aed1e814c6404d5c450653f3 | |
parent | 000093dc417088be9a99278dc59203a30f976289 (diff) |
refactor vest mapper input; --optimize_feature (s) should now limit non-oracle directions
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@273 ec762483-ff6d-05da-a07a-a48fb63a330f
-rw-r--r-- | decoder/Makefile.am | 3 | ||||
-rw-r--r-- | decoder/cdec_ff.cc | 2 | ||||
-rw-r--r-- | decoder/ff_factory.h | 2 | ||||
-rwxr-xr-x | decoder/oracle_bleu.h | 1 | ||||
-rw-r--r-- | vest/mr_vest_generate_mapper_input.cc | 270 |
5 files changed, 167 insertions, 111 deletions
diff --git a/decoder/Makefile.am b/decoder/Makefile.am index e7b6abd8..a34aba1a 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -13,7 +13,7 @@ noinst_PROGRAMS = \ small_vector_test endif -cdec_SOURCES = cdec.cc forest_writer.cc maxtrans_blunsom.cc cdec_ff.cc ff_factory.cc timing_stats.cc +cdec_SOURCES = cdec.cc forest_writer.cc maxtrans_blunsom.cc cdec_ff.cc timing_stats.cc small_vector_test_SOURCES = small_vector_test.cc small_vector_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a parser_test_SOURCES = parser_test.cc @@ -75,6 +75,7 @@ libcdec_a_SOURCES = \ ff_csplit.cc \ ff_tagger.cc \ ff_bleu.cc \ + ff_factory.cc \ ../vest/scorer.cc \ ../vest/ter.cc \ ../vest/aer_scorer.cc \ diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc index c91780e2..069e07f1 100644 --- a/decoder/cdec_ff.cc +++ b/decoder/cdec_ff.cc @@ -21,7 +21,7 @@ void register_feature_functions() { global_ff_registry->Register(new FFFactory<WordPenalty>); global_ff_registry->Register(new FFFactory<SourceWordPenalty>); global_ff_registry->Register(new FFFactory<ArityPenalty>); - global_ff_registry->Register("BLEUModel", new FFFactory<BLEUModel>); + global_ff_registry->Register(new FFFactory<BLEUModel>); global_ff_registry->Register("RuleShape", new FFFactory<RuleShapeFeatures>); global_ff_registry->Register("RelativeSentencePosition", new FFFactory<RelativeSentencePosition>); global_ff_registry->Register("Model2BinaryFeatures", new FFFactory<Model2BinaryFeatures>); diff --git a/decoder/ff_factory.h b/decoder/ff_factory.h index 75911f38..6f86f2f9 100644 --- a/decoder/ff_factory.h +++ b/decoder/ff_factory.h @@ -21,8 +21,8 @@ class FFRegistry { void DisplayList() const; void Register(const std::string& ffname, FFFactoryBase* factory); void Register(FFFactoryBase* factory); - private: FFRegistry() {} + private: std::map<std::string, boost::shared_ptr<FFFactoryBase> > reg_; }; diff --git a/decoder/oracle_bleu.h b/decoder/oracle_bleu.h index 32525466..5fef53fd 100755 --- a/decoder/oracle_bleu.h +++ b/decoder/oracle_bleu.h @@ -195,7 +195,6 @@ struct OracleBleu { std::ostringstream kbest_string_stream; kbest_string_stream << conf["forest_output"].as<std::string>() << "/kbest_"<<suffix<< "." << sent_id; DumpKBest(sent_id, forest, k, unique, kbest_string_stream.str()); - } }; diff --git a/vest/mr_vest_generate_mapper_input.cc b/vest/mr_vest_generate_mapper_input.cc index c0f80d0c..e9a5650b 100644 --- a/vest/mr_vest_generate_mapper_input.cc +++ b/vest/mr_vest_generate_mapper_input.cc @@ -13,15 +13,149 @@ #include "hg_io.h" #include "scorer.h" #include "oracle_bleu.h" +#include "ff_bleu.h" + +boost::shared_ptr<FFRegistry> global_ff_registry; +namespace { +struct init_ff { + init_ff() { + global_ff_registry.reset(new FFRegistry); + global_ff_registry->Register(new FFFactory<BLEUModel>); + } +}; +init_ff reg; +} using namespace std; namespace po = boost::program_options; typedef SparseVector<double> Dir; +typedef Dir Point; -MT19937 rng; + +void compress_similar(vector<Dir> &dirs,double min_dist,ostream *log=&cerr,bool avg=true) { + if (min_dist<=0) return; + double max_s=1.-min_dist; + unsigned N=dirs.size(); + for (int i=0;i<N;++i) { + for (int j=i+1;j<N;++j) { + double s=dirs[i].tanimoto_coef(dirs[j]); + if (s>max_s) { + if (log) *log << "Collapsing similar directions (T="<<s<<" > "<<max_s<<"). dirs["<<i<<"]="<<dirs[i]<<" dirs["<<j<<"]"; + if (avg) { + dirs[i]+=dirs[j]; + dirs[i]/=2.; + if (log) *log<<" averaged="<<dirs[i]; + } + if (log) *log<<endl; + swap(dirs[j],dirs[--N]); + } + } + } + dirs.resize(N); +} struct oracle_directions { + MT19937 rng; + OracleBleu oracle; + vector<Dir> directions; + + bool start_random; + bool include_primary; + bool fear_to_hope; + unsigned n_random; + void AddPrimaryAndRandomDirections() { + LineOptimizer::CreateOptimizationDirections( + fids,n_random,&rng,&directions,include_primary); + } + + void Print() { + for (int i = 0; i < dev_set_size; ++i) + for (int j = 0; j < directions.size(); ++j) + cout << forest_file(i) <<" " << i << ' ' << origin << ' ' << directions[j] << endl; + } + + void AddOptions(po::options_description *opts) { + oracle.AddOptions(opts); + } + + void InitCommandLine(int argc, char *argv[], po::variables_map *conf) { + po::options_description opts("Configuration options"); + OracleBleu::AddOptions(&opts); + opts.add_options() + ("dev_set_size,s",po::value<unsigned>(&dev_set_size),"[REQD] Development set size (# of parallel sentences)") + ("forest_repository,r",po::value<string>(),"[REQD] Path to forest repository") + ("weights,w",po::value<string>(),"[REQD] Current feature weights file") + ("optimize_feature,o",po::value<vector<string> >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)") + ("random_directions,d",po::value<unsigned int>()->default_value(20),"Number of random directions to run the line optimizer in") + ("no_primary,n","don't use the primary (orthogonal each feature alone) directions") + ("oracle_directions,O",po::value<unsigned>()->default_value(0),"read the forests and choose this many directions based on heading toward a hope max (bleu+modelscore) translation.") + ("oracle_start_random",po::bool_switch(&start_random),"sample random subsets of dev set for ALL oracle directions, not just those after a sequential run through it") + ("oracle_batch,b",po::value<unsigned>()->default_value(10),"to produce each oracle direction, sum the 'gradient' over this many sentences") + ("max_similarity,m",po::value<double>()->default_value(0),"remove directions that are too similar (Tanimoto coeff. less than (1-this)). 0 means don't filter, 1 means only 1 direction allowed?") + ("fear_to_hope,f","for each of the oracle_directions, also include a direction from fear to hope (as well as origin to hope)") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + bool flag = false; + if (conf->count("dev_set_size") == 0) { + cerr << "Please specify the size of the development set using -d N\n"; + flag = true; + } + if (conf->count("weights") == 0) { + cerr << "Please specify the starting-point weights using -w <weightfile.txt>\n"; + flag = true; + } + if (conf->count("forest_repository") == 0) { + cerr << "Please specify the forest repository location using -r <DIR>\n"; + flag = true; + } + if (flag || conf->count("help")) { + cerr << dcmdline_options << endl; + exit(1); + } + } + + int main(int argc, char *argv[]) { + po::variables_map conf; + InitCommandLine(argc,argv,&conf); + UseConf(conf); + Run(); + return 0; + } + + void Run() { + AddPrimaryAndRandomDirections(); + AddOracleDirections(); + compress_similar(directions,max_similarity); + Print(); + } + + + Point origin; // old weights that gave model 1best. + vector<string> optimize_features; + void UseConf(po::variables_map const& conf) { + oracle.UseConf(conf); + + include_primary=!conf.count("no_primary"); + if (conf.count("optimize_feature") > 0) + optimize_features=conf["optimize_feature"].as<vector<string> >(); + fear_to_hope=conf.count("fear_to_hope"); + n_random=conf["random_directions"].as<unsigned int>(); + forest_repository=conf["forest_repository"].as<string>(); +// dev_set_size=conf["dev_set_size"].as<unsigned int>(); + n_oracle=conf["oracle_directions"].as<unsigned>(); + oracle_batch=conf["oracle_batch"].as<unsigned>(); + max_similarity=conf["max_similarity"].as<double>(); + weights_file=conf["weights"].as<string>(); + + Init(); + } + + string weights_file; + double max_similarity; + unsigned n_oracle, oracle_batch; string forest_repository; unsigned dev_set_size; vector<Dir> dirs; //best_to_hope_dirs @@ -32,14 +166,28 @@ struct oracle_directions { return o.str(); } - void set_dev_set_size(int i) { - dev_set_size=i; - dirs.resize(dev_set_size); + oracle_directions() { } + + void Init() { + start_random=false; + assert(DirectoryExists(forest_repository)); + vector<string> features; + weights.InitFromFile(weights_file, &features); + if (optimize_features.size()) + features=optimize_features; + weights.InitSparseVector(&origin); + fids.clear(); + AddFeatureIds(features); } - oracle_directions(string forest_repository="",unsigned dev_set_sz=0,vector<int> const& fids=vector<int>()): forest_repository(forest_repository),fids(fids) { - set_dev_set_size(dev_set_sz); - } + Weights weights; + void AddFeatureIds(vector<string> const& features) { + int i = fids.size(); + fids.resize(fids.size()+features.size()); + for (; i < features.size(); ++i) + fids[i] = FD::Convert(features[i]); + } + Dir const& operator[](unsigned i) { Dir &dir=dirs[i]; @@ -52,112 +200,20 @@ struct oracle_directions { return dir; } // if start_random is true, immediately sample w/ replacement from src sentences; otherwise, consume them sequentially until exhausted, then random. oracle vectors are summed - void add_directions(vector<Dir> &dirs,unsigned n,unsigned batchsz=20,bool start_random=false) { + void AddOracleDirections() { MT19937::IntRNG rsg=rng.inclusive(0,dev_set_size-1); unsigned b=0; - for(unsigned i=0;i<n;++i) { - dirs.push_back(Dir()); - Dir &d=dirs.back(); - for (unsigned j=0;j<batchsz;++j,++b) + for(unsigned i=0;i<n_oracle;++i) { + directions.push_back(Dir()); + Dir &d=directions.back(); + for (unsigned j=0;j<oracle_batch;++j,++b) d+=(*this)[(start_random || b>=dev_set_size)?rsg():b]; - d/=(double)batchsz; + d/=(double)oracle_batch; } } - }; -void compress_similar(vector<Dir> &dirs,double min_dist,ostream *log=&cerr,bool avg=true) { - if (min_dist<=0) return; - double max_s=1.-min_dist; - unsigned N=dirs.size(); - for (int i=0;i<N;++i) { - for (int j=i+1;j<N;++j) { - double s=dirs[i].tanimoto_coef(dirs[j]); - if (s>max_s) { - if (log) *log << "Collapsing similar directions (T="<<s<<" > "<<max_s<<"). dirs["<<i<<"]="<<dirs[i]<<" dirs["<<j<<"]"; - if (avg) { - dirs[i]+=dirs[j]; - dirs[i]/=2.; - if (log) *log<<" averaged="<<dirs[i]; - } - if (log) *log<<endl; - swap(dirs[j],dirs[--N]); - } - } - } - dirs.resize(N); -} - - - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - OracleBleu::AddOptions(&opts); - opts.add_options() - ("dev_set_size,s",po::value<unsigned int>(),"[REQD] Development set size (# of parallel sentences)") - ("forest_repository,r",po::value<string>(),"[REQD] Path to forest repository") - ("weights,w",po::value<string>(),"[REQD] Current feature weights file") - ("optimize_feature,o",po::value<vector<string> >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)") - ("random_directions,d",po::value<unsigned int>()->default_value(20),"Number of random directions to run the line optimizer in") - ("no_primary,n","don't use the primary (orthogonal each feature alone) directions") - ("oracle_directions,O",po::value<unsigned>()->default_value(0),"read the forests and choose this many directions based on heading toward a hope max (bleu+modelscore) translation.") - ("oracle_batch,b",po::value<unsigned>()->default_value(10),"to produce each oracle direction, sum the 'gradient' over this many sentences") - ("max_similarity,m",po::value<double>()->default_value(0),"remove directions that are too similar (Tanimoto coeff. less than (1-this)). 0 means don't filter, 1 means only 1 direction allowed?") - ("help,h", "Help"); - po::options_description dcmdline_options; - dcmdline_options.add(opts); - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - bool flag = false; - if (conf->count("dev_set_size") == 0) { - cerr << "Please specify the size of the development set using -d N\n"; - flag = true; - } - if (conf->count("weights") == 0) { - cerr << "Please specify the starting-point weights using -w <weightfile.txt>\n"; - flag = true; - } - if (conf->count("forest_repository") == 0) { - cerr << "Please specify the forest repository location using -r <DIR>\n"; - flag = true; - } - if (flag || conf->count("help")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - Weights weights; - vector<string> features; - weights.InitFromFile(conf["weights"].as<string>(), &features); - vector<int> fids(features.size()); - for (int i = 0; i < features.size(); ++i) - fids[i] = FD::Convert(features[i]); - - oracle_directions od(conf["forest_repository"].as<string>() - , conf["dev_set_size"].as<unsigned int>() - , fids - ); -; - assert(DirectoryExists(od.forest_repository)); - SparseVector<double> origin; - weights.InitSparseVector(&origin); - if (conf.count("optimize_feature") > 0) - features=conf["optimize_feature"].as<vector<string> >(); - vector<SparseVector<double> > axes; - LineOptimizer::CreateOptimizationDirections( - fids, - conf["random_directions"].as<unsigned int>(), - &rng, - &axes, - !conf.count("no_primary") - ); - od.add_directions(axes,conf["oracle_directions"].as<unsigned>(),conf["oracle_batch"].as<unsigned>()); - compress_similar(axes,conf["max_similarity"].as<double>()); - for (int i = 0; i < od.dev_set_size; ++i) - for (int j = 0; j < axes.size(); ++j) - cout << od.forest_file(i) <<" " << i << ' ' << origin << ' ' << axes[j] << endl; - return 0; + oracle_directions od; + return od.main(argc,argv); } |