#include #include #include #include #include #include "sampler.h" #include "filelib.h" #include "weights.h" #include "line_optimizer.h" #include "hg.h" #include "hg_io.h" #include "scorer.h" #include "oracle_bleu.h" #include "ff_bleu.h" boost::shared_ptr global_ff_registry; namespace { struct init_ff { init_ff() { global_ff_registry.reset(new FFRegistry); global_ff_registry->Register(new FFFactory); } }; init_ff reg; } using namespace std; namespace po = boost::program_options; typedef SparseVector Dir; typedef Dir Point; void compress_similar(vector &dirs,double min_dist,ostream *log=&cerr,bool avg=true) { if (min_dist<=0) return; double max_s=1.-min_dist; unsigned N=dirs.size(); for (int i=0;imax_s) { if (log) *log << "Collapsing similar directions (T="< "<(&dev_set_size),"[REQD] Development set size (# of parallel sentences)") ("forest_repository,r",po::value(&forest_repository),"[REQD] Path to forest repository") ("weights,w",po::value(&weights_file),"[REQD] Current feature weights file") ("optimize_feature,o",po::value >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)") ("random_directions,d",po::value(&random_directions)->default_value(10),"Number of random directions to run the line optimizer in") ("no_primary,n","don't use the primary (orthogonal each feature alone) directions") ("oracle_directions,O",po::value(&n_oracle)->default_value(0),"read the forests and choose this many directions based on heading toward a hope max (bleu+modelscore) translation.") ("oracle_start_random",po::bool_switch(&start_random),"sample random subsets of dev set for ALL oracle directions, not just those after a sequential run through it") ("oracle_batch,b",po::value(&oracle_batch)->default_value(10),"to produce each oracle direction, sum the 'gradient' over this many sentences") ("max_similarity,m",po::value(&max_similarity)->default_value(0),"remove directions that are too similar (Tanimoto coeff. less than (1-this)). 0 means don't filter, 1 means only 1 direction allowed?") ("fear_to_hope,f",po::bool_switch(&fear_to_hope),"for each of the oracle_directions, also include a direction from fear to hope (as well as origin to hope)") ("decoder_translations",po::value(&decoder_translations)->default_value(""),"one per line decoder 1best translations for computing document BLEU vs. sentences-seen-so-far BLEU") ("help,h", "Help"); po::options_description dcmdline_options; dcmdline_options.add(opts); po::store(parse_command_line(argc, argv, dcmdline_options), *conf); bool flag = false; if (conf->count("dev_set_size") == 0) { cerr << "Please specify the size of the development set using -d N\n"; flag = true; } if (conf->count("weights") == 0) { cerr << "Please specify the starting-point weights using -w \n"; flag = true; } if (conf->count("forest_repository") == 0) { cerr << "Please specify the forest repository location using -r \n"; flag = true; } if (flag || conf->count("help")) { cerr << dcmdline_options << endl; exit(1); } } int main(int argc, char *argv[]) { po::variables_map conf; InitCommandLine(argc,argv,&conf); UseConf(conf); Run(); return 0; } void Run() { AddPrimaryAndRandomDirections(); AddOracleDirections(); compress_similar(directions,max_similarity); Print(); } Point origin; // old weights that gave model 1best. vector optimize_features; void UseConf(po::variables_map const& conf) { oracle.UseConf(conf); include_primary=!conf.count("no_primary"); old_to_hope=!conf.count("no_old_to_hope"); if (conf.count("optimize_feature") > 0) optimize_features=conf["optimize_feature"].as >(); // po::value(&var) takes care of below: // fear_to_hope=conf.count("fear_to_hope"); // n_random=conf["random_directions"].as(); // forest_repository=conf["forest_repository"].as(); // dev_set_size=conf["dev_set_size"].as(); // n_oracle=conf["oracle_directions"].as(); // oracle_batch=conf["oracle_batch"].as(); // max_similarity=conf["max_similarity"].as(); // weights_file=conf["weights"].as(); Init(); } string weights_file; double max_similarity; unsigned n_oracle, oracle_batch; string forest_repository; unsigned dev_set_size; vector oracles; vector fids; string forest_file(unsigned i) const { ostringstream o; o << forest_repository << '/' << i << ".json.gz"; return o.str(); } oracle_directions() { } void Init() { start_random=false; assert(DirectoryExists(forest_repository)); vector features; weights.InitFromFile(weights_file, &features); if (optimize_features.size()) features=optimize_features; weights.InitSparseVector(&origin); fids.clear(); AddFeatureIds(features); oracles.resize(dev_set_size); } Weights weights; void AddFeatureIds(vector const& features) { int i = fids.size(); fids.resize(fids.size()+features.size()); for (; i < features.size(); ++i) fids[i] = FD::Convert(features[i]); } std::string decoder_translations_file; // one per line //TODO: is it worthwhile to get a complete document bleu first? would take a list of 1best translations one per line from the decoders, rather than loading all the forests (expensive). translations are in run.raw.N.gz - new arg Oracle const& ComputeOracle(unsigned i) { Oracle &o=oracles[i]; if (o.is_null()) { ReadFile rf(forest_file(i)); Hypergraph hg; { Timer t("Loading forest from JSON "+forest_file(i)); HypergraphIO::ReadFromJSON(rf.stream(), &hg); } o=oracle.ComputeOracles(MakeMetadata(hg,i),&hg,origin,&cerr); } return o; } // if start_random is true, immediately sample w/ replacement from src sentences; otherwise, consume them sequentially until exhausted, then random. oracle vectors are summed void AddOracleDirections() { MT19937::IntRNG rsg=rng.inclusive(0,dev_set_size-1); unsigned b=0; for(unsigned i=0;i=dev_set_size) ? rsg() : b); o2hope+=o.ModelHopeGradient(); if (fear_to_hope) fear2hope+=o.FearHopeGradient(); } double N=(double)oracle_batch; o2hope/=N; directions.push_back(o2hope); if (fear_to_hope) { fear2hope/=N; directions.push_back(fear2hope); } } } }; int main(int argc, char** argv) { oracle_directions od; return od.main(argc,argv); }