diff options
| author | graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-13 02:01:07 +0000 | 
|---|---|---|
| committer | graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-13 02:01:07 +0000 | 
| commit | 2d6a5896df99904b0c492e77168489636d545869 (patch) | |
| tree | bebf9b97cddbee40a95c026d1608c0cccf6bb49f /vest | |
| parent | 77c25d9f30f95ccb7843f9dce71a4f4e018cc727 (diff) | |
vest: combine over-similar search directions, exclude primary directions, skeleton for oracle directions
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@227 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'vest')
| -rwxr-xr-x | vest/dist-vest.pl | 13 | ||||
| -rw-r--r-- | vest/line_optimizer.cc | 29 | ||||
| -rw-r--r-- | vest/line_optimizer.h | 4 | ||||
| -rw-r--r-- | vest/mr_vest_generate_mapper_input.cc | 91 | 
4 files changed, 114 insertions, 23 deletions
| diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl index f8f79ee5..32a8edbd 100755 --- a/vest/dist-vest.pl +++ b/vest/dist-vest.pl @@ -49,6 +49,11 @@ my $iniFile;  my $weights;  my $initialWeights;  my $decoderOpt; +my $noprimary; +my $maxsim=0; +my $oraclen=0; +my $oracleb=20; +my $dirargs='';  # Process command-line options  Getopt::Long::Configure("no_auto_abbrev"); @@ -66,6 +71,11 @@ if (GetOptions(  	"normalize=s" => \$normalize,  	"pmem=s" => \$pmem,  	"rand-directions=i" => \$rand_directions, +        "no-primary!" => \$noprimary, +        "max-similarity=s" => \$maxsim, +        "oracle-directions=i" => \$oraclen, +        "oracle-batch=i" => \$oracleb, +        "directions-args=s" => \$dirargs,  	"ref-files=s" => \$refFiles,  	"metric=s" => \$metric,  	"source-file=s" => \$srcFile, @@ -234,7 +244,8 @@ while (1){  		print STDERR "\nGENERATE OPTIMIZATION STRATEGY (OPT-ITERATION $opt_iter/$optimization_iters)\n";  		print STDERR `date`;  		$icc++; -		$cmd="$MAPINPUT -w $inweights -r $dir/hgs -s $devSize -d $rand_directions > $dir/agenda.$im1-$opt_iter"; +        my $nop=$noprimary?"--no_primary":""; +		$cmd="$MAPINPUT -w $inweights -r $dir/hgs -s $devSize -d $rand_directions --max_similarity=$maxsim --oracle_directions=$oraclen --oracle_batch=$oracleb $dirargs > $dir/agenda.$im1-$opt_iter";  		print STDERR "COMMAND:\n$cmd\n";  		$result = system($cmd);  		unless ($result == 0){ diff --git a/vest/line_optimizer.cc b/vest/line_optimizer.cc index b6410c35..e8b40237 100644 --- a/vest/line_optimizer.cc +++ b/vest/line_optimizer.cc @@ -15,7 +15,7 @@ struct IntervalComp {    bool operator() (const ErrorIter& a, const ErrorIter& b) const {      return a->x < b->x;    } -};	 +};  double LineOptimizer::LineOptimize(      const vector<ErrorSurface>& surfaces, @@ -89,15 +89,22 @@ void LineOptimizer::CreateOptimizationDirections(       const vector<int>& features_to_optimize,       int additional_random_directions,       RandomNumberGenerator<boost::mt19937>* rng, -     vector<SparseVector<double> >* dirs) { +     vector<SparseVector<double> >* dirs +     , bool include_orthogonal +  ) {    const int num_directions = features_to_optimize.size() + additional_random_directions; -  dirs->resize(num_directions); -  for (int i = 0; i < num_directions; ++i) { -    SparseVector<double>& axis = (*dirs)[i]; -    if (i < features_to_optimize.size()) -      axis.set_value(features_to_optimize[i], 1.0); -    else -      RandomUnitVector(features_to_optimize, &axis, rng); -  } -  cerr << "Generated " << num_directions << " total axes to optimize along.\n"; +  dirs->clear(); +  typedef SparseVector<double> Dir; +  vector<Dir> &out=*dirs; +  int i=0; +  if (include_orthogonal) +    for (;i<features_to_optimize.size();++i) { +      Dir d; +      d.set_value(features_to_optimize[i],1.); +      out.push_back(d); +    } +  out.resize(i+additional_random_directions); +  for (;i<out.size();++i) +     RandomUnitVector(features_to_optimize, &out[i], rng); +  cerr << "Generated " << out.size() << " total axes to optimize along.\n";  } diff --git a/vest/line_optimizer.h b/vest/line_optimizer.h index 43164360..1b5382cd 100644 --- a/vest/line_optimizer.h +++ b/vest/line_optimizer.h @@ -37,7 +37,9 @@ struct LineOptimizer {       const std::vector<int>& primary,       int additional_random_directions,       RandomNumberGenerator<boost::mt19937>* rng, -     std::vector<SparseVector<double> >* dirs); +     std::vector<SparseVector<double> >* dirs +     , bool include_primary=true +    );  }; diff --git a/vest/mr_vest_generate_mapper_input.cc b/vest/mr_vest_generate_mapper_input.cc index c96a61e4..9e702e2f 100644 --- a/vest/mr_vest_generate_mapper_input.cc +++ b/vest/mr_vest_generate_mapper_input.cc @@ -1,5 +1,6 @@  #include <iostream>  #include <vector> +#include <sstream>  #include <boost/program_options.hpp>  #include <boost/program_options/variables_map.hpp> @@ -7,10 +8,70 @@  #include "filelib.h"  #include "weights.h"  #include "line_optimizer.h" +#include "hg.h" +#include "hg_io.h"  using namespace std;  namespace po = boost::program_options; +typedef SparseVector<double> Dir; + +typedef RandomNumberGenerator<boost::mt19937> RNG; +RNG rng; + +struct oracle_directions { +  string forest_repository; +  unsigned dev_set_size; +  vector<Dir> dirs; //best_to_hope_dirs +  vector<int> fids; +  string forest_file(unsigned i) const { +    ostringstream o; +    o << forest_repository << '/' << i << ".json.gz"; +    return o.str(); +  } + +  oracle_directions(string forest_repository,unsigned dev_set_size,vector<int> const& fids=vector<int>()): forest_repository(forest_repository),dev_set_size(dev_set_size),fids(fids) { +    dirs.resize(dev_set_size); +  } +  Dir const& operator[](unsigned i) { +    Dir &dir=dirs[i]; +    if (dir.empty()) { +      ReadFile rf(forest_file(i)); +      Hypergraph hg; +      HypergraphIO::ReadFromJSON(rf.stream(), &hg); +      cerr<<"oracle: forest["<<i<<"] loaded: "<<hg.stats()<<endl; +      //TODO: get hope/oracle from vlad.  random for now. +      LineOptimizer::RandomUnitVector(fids,&dir,&rng); +    } +    return dir; +  } + +}; + +void compress_similar(vector<Dir> &dirs,double min_dist,ostream *log=&cerr,bool avg=true) { +  if (min_dist<=0) return; +  double max_s=1.-min_dist; +  unsigned N=dirs.size(); +  for (int i=0;i<N;++i) { +    for (int j=i+1;j<N;++j) { +      double s=dirs[i].tanimoto_coef(dirs[j]); +      if (s>max_s) { +        if (log) *log << "Collapsing similar directions (T="<<s<<" > "<<max_s<<").  dirs["<<i<<"]="<<dirs[i]<<" dirs["<<j<<"]"; +        if (avg) { +          dirs[i]+=dirs[j]; +          dirs[i]/=2.; +          if (log) *log<<" averaged="<<dirs[i]; +        } +        if (log) *log<<endl; +        swap(dirs[j],dirs[--N]); +      } +    } +  } +  dirs.resize(N); +} + + +  void InitCommandLine(int argc, char** argv, po::variables_map* conf) {    po::options_description opts("Configuration options");    opts.add_options() @@ -19,6 +80,10 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {          ("weights,w",po::value<string>(),"[REQD] Current feature weights file")          ("optimize_feature,o",po::value<vector<string> >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)")          ("random_directions,d",po::value<unsigned int>()->default_value(20),"Number of random directions to run the line optimizer in") +    ("no_primary,n","don't use the primary (orthogonal each feature alone) directions") +    ("oracle_directions,O",po::value<unsigned>()->default_value(0),"read the forests and choose this many directions based on heading toward a hope max (bleu+modelscore) translation.") +    ("oracle_batch,b",po::value<unsigned>()->default_value(10),"to produce each oracle direction, sum the 'gradient' over this many sentences") +    ("max_similarity,m",po::value<double>()->default_value(0),"remove directions that are too similar (Tanimoto coeff. less than (1-this)).  0 means don't filter, 1 means only 1 direction allowed?")          ("help,h", "Help");    po::options_description dcmdline_options;    dcmdline_options.add(opts); @@ -43,30 +108,36 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {  }  int main(int argc, char** argv) { -  RandomNumberGenerator<boost::mt19937> rng;    po::variables_map conf;    InitCommandLine(argc, argv, &conf);    Weights weights;    vector<string> features;    weights.InitFromFile(conf["weights"].as<string>(), &features); -  const string forest_repository = conf["forest_repository"].as<string>(); -  assert(DirectoryExists(forest_repository)); +  vector<int> fids(features.size()); +  for (int i = 0; i < features.size(); ++i) +    fids[i] = FD::Convert(features[i]); + +  oracle_directions od(conf["forest_repository"].as<string>() +                       , conf["dev_set_size"].as<unsigned int>() +                       , fids +    ); +; +  assert(DirectoryExists(od.forest_repository));    SparseVector<double> origin;    weights.InitSparseVector(&origin);    if (conf.count("optimize_feature") > 0)      features=conf["optimize_feature"].as<vector<string> >();    vector<SparseVector<double> > axes; -  vector<int> fids(features.size()); -  for (int i = 0; i < features.size(); ++i) -    fids[i] = FD::Convert(features[i]);    LineOptimizer::CreateOptimizationDirections(       fids,       conf["random_directions"].as<unsigned int>(),       &rng, -     &axes); -  int dev_set_size = conf["dev_set_size"].as<unsigned int>(); -  for (int i = 0; i < dev_set_size; ++i) +     &axes, +     !conf.count("no_primary") +    ); +  compress_similar(axes,conf["max_similarity"].as<double>()); +  for (int i = 0; i < od.dev_set_size; ++i)      for (int j = 0; j < axes.size(); ++j) -      cout << forest_repository << '/' << i << ".json.gz " << i << ' ' << origin << ' ' << axes[j] << endl; +      cout << od.forest_file(i) <<" " << i << ' ' << origin << ' ' << axes[j] << endl;    return 0;  } | 
