diff options
author | graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-13 02:01:07 +0000 |
---|---|---|
committer | graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-13 02:01:07 +0000 |
commit | 2d6a5896df99904b0c492e77168489636d545869 (patch) | |
tree | bebf9b97cddbee40a95c026d1608c0cccf6bb49f | |
parent | 77c25d9f30f95ccb7843f9dce71a4f4e018cc727 (diff) |
vest: combine over-similar search directions, exclude primary directions, skeleton for oracle directions
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@227 ec762483-ff6d-05da-a07a-a48fb63a330f
-rw-r--r-- | decoder/grammar.cc | 9 | ||||
-rw-r--r-- | decoder/sparse_vector.h | 32 | ||||
-rwxr-xr-x | vest/dist-vest.pl | 13 | ||||
-rw-r--r-- | vest/line_optimizer.cc | 29 | ||||
-rw-r--r-- | vest/line_optimizer.h | 4 | ||||
-rw-r--r-- | vest/mr_vest_generate_mapper_input.cc | 91 |
6 files changed, 142 insertions, 36 deletions
diff --git a/decoder/grammar.cc b/decoder/grammar.cc index 5eb7887d..499e79fe 100644 --- a/decoder/grammar.cc +++ b/decoder/grammar.cc @@ -71,7 +71,7 @@ struct TGImpl { }; TextGrammar::TextGrammar() : max_span_(10), pimpl_(new TGImpl) {} -TextGrammar::TextGrammar(const string& file) : +TextGrammar::TextGrammar(const string& file) : max_span_(10), pimpl_(new TGImpl) { ReadFromFile(file); @@ -104,7 +104,7 @@ void TextGrammar::ReadFromFile(const string& filename) { RuleLexer::ReadRules(in.stream(), &AddRuleHelper, this); } -bool TextGrammar::HasRuleForSpan(int i, int j, int distance) const { +bool TextGrammar::HasRuleForSpan(int /* i */, int /* j */, int distance) const { return (max_span_ >= distance); } @@ -121,8 +121,7 @@ GlueGrammar::GlueGrammar(const string& goal_nt, const string& default_nt) { //cerr << "GLUE: " << glue->AsString() << endl; } -bool GlueGrammar::HasRuleForSpan(int i, int j, int distance) const { - (void) j; +bool GlueGrammar::HasRuleForSpan(int i, int /* j */, int /* distance */) const { return (i == 0); } @@ -141,7 +140,7 @@ PassThroughGrammar::PassThroughGrammar(const Lattice& input, const string& cat) } } -bool PassThroughGrammar::HasRuleForSpan(int i, int j, int distance) const { +bool PassThroughGrammar::HasRuleForSpan(int i, int j, int /* distance */) const { const set<int>& hr = has_rule_[i]; if (i == j) { return !hr.empty(); } return (hr.find(j) != hr.end()); diff --git a/decoder/sparse_vector.h b/decoder/sparse_vector.h index be91f324..bfdeebcc 100644 --- a/decoder/sparse_vector.h +++ b/decoder/sparse_vector.h @@ -58,7 +58,7 @@ public: } int max_index() const { - if (values_.empty()) return 0; + if (empty()) return 0; typename MapType::const_iterator found =values_.end(); --found; return found->first; @@ -75,6 +75,18 @@ public: } template<typename S> + S cosine_sim(const SparseVector<S> &vec) const { + return dot(vec)/(l2norm()*vec.l2norm()); + } + + // if values are binary, gives |A intersect B|/|A union B| + template<typename S> + S tanimoto_coef(const SparseVector<S> &vec) const { + S dp=dot(vec); + return dp/(l2norm_sq()*vec.l2norm_sq()-dp); + } + + template<typename S> S dot(const SparseVector<S> &vec) const { S sum = 0; for (typename MapType::const_iterator @@ -119,12 +131,16 @@ public: return sum; } - T l2norm() const { + T l2norm_sq() const { T sum = 0; for (typename MapType::const_iterator it = values_.begin(); it != values_.end(); ++it) sum += it->second * it->second; - return sqrt(sum); + return sum; + } + + T l2norm() const { + return sqrt(l2norm_sq()); } SparseVector<T> &operator+=(const SparseVector<T> &other) { @@ -149,14 +165,14 @@ public: return *this; } - SparseVector<T> &operator-=(const double &x) { + SparseVector<T> &operator-=(T const& x) { for (typename MapType::iterator it = values_.begin(); it != values_.end(); ++it) it->second -= x; return *this; } - SparseVector<T> &operator+=(const double &x) { + SparseVector<T> &operator+=(T const& x) { for (typename MapType::iterator it = values_.begin(); it != values_.end(); ++it) it->second += x; @@ -177,17 +193,17 @@ public: return *this; } - SparseVector<T> operator+(const double &x) const { + SparseVector<T> operator+(T const& x) const { SparseVector<T> result = *this; return result += x; } - SparseVector<T> operator-(const double &x) const { + SparseVector<T> operator-(T const& x) const { SparseVector<T> result = *this; return result -= x; } - SparseVector<T> operator/(const double &x) const { + SparseVector<T> operator/(T const& x) const { SparseVector<T> result = *this; return result /= x; } diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl index f8f79ee5..32a8edbd 100755 --- a/vest/dist-vest.pl +++ b/vest/dist-vest.pl @@ -49,6 +49,11 @@ my $iniFile; my $weights; my $initialWeights; my $decoderOpt; +my $noprimary; +my $maxsim=0; +my $oraclen=0; +my $oracleb=20; +my $dirargs=''; # Process command-line options Getopt::Long::Configure("no_auto_abbrev"); @@ -66,6 +71,11 @@ if (GetOptions( "normalize=s" => \$normalize, "pmem=s" => \$pmem, "rand-directions=i" => \$rand_directions, + "no-primary!" => \$noprimary, + "max-similarity=s" => \$maxsim, + "oracle-directions=i" => \$oraclen, + "oracle-batch=i" => \$oracleb, + "directions-args=s" => \$dirargs, "ref-files=s" => \$refFiles, "metric=s" => \$metric, "source-file=s" => \$srcFile, @@ -234,7 +244,8 @@ while (1){ print STDERR "\nGENERATE OPTIMIZATION STRATEGY (OPT-ITERATION $opt_iter/$optimization_iters)\n"; print STDERR `date`; $icc++; - $cmd="$MAPINPUT -w $inweights -r $dir/hgs -s $devSize -d $rand_directions > $dir/agenda.$im1-$opt_iter"; + my $nop=$noprimary?"--no_primary":""; + $cmd="$MAPINPUT -w $inweights -r $dir/hgs -s $devSize -d $rand_directions --max_similarity=$maxsim --oracle_directions=$oraclen --oracle_batch=$oracleb $dirargs > $dir/agenda.$im1-$opt_iter"; print STDERR "COMMAND:\n$cmd\n"; $result = system($cmd); unless ($result == 0){ diff --git a/vest/line_optimizer.cc b/vest/line_optimizer.cc index b6410c35..e8b40237 100644 --- a/vest/line_optimizer.cc +++ b/vest/line_optimizer.cc @@ -15,7 +15,7 @@ struct IntervalComp { bool operator() (const ErrorIter& a, const ErrorIter& b) const { return a->x < b->x; } -}; +}; double LineOptimizer::LineOptimize( const vector<ErrorSurface>& surfaces, @@ -89,15 +89,22 @@ void LineOptimizer::CreateOptimizationDirections( const vector<int>& features_to_optimize, int additional_random_directions, RandomNumberGenerator<boost::mt19937>* rng, - vector<SparseVector<double> >* dirs) { + vector<SparseVector<double> >* dirs + , bool include_orthogonal + ) { const int num_directions = features_to_optimize.size() + additional_random_directions; - dirs->resize(num_directions); - for (int i = 0; i < num_directions; ++i) { - SparseVector<double>& axis = (*dirs)[i]; - if (i < features_to_optimize.size()) - axis.set_value(features_to_optimize[i], 1.0); - else - RandomUnitVector(features_to_optimize, &axis, rng); - } - cerr << "Generated " << num_directions << " total axes to optimize along.\n"; + dirs->clear(); + typedef SparseVector<double> Dir; + vector<Dir> &out=*dirs; + int i=0; + if (include_orthogonal) + for (;i<features_to_optimize.size();++i) { + Dir d; + d.set_value(features_to_optimize[i],1.); + out.push_back(d); + } + out.resize(i+additional_random_directions); + for (;i<out.size();++i) + RandomUnitVector(features_to_optimize, &out[i], rng); + cerr << "Generated " << out.size() << " total axes to optimize along.\n"; } diff --git a/vest/line_optimizer.h b/vest/line_optimizer.h index 43164360..1b5382cd 100644 --- a/vest/line_optimizer.h +++ b/vest/line_optimizer.h @@ -37,7 +37,9 @@ struct LineOptimizer { const std::vector<int>& primary, int additional_random_directions, RandomNumberGenerator<boost::mt19937>* rng, - std::vector<SparseVector<double> >* dirs); + std::vector<SparseVector<double> >* dirs + , bool include_primary=true + ); }; diff --git a/vest/mr_vest_generate_mapper_input.cc b/vest/mr_vest_generate_mapper_input.cc index c96a61e4..9e702e2f 100644 --- a/vest/mr_vest_generate_mapper_input.cc +++ b/vest/mr_vest_generate_mapper_input.cc @@ -1,5 +1,6 @@ #include <iostream> #include <vector> +#include <sstream> #include <boost/program_options.hpp> #include <boost/program_options/variables_map.hpp> @@ -7,10 +8,70 @@ #include "filelib.h" #include "weights.h" #include "line_optimizer.h" +#include "hg.h" +#include "hg_io.h" using namespace std; namespace po = boost::program_options; +typedef SparseVector<double> Dir; + +typedef RandomNumberGenerator<boost::mt19937> RNG; +RNG rng; + +struct oracle_directions { + string forest_repository; + unsigned dev_set_size; + vector<Dir> dirs; //best_to_hope_dirs + vector<int> fids; + string forest_file(unsigned i) const { + ostringstream o; + o << forest_repository << '/' << i << ".json.gz"; + return o.str(); + } + + oracle_directions(string forest_repository,unsigned dev_set_size,vector<int> const& fids=vector<int>()): forest_repository(forest_repository),dev_set_size(dev_set_size),fids(fids) { + dirs.resize(dev_set_size); + } + Dir const& operator[](unsigned i) { + Dir &dir=dirs[i]; + if (dir.empty()) { + ReadFile rf(forest_file(i)); + Hypergraph hg; + HypergraphIO::ReadFromJSON(rf.stream(), &hg); + cerr<<"oracle: forest["<<i<<"] loaded: "<<hg.stats()<<endl; + //TODO: get hope/oracle from vlad. random for now. + LineOptimizer::RandomUnitVector(fids,&dir,&rng); + } + return dir; + } + +}; + +void compress_similar(vector<Dir> &dirs,double min_dist,ostream *log=&cerr,bool avg=true) { + if (min_dist<=0) return; + double max_s=1.-min_dist; + unsigned N=dirs.size(); + for (int i=0;i<N;++i) { + for (int j=i+1;j<N;++j) { + double s=dirs[i].tanimoto_coef(dirs[j]); + if (s>max_s) { + if (log) *log << "Collapsing similar directions (T="<<s<<" > "<<max_s<<"). dirs["<<i<<"]="<<dirs[i]<<" dirs["<<j<<"]"; + if (avg) { + dirs[i]+=dirs[j]; + dirs[i]/=2.; + if (log) *log<<" averaged="<<dirs[i]; + } + if (log) *log<<endl; + swap(dirs[j],dirs[--N]); + } + } + } + dirs.resize(N); +} + + + void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() @@ -19,6 +80,10 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("weights,w",po::value<string>(),"[REQD] Current feature weights file") ("optimize_feature,o",po::value<vector<string> >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)") ("random_directions,d",po::value<unsigned int>()->default_value(20),"Number of random directions to run the line optimizer in") + ("no_primary,n","don't use the primary (orthogonal each feature alone) directions") + ("oracle_directions,O",po::value<unsigned>()->default_value(0),"read the forests and choose this many directions based on heading toward a hope max (bleu+modelscore) translation.") + ("oracle_batch,b",po::value<unsigned>()->default_value(10),"to produce each oracle direction, sum the 'gradient' over this many sentences") + ("max_similarity,m",po::value<double>()->default_value(0),"remove directions that are too similar (Tanimoto coeff. less than (1-this)). 0 means don't filter, 1 means only 1 direction allowed?") ("help,h", "Help"); po::options_description dcmdline_options; dcmdline_options.add(opts); @@ -43,30 +108,36 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { } int main(int argc, char** argv) { - RandomNumberGenerator<boost::mt19937> rng; po::variables_map conf; InitCommandLine(argc, argv, &conf); Weights weights; vector<string> features; weights.InitFromFile(conf["weights"].as<string>(), &features); - const string forest_repository = conf["forest_repository"].as<string>(); - assert(DirectoryExists(forest_repository)); + vector<int> fids(features.size()); + for (int i = 0; i < features.size(); ++i) + fids[i] = FD::Convert(features[i]); + + oracle_directions od(conf["forest_repository"].as<string>() + , conf["dev_set_size"].as<unsigned int>() + , fids + ); +; + assert(DirectoryExists(od.forest_repository)); SparseVector<double> origin; weights.InitSparseVector(&origin); if (conf.count("optimize_feature") > 0) features=conf["optimize_feature"].as<vector<string> >(); vector<SparseVector<double> > axes; - vector<int> fids(features.size()); - for (int i = 0; i < features.size(); ++i) - fids[i] = FD::Convert(features[i]); LineOptimizer::CreateOptimizationDirections( fids, conf["random_directions"].as<unsigned int>(), &rng, - &axes); - int dev_set_size = conf["dev_set_size"].as<unsigned int>(); - for (int i = 0; i < dev_set_size; ++i) + &axes, + !conf.count("no_primary") + ); + compress_similar(axes,conf["max_similarity"].as<double>()); + for (int i = 0; i < od.dev_set_size; ++i) for (int j = 0; j < axes.size(); ++j) - cout << forest_repository << '/' << i << ".json.gz " << i << ' ' << origin << ' ' << axes[j] << endl; + cout << od.forest_file(i) <<" " << i << ' ' << origin << ' ' << axes[j] << endl; return 0; } |