summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorgraehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-13 02:01:07 +0000
committergraehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-13 02:01:07 +0000
commit2d6a5896df99904b0c492e77168489636d545869 (patch)
treebebf9b97cddbee40a95c026d1608c0cccf6bb49f
parent77c25d9f30f95ccb7843f9dce71a4f4e018cc727 (diff)
vest: combine over-similar search directions, exclude primary directions, skeleton for oracle directions
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@227 ec762483-ff6d-05da-a07a-a48fb63a330f
-rw-r--r--decoder/grammar.cc9
-rw-r--r--decoder/sparse_vector.h32
-rwxr-xr-xvest/dist-vest.pl13
-rw-r--r--vest/line_optimizer.cc29
-rw-r--r--vest/line_optimizer.h4
-rw-r--r--vest/mr_vest_generate_mapper_input.cc91
6 files changed, 142 insertions, 36 deletions
diff --git a/decoder/grammar.cc b/decoder/grammar.cc
index 5eb7887d..499e79fe 100644
--- a/decoder/grammar.cc
+++ b/decoder/grammar.cc
@@ -71,7 +71,7 @@ struct TGImpl {
};
TextGrammar::TextGrammar() : max_span_(10), pimpl_(new TGImpl) {}
-TextGrammar::TextGrammar(const string& file) :
+TextGrammar::TextGrammar(const string& file) :
max_span_(10),
pimpl_(new TGImpl) {
ReadFromFile(file);
@@ -104,7 +104,7 @@ void TextGrammar::ReadFromFile(const string& filename) {
RuleLexer::ReadRules(in.stream(), &AddRuleHelper, this);
}
-bool TextGrammar::HasRuleForSpan(int i, int j, int distance) const {
+bool TextGrammar::HasRuleForSpan(int /* i */, int /* j */, int distance) const {
return (max_span_ >= distance);
}
@@ -121,8 +121,7 @@ GlueGrammar::GlueGrammar(const string& goal_nt, const string& default_nt) {
//cerr << "GLUE: " << glue->AsString() << endl;
}
-bool GlueGrammar::HasRuleForSpan(int i, int j, int distance) const {
- (void) j;
+bool GlueGrammar::HasRuleForSpan(int i, int /* j */, int /* distance */) const {
return (i == 0);
}
@@ -141,7 +140,7 @@ PassThroughGrammar::PassThroughGrammar(const Lattice& input, const string& cat)
}
}
-bool PassThroughGrammar::HasRuleForSpan(int i, int j, int distance) const {
+bool PassThroughGrammar::HasRuleForSpan(int i, int j, int /* distance */) const {
const set<int>& hr = has_rule_[i];
if (i == j) { return !hr.empty(); }
return (hr.find(j) != hr.end());
diff --git a/decoder/sparse_vector.h b/decoder/sparse_vector.h
index be91f324..bfdeebcc 100644
--- a/decoder/sparse_vector.h
+++ b/decoder/sparse_vector.h
@@ -58,7 +58,7 @@ public:
}
int max_index() const {
- if (values_.empty()) return 0;
+ if (empty()) return 0;
typename MapType::const_iterator found =values_.end();
--found;
return found->first;
@@ -75,6 +75,18 @@ public:
}
template<typename S>
+ S cosine_sim(const SparseVector<S> &vec) const {
+ return dot(vec)/(l2norm()*vec.l2norm());
+ }
+
+ // if values are binary, gives |A intersect B|/|A union B|
+ template<typename S>
+ S tanimoto_coef(const SparseVector<S> &vec) const {
+ S dp=dot(vec);
+ return dp/(l2norm_sq()*vec.l2norm_sq()-dp);
+ }
+
+ template<typename S>
S dot(const SparseVector<S> &vec) const {
S sum = 0;
for (typename MapType::const_iterator
@@ -119,12 +131,16 @@ public:
return sum;
}
- T l2norm() const {
+ T l2norm_sq() const {
T sum = 0;
for (typename MapType::const_iterator
it = values_.begin(); it != values_.end(); ++it)
sum += it->second * it->second;
- return sqrt(sum);
+ return sum;
+ }
+
+ T l2norm() const {
+ return sqrt(l2norm_sq());
}
SparseVector<T> &operator+=(const SparseVector<T> &other) {
@@ -149,14 +165,14 @@ public:
return *this;
}
- SparseVector<T> &operator-=(const double &x) {
+ SparseVector<T> &operator-=(T const& x) {
for (typename MapType::iterator
it = values_.begin(); it != values_.end(); ++it)
it->second -= x;
return *this;
}
- SparseVector<T> &operator+=(const double &x) {
+ SparseVector<T> &operator+=(T const& x) {
for (typename MapType::iterator
it = values_.begin(); it != values_.end(); ++it)
it->second += x;
@@ -177,17 +193,17 @@ public:
return *this;
}
- SparseVector<T> operator+(const double &x) const {
+ SparseVector<T> operator+(T const& x) const {
SparseVector<T> result = *this;
return result += x;
}
- SparseVector<T> operator-(const double &x) const {
+ SparseVector<T> operator-(T const& x) const {
SparseVector<T> result = *this;
return result -= x;
}
- SparseVector<T> operator/(const double &x) const {
+ SparseVector<T> operator/(T const& x) const {
SparseVector<T> result = *this;
return result /= x;
}
diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl
index f8f79ee5..32a8edbd 100755
--- a/vest/dist-vest.pl
+++ b/vest/dist-vest.pl
@@ -49,6 +49,11 @@ my $iniFile;
my $weights;
my $initialWeights;
my $decoderOpt;
+my $noprimary;
+my $maxsim=0;
+my $oraclen=0;
+my $oracleb=20;
+my $dirargs='';
# Process command-line options
Getopt::Long::Configure("no_auto_abbrev");
@@ -66,6 +71,11 @@ if (GetOptions(
"normalize=s" => \$normalize,
"pmem=s" => \$pmem,
"rand-directions=i" => \$rand_directions,
+ "no-primary!" => \$noprimary,
+ "max-similarity=s" => \$maxsim,
+ "oracle-directions=i" => \$oraclen,
+ "oracle-batch=i" => \$oracleb,
+ "directions-args=s" => \$dirargs,
"ref-files=s" => \$refFiles,
"metric=s" => \$metric,
"source-file=s" => \$srcFile,
@@ -234,7 +244,8 @@ while (1){
print STDERR "\nGENERATE OPTIMIZATION STRATEGY (OPT-ITERATION $opt_iter/$optimization_iters)\n";
print STDERR `date`;
$icc++;
- $cmd="$MAPINPUT -w $inweights -r $dir/hgs -s $devSize -d $rand_directions > $dir/agenda.$im1-$opt_iter";
+ my $nop=$noprimary?"--no_primary":"";
+ $cmd="$MAPINPUT -w $inweights -r $dir/hgs -s $devSize -d $rand_directions --max_similarity=$maxsim --oracle_directions=$oraclen --oracle_batch=$oracleb $dirargs > $dir/agenda.$im1-$opt_iter";
print STDERR "COMMAND:\n$cmd\n";
$result = system($cmd);
unless ($result == 0){
diff --git a/vest/line_optimizer.cc b/vest/line_optimizer.cc
index b6410c35..e8b40237 100644
--- a/vest/line_optimizer.cc
+++ b/vest/line_optimizer.cc
@@ -15,7 +15,7 @@ struct IntervalComp {
bool operator() (const ErrorIter& a, const ErrorIter& b) const {
return a->x < b->x;
}
-};
+};
double LineOptimizer::LineOptimize(
const vector<ErrorSurface>& surfaces,
@@ -89,15 +89,22 @@ void LineOptimizer::CreateOptimizationDirections(
const vector<int>& features_to_optimize,
int additional_random_directions,
RandomNumberGenerator<boost::mt19937>* rng,
- vector<SparseVector<double> >* dirs) {
+ vector<SparseVector<double> >* dirs
+ , bool include_orthogonal
+ ) {
const int num_directions = features_to_optimize.size() + additional_random_directions;
- dirs->resize(num_directions);
- for (int i = 0; i < num_directions; ++i) {
- SparseVector<double>& axis = (*dirs)[i];
- if (i < features_to_optimize.size())
- axis.set_value(features_to_optimize[i], 1.0);
- else
- RandomUnitVector(features_to_optimize, &axis, rng);
- }
- cerr << "Generated " << num_directions << " total axes to optimize along.\n";
+ dirs->clear();
+ typedef SparseVector<double> Dir;
+ vector<Dir> &out=*dirs;
+ int i=0;
+ if (include_orthogonal)
+ for (;i<features_to_optimize.size();++i) {
+ Dir d;
+ d.set_value(features_to_optimize[i],1.);
+ out.push_back(d);
+ }
+ out.resize(i+additional_random_directions);
+ for (;i<out.size();++i)
+ RandomUnitVector(features_to_optimize, &out[i], rng);
+ cerr << "Generated " << out.size() << " total axes to optimize along.\n";
}
diff --git a/vest/line_optimizer.h b/vest/line_optimizer.h
index 43164360..1b5382cd 100644
--- a/vest/line_optimizer.h
+++ b/vest/line_optimizer.h
@@ -37,7 +37,9 @@ struct LineOptimizer {
const std::vector<int>& primary,
int additional_random_directions,
RandomNumberGenerator<boost::mt19937>* rng,
- std::vector<SparseVector<double> >* dirs);
+ std::vector<SparseVector<double> >* dirs
+ , bool include_primary=true
+ );
};
diff --git a/vest/mr_vest_generate_mapper_input.cc b/vest/mr_vest_generate_mapper_input.cc
index c96a61e4..9e702e2f 100644
--- a/vest/mr_vest_generate_mapper_input.cc
+++ b/vest/mr_vest_generate_mapper_input.cc
@@ -1,5 +1,6 @@
#include <iostream>
#include <vector>
+#include <sstream>
#include <boost/program_options.hpp>
#include <boost/program_options/variables_map.hpp>
@@ -7,10 +8,70 @@
#include "filelib.h"
#include "weights.h"
#include "line_optimizer.h"
+#include "hg.h"
+#include "hg_io.h"
using namespace std;
namespace po = boost::program_options;
+typedef SparseVector<double> Dir;
+
+typedef RandomNumberGenerator<boost::mt19937> RNG;
+RNG rng;
+
+struct oracle_directions {
+ string forest_repository;
+ unsigned dev_set_size;
+ vector<Dir> dirs; //best_to_hope_dirs
+ vector<int> fids;
+ string forest_file(unsigned i) const {
+ ostringstream o;
+ o << forest_repository << '/' << i << ".json.gz";
+ return o.str();
+ }
+
+ oracle_directions(string forest_repository,unsigned dev_set_size,vector<int> const& fids=vector<int>()): forest_repository(forest_repository),dev_set_size(dev_set_size),fids(fids) {
+ dirs.resize(dev_set_size);
+ }
+ Dir const& operator[](unsigned i) {
+ Dir &dir=dirs[i];
+ if (dir.empty()) {
+ ReadFile rf(forest_file(i));
+ Hypergraph hg;
+ HypergraphIO::ReadFromJSON(rf.stream(), &hg);
+ cerr<<"oracle: forest["<<i<<"] loaded: "<<hg.stats()<<endl;
+ //TODO: get hope/oracle from vlad. random for now.
+ LineOptimizer::RandomUnitVector(fids,&dir,&rng);
+ }
+ return dir;
+ }
+
+};
+
+void compress_similar(vector<Dir> &dirs,double min_dist,ostream *log=&cerr,bool avg=true) {
+ if (min_dist<=0) return;
+ double max_s=1.-min_dist;
+ unsigned N=dirs.size();
+ for (int i=0;i<N;++i) {
+ for (int j=i+1;j<N;++j) {
+ double s=dirs[i].tanimoto_coef(dirs[j]);
+ if (s>max_s) {
+ if (log) *log << "Collapsing similar directions (T="<<s<<" > "<<max_s<<"). dirs["<<i<<"]="<<dirs[i]<<" dirs["<<j<<"]";
+ if (avg) {
+ dirs[i]+=dirs[j];
+ dirs[i]/=2.;
+ if (log) *log<<" averaged="<<dirs[i];
+ }
+ if (log) *log<<endl;
+ swap(dirs[j],dirs[--N]);
+ }
+ }
+ }
+ dirs.resize(N);
+}
+
+
+
void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
po::options_description opts("Configuration options");
opts.add_options()
@@ -19,6 +80,10 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
("weights,w",po::value<string>(),"[REQD] Current feature weights file")
("optimize_feature,o",po::value<vector<string> >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)")
("random_directions,d",po::value<unsigned int>()->default_value(20),"Number of random directions to run the line optimizer in")
+ ("no_primary,n","don't use the primary (orthogonal each feature alone) directions")
+ ("oracle_directions,O",po::value<unsigned>()->default_value(0),"read the forests and choose this many directions based on heading toward a hope max (bleu+modelscore) translation.")
+ ("oracle_batch,b",po::value<unsigned>()->default_value(10),"to produce each oracle direction, sum the 'gradient' over this many sentences")
+ ("max_similarity,m",po::value<double>()->default_value(0),"remove directions that are too similar (Tanimoto coeff. less than (1-this)). 0 means don't filter, 1 means only 1 direction allowed?")
("help,h", "Help");
po::options_description dcmdline_options;
dcmdline_options.add(opts);
@@ -43,30 +108,36 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
}
int main(int argc, char** argv) {
- RandomNumberGenerator<boost::mt19937> rng;
po::variables_map conf;
InitCommandLine(argc, argv, &conf);
Weights weights;
vector<string> features;
weights.InitFromFile(conf["weights"].as<string>(), &features);
- const string forest_repository = conf["forest_repository"].as<string>();
- assert(DirectoryExists(forest_repository));
+ vector<int> fids(features.size());
+ for (int i = 0; i < features.size(); ++i)
+ fids[i] = FD::Convert(features[i]);
+
+ oracle_directions od(conf["forest_repository"].as<string>()
+ , conf["dev_set_size"].as<unsigned int>()
+ , fids
+ );
+;
+ assert(DirectoryExists(od.forest_repository));
SparseVector<double> origin;
weights.InitSparseVector(&origin);
if (conf.count("optimize_feature") > 0)
features=conf["optimize_feature"].as<vector<string> >();
vector<SparseVector<double> > axes;
- vector<int> fids(features.size());
- for (int i = 0; i < features.size(); ++i)
- fids[i] = FD::Convert(features[i]);
LineOptimizer::CreateOptimizationDirections(
fids,
conf["random_directions"].as<unsigned int>(),
&rng,
- &axes);
- int dev_set_size = conf["dev_set_size"].as<unsigned int>();
- for (int i = 0; i < dev_set_size; ++i)
+ &axes,
+ !conf.count("no_primary")
+ );
+ compress_similar(axes,conf["max_similarity"].as<double>());
+ for (int i = 0; i < od.dev_set_size; ++i)
for (int j = 0; j < axes.size(); ++j)
- cout << forest_repository << '/' << i << ".json.gz " << i << ' ' << origin << ' ' << axes[j] << endl;
+ cout << od.forest_file(i) <<" " << i << ' ' << origin << ' ' << axes[j] << endl;
return 0;
}