summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorgraehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-16 01:56:17 +0000
committergraehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-16 01:56:17 +0000
commit1671e12ef0b069a5d2ae1c2d4fea20b9b1087af3 (patch)
tree1518f8d42839d0039a9546cf3b06e1e976c8468a
parent4037e35c511aec96f780276aa4e3c1493e19eba1 (diff)
refactor vest mapper input; --optimize_feature (s) should now limit non-oracle directions
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@273 ec762483-ff6d-05da-a07a-a48fb63a330f
-rw-r--r--decoder/Makefile.am3
-rw-r--r--decoder/cdec_ff.cc2
-rw-r--r--decoder/ff_factory.h2
-rwxr-xr-xdecoder/oracle_bleu.h1
-rw-r--r--vest/mr_vest_generate_mapper_input.cc270
5 files changed, 167 insertions, 111 deletions
diff --git a/decoder/Makefile.am b/decoder/Makefile.am
index e7b6abd8..a34aba1a 100644
--- a/decoder/Makefile.am
+++ b/decoder/Makefile.am
@@ -13,7 +13,7 @@ noinst_PROGRAMS = \
small_vector_test
endif
-cdec_SOURCES = cdec.cc forest_writer.cc maxtrans_blunsom.cc cdec_ff.cc ff_factory.cc timing_stats.cc
+cdec_SOURCES = cdec.cc forest_writer.cc maxtrans_blunsom.cc cdec_ff.cc timing_stats.cc
small_vector_test_SOURCES = small_vector_test.cc
small_vector_test_LDADD = $(GTEST_LDFLAGS) $(GTEST_LIBS) libcdec.a
parser_test_SOURCES = parser_test.cc
@@ -75,6 +75,7 @@ libcdec_a_SOURCES = \
ff_csplit.cc \
ff_tagger.cc \
ff_bleu.cc \
+ ff_factory.cc \
../vest/scorer.cc \
../vest/ter.cc \
../vest/aer_scorer.cc \
diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc
index c91780e2..069e07f1 100644
--- a/decoder/cdec_ff.cc
+++ b/decoder/cdec_ff.cc
@@ -21,7 +21,7 @@ void register_feature_functions() {
global_ff_registry->Register(new FFFactory<WordPenalty>);
global_ff_registry->Register(new FFFactory<SourceWordPenalty>);
global_ff_registry->Register(new FFFactory<ArityPenalty>);
- global_ff_registry->Register("BLEUModel", new FFFactory<BLEUModel>);
+ global_ff_registry->Register(new FFFactory<BLEUModel>);
global_ff_registry->Register("RuleShape", new FFFactory<RuleShapeFeatures>);
global_ff_registry->Register("RelativeSentencePosition", new FFFactory<RelativeSentencePosition>);
global_ff_registry->Register("Model2BinaryFeatures", new FFFactory<Model2BinaryFeatures>);
diff --git a/decoder/ff_factory.h b/decoder/ff_factory.h
index 75911f38..6f86f2f9 100644
--- a/decoder/ff_factory.h
+++ b/decoder/ff_factory.h
@@ -21,8 +21,8 @@ class FFRegistry {
void DisplayList() const;
void Register(const std::string& ffname, FFFactoryBase* factory);
void Register(FFFactoryBase* factory);
- private:
FFRegistry() {}
+ private:
std::map<std::string, boost::shared_ptr<FFFactoryBase> > reg_;
};
diff --git a/decoder/oracle_bleu.h b/decoder/oracle_bleu.h
index 32525466..5fef53fd 100755
--- a/decoder/oracle_bleu.h
+++ b/decoder/oracle_bleu.h
@@ -195,7 +195,6 @@ struct OracleBleu {
std::ostringstream kbest_string_stream;
kbest_string_stream << conf["forest_output"].as<std::string>() << "/kbest_"<<suffix<< "." << sent_id;
DumpKBest(sent_id, forest, k, unique, kbest_string_stream.str());
-
}
};
diff --git a/vest/mr_vest_generate_mapper_input.cc b/vest/mr_vest_generate_mapper_input.cc
index c0f80d0c..e9a5650b 100644
--- a/vest/mr_vest_generate_mapper_input.cc
+++ b/vest/mr_vest_generate_mapper_input.cc
@@ -13,15 +13,149 @@
#include "hg_io.h"
#include "scorer.h"
#include "oracle_bleu.h"
+#include "ff_bleu.h"
+
+boost::shared_ptr<FFRegistry> global_ff_registry;
+namespace {
+struct init_ff {
+ init_ff() {
+ global_ff_registry.reset(new FFRegistry);
+ global_ff_registry->Register(new FFFactory<BLEUModel>);
+ }
+};
+init_ff reg;
+}
using namespace std;
namespace po = boost::program_options;
typedef SparseVector<double> Dir;
+typedef Dir Point;
-MT19937 rng;
+
+void compress_similar(vector<Dir> &dirs,double min_dist,ostream *log=&cerr,bool avg=true) {
+ if (min_dist<=0) return;
+ double max_s=1.-min_dist;
+ unsigned N=dirs.size();
+ for (int i=0;i<N;++i) {
+ for (int j=i+1;j<N;++j) {
+ double s=dirs[i].tanimoto_coef(dirs[j]);
+ if (s>max_s) {
+ if (log) *log << "Collapsing similar directions (T="<<s<<" > "<<max_s<<"). dirs["<<i<<"]="<<dirs[i]<<" dirs["<<j<<"]";
+ if (avg) {
+ dirs[i]+=dirs[j];
+ dirs[i]/=2.;
+ if (log) *log<<" averaged="<<dirs[i];
+ }
+ if (log) *log<<endl;
+ swap(dirs[j],dirs[--N]);
+ }
+ }
+ }
+ dirs.resize(N);
+}
struct oracle_directions {
+ MT19937 rng;
+ OracleBleu oracle;
+ vector<Dir> directions;
+
+ bool start_random;
+ bool include_primary;
+ bool fear_to_hope;
+ unsigned n_random;
+ void AddPrimaryAndRandomDirections() {
+ LineOptimizer::CreateOptimizationDirections(
+ fids,n_random,&rng,&directions,include_primary);
+ }
+
+ void Print() {
+ for (int i = 0; i < dev_set_size; ++i)
+ for (int j = 0; j < directions.size(); ++j)
+ cout << forest_file(i) <<" " << i << ' ' << origin << ' ' << directions[j] << endl;
+ }
+
+ void AddOptions(po::options_description *opts) {
+ oracle.AddOptions(opts);
+ }
+
+ void InitCommandLine(int argc, char *argv[], po::variables_map *conf) {
+ po::options_description opts("Configuration options");
+ OracleBleu::AddOptions(&opts);
+ opts.add_options()
+ ("dev_set_size,s",po::value<unsigned>(&dev_set_size),"[REQD] Development set size (# of parallel sentences)")
+ ("forest_repository,r",po::value<string>(),"[REQD] Path to forest repository")
+ ("weights,w",po::value<string>(),"[REQD] Current feature weights file")
+ ("optimize_feature,o",po::value<vector<string> >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)")
+ ("random_directions,d",po::value<unsigned int>()->default_value(20),"Number of random directions to run the line optimizer in")
+ ("no_primary,n","don't use the primary (orthogonal each feature alone) directions")
+ ("oracle_directions,O",po::value<unsigned>()->default_value(0),"read the forests and choose this many directions based on heading toward a hope max (bleu+modelscore) translation.")
+ ("oracle_start_random",po::bool_switch(&start_random),"sample random subsets of dev set for ALL oracle directions, not just those after a sequential run through it")
+ ("oracle_batch,b",po::value<unsigned>()->default_value(10),"to produce each oracle direction, sum the 'gradient' over this many sentences")
+ ("max_similarity,m",po::value<double>()->default_value(0),"remove directions that are too similar (Tanimoto coeff. less than (1-this)). 0 means don't filter, 1 means only 1 direction allowed?")
+ ("fear_to_hope,f","for each of the oracle_directions, also include a direction from fear to hope (as well as origin to hope)")
+ ("help,h", "Help");
+ po::options_description dcmdline_options;
+ dcmdline_options.add(opts);
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ bool flag = false;
+ if (conf->count("dev_set_size") == 0) {
+ cerr << "Please specify the size of the development set using -d N\n";
+ flag = true;
+ }
+ if (conf->count("weights") == 0) {
+ cerr << "Please specify the starting-point weights using -w <weightfile.txt>\n";
+ flag = true;
+ }
+ if (conf->count("forest_repository") == 0) {
+ cerr << "Please specify the forest repository location using -r <DIR>\n";
+ flag = true;
+ }
+ if (flag || conf->count("help")) {
+ cerr << dcmdline_options << endl;
+ exit(1);
+ }
+ }
+
+ int main(int argc, char *argv[]) {
+ po::variables_map conf;
+ InitCommandLine(argc,argv,&conf);
+ UseConf(conf);
+ Run();
+ return 0;
+ }
+
+ void Run() {
+ AddPrimaryAndRandomDirections();
+ AddOracleDirections();
+ compress_similar(directions,max_similarity);
+ Print();
+ }
+
+
+ Point origin; // old weights that gave model 1best.
+ vector<string> optimize_features;
+ void UseConf(po::variables_map const& conf) {
+ oracle.UseConf(conf);
+
+ include_primary=!conf.count("no_primary");
+ if (conf.count("optimize_feature") > 0)
+ optimize_features=conf["optimize_feature"].as<vector<string> >();
+ fear_to_hope=conf.count("fear_to_hope");
+ n_random=conf["random_directions"].as<unsigned int>();
+ forest_repository=conf["forest_repository"].as<string>();
+// dev_set_size=conf["dev_set_size"].as<unsigned int>();
+ n_oracle=conf["oracle_directions"].as<unsigned>();
+ oracle_batch=conf["oracle_batch"].as<unsigned>();
+ max_similarity=conf["max_similarity"].as<double>();
+ weights_file=conf["weights"].as<string>();
+
+ Init();
+ }
+
+ string weights_file;
+ double max_similarity;
+ unsigned n_oracle, oracle_batch;
string forest_repository;
unsigned dev_set_size;
vector<Dir> dirs; //best_to_hope_dirs
@@ -32,14 +166,28 @@ struct oracle_directions {
return o.str();
}
- void set_dev_set_size(int i) {
- dev_set_size=i;
- dirs.resize(dev_set_size);
+ oracle_directions() { }
+
+ void Init() {
+ start_random=false;
+ assert(DirectoryExists(forest_repository));
+ vector<string> features;
+ weights.InitFromFile(weights_file, &features);
+ if (optimize_features.size())
+ features=optimize_features;
+ weights.InitSparseVector(&origin);
+ fids.clear();
+ AddFeatureIds(features);
}
- oracle_directions(string forest_repository="",unsigned dev_set_sz=0,vector<int> const& fids=vector<int>()): forest_repository(forest_repository),fids(fids) {
- set_dev_set_size(dev_set_sz);
- }
+ Weights weights;
+ void AddFeatureIds(vector<string> const& features) {
+ int i = fids.size();
+ fids.resize(fids.size()+features.size());
+ for (; i < features.size(); ++i)
+ fids[i] = FD::Convert(features[i]);
+ }
+
Dir const& operator[](unsigned i) {
Dir &dir=dirs[i];
@@ -52,112 +200,20 @@ struct oracle_directions {
return dir;
}
// if start_random is true, immediately sample w/ replacement from src sentences; otherwise, consume them sequentially until exhausted, then random. oracle vectors are summed
- void add_directions(vector<Dir> &dirs,unsigned n,unsigned batchsz=20,bool start_random=false) {
+ void AddOracleDirections() {
MT19937::IntRNG rsg=rng.inclusive(0,dev_set_size-1);
unsigned b=0;
- for(unsigned i=0;i<n;++i) {
- dirs.push_back(Dir());
- Dir &d=dirs.back();
- for (unsigned j=0;j<batchsz;++j,++b)
+ for(unsigned i=0;i<n_oracle;++i) {
+ directions.push_back(Dir());
+ Dir &d=directions.back();
+ for (unsigned j=0;j<oracle_batch;++j,++b)
d+=(*this)[(start_random || b>=dev_set_size)?rsg():b];
- d/=(double)batchsz;
+ d/=(double)oracle_batch;
}
}
-
};
-void compress_similar(vector<Dir> &dirs,double min_dist,ostream *log=&cerr,bool avg=true) {
- if (min_dist<=0) return;
- double max_s=1.-min_dist;
- unsigned N=dirs.size();
- for (int i=0;i<N;++i) {
- for (int j=i+1;j<N;++j) {
- double s=dirs[i].tanimoto_coef(dirs[j]);
- if (s>max_s) {
- if (log) *log << "Collapsing similar directions (T="<<s<<" > "<<max_s<<"). dirs["<<i<<"]="<<dirs[i]<<" dirs["<<j<<"]";
- if (avg) {
- dirs[i]+=dirs[j];
- dirs[i]/=2.;
- if (log) *log<<" averaged="<<dirs[i];
- }
- if (log) *log<<endl;
- swap(dirs[j],dirs[--N]);
- }
- }
- }
- dirs.resize(N);
-}
-
-
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- OracleBleu::AddOptions(&opts);
- opts.add_options()
- ("dev_set_size,s",po::value<unsigned int>(),"[REQD] Development set size (# of parallel sentences)")
- ("forest_repository,r",po::value<string>(),"[REQD] Path to forest repository")
- ("weights,w",po::value<string>(),"[REQD] Current feature weights file")
- ("optimize_feature,o",po::value<vector<string> >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)")
- ("random_directions,d",po::value<unsigned int>()->default_value(20),"Number of random directions to run the line optimizer in")
- ("no_primary,n","don't use the primary (orthogonal each feature alone) directions")
- ("oracle_directions,O",po::value<unsigned>()->default_value(0),"read the forests and choose this many directions based on heading toward a hope max (bleu+modelscore) translation.")
- ("oracle_batch,b",po::value<unsigned>()->default_value(10),"to produce each oracle direction, sum the 'gradient' over this many sentences")
- ("max_similarity,m",po::value<double>()->default_value(0),"remove directions that are too similar (Tanimoto coeff. less than (1-this)). 0 means don't filter, 1 means only 1 direction allowed?")
- ("help,h", "Help");
- po::options_description dcmdline_options;
- dcmdline_options.add(opts);
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- bool flag = false;
- if (conf->count("dev_set_size") == 0) {
- cerr << "Please specify the size of the development set using -d N\n";
- flag = true;
- }
- if (conf->count("weights") == 0) {
- cerr << "Please specify the starting-point weights using -w <weightfile.txt>\n";
- flag = true;
- }
- if (conf->count("forest_repository") == 0) {
- cerr << "Please specify the forest repository location using -r <DIR>\n";
- flag = true;
- }
- if (flag || conf->count("help")) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
- Weights weights;
- vector<string> features;
- weights.InitFromFile(conf["weights"].as<string>(), &features);
- vector<int> fids(features.size());
- for (int i = 0; i < features.size(); ++i)
- fids[i] = FD::Convert(features[i]);
-
- oracle_directions od(conf["forest_repository"].as<string>()
- , conf["dev_set_size"].as<unsigned int>()
- , fids
- );
-;
- assert(DirectoryExists(od.forest_repository));
- SparseVector<double> origin;
- weights.InitSparseVector(&origin);
- if (conf.count("optimize_feature") > 0)
- features=conf["optimize_feature"].as<vector<string> >();
- vector<SparseVector<double> > axes;
- LineOptimizer::CreateOptimizationDirections(
- fids,
- conf["random_directions"].as<unsigned int>(),
- &rng,
- &axes,
- !conf.count("no_primary")
- );
- od.add_directions(axes,conf["oracle_directions"].as<unsigned>(),conf["oracle_batch"].as<unsigned>());
- compress_similar(axes,conf["max_similarity"].as<double>());
- for (int i = 0; i < od.dev_set_size; ++i)
- for (int j = 0; j < axes.size(); ++j)
- cout << od.forest_file(i) <<" " << i << ' ' << origin << ' ' << axes[j] << endl;
- return 0;
+ oracle_directions od;
+ return od.main(argc,argv);
}