summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2012-01-27 13:19:27 -0500
committerChris Dyer <cdyer@cs.cmu.edu>2012-01-27 13:19:27 -0500
commit203c3c3357b9ed8cfe44932c2bf5ea19eba6238c (patch)
treec446f8e8afbe194ef656b33cfc643f83633cf18c
parent481a120564fdb73c8c6833e2102acb533683261c (diff)
migration to new metric api for vest, clean up of unsupported/not functional code
-rw-r--r--mteval/mbr_kbest.cc21
-rw-r--r--utils/fast_sparse_vector.h6
-rwxr-xr-xvest/dist-vest.pl22
-rw-r--r--vest/mbr_kbest.cc138
-rw-r--r--vest/mr_vest_generate_mapper_input.cc356
-rw-r--r--vest/mr_vest_map.cc16
6 files changed, 84 insertions, 475 deletions
diff --git a/mteval/mbr_kbest.cc b/mteval/mbr_kbest.cc
index 64a6a8bf..b5e4750c 100644
--- a/mteval/mbr_kbest.cc
+++ b/mteval/mbr_kbest.cc
@@ -5,7 +5,7 @@
#include "prob.h"
#include "tdict.h"
-#include "scorer.h"
+#include "ns.h"
#include "filelib.h"
#include "stringlib.h"
@@ -17,7 +17,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
po::options_description opts("Configuration options");
opts.add_options()
("scale,a",po::value<double>()->default_value(1.0), "Posterior scaling factor (alpha)")
- ("loss_function,l",po::value<string>()->default_value("bleu"), "Loss function")
+ ("evaluation_metric,m",po::value<string>()->default_value("ibm_bleu"), "Evaluation metric")
("input,i",po::value<string>()->default_value("-"), "File to read k-best lists from")
("output_list,L", "Show reranked list as output")
("help,h", "Help");
@@ -75,13 +75,14 @@ bool ReadKBestList(istream* in, string* sent_id, vector<pair<vector<WordID>, pro
int main(int argc, char** argv) {
po::variables_map conf;
InitCommandLine(argc, argv, &conf);
- const string metric = conf["loss_function"].as<string>();
+ const string smetric = conf["evaluation_metric"].as<string>();
+ EvaluationMetric* metric = EvaluationMetric::Instance(smetric);
+ const bool is_loss = (UppercaseString(smetric) == "TER");
const bool output_list = conf.count("output_list") > 0;
const string file = conf["input"].as<string>();
const double mbr_scale = conf["scale"].as<double>();
cerr << "Posterior scaling factor (alpha) = " << mbr_scale << endl;
- ScoreType type = ScoreTypeFromString(metric);
vector<pair<vector<WordID>, prob_t> > list;
ReadFile rf(file);
string sent_id;
@@ -99,15 +100,15 @@ int main(int argc, char** argv) {
vector<double> mbr_scores(output_list ? list.size() : 0);
double mbr_loss = numeric_limits<double>::max();
for (int i = 0 ; i < list.size(); ++i) {
- vector<vector<WordID> > refs(1, list[i].first);
- //cerr << i << ": " << list[i].second <<"\t" << TD::GetString(list[i].first) << endl;
- ScorerP scorer = SentenceScorer::CreateSentenceScorer(type, refs);
+ const vector<vector<WordID> > refs(1, list[i].first);
+
double wl_acc = 0;
for (int j = 0; j < list.size(); ++j) {
if (i != j) {
- ScoreP s = scorer->ScoreCandidate(list[j].first);
- double loss = 1.0 - s->ComputeScore();
- if (type == TER || type == AER) loss = 1.0 - loss;
+ SufficientStats ss;
+ metric->ComputeSufficientStatistics(list[j].first, refs, &ss);
+ double loss = 1.0 - metric->ComputeScore(ss);
+ if (is_loss) loss = 1.0 - loss;
double weighted_loss = loss * (joints[j] / marginal).as_float();
wl_acc += weighted_loss;
if ((!output_list) && wl_acc > mbr_loss) break;
diff --git a/utils/fast_sparse_vector.h b/utils/fast_sparse_vector.h
index 1301581a..17fa47bf 100644
--- a/utils/fast_sparse_vector.h
+++ b/utils/fast_sparse_vector.h
@@ -178,6 +178,12 @@ class FastSparseVector {
T l2norm() const {
return sqrt(l2norm_sq());
}
+ T pnorm(const double p) const {
+ T sum = T();
+ for (const_iterator it = begin(), e = end(); it != e; ++it)
+ sum += pow(fabs(it->second), p);
+ return pow(sum, 1.0 / p);
+ }
// if values are binary, gives |A intersect B|/|A union B|
template<typename S>
S tanimoto_coef(const FastSparseVector<S> &vec) const {
diff --git a/vest/dist-vest.pl b/vest/dist-vest.pl
index 8cde748b..1ec8c6b1 100755
--- a/vest/dist-vest.pl
+++ b/vest/dist-vest.pl
@@ -65,8 +65,6 @@ my $oraclen=0;
my $oracleb=20;
my $bleu_weight=1;
my $use_make = 1; # use make to parallelize line search
-my $dirargs='';
-my $density_prune;
my $useqsub;
my $pass_suffix = '';
my $cpbin=1;
@@ -75,7 +73,6 @@ Getopt::Long::Configure("no_auto_abbrev");
if (GetOptions(
"decoder=s" => \$decoderOpt,
"jobs=i" => \$jobs,
- "density-prune=f" => \$density_prune,
"dont-clean" => \$disable_clean,
"pass-suffix=s" => \$pass_suffix,
"dry-run" => \$dryrun,
@@ -87,15 +84,7 @@ if (GetOptions(
"normalize=s" => \$normalize,
"pmem=s" => \$pmem,
"cpbin!" => \$cpbin,
- "rand-directions=i" => \$rand_directions,
- "random_directions=i" => \$rand_directions,
- "bleu_weight=s" => \$bleu_weight,
- "no-primary!" => \$noprimary,
- "max-similarity=s" => \$maxsim,
- "oracle-directions=i" => \$oraclen,
- "n-oracle=i" => \$oraclen,
- "oracle-batch=i" => \$oracleb,
- "directions-args=s" => \$dirargs,
+ "random-directions=i" => \$rand_directions,
"ref-files=s" => \$refFiles,
"metric=s" => \$metric,
"source-file=s" => \$srcFile,
@@ -107,10 +96,6 @@ if (GetOptions(
exit;
}
-if (defined $density_prune) {
- die "--density_prune n: n must be greater than 1.0\n" unless $density_prune > 1.0;
-}
-
if ($useqsub) {
$use_make = 0;
die "LocalEnvironment.pm does not have qsub configuration for this host. Cannot run with --qsub!\n" unless has_qsub();
@@ -328,10 +313,7 @@ while (1){
print STDERR "\nGENERATE OPTIMIZATION STRATEGY (OPT-ITERATION $opt_iter/$optimization_iters)\n";
print STDERR unchecked_output("date");
$icc++;
- my $nop=$noprimary?"--no_primary":"";
- my $targs=$oraclen ? "--decoder_translations='$runFile.gz' ".get_comma_sep_refs('-references',$refFiles):"";
- my $bwargs=$bleu_weight!=1 ? "--bleu_weight=$bleu_weight":"";
- $cmd="$MAPINPUT -w $inweights -r $dir/hgs $bwargs -s $devSize -d $rand_directions --max_similarity=$maxsim --oracle_directions=$oraclen --oracle_batch=$oracleb $targs $dirargs > $dir/agenda.$im1-$opt_iter";
+ $cmd="$MAPINPUT -w $inweights -r $dir/hgs -s $devSize -d $rand_directions > $dir/agenda.$im1-$opt_iter";
print STDERR "COMMAND:\n$cmd\n";
check_call($cmd);
check_call("mkdir -p $dir/splag.$im1");
diff --git a/vest/mbr_kbest.cc b/vest/mbr_kbest.cc
deleted file mode 100644
index 2867b36b..00000000
--- a/vest/mbr_kbest.cc
+++ /dev/null
@@ -1,138 +0,0 @@
-#include <iostream>
-#include <vector>
-
-#include <boost/program_options.hpp>
-
-#include "prob.h"
-#include "tdict.h"
-#include "scorer.h"
-#include "filelib.h"
-#include "stringlib.h"
-
-using namespace std;
-
-namespace po = boost::program_options;
-
-void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("scale,a",po::value<double>()->default_value(1.0), "Posterior scaling factor (alpha)")
- ("loss_function,l",po::value<string>()->default_value("bleu"), "Loss function")
- ("input,i",po::value<string>()->default_value("-"), "File to read k-best lists from")
- ("output_list,L", "Show reranked list as output")
- ("help,h", "Help");
- po::options_description dcmdline_options;
- dcmdline_options.add(opts);
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- bool flag = false;
- if (flag || conf->count("help")) {
- cerr << dcmdline_options << endl;
- exit(1);
- }
-}
-
-struct LossComparer {
- bool operator()(const pair<vector<WordID>, double>& a, const pair<vector<WordID>, double>& b) const {
- return a.second < b.second;
- }
-};
-
-bool ReadKBestList(istream* in, string* sent_id, vector<pair<vector<WordID>, prob_t> >* list) {
- static string cache_id;
- static pair<vector<WordID>, prob_t> cache_pair;
- list->clear();
- string cur_id;
- if (cache_pair.first.size() > 0) {
- list->push_back(cache_pair);
- cur_id = cache_id;
- cache_pair.first.clear();
- }
- string line;
- string tstr;
- while(*in) {
- getline(*in, line);
- if (line.empty()) continue;
- size_t p1 = line.find(" ||| ");
- if (p1 == string::npos) { cerr << "Bad format: " << line << endl; abort(); }
- size_t p2 = line.find(" ||| ", p1 + 4);
- if (p2 == string::npos) { cerr << "Bad format: " << line << endl; abort(); }
- size_t p3 = line.rfind(" ||| ");
- cache_id = line.substr(0, p1);
- tstr = line.substr(p1 + 5, p2 - p1 - 5);
- double val = strtod(line.substr(p3 + 5).c_str(), NULL);
- TD::ConvertSentence(tstr, &cache_pair.first);
- cache_pair.second.logeq(val);
- if (cur_id.empty()) cur_id = cache_id;
- if (cur_id == cache_id) {
- list->push_back(cache_pair);
- *sent_id = cur_id;
- cache_pair.first.clear();
- } else { break; }
- }
- return !list->empty();
-}
-
-int main(int argc, char** argv) {
- po::variables_map conf;
- InitCommandLine(argc, argv, &conf);
- const string metric = conf["loss_function"].as<string>();
- const bool output_list = conf.count("output_list") > 0;
- const string file = conf["input"].as<string>();
- const double mbr_scale = conf["scale"].as<double>();
- cerr << "Posterior scaling factor (alpha) = " << mbr_scale << endl;
-
- ScoreType type = ScoreTypeFromString(metric);
- vector<pair<vector<WordID>, prob_t> > list;
- ReadFile rf(file);
- string sent_id;
- while(ReadKBestList(rf.stream(), &sent_id, &list)) {
- vector<prob_t> joints(list.size());
- const prob_t max_score = pow(list.front().second, mbr_scale);
- prob_t marginal = prob_t::Zero();
- for (int i = 0 ; i < list.size(); ++i) {
- const prob_t joint = pow(list[i].second, mbr_scale) / max_score;
- joints[i] = joint;
- // cerr << "list[" << i << "] joint=" << log(joint) << endl;
- marginal += joint;
- }
- int mbr_idx = -1;
- vector<double> mbr_scores(output_list ? list.size() : 0);
- double mbr_loss = numeric_limits<double>::max();
- for (int i = 0 ; i < list.size(); ++i) {
- vector<vector<WordID> > refs(1, list[i].first);
- //cerr << i << ": " << list[i].second <<"\t" << TD::GetString(list[i].first) << endl;
- ScorerP scorer = SentenceScorer::CreateSentenceScorer(type, refs);
- double wl_acc = 0;
- for (int j = 0; j < list.size(); ++j) {
- if (i != j) {
- ScoreP s = scorer->ScoreCandidate(list[j].first);
- double loss = 1.0 - s->ComputeScore();
- if (type == TER || type == AER) loss = 1.0 - loss;
- double weighted_loss = loss * (joints[j] / marginal);
- wl_acc += weighted_loss;
- if ((!output_list) && wl_acc > mbr_loss) break;
- }
- }
- if (output_list) mbr_scores[i] = wl_acc;
- if (wl_acc < mbr_loss) {
- mbr_loss = wl_acc;
- mbr_idx = i;
- }
- }
- // cerr << "ML translation: " << TD::GetString(list[0].first) << endl;
- cerr << "MBR Best idx: " << mbr_idx << endl;
- if (output_list) {
- for (int i = 0; i < list.size(); ++i)
- list[i].second.logeq(mbr_scores[i]);
- sort(list.begin(), list.end(), LossComparer());
- for (int i = 0; i < list.size(); ++i)
- cout << sent_id << " ||| "
- << TD::GetString(list[i].first) << " ||| "
- << log(list[i].second) << endl;
- } else {
- cout << TD::GetString(list[mbr_idx].first) << endl;
- }
- }
- return 0;
-}
-
diff --git a/vest/mr_vest_generate_mapper_input.cc b/vest/mr_vest_generate_mapper_input.cc
index 0c094fd5..59d4f24f 100644
--- a/vest/mr_vest_generate_mapper_input.cc
+++ b/vest/mr_vest_generate_mapper_input.cc
@@ -1,320 +1,78 @@
-//TODO: debug segfault when references supplied, null shared_ptr when oracle
#include <iostream>
#include <vector>
-#include <sstream>
#include <boost/program_options.hpp>
#include <boost/program_options/variables_map.hpp>
-#include "sampler.h"
#include "filelib.h"
#include "weights.h"
#include "line_optimizer.h"
-#include "hg.h"
-#include "hg_io.h"
-#include "scorer.h"
-#include "oracle_bleu.h"
-#include "ff_bleu.h"
-
-const bool DEBUG_ORACLE=true;
-
-//TODO: decide on cdec_ff ffs, or just bleumodel - if just bleumodel, then do existing features on serialized hypergraphs remain? weights (origin) is passed to oracle_bleu.h:ComputeOracle
-//void register_feature_functions();
-//FFRegistry ff_registry;
-namespace {
-void init_bleumodel() {
- ff_registry.clear();
- ff_registry.Register(new FFFactory<BLEUModel>);
-}
-
-struct init_ff {
- init_ff() {
- init_bleumodel();
- }
-};
-//init_ff reg; // order of initialization? ff_registry may not be init yet. call in Run() instead.
-}
using namespace std;
namespace po = boost::program_options;
-typedef SparseVector<double> Dir;
-typedef Dir Point;
-
-void compress_similar(vector<Dir> &dirs,double min_dist,ostream *log=&cerr,bool avg=true,bool verbose=true) {
- // return; //TODO: debug
- if (min_dist<=0) return;
- double max_s=1.-min_dist;
- if (log&&verbose) *log<<"max allowed S="<<max_s<<endl;
- unsigned N=dirs.size();
- for (int i=0;i<N;++i) {
- for (int j=i+1;j<N;++j) {
- double s=dirs[i].tanimoto_coef(dirs[j]);
- if (log&&verbose) *log<<"S["<<i<<","<<j<<"]="<<s<<' ';
- if (s>max_s) {
- if (log) *log << "Collapsing similar directions (T="<<s<<" > "<<max_s<<"). dirs["<<i<<"]="<<dirs[i]<<" dirs["<<j<<"]"<<endl;
- if (avg) {
- dirs[i]+=dirs[j];
- dirs[i]/=2.;
- if (log) *log<<" averaged="<<dirs[i];
- }
- if (log) *log<<endl;
- swap(dirs[j],dirs[--N]);
- }
- }
- if (log&&verbose) *log<<endl;
-
- }
- dirs.resize(N);
-}
-
-struct oracle_directions {
- MT19937 rng;
- OracleBleu oracle;
- vector<Dir> directions;
-
- bool start_random;
- bool include_primary;
- bool old_to_hope;
- bool fear_to_hope;
- unsigned n_random;
- void AddPrimaryAndRandomDirections() {
- LineOptimizer::CreateOptimizationDirections(
- fids,n_random,&rng,&directions,include_primary);
- }
-
- void Print() {
- for (int i = 0; i < dev_set_size; ++i)
- for (int j = 0; j < directions.size(); ++j) {
- cout << forest_file(i) <<" " << i<<" ";
- print(cout,origin,"=",";");
- cout<<" ";
- print(cout,directions[j],"=",";");
- cout<<"\n";
- }
- }
-
- void AddOptions(po::options_description *opts) {
- oracle.AddOptions(opts);
- opts->add_options()
- ("dev_set_size,s",po::value<unsigned>(&dev_set_size),"[REQD] Development set size (# of parallel sentences)")
- ("forest_repository,r",po::value<string>(&forest_repository),"[REQD] Path to forest repository")
- ("weights,w",po::value<string>(&weights_file),"[REQD] Current feature weights file")
- ("optimize_feature,o",po::value<vector<string> >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)")
- ("random_directions,d",po::value<unsigned>(&n_random)->default_value(10),"Number of random directions to run the line optimizer in")
- ("no_primary,n","don't use the primary (orthogonal each feature alone) directions")
- ("oracle_directions,O",po::value<unsigned>(&n_oracle)->default_value(0),"read the forests and choose this many directions based on heading toward a hope max (bleu+modelscore) translation.")
- ("oracle_start_random",po::bool_switch(&start_random),"sample random subsets of dev set for ALL oracle directions, not just those after a sequential run through it")
- ("oracle_batch,b",po::value<unsigned>(&oracle_batch)->default_value(10),"to produce each oracle direction, sum the 'gradient' over this many sentences")
- ("max_similarity,m",po::value<double>(&max_similarity)->default_value(0),"remove directions that are too similar (Tanimoto coeff. less than (1-this)). 0 means don't filter, 1 means only 1 direction allowed?")
- ("fear_to_hope,f",po::bool_switch(&fear_to_hope),"for each of the oracle_directions, also include a direction from fear to hope (as well as origin to hope)")
- ("no_old_to_hope","don't emit the usual old -> hope oracle")
- ("decoder_translations",po::value<string>(&decoder_translations_file)->default_value(""),"one per line decoder 1best translations for computing document BLEU vs. sentences-seen-so-far BLEU")
- ;
- }
- void InitCommandLine(int argc, char *argv[], po::variables_map *conf) {
- po::options_description opts("Configuration options");
- AddOptions(&opts);
- opts.add_options()("help,h", "Help");
-
- po::options_description dcmdline_options;
- dcmdline_options.add(opts);
- po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
- po::notify(*conf);
- if (conf->count("dev_set_size") == 0) {
- cerr << "Please specify the size of the development set using -s N\n";
- goto bad_cmdline;
- }
- if (conf->count("weights") == 0) {
- cerr << "Please specify the starting-point weights using -w <weightfile.txt>\n";
- goto bad_cmdline;
- }
- if (conf->count("forest_repository") == 0) {
- cerr << "Please specify the forest repository location using -r <DIR>\n";
- goto bad_cmdline;
- }
- if (n_oracle && oracle.refs.empty()) {
- cerr<<"Specify references when using oracle directions\n";
- goto bad_cmdline;
- }
- if (conf->count("help")) {
- cout << dcmdline_options << endl;
- exit(0);
- }
-
- return;
- bad_cmdline:
- cerr << dcmdline_options << endl;
- exit(1);
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+ po::options_description opts("Configuration options");
+ opts.add_options()
+ ("dev_set_size,s",po::value<unsigned>(),"[REQD] Development set size (# of parallel sentences)")
+ ("forest_repository,r",po::value<string>(),"[REQD] Path to forest repository")
+ ("weights,w",po::value<string>(),"[REQD] Current feature weights file")
+ ("optimize_feature,o",po::value<vector<string> >(), "Feature to optimize (if none specified, all weights listed in the weights file will be optimized)")
+ ("random_directions,d",po::value<unsigned int>()->default_value(20),"Number of random directions to run the line optimizer in")
+ ("help,h", "Help");
+ po::options_description dcmdline_options;
+ dcmdline_options.add(opts);
+ po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+ bool flag = false;
+ if (conf->count("dev_set_size") == 0) {
+ cerr << "Please specify the size of the development set using -d N\n";
+ flag = true;
}
-
- int main(int argc, char *argv[]) {
- po::variables_map conf;
- InitCommandLine(argc,argv,&conf);
- init_bleumodel();
- UseConf(conf);
- Run();
- return 0;
+ if (conf->count("weights") == 0) {
+ cerr << "Please specify the starting-point weights using -w <weightfile.txt>\n";
+ flag = true;
}
- bool verbose() const { return oracle.verbose; }
- void Run() {
-// register_feature_functions();
- AddPrimaryAndRandomDirections();
- AddOracleDirections();
- compress_similar(directions,max_similarity,&cerr,true,verbose());
- Print();
+ if (conf->count("forest_repository") == 0) {
+ cerr << "Please specify the forest repository location using -r <DIR>\n";
+ flag = true;
}
-
-
- Point origin; // old weights that gave model 1best.
- vector<string> optimize_features;
- void UseConf(po::variables_map const& conf) {
- oracle.UseConf(conf);
- include_primary=!conf.count("no_primary");
- old_to_hope=!conf.count("no_old_to_hope");
-
- if (conf.count("optimize_feature") > 0)
- optimize_features=conf["optimize_feature"].as<vector<string> >();
- Init();
+ if (flag || conf->count("help")) {
+ cerr << dcmdline_options << endl;
+ exit(1);
}
+}
- string weights_file;
- double max_similarity;
- unsigned n_oracle, oracle_batch;
- string forest_repository;
- unsigned dev_set_size;
- vector<Oracle> oracles;
- vector<int> fids;
- string forest_file(unsigned i) const {
- ostringstream o;
- o << forest_repository << '/' << i << ".json.gz";
- return o.str();
- }
-
- oracle_directions() { }
-
- Sentences model_hyps;
-
- vector<ScoreP> model_scores;
- bool have_doc;
- void Init() {
- have_doc=!decoder_translations_file.empty();
- if (have_doc) {
- model_hyps.Load(decoder_translations_file);
- if (verbose()) model_hyps.Print(cerr,5);
- model_scores.resize(model_hyps.size());
- if (dev_set_size!=model_hyps.size()) {
- cerr<<"You supplied decoder_translations with a different number of lines ("<<model_hyps.size()<<") than dev_set_size ("<<dev_set_size<<")"<<endl;
- abort();
- }
- cerr << "Scoring model translations " << model_hyps << endl;
- for (int i=0;i<model_hyps.size();++i) {
- //TODO: what is scoreCcand? without clipping? do without for consistency w/ oracle
- model_scores[i]=oracle.ds[i]->ScoreCandidate(model_hyps[i]);
- assert(model_scores[i]);
- if (verbose()) cerr<<"Before model["<<i<<"]: "<<ds().ScoreDetails()<<endl;
- if (verbose()) cerr<<"model["<<i<<"]: "<<model_scores[i]->ScoreDetails()<<endl;
- oracle.doc_score->PlusEquals(*model_scores[i]);
- if (verbose()) cerr<<"After model["<<i<<"]: "<<ds().ScoreDetails()<<endl;
- }
- //TODO: compute doc bleu stats for each sentence, then when getting oracle temporarily exclude stats for that sentence (skip regular score updating)
- }
- start_random=false;
- cerr << "Forest repo: " << forest_repository << endl;
- assert(DirectoryExists(forest_repository));
- vector<string> features;
- vector<weight_t> dorigin;
- Weights::InitFromFile(weights_file, &dorigin, &features);
- if (optimize_features.size())
- features=optimize_features;
- Weights::InitSparseVector(dorigin, &origin);
- fids.clear();
- AddFeatureIds(features);
- oracles.resize(dev_set_size);
- }
-
- void AddFeatureIds(vector<string> const& features) {
- int i = fids.size();
- fids.resize(fids.size()+features.size());
- for (; i < features.size(); ++i)
- fids[i] = FD::Convert(features[i]);
- }
-
-
- std::string decoder_translations_file; // one per line
- //TODO: is it worthwhile to get a complete document bleu first? would take a list of 1best translations one per line from the decoders, rather than loading all the forests (expensive). translations are in run.raw.N.gz - new arg
- void adjust_doc(unsigned i,double scale=1.) {
- oracle.doc_score->PlusEquals(*model_scores[i],scale);
- }
-
- Score &ds() {
- return *oracle.doc_score;
- }
-
- Oracle const& ComputeOracle(unsigned i) {
- Oracle &o=oracles[i];
- if (o.is_null()) {
- if (have_doc) {
- if (verbose()) cerr<<"Before removing i="<<i<<" "<<ds().ScoreDetails()<<"\n";
- adjust_doc(i,-1);
- }
- ReadFile rf(forest_file(i));
- Hypergraph hg;
- {
- Timer t("Loading forest from JSON "+forest_file(i));
- HypergraphIO::ReadFromJSON(rf.stream(), &hg);
- }
- if (verbose()) cerr<<"Before oracle["<<i<<"]: "<<ds().ScoreDetails()<<endl;
- o=oracle.ComputeOracle(oracle.MakeMetadata(hg,i),&hg,origin);
- if (verbose()) {
- cerr << o;
- ScoreP hopesc=oracle.GetScore(o.hope.sentence,i);
- oracle.doc_score->PlusEquals(*hopesc,1);
- cerr<<"With hope: "<<ds().ScoreDetails()<<endl;
- oracle.doc_score->PlusEquals(*hopesc,-1);
- cerr<<"Without hope: "<<ds().ScoreDetails()<<endl;
- cerr<<" oracle="<<oracle.GetScore(o.hope.sentence,i)->ScoreDetails()<<endl
- <<" model="<<oracle.GetScore(o.model.sentence,i)->ScoreDetails()<<endl;
- if (have_doc)
- cerr<<" doc (should = model): "<<model_scores[i]->ScoreDetails()<<endl;
- }
- if (have_doc) {
- adjust_doc(i,1);
- } else
- oracle.IncludeLastScore();
- }
- return o;
- }
-
- // if start_random is true, immediately sample w/ replacement from src sentences; otherwise, consume them sequentially until exhausted, then random. oracle vectors are summed
- void AddOracleDirections() {
- MT19937::IntRNG rsg=rng.inclusive(0,dev_set_size-1);
- unsigned b=0;
- for(unsigned i=0;i<n_oracle;++i) {
- Dir o2hope;
- Dir fear2hope;
- for (unsigned j=0;j<oracle_batch;++j,++b) {
- Oracle const& o=ComputeOracle((start_random||b>=dev_set_size) ? rsg() : b);
-
- if (old_to_hope)
- o2hope+=o.ModelHopeGradient();
- if (fear_to_hope)
- fear2hope+=o.FearHopeGradient();
- }
- double N=(double)oracle_batch;
- if (old_to_hope) {
- o2hope/=N;
- directions.push_back(o2hope);
- }
- if (fear_to_hope) {
- fear2hope/=N;
- directions.push_back(fear2hope);
- }
+int main(int argc, char** argv) {
+ RandomNumberGenerator<boost::mt19937> rng;
+ po::variables_map conf;
+ InitCommandLine(argc, argv, &conf);
+ vector<string> features;
+ SparseVector<weight_t> origin;
+ vector<weight_t> w;
+ Weights::InitFromFile(conf["weights"].as<string>(), &w, &features);
+ Weights::InitSparseVector(w, &origin);
+ const string forest_repository = conf["forest_repository"].as<string>();
+ assert(DirectoryExists(forest_repository));
+ if (conf.count("optimize_feature") > 0)
+ features=conf["optimize_feature"].as<vector<string> >();
+ vector<SparseVector<weight_t> > directions;
+ vector<int> fids(features.size());
+ for (int i = 0; i < features.size(); ++i)
+ fids[i] = FD::Convert(features[i]);
+ LineOptimizer::CreateOptimizationDirections(
+ fids,
+ conf["random_directions"].as<unsigned int>(),
+ &rng,
+ &directions);
+ unsigned dev_set_size = conf["dev_set_size"].as<unsigned>();
+ for (unsigned i = 0; i < dev_set_size; ++i) {
+ for (unsigned j = 0; j < directions.size(); ++j) {
+ cout << forest_repository << '/' << i << ".json.gz " << i << ' ';
+ print(cout, origin, "=", ";");
+ cout << ' ';
+ print(cout, directions[j], "=", ";");
+ cout << endl;
}
}
-};
-
-int main(int argc, char** argv) {
- oracle_directions od;
- return od.main(argc,argv);
+ return 0;
}
diff --git a/vest/mr_vest_map.cc b/vest/mr_vest_map.cc
index 8f6e085d..7d9625bc 100644
--- a/vest/mr_vest_map.cc
+++ b/vest/mr_vest_map.cc
@@ -82,20 +82,20 @@ int main(int argc, char** argv) {
if (line.empty()) continue;
istringstream is(line);
int sent_id;
- string file, s_origin, s_axis;
+ string file, s_origin, s_direction;
// path-to-file (JSON) sent_ed starting-point search-direction
- is >> file >> sent_id >> s_origin >> s_axis;
+ is >> file >> sent_id >> s_origin >> s_direction;
SparseVector<double> origin;
- assert(ReadSparseVectorString(s_origin, &origin));
- SparseVector<double> axis;
- assert(ReadSparseVectorString(s_axis, &axis));
- // cerr << "File: " << file << "\nAxis: " << axis << "\n X: " << origin << endl;
+ ReadSparseVectorString(s_origin, &origin);
+ SparseVector<double> direction;
+ ReadSparseVectorString(s_direction, &direction);
+ // cerr << "File: " << file << "\nDir: " << direction << "\n X: " << origin << endl;
if (last_file != file) {
last_file = file;
ReadFile rf(file);
HypergraphIO::ReadFromJSON(rf.stream(), &hg);
}
- ViterbiEnvelopeWeightFunction wf(origin, axis);
+ ViterbiEnvelopeWeightFunction wf(origin, direction);
ViterbiEnvelope ve = Inside<ViterbiEnvelope, ViterbiEnvelopeWeightFunction>(hg, NULL, wf);
ErrorSurface es;
@@ -104,7 +104,7 @@ int main(int argc, char** argv) {
// cerr << "Error surface has " << es.size() << " segments\n";
string val;
es.Serialize(&val);
- cout << 'M' << ' ' << s_origin << ' ' << s_axis << '\t';
+ cout << 'M' << ' ' << s_origin << ' ' << s_direction << '\t';
B64::b64encode(val.c_str(), val.size(), &cout);
cout << endl << flush;
}