summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2011-07-12 22:34:34 -0400
committerChris Dyer <cdyer@cs.cmu.edu>2011-07-12 22:34:34 -0400
commitc87835f5f94b3aa954682133c40117b3f8e26585 (patch)
treedfe5e8cffbdbf3b911d7ef6fc9d7eb8508a28d89
parent9ab32f74dd821f08cb5863faf88d40ca60301688 (diff)
debugged pro trainer
-rwxr-xr-xpro-train/dist-pro.pl9
-rw-r--r--pro-train/mr_pro_map.cc244
-rw-r--r--pro-train/mr_pro_reduce.cc57
-rw-r--r--utils/filelib.cc12
-rw-r--r--utils/filelib.h1
5 files changed, 253 insertions, 70 deletions
diff --git a/pro-train/dist-pro.pl b/pro-train/dist-pro.pl
index 55d7f1fa..c42e3876 100755
--- a/pro-train/dist-pro.pl
+++ b/pro-train/dist-pro.pl
@@ -66,6 +66,7 @@ my $bleu_weight=1;
my $use_make; # use make to parallelize line search
my $dirargs='';
my $usefork;
+my $initial_weights;
my $pass_suffix = '';
my $cpbin=1;
# Process command-line options
@@ -79,6 +80,7 @@ if (GetOptions(
"dry-run" => \$dryrun,
"epsilon=s" => \$epsilon,
"help" => \$help,
+ "weights=s" => \$initial_weights,
"interval" => \$interval,
"iteration=i" => \$iteration,
"local" => \$run_local,
@@ -212,7 +214,7 @@ if ($dryrun){
close CMD;
print STDERR $cline;
chmod(0755,$cmdfile);
- check_call("touch $dir/weights.0");
+ check_call("cp $initial_weights $dir/weights.0");
die "Can't find weights.0" unless (-e "$dir/weights.0");
}
write_config(*STDERR);
@@ -239,7 +241,6 @@ my $random_seed = int(time / 1000);
my $lastWeightsFile;
my $lastPScore = 0;
# main optimization loop
-my @mapoutputs = (); # aggregate map outputs over all iters
while (1){
print STDERR "\n\nITERATION $iteration\n==========\n";
@@ -262,6 +263,7 @@ while (1){
my $im1 = $iteration - 1;
my $weightsFile="$dir/weights.$im1";
push @allweights, "-w $dir/weights.$im1";
+ `rm -f $dir/hgs/*.gz`;
my $decoder_cmd = "$decoder -c $iniFile --weights$pass_suffix $weightsFile -O $dir/hgs";
my $pcmd;
if ($run_local) {
@@ -333,6 +335,7 @@ while (1){
print $mkfile "all: $dir/splag.$im1/map.done\n\n";
}
my @mkouts = (); # only used with makefiles
+ my @mapoutputs = ();
for my $shard (@shards) {
my $mapoutput = $shard;
my $client_name = $shard;
@@ -341,7 +344,7 @@ while (1){
$mapoutput =~ s/mapinput/mapoutput/;
push @mapoutputs, "$dir/splag.$im1/$mapoutput";
$o2i{"$dir/splag.$im1/$mapoutput"} = "$dir/splag.$im1/$shard";
- my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep @allweights < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput";
+ my $script = "$MAPPER -s $srcFile -l $metric $refs_comma_sep -w $inweights -K $dir/kbest < $dir/splag.$im1/$shard > $dir/splag.$im1/$mapoutput";
if ($run_local) {
print STDERR "COMMAND:\n$script\n";
check_bash_call($script);
diff --git a/pro-train/mr_pro_map.cc b/pro-train/mr_pro_map.cc
index 128d93ce..4324e8de 100644
--- a/pro-train/mr_pro_map.cc
+++ b/pro-train/mr_pro_map.cc
@@ -2,7 +2,9 @@
#include <iostream>
#include <fstream>
#include <vector>
+#include <tr1/unordered_map>
+#include <boost/functional/hash.hpp>
#include <boost/shared_ptr.hpp>
#include <boost/program_options.hpp>
#include <boost/program_options/variables_map.hpp>
@@ -22,16 +24,63 @@
using namespace std;
namespace po = boost::program_options;
+struct ApproxVectorHasher {
+ static const size_t MASK = 0xFFFFFFFFull;
+ union UType {
+ double f;
+ size_t i;
+ };
+ static inline double round(const double x) {
+ UType t;
+ t.f = x;
+ size_t r = t.i & MASK;
+ if ((r << 1) > MASK)
+ t.i += MASK - r + 1;
+ else
+ t.i &= (1ull - MASK);
+ return t.f;
+ }
+ size_t operator()(const SparseVector<double>& x) const {
+ size_t h = 0x573915839;
+ for (SparseVector<double>::const_iterator it = x.begin(); it != x.end(); ++it) {
+ UType t;
+ t.f = it->second;
+ if (t.f) {
+ size_t z = (t.i >> 32);
+ boost::hash_combine(h, it->first);
+ boost::hash_combine(h, z);
+ }
+ }
+ return h;
+ }
+};
+
+struct ApproxVectorEquals {
+ bool operator()(const SparseVector<double>& a, const SparseVector<double>& b) const {
+ SparseVector<double>::const_iterator bit = b.begin();
+ for (SparseVector<double>::const_iterator ait = a.begin(); ait != a.end(); ++ait) {
+ if (bit == b.end() ||
+ ait->first != bit->first ||
+ ApproxVectorHasher::round(ait->second) != ApproxVectorHasher::round(bit->second))
+ return false;
+ ++bit;
+ }
+ if (bit != b.end()) return false;
+ return true;
+ }
+};
+
boost::shared_ptr<MT19937> rng;
void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
po::options_description opts("Configuration options");
opts.add_options()
("reference,r",po::value<vector<string> >(), "[REQD] Reference translation (tokenized text)")
+ ("weights,w",po::value<string>(), "[REQD] Weights files from current iterations")
+ ("kbest_repository,K",po::value<string>()->default_value("./kbest"),"K-best list repository (directory)")
+ ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)")
("source,s",po::value<string>()->default_value(""), "Source file (ignored, except for AER)")
("loss_function,l",po::value<string>()->default_value("ibm_bleu"), "Loss function being optimized")
- ("input,i",po::value<string>()->default_value("-"), "Input file to map (- is STDIN)")
- ("weights,w",po::value<vector<string> >(), "[REQD] Weights files from previous and current iterations")
("kbest_size,k",po::value<unsigned>()->default_value(1500u), "Top k-hypotheses to extract")
("candidate_pairs,G", po::value<unsigned>()->default_value(5000u), "Number of pairs to sample per hypothesis (Gamma)")
("best_pairs,X", po::value<unsigned>()->default_value(50u), "Number of pairs, ranked by magnitude of objective delta, to retain (Xi)")
@@ -46,7 +95,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
flag = true;
}
if (!conf->count("weights")) {
- cerr << "Please specify one or more weights using -w <WEIGHTS.TXT>\n";
+ cerr << "Please specify weights using -w <WEIGHTS.TXT>\n";
flag = true;
}
if (flag || conf->count("help")) {
@@ -56,6 +105,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
}
struct HypInfo {
+ HypInfo() : g_(-100.0) {}
HypInfo(const vector<WordID>& h, const SparseVector<double>& feats) : hyp(h), g_(-100.0), x(feats) {}
// lazy evaluation
@@ -66,10 +116,92 @@ struct HypInfo {
}
vector<WordID> hyp;
mutable double g_;
- public:
SparseVector<double> x;
};
+struct HypInfoCompare {
+ bool operator()(const HypInfo& a, const HypInfo& b) const {
+ ApproxVectorEquals comp;
+ return (a.hyp == b.hyp && comp(a.x,b.x));
+ }
+};
+
+struct HypInfoHasher {
+ size_t operator()(const HypInfo& x) const {
+ boost::hash<vector<WordID> > hhasher;
+ ApproxVectorHasher vhasher;
+ size_t ha = hhasher(x.hyp);
+ boost::hash_combine(ha, vhasher(x.x));
+ return ha;
+ }
+};
+
+void WriteKBest(const string& file, const vector<HypInfo>& kbest) {
+ WriteFile wf(file);
+ ostream& out = *wf.stream();
+ out.precision(10);
+ for (int i = 0; i < kbest.size(); ++i) {
+ out << TD::GetString(kbest[i].hyp) << endl;
+ out << kbest[i].x << endl;
+ }
+}
+
+void ParseSparseVector(string& line, size_t cur, SparseVector<double>* out) {
+ SparseVector<double>& x = *out;
+ size_t last_start = cur;
+ size_t last_comma = string::npos;
+ while(cur <= line.size()) {
+ if (line[cur] == ' ' || cur == line.size()) {
+ if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) {
+ cerr << "[ERROR] " << line << endl << " position = " << cur << endl;
+ exit(1);
+ }
+ const int fid = FD::Convert(line.substr(last_start, last_comma - last_start));
+ if (cur < line.size()) line[cur] = 0;
+ const double val = strtod(&line[last_comma + 1], NULL);
+ x.set_value(fid, val);
+
+ last_comma = string::npos;
+ last_start = cur+1;
+ } else {
+ if (line[cur] == '=')
+ last_comma = cur;
+ }
+ ++cur;
+ }
+}
+
+void ReadKBest(const string& file, vector<HypInfo>* kbest) {
+ cerr << "Reading from " << file << endl;
+ ReadFile rf(file);
+ istream& in = *rf.stream();
+ string cand;
+ string feats;
+ while(getline(in, cand)) {
+ getline(in, feats);
+ assert(in);
+ kbest->push_back(HypInfo());
+ TD::ConvertSentence(cand, &kbest->back().hyp);
+ ParseSparseVector(feats, 0, &kbest->back().x);
+ }
+ cerr << " read " << kbest->size() << " hypotheses\n";
+}
+
+void Dedup(vector<HypInfo>* h) {
+ cerr << "Dedup in=" << h->size();
+ tr1::unordered_set<HypInfo, HypInfoHasher, HypInfoCompare> u;
+ while(h->size() > 0) {
+ u.insert(h->back());
+ h->pop_back();
+ }
+ tr1::unordered_set<HypInfo, HypInfoHasher, HypInfoCompare>::iterator it = u.begin();
+ while (it != u.end()) {
+ h->push_back(*it);
+ it = u.erase(it);
+ }
+ cerr << " out=" << h->size() << endl;
+}
+
struct ThresholdAlpha {
explicit ThresholdAlpha(double t = 0.05) : threshold(t) {}
double operator()(double mag) const {
@@ -81,6 +213,7 @@ struct ThresholdAlpha {
struct TrainingInstance {
TrainingInstance(const SparseVector<double>& feats, bool positive, double diff) : x(feats), y(positive), gdiff(diff) {}
SparseVector<double> x;
+#undef DEBUGGING_PRO
#ifdef DEBUGGING_PRO
vector<WordID> a;
vector<WordID> b;
@@ -88,6 +221,11 @@ struct TrainingInstance {
bool y;
double gdiff;
};
+#ifdef DEBUGGING_PRO
+ostream& operator<<(ostream& os, const TrainingInstance& d) {
+ return os << d.gdiff << " y=" << d.y << "\tA:" << TD::GetString(d.a) << "\n\tB: " << TD::GetString(d.b) << "\n\tX: " << d.x;
+}
+#endif
struct DiffOrder {
bool operator()(const TrainingInstance& a, const TrainingInstance& b) const {
@@ -95,36 +233,51 @@ struct DiffOrder {
}
};
-template<typename Alpha>
-void Sample(const unsigned gamma, const unsigned xi, const vector<HypInfo>& J_i, const SentenceScorer& scorer, const Alpha& alpha_i, bool invert_score, vector<TrainingInstance>* pv) {
- vector<TrainingInstance> v;
+void Sample(const unsigned gamma, const unsigned xi, const vector<HypInfo>& J_i, const SentenceScorer& scorer, const bool invert_score, vector<TrainingInstance>* pv) {
+ vector<TrainingInstance> v1, v2;
+ double avg_diff = 0;
for (unsigned i = 0; i < gamma; ++i) {
- size_t a = rng->inclusive(0, J_i.size() - 1)();
- size_t b = rng->inclusive(0, J_i.size() - 1)();
+ const size_t a = rng->inclusive(0, J_i.size() - 1)();
+ const size_t b = rng->inclusive(0, J_i.size() - 1)();
if (a == b) continue;
double ga = J_i[a].g(scorer);
double gb = J_i[b].g(scorer);
- bool positive = ga < gb;
+ bool positive = gb < ga;
if (invert_score) positive = !positive;
- double gdiff = fabs(ga - gb);
+ const double gdiff = fabs(ga - gb);
if (!gdiff) continue;
- if (rng->next() < alpha_i(gdiff)) {
- v.push_back(TrainingInstance((J_i[a].x - J_i[b].x).erase_zeros(), positive, gdiff));
+ avg_diff += gdiff;
+ SparseVector<double> xdiff = (J_i[a].x - J_i[b].x).erase_zeros();
+ if (xdiff.empty()) {
+ cerr << "Empty diff:\n " << TD::GetString(J_i[a].hyp) << endl << "x=" << J_i[a].x << endl;
+ cerr << " " << TD::GetString(J_i[b].hyp) << endl << "x=" << J_i[b].x << endl;
+ continue;
+ }
+ v1.push_back(TrainingInstance(xdiff, positive, gdiff));
#ifdef DEBUGGING_PRO
- v.back().a = J_i[a].hyp;
- v.back().b = J_i[b].hyp;
+ v1.back().a = J_i[a].hyp;
+ v1.back().b = J_i[b].hyp;
+ cerr << "N: " << v1.back() << endl;
#endif
- }
}
- vector<TrainingInstance>::iterator mid = v.begin() + xi;
- if (xi > v.size()) mid = v.end();
- partial_sort(v.begin(), mid, v.end(), DiffOrder());
- copy(v.begin(), mid, back_inserter(*pv));
+ avg_diff /= v1.size();
+
+ for (unsigned i = 0; i < v1.size(); ++i) {
+ double p = 1.0 / (1.0 + exp(-avg_diff - v1[i].gdiff));
+ // cerr << "avg_diff=" << avg_diff << " gdiff=" << v1[i].gdiff << " p=" << p << endl;
+ if (rng->next() < p) v2.push_back(v1[i]);
+ }
+ vector<TrainingInstance>::iterator mid = v2.begin() + xi;
+ if (xi > v2.size()) mid = v2.end();
+ partial_sort(v2.begin(), mid, v2.end(), DiffOrder());
+ copy(v2.begin(), mid, back_inserter(*pv));
#ifdef DEBUGGING_PRO
- if (v.size() >= 5)
- for (int i =0; i < 5; ++i) {
- cerr << v[i].gdiff << " y=" << v[i].y << "\tA:" << TD::GetString(v[i].a) << "\n\tB: " << TD::GetString(v[i].b) << endl;
+ if (v2.size() >= 5) {
+ for (int i =0; i < (mid - v2.begin()); ++i) {
+ cerr << v2[i] << endl;
}
+ cerr << pv->back() << endl;
+ }
#endif
}
@@ -136,6 +289,7 @@ int main(int argc, char** argv) {
else
rng.reset(new MT19937);
const string loss_function = conf["loss_function"].as<string>();
+
ScoreType type = ScoreTypeFromString(loss_function);
DocScorer ds(type, conf["reference"].as<vector<string> >(), conf["source"].as<string>());
cerr << "Loaded " << ds.size() << " references for scoring with " << loss_function << endl;
@@ -146,13 +300,15 @@ int main(int argc, char** argv) {
const unsigned kbest_size = conf["kbest_size"].as<unsigned>();
const unsigned gamma = conf["candidate_pairs"].as<unsigned>();
const unsigned xi = conf["best_pairs"].as<unsigned>();
- vector<string> weights_files = conf["weights"].as<vector<string> >();
- vector<vector<double> > weights(weights_files.size());
- for (int i = 0; i < weights.size(); ++i) {
+ string weightsf = conf["weights"].as<string>();
+ vector<double> weights;
+ {
Weights w;
- w.InitFromFile(weights_files[i]);
- w.InitVector(&weights[i]);
+ w.InitFromFile(weightsf);
+ w.InitVector(&weights);
}
+ string kbest_repo = conf["kbest_repository"].as<string>();
+ MkDirP(kbest_repo);
while(in) {
vector<TrainingInstance> v;
string line;
@@ -164,24 +320,26 @@ int main(int argc, char** argv) {
// path-to-file (JSON) sent_id
is >> file >> sent_id;
ReadFile rf(file);
- HypergraphIO::ReadFromJSON(rf.stream(), &hg);
+ ostringstream os;
vector<HypInfo> J_i;
- int start = weights.size();
- start -= 4;
- if (start < 0) start = 0;
- for (int i = start; i < weights.size(); ++i) {
- hg.Reweight(weights[i]);
- KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, kbest_size);
-
- for (int i = 0; i < kbest_size; ++i) {
- const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
- kbest.LazyKthBest(hg.nodes_.size() - 1, i);
- if (!d) break;
- J_i.push_back(HypInfo(d->yield, d->feature_values));
- }
+ os << kbest_repo << "/kbest." << sent_id << ".txt.gz";
+ const string kbest_file = os.str();
+ if (FileExists(kbest_file))
+ ReadKBest(kbest_file, &J_i);
+ HypergraphIO::ReadFromJSON(rf.stream(), &hg);
+ hg.Reweight(weights);
+ KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(hg, kbest_size);
+
+ for (int i = 0; i < kbest_size; ++i) {
+ const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d =
+ kbest.LazyKthBest(hg.nodes_.size() - 1, i);
+ if (!d) break;
+ J_i.push_back(HypInfo(d->yield, d->feature_values));
}
+ Dedup(&J_i);
+ WriteKBest(kbest_file, J_i);
- Sample(gamma, xi, J_i, *ds[sent_id], ThresholdAlpha(0.05), (type == TER), &v);
+ Sample(gamma, xi, J_i, *ds[sent_id], (type == TER), &v);
for (unsigned i = 0; i < v.size(); ++i) {
const TrainingInstance& vi = v[i];
cout << vi.y << "\t" << vi.x << endl;
diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc
index 2b9c5ce7..e1a7db8a 100644
--- a/pro-train/mr_pro_reduce.cc
+++ b/pro-train/mr_pro_reduce.cc
@@ -24,7 +24,7 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
("weights,w", po::value<string>(), "Weights from previous iteration (used as initialization and interpolation")
("interpolation,p",po::value<double>()->default_value(0.9), "Output weights are p*w + (1-p)*w_prev")
("memory_buffers,m",po::value<unsigned>()->default_value(200), "Number of memory buffers (LBFGS)")
- ("sigma_squared,s",po::value<double>()->default_value(0.5), "Sigma squared for Gaussian prior")
+ ("sigma_squared,s",po::value<double>()->default_value(1.0), "Sigma squared for Gaussian prior")
("help,h", "Help");
po::options_description dcmdline_options;
dcmdline_options.add(opts);
@@ -35,6 +35,31 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
}
}
+void ParseSparseVector(string& line, size_t cur, SparseVector<double>* out) {
+ SparseVector<double>& x = *out;
+ size_t last_start = cur;
+ size_t last_comma = string::npos;
+ while(cur <= line.size()) {
+ if (line[cur] == ' ' || cur == line.size()) {
+ if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) {
+ cerr << "[ERROR] " << line << endl << " position = " << cur << endl;
+ exit(1);
+ }
+ const int fid = FD::Convert(line.substr(last_start, last_comma - last_start));
+ if (cur < line.size()) line[cur] = 0;
+ const double val = strtod(&line[last_comma + 1], NULL);
+ x.set_value(fid, val);
+
+ last_comma = string::npos;
+ last_start = cur+1;
+ } else {
+ if (line[cur] == '=')
+ last_comma = cur;
+ }
+ ++cur;
+ }
+}
+
int main(int argc, char** argv) {
po::variables_map conf;
InitCommandLine(argc, argv, &conf);
@@ -60,28 +85,7 @@ int main(int argc, char** argv) {
assert(ks == 1);
const bool y = line[0] == '1';
SparseVector<double> x;
- size_t last_start = ks + 1;
- size_t last_comma = string::npos;
- size_t cur = last_start;
- while(cur <= line.size()) {
- if (line[cur] == ' ' || cur == line.size()) {
- if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) {
- cerr << "[ERROR] " << line << endl << " position = " << cur << endl;
- exit(1);
- }
- const int fid = FD::Convert(line.substr(last_start, last_comma - last_start));
- if (cur < line.size()) line[cur] = 0;
- const double val = strtod(&line[last_comma + 1], NULL);
- x.set_value(fid, val);
-
- last_comma = string::npos;
- last_start = cur+1;
- } else {
- if (line[cur] == '=')
- last_comma = cur;
- }
- ++cur;
- }
+ ParseSparseVector(line, ks + 1, &x);
training.push_back(make_pair(y, x));
}
if (flag) cerr << endl;
@@ -95,6 +99,7 @@ int main(int argc, char** argv) {
SparseVector<double> g;
bool converged = false;
LBFGSOptimizer opt(FD::NumFeats(), conf["memory_buffers"].as<unsigned>());
+ double ppl = 0;
while(!converged) {
double cll = 0;
double dbias = 0;
@@ -114,14 +119,18 @@ int main(int argc, char** argv) {
lp_false*=-1;
if (training[i].first) { // true label
cll -= lp_true;
+ ppl += lp_true / log(2);
g -= training[i].second * exp(lp_false);
dbias -= exp(lp_false);
} else { // false label
cll -= lp_false;
+ ppl += lp_false / log(2);
g += training[i].second * exp(lp_true);
dbias += exp(lp_true);
}
}
+ ppl /= training.size();
+ ppl = pow(2.0, - ppl);
vg.clear();
g.init_vector(&vg);
vg[0] = dbias;
@@ -139,7 +148,7 @@ int main(int argc, char** argv) {
double reg = 0;
#endif
cll += reg;
- cerr << cll << " (REG=" << reg << ")\t";
+ cerr << cll << " (REG=" << reg << ")\tPPL=" << ppl << "\t";
bool failed = false;
try {
opt.Optimize(cll, vg, &x);
diff --git a/utils/filelib.cc b/utils/filelib.cc
index 79ad2847..a0969b1a 100644
--- a/utils/filelib.cc
+++ b/utils/filelib.cc
@@ -20,3 +20,15 @@ bool DirectoryExists(const string& dir) {
return false;
}
+void MkDirP(const string& dir) {
+ if (DirectoryExists(dir)) return;
+ if (mkdir(dir.c_str(), 0777)) {
+ perror(dir.c_str());
+ abort();
+ }
+ if (chmod(dir.c_str(), 07777)) {
+ perror(dir.c_str());
+ abort();
+ }
+}
+
diff --git a/utils/filelib.h b/utils/filelib.h
index dda98671..a8622246 100644
--- a/utils/filelib.h
+++ b/utils/filelib.h
@@ -12,6 +12,7 @@
bool FileExists(const std::string& file_name);
bool DirectoryExists(const std::string& dir_name);
+void MkDirP(const std::string& dir_name);
// reads from standard in if filename is -
// uncompresses if file ends with .gz