diff options
Diffstat (limited to 'pro-train/mr_pro_reduce.cc')
| -rw-r--r-- | pro-train/mr_pro_reduce.cc | 167 | 
1 files changed, 125 insertions, 42 deletions
diff --git a/pro-train/mr_pro_reduce.cc b/pro-train/mr_pro_reduce.cc index 3df52020..2b9c5ce7 100644 --- a/pro-train/mr_pro_reduce.cc +++ b/pro-train/mr_pro_reduce.cc @@ -1,3 +1,4 @@ +#include <cstdlib>  #include <sstream>  #include <iostream>  #include <fstream> @@ -6,24 +7,29 @@  #include <boost/program_options.hpp>  #include <boost/program_options/variables_map.hpp> +#include "weights.h"  #include "sparse_vector.h" -#include "error_surface.h" -#include "line_optimizer.h" -#include "b64tools.h" +#include "optimize.h"  using namespace std;  namespace po = boost::program_options; +// since this is a ranking model, there should be equal numbers of +// positive and negative examples so the bias should be 0 +static const double MAX_BIAS = 1e-10; +  void InitCommandLine(int argc, char** argv, po::variables_map* conf) {    po::options_description opts("Configuration options");    opts.add_options() -        ("loss_function,l",po::value<string>(), "Loss function being optimized") +        ("weights,w", po::value<string>(), "Weights from previous iteration (used as initialization and interpolation") +        ("interpolation,p",po::value<double>()->default_value(0.9), "Output weights are p*w + (1-p)*w_prev") +        ("memory_buffers,m",po::value<unsigned>()->default_value(200), "Number of memory buffers (LBFGS)") +        ("sigma_squared,s",po::value<double>()->default_value(0.5), "Sigma squared for Gaussian prior")          ("help,h", "Help");    po::options_description dcmdline_options;    dcmdline_options.add(opts);    po::store(parse_command_line(argc, argv, dcmdline_options), *conf); -  bool flag = conf->count("loss_function") == 0; -  if (flag || conf->count("help")) { +  if (conf->count("help")) {      cerr << dcmdline_options << endl;      exit(1);    } @@ -32,50 +38,127 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {  int main(int argc, char** argv) {    po::variables_map conf;    InitCommandLine(argc, argv, &conf); -  const string loss_function = conf["loss_function"].as<string>(); -  ScoreType type = ScoreTypeFromString(loss_function); -  LineOptimizer::ScoreType opt_type = LineOptimizer::MAXIMIZE_SCORE; -  if (type == TER || type == AER) { -    opt_type = LineOptimizer::MINIMIZE_SCORE; +  string line; +  vector<pair<bool, SparseVector<double> > > training; +  int lc = 0; +  bool flag = false; +  SparseVector<double> old_weights; +  const double psi = conf["interpolation"].as<double>(); +  if (psi < 0.0 || psi > 1.0) { cerr << "Invalid interpolation weight: " << psi << endl; } +  if (conf.count("weights")) { +    Weights w; +    w.InitFromFile(conf["weights"].as<string>()); +    w.InitSparseVector(&old_weights);    } -  string last_key; -  vector<ErrorSurface> esv; -  while(cin) { -    string line; -    getline(cin, line); +  while(getline(cin, line)) { +    ++lc; +    if (lc % 1000 == 0) { cerr << '.'; flag = true; } +    if (lc % 40000 == 0) { cerr << " [" << lc << "]\n"; flag = false; }      if (line.empty()) continue; -    size_t ks = line.find("\t"); +    const size_t ks = line.find("\t");      assert(string::npos != ks); -    assert(ks > 2); -    string key = line.substr(2, ks - 2); -    string val = line.substr(ks + 1); -    if (key != last_key) { -      if (!last_key.empty()) { -	float score; -        double x = LineOptimizer::LineOptimize(esv, opt_type, &score); -	cout << last_key << "|" << x << "|" << score << endl; +    assert(ks == 1); +    const bool y = line[0] == '1'; +    SparseVector<double> x; +    size_t last_start = ks + 1; +    size_t last_comma = string::npos; +    size_t cur = last_start; +    while(cur <= line.size()) { +      if (line[cur] == ' ' || cur == line.size()) { +        if (!(cur > last_start && last_comma != string::npos && cur > last_comma)) { +          cerr << "[ERROR] " << line << endl << "  position = " << cur << endl; +          exit(1); +        } +        const int fid = FD::Convert(line.substr(last_start, last_comma - last_start)); +        if (cur < line.size()) line[cur] = 0; +        const double val = strtod(&line[last_comma + 1], NULL); +        x.set_value(fid, val); + +        last_comma = string::npos; +        last_start = cur+1; +      } else { +        if (line[cur] == '=') +          last_comma = cur; +      } +      ++cur; +    } +    training.push_back(make_pair(y, x)); +  } +  if (flag) cerr << endl; + +  cerr << "Number of features: " << FD::NumFeats() << endl; +  vector<double> x(FD::NumFeats(), 0.0);  // x[0] is bias +  for (SparseVector<double>::const_iterator it = old_weights.begin(); +       it != old_weights.end(); ++it) +    x[it->first] = it->second; +  vector<double> vg(FD::NumFeats(), 0.0); +  SparseVector<double> g; +  bool converged = false; +  LBFGSOptimizer opt(FD::NumFeats(), conf["memory_buffers"].as<unsigned>()); +  while(!converged) { +    double cll = 0; +    double dbias = 0; +    g.clear(); +    for (int i = 0; i < training.size(); ++i) { +      const double dotprod = training[i].second.dot(x) + x[0]; // x[0] is bias +      double lp_false = dotprod; +      double lp_true = -dotprod; +      if (0 < lp_true) { +        lp_true += log1p(exp(-lp_true)); +        lp_false = log1p(exp(lp_false)); +      } else { +        lp_true = log1p(exp(lp_true)); +        lp_false += log1p(exp(-lp_false)); +      } +      lp_true*=-1; +      lp_false*=-1; +      if (training[i].first) {  // true label +        cll -= lp_true; +        g -= training[i].second * exp(lp_false); +        dbias -= exp(lp_false); +      } else {                  // false label +        cll -= lp_false; +        g += training[i].second * exp(lp_true); +        dbias += exp(lp_true);        } -      last_key = key; -      esv.clear();      } -    if (val.size() % 4 != 0) { -      cerr << "B64 encoding error 1! Skipping.\n"; -      continue; +    vg.clear(); +    g.init_vector(&vg); +    vg[0] = dbias; +#if 1 +    const double sigsq = conf["sigma_squared"].as<double>(); +    double norm = 0; +    for (int i = 1; i < x.size(); ++i) { +      const double mean_i = 0.0; +      const double param = (x[i] - mean_i); +      norm += param * param; +      vg[i] += param / sigsq; +    }  +    const double reg = norm / (2.0 * sigsq); +#else +    double reg = 0; +#endif +    cll += reg; +    cerr << cll << " (REG=" << reg << ")\t"; +    bool failed = false; +    try { +      opt.Optimize(cll, vg, &x); +    } catch (...) { +      cerr << "Exception caught, assuming convergence is close enough...\n"; +      failed = true;      } -    string encoded(val.size() / 4 * 3, '\0'); -    if (!B64::b64decode(reinterpret_cast<const unsigned char*>(&val[0]), val.size(), &encoded[0], encoded.size())) { -      cerr << "B64 encoding error 2! Skipping.\n"; -      continue; +    if (fabs(x[0]) > MAX_BIAS) { +      cerr << "Biased model learned. Are your training instances wrong?\n"; +      cerr << "  BIAS: " << x[0] << endl;      } -    esv.push_back(ErrorSurface()); -    esv.back().Deserialize(type, encoded); +    converged = failed || opt.HasConverged();    } -  if (!esv.empty()) { -    // cerr << "ESV=" << esv.size() << endl; -    // for (int i = 0; i < esv.size(); ++i) { cerr << esv[i].size() << endl; } -    float score; -    double x = LineOptimizer::LineOptimize(esv, opt_type, &score); -    cout << last_key << "|" << x << "|" << score << endl; +  Weights w; +  if (conf.count("weights")) { +    for (int i = 1; i < x.size(); ++i) +      x[i] = (x[i] * psi) + old_weights.get(i) * (1.0 - psi);    } +  w.InitFromVector(x); +  w.WriteToFile("-");    return 0;  }  | 
