diff options
Diffstat (limited to 'training/creg.cc')
-rw-r--r-- | training/creg.cc | 334 |
1 files changed, 0 insertions, 334 deletions
diff --git a/training/creg.cc b/training/creg.cc deleted file mode 100644 index 58adea00..00000000 --- a/training/creg.cc +++ /dev/null @@ -1,334 +0,0 @@ -#include <cstdlib> -#include <iostream> -#include <vector> -#include <tr1/unordered_map> - -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "json_feature_map_lexer.h" -#include "prob.h" -#include "filelib.h" -#include "weights.h" -#include "sparse_vector.h" -#include "liblbfgs/lbfgs++.h" - -using namespace std; -using namespace std::tr1; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("training_features,x", po::value<string>(), "File containing training instance features (ARKRegression format)") - ("training_responses,y", po::value<string>(), "File containing training response features (ARKRegression format)") - ("linear,n", "Linear (rather than logistic) regression") - ("l1",po::value<double>()->default_value(0.0), "l_1 regularization strength") - ("l2",po::value<double>()->default_value(0.0), "l_2 regularization strength") - ("weights,w", po::value<string>(), "Initial weights") - ("epsilon,e", po::value<double>()->default_value(1e-4), "Epsilon for convergence test. Terminates when ||g|| < epsilon * max(1, ||x||)") - ("memory_buffers,m",po::value<unsigned>()->default_value(40), "Number of memory buffers for LBFGS") - ("help,h", "Help"); - po::options_description dcmdline_options; - dcmdline_options.add(opts); - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("help") || !conf->count("training_features") || !conf->count("training_responses")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -struct TrainingInstance { - SparseVector<float> x; - union { - unsigned label; // for categorical predictions - float value; // for continuous predictions - } y; -}; - -struct ReaderHelper { - explicit ReaderHelper(vector<TrainingInstance>* xyp) : xy_pairs(xyp), lc(), flag() {} - unordered_map<string, unsigned> id2ind; - vector<TrainingInstance>* xy_pairs; - int lc; - bool flag; -}; - -void ReaderCB(const string& id, const SparseVector<float>& fmap, void* extra) { - ReaderHelper& rh = *reinterpret_cast<ReaderHelper*>(extra); - ++rh.lc; - if (rh.lc % 1000 == 0) { cerr << '.'; rh.flag = true; } - if (rh.lc % 40000 == 0) { cerr << " [" << rh.lc << "]\n"; rh.flag = false; } - const unordered_map<string, unsigned>::iterator it = rh.id2ind.find(id); - if (it == rh.id2ind.end()) { - cerr << "Unlabeled example in line " << rh.lc << endl; - abort(); - } - (*rh.xy_pairs)[it->second - 1].x = fmap; -} - -void ReadLabeledInstances(const string& ffeats, - const string& fresp, - const bool is_continuous, - vector<TrainingInstance>* xy_pairs, - vector<string>* labels) { - bool flag = false; - xy_pairs->clear(); - int lc = 0; - ReaderHelper rh(xy_pairs); - unordered_map<string, unsigned> label2id; - cerr << "Reading training responses from " << fresp << " ..." << endl; - ReadFile fr(fresp); - for (unsigned i = 0; i < labels->size(); ++i) - label2id[(*labels)[i]] = i; - istream& in = *fr.stream(); - string line; - while(getline(in, line)) { - ++lc; - if (lc % 1000 == 0) { cerr << '.'; flag = true; } - if (lc % 40000 == 0) { cerr << " [" << lc << "]\n"; flag = false; } - if (line.size() == 0) continue; - if (line[0] == '#') continue; - unsigned p = 0; - while (p < line.size() && line[p] != ' ' && line[p] != '\t') { ++p; } - unsigned& ind = rh.id2ind[line.substr(0, p)]; - if (ind != 0) { cerr << "ID " << line.substr(0, p) << " duplicated in line " << lc << endl; abort(); } - while (p < line.size() && (line[p] == ' ' || line[p] == '\t')) { ++p; } - assert(p < line.size()); - xy_pairs->push_back(TrainingInstance()); - ind = xy_pairs->size(); - if (is_continuous) { - xy_pairs->back().y.value = strtof(&line[p], 0); - } else { // categorical predictions - unordered_map<string, unsigned>::iterator it = label2id.find(line.substr(p)); - if (it == label2id.end()) { - const string label = line.substr(p); - it = label2id.insert(make_pair(label, labels->size())).first; - labels->push_back(label); - } - xy_pairs->back().y.label = it->second; // label id - } - } - if (flag) cerr << endl; - if (!is_continuous) { - cerr << "LABELS:"; - for (unsigned j = 0; j < labels->size(); ++j) - cerr << " " << (*labels)[j]; - cerr << endl; - } - cerr << "Reading training features from " << ffeats << " ..." << endl; - ReadFile ff(ffeats); - JSONFeatureMapLexer::ReadRules(ff.stream(), ReaderCB, &rh); - if (rh.flag) cerr << endl; -} - -// helper base class (not polymorphic- just a container and some helper functions) for loss functions -// real loss functions should implement double operator()(const vector<double>& x, double* g), -// which should evaluate f(x) and g = f'(x) -struct BaseLoss { - // dimp1 = number of categorial outputs possible for logistic regression - // for linear regression, it should be 1 more than the dimension of the response variable - BaseLoss( - const vector<TrainingInstance>& tr, - unsigned dimp1, - unsigned numfeats, - unsigned ll2) : training(tr), K(dimp1), p(numfeats), l2(ll2) {} - - // weight vector layout for K classes, with p features - // w[0 : K-1] = bias weights - // w[y*p + K : y*p + K + p - 1] = feature weights for y^th class - // this representation is used in ComputeDotProducts and GradAdd - void ComputeDotProducts(const SparseVector<float>& fx, // feature vector of x - const vector<double>& w, // full weight vector - vector<double>* pdotprods) const { - vector<double>& dotprods = *pdotprods; - const unsigned km1 = K - 1; - dotprods.resize(km1); - for (unsigned y = 0; y < km1; ++y) - dotprods[y] = w[y]; // bias terms - for (SparseVector<float>::const_iterator it = fx.begin(); it != fx.end(); ++it) { - const float fval = it->second; - const unsigned fid = it->first; - for (unsigned y = 0; y < km1; ++y) - dotprods[y] += w[fid + y * p + km1] * fval; - } - } - - double ApplyRegularizationTerms(const vector<double>& weights, - double* g) const { - double reg = 0; - for (size_t i = K - 1; i < weights.size(); ++i) { - const double& w_i = weights[i]; - reg += l2 * w_i * w_i; - g[i] += 2 * l2 * w_i; - } - return reg; - } - - void GradAdd(const SparseVector<float>& fx, - const unsigned y, - const double scale, - double* acc) const { - acc[y] += scale; // class bias - for (SparseVector<float>::const_iterator it = fx.begin(); - it != fx.end(); ++it) - acc[it->first + y * p + K - 1] += it->second * scale; - } - - const vector<TrainingInstance>& training; - const unsigned K, p; - const double l2; -}; - -struct UnivariateSquaredLoss : public BaseLoss { - UnivariateSquaredLoss( - const vector<TrainingInstance>& tr, - unsigned numfeats, - const double l2) : BaseLoss(tr, 2, numfeats, l2) {} - - // evaluate squared loss and gradient - double operator()(const vector<double>& x, double* g) const { - fill(g, g + x.size(), 0.0); - double cll = 0; - vector<double> dotprods(1); // univariate prediction - for (int i = 0; i < training.size(); ++i) { - const SparseVector<float>& fmapx = training[i].x; - const double refy = training[i].y.value; - ComputeDotProducts(fmapx, x, &dotprods); - double diff = dotprods[0] - refy; - cll += diff * diff; - - double scale = 2 * diff; - GradAdd(fmapx, 0, scale, g); - } - double reg = ApplyRegularizationTerms(x, g); - return cll + reg; - } -}; - -struct MulticlassLogLoss : public BaseLoss { - MulticlassLogLoss( - const vector<TrainingInstance>& tr, - unsigned k, - unsigned numfeats, - const double l2) : BaseLoss(tr, k, numfeats, l2) {} - - // evaluate log loss and gradient - double operator()(const vector<double>& x, double* g) const { - fill(g, g + x.size(), 0.0); - vector<double> dotprods(K - 1); // K-1 degrees of freedom - vector<prob_t> probs(K); - double cll = 0; - for (int i = 0; i < training.size(); ++i) { - const SparseVector<float>& fmapx = training[i].x; - const unsigned refy = training[i].y.label; - //cerr << "FMAP: " << fmapx << endl; - ComputeDotProducts(fmapx, x, &dotprods); - prob_t z; - for (unsigned j = 0; j < dotprods.size(); ++j) - z += (probs[j] = prob_t(dotprods[j], init_lnx())); - z += (probs.back() = prob_t::One()); - for (unsigned y = 0; y < probs.size(); ++y) { - probs[y] /= z; - //cerr << " p(y=" << y << ")=" << probs[y].as_float() << "\tz=" << z << endl; - } - cll -= log(probs[refy]); // log p(y | x) - - for (unsigned y = 0; y < dotprods.size(); ++y) { - double scale = probs[y].as_float(); - if (y == refy) { scale -= 1.0; } - GradAdd(fmapx, y, scale, g); - } - } - double reg = ApplyRegularizationTerms(x, g); - return cll + reg; - } -}; - -template <class LossFunction> -double LearnParameters(LossFunction& loss, - const double l1, - const unsigned l1_start, - const unsigned memory_buffers, - const double eps, - vector<double>* px) { - LBFGS<LossFunction> lbfgs(px, loss, memory_buffers, l1, l1_start, eps); - lbfgs.MinimizeFunction(); - return 0; -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - string line; - vector<TrainingInstance> training; - const string xfile = conf["training_features"].as<string>(); - const string yfile = conf["training_responses"].as<string>(); - double l1 = conf["l1"].as<double>(); - double l2 = conf["l2"].as<double>(); - const unsigned memory_buffers = conf["memory_buffers"].as<unsigned>(); - const double epsilon = conf["epsilon"].as<double>(); - if (l1 < 0.0) { - cerr << "L1 strength must be >= 0\n"; - return 1; - } - if (l2 < 0.0) { - cerr << "L2 strength must be >= 0\n"; - return 2; - } - - const bool is_continuous = conf.count("linear"); - vector<string> labels; // only populated for non-continuous models - ReadLabeledInstances(xfile, yfile, is_continuous, &training, &labels); - - if (conf.count("weights")) { - cerr << "Initial weights are not implemented, please implement." << endl; - // TODO read weights for categorical and continuous predictions - // can't use normal cdec weight framework - abort(); - } - - cerr << " Number of features: " << FD::NumFeats() << endl; - cerr << "Number of training examples: " << training.size() << endl; - const unsigned p = FD::NumFeats(); - cout.precision(15); - - if (conf.count("linear")) { // linear regression - vector<double> weights(1 + FD::NumFeats(), 0.0); - cerr << " Number of parameters: " << weights.size() << endl; - UnivariateSquaredLoss loss(training, p, l2); - LearnParameters(loss, l1, 1, memory_buffers, epsilon, &weights); - cout << p << "\t***CONTINUOUS***" << endl; - cout << "***BIAS***\t" << weights[0] << endl; - for (unsigned f = 0; f < p; ++f) { - const double w = weights[1 + f]; - if (w) - cout << FD::Convert(f) << "\t" << w << endl; - } - } else { // logistic regression - vector<double> weights((1 + FD::NumFeats()) * (labels.size() - 1), 0.0); - cerr << " Number of parameters: " << weights.size() << endl; - cerr << " Number of labels: " << labels.size() << endl; - const unsigned K = labels.size(); - const unsigned km1 = K - 1; - MulticlassLogLoss loss(training, K, p, l2); - LearnParameters(loss, l1, km1, memory_buffers, epsilon, &weights); - - cout << p << "\t***CATEGORICAL***"; - for (unsigned y = 0; y < K; ++y) - cout << '\t' << labels[y]; - cout << endl; - for (unsigned y = 0; y < km1; ++y) - cout << labels[y] << "\t***BIAS***\t" << weights[y] << endl; - for (unsigned y = 0; y < km1; ++y) { - for (unsigned f = 0; f < p; ++f) { - const double w = weights[km1 + y * p + f]; - if (w) - cout << labels[y] << "\t" << FD::Convert(f) << "\t" << w << endl; - } - } - } - - return 0; -} - |