From b56da6f08c4f59b562a102671ac3deb135b0538a Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 13 May 2012 16:18:43 -0700 Subject: fast creg training code for univariate linear and logistic regression --- training/Makefile.am | 4 + training/creg.cc | 334 +++++++++++++++++++++++++++++++++++++++++++ training/liblbfgs/lbfgs++.h | 29 ++-- training/liblbfgs/ll_test.cc | 4 +- 4 files changed, 358 insertions(+), 13 deletions(-) create mode 100644 training/creg.cc (limited to 'training') diff --git a/training/Makefile.am b/training/Makefile.am index 991ac210..4b69ea94 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -1,5 +1,6 @@ bin_PROGRAMS = \ model1 \ + creg \ lbl_model \ test_ngram \ mr_em_map_adapter \ @@ -23,6 +24,9 @@ noinst_PROGRAMS = \ TESTS = lbfgs_test optimize_test +creg_SOURCES = creg.cc +creg_LDADD = ./liblbfgs/liblbfgs.a $(top_srcdir)/utils/libutils.a -lz + mpi_online_optimize_SOURCES = mpi_online_optimize.cc online_optimizer.cc mpi_online_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz diff --git a/training/creg.cc b/training/creg.cc new file mode 100644 index 00000000..58adea00 --- /dev/null +++ b/training/creg.cc @@ -0,0 +1,334 @@ +#include +#include +#include +#include + +#include +#include + +#include "json_feature_map_lexer.h" +#include "prob.h" +#include "filelib.h" +#include "weights.h" +#include "sparse_vector.h" +#include "liblbfgs/lbfgs++.h" + +using namespace std; +using namespace std::tr1; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("training_features,x", po::value(), "File containing training instance features (ARKRegression format)") + ("training_responses,y", po::value(), "File containing training response features (ARKRegression format)") + ("linear,n", "Linear (rather than logistic) regression") + ("l1",po::value()->default_value(0.0), "l_1 regularization strength") + ("l2",po::value()->default_value(0.0), "l_2 regularization strength") + ("weights,w", po::value(), "Initial weights") + ("epsilon,e", po::value()->default_value(1e-4), "Epsilon for convergence test. Terminates when ||g|| < epsilon * max(1, ||x||)") + ("memory_buffers,m",po::value()->default_value(40), "Number of memory buffers for LBFGS") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("help") || !conf->count("training_features") || !conf->count("training_responses")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +struct TrainingInstance { + SparseVector x; + union { + unsigned label; // for categorical predictions + float value; // for continuous predictions + } y; +}; + +struct ReaderHelper { + explicit ReaderHelper(vector* xyp) : xy_pairs(xyp), lc(), flag() {} + unordered_map id2ind; + vector* xy_pairs; + int lc; + bool flag; +}; + +void ReaderCB(const string& id, const SparseVector& fmap, void* extra) { + ReaderHelper& rh = *reinterpret_cast(extra); + ++rh.lc; + if (rh.lc % 1000 == 0) { cerr << '.'; rh.flag = true; } + if (rh.lc % 40000 == 0) { cerr << " [" << rh.lc << "]\n"; rh.flag = false; } + const unordered_map::iterator it = rh.id2ind.find(id); + if (it == rh.id2ind.end()) { + cerr << "Unlabeled example in line " << rh.lc << endl; + abort(); + } + (*rh.xy_pairs)[it->second - 1].x = fmap; +} + +void ReadLabeledInstances(const string& ffeats, + const string& fresp, + const bool is_continuous, + vector* xy_pairs, + vector* labels) { + bool flag = false; + xy_pairs->clear(); + int lc = 0; + ReaderHelper rh(xy_pairs); + unordered_map label2id; + cerr << "Reading training responses from " << fresp << " ..." << endl; + ReadFile fr(fresp); + for (unsigned i = 0; i < labels->size(); ++i) + label2id[(*labels)[i]] = i; + istream& in = *fr.stream(); + string line; + while(getline(in, line)) { + ++lc; + if (lc % 1000 == 0) { cerr << '.'; flag = true; } + if (lc % 40000 == 0) { cerr << " [" << lc << "]\n"; flag = false; } + if (line.size() == 0) continue; + if (line[0] == '#') continue; + unsigned p = 0; + while (p < line.size() && line[p] != ' ' && line[p] != '\t') { ++p; } + unsigned& ind = rh.id2ind[line.substr(0, p)]; + if (ind != 0) { cerr << "ID " << line.substr(0, p) << " duplicated in line " << lc << endl; abort(); } + while (p < line.size() && (line[p] == ' ' || line[p] == '\t')) { ++p; } + assert(p < line.size()); + xy_pairs->push_back(TrainingInstance()); + ind = xy_pairs->size(); + if (is_continuous) { + xy_pairs->back().y.value = strtof(&line[p], 0); + } else { // categorical predictions + unordered_map::iterator it = label2id.find(line.substr(p)); + if (it == label2id.end()) { + const string label = line.substr(p); + it = label2id.insert(make_pair(label, labels->size())).first; + labels->push_back(label); + } + xy_pairs->back().y.label = it->second; // label id + } + } + if (flag) cerr << endl; + if (!is_continuous) { + cerr << "LABELS:"; + for (unsigned j = 0; j < labels->size(); ++j) + cerr << " " << (*labels)[j]; + cerr << endl; + } + cerr << "Reading training features from " << ffeats << " ..." << endl; + ReadFile ff(ffeats); + JSONFeatureMapLexer::ReadRules(ff.stream(), ReaderCB, &rh); + if (rh.flag) cerr << endl; +} + +// helper base class (not polymorphic- just a container and some helper functions) for loss functions +// real loss functions should implement double operator()(const vector& x, double* g), +// which should evaluate f(x) and g = f'(x) +struct BaseLoss { + // dimp1 = number of categorial outputs possible for logistic regression + // for linear regression, it should be 1 more than the dimension of the response variable + BaseLoss( + const vector& tr, + unsigned dimp1, + unsigned numfeats, + unsigned ll2) : training(tr), K(dimp1), p(numfeats), l2(ll2) {} + + // weight vector layout for K classes, with p features + // w[0 : K-1] = bias weights + // w[y*p + K : y*p + K + p - 1] = feature weights for y^th class + // this representation is used in ComputeDotProducts and GradAdd + void ComputeDotProducts(const SparseVector& fx, // feature vector of x + const vector& w, // full weight vector + vector* pdotprods) const { + vector& dotprods = *pdotprods; + const unsigned km1 = K - 1; + dotprods.resize(km1); + for (unsigned y = 0; y < km1; ++y) + dotprods[y] = w[y]; // bias terms + for (SparseVector::const_iterator it = fx.begin(); it != fx.end(); ++it) { + const float fval = it->second; + const unsigned fid = it->first; + for (unsigned y = 0; y < km1; ++y) + dotprods[y] += w[fid + y * p + km1] * fval; + } + } + + double ApplyRegularizationTerms(const vector& weights, + double* g) const { + double reg = 0; + for (size_t i = K - 1; i < weights.size(); ++i) { + const double& w_i = weights[i]; + reg += l2 * w_i * w_i; + g[i] += 2 * l2 * w_i; + } + return reg; + } + + void GradAdd(const SparseVector& fx, + const unsigned y, + const double scale, + double* acc) const { + acc[y] += scale; // class bias + for (SparseVector::const_iterator it = fx.begin(); + it != fx.end(); ++it) + acc[it->first + y * p + K - 1] += it->second * scale; + } + + const vector& training; + const unsigned K, p; + const double l2; +}; + +struct UnivariateSquaredLoss : public BaseLoss { + UnivariateSquaredLoss( + const vector& tr, + unsigned numfeats, + const double l2) : BaseLoss(tr, 2, numfeats, l2) {} + + // evaluate squared loss and gradient + double operator()(const vector& x, double* g) const { + fill(g, g + x.size(), 0.0); + double cll = 0; + vector dotprods(1); // univariate prediction + for (int i = 0; i < training.size(); ++i) { + const SparseVector& fmapx = training[i].x; + const double refy = training[i].y.value; + ComputeDotProducts(fmapx, x, &dotprods); + double diff = dotprods[0] - refy; + cll += diff * diff; + + double scale = 2 * diff; + GradAdd(fmapx, 0, scale, g); + } + double reg = ApplyRegularizationTerms(x, g); + return cll + reg; + } +}; + +struct MulticlassLogLoss : public BaseLoss { + MulticlassLogLoss( + const vector& tr, + unsigned k, + unsigned numfeats, + const double l2) : BaseLoss(tr, k, numfeats, l2) {} + + // evaluate log loss and gradient + double operator()(const vector& x, double* g) const { + fill(g, g + x.size(), 0.0); + vector dotprods(K - 1); // K-1 degrees of freedom + vector probs(K); + double cll = 0; + for (int i = 0; i < training.size(); ++i) { + const SparseVector& fmapx = training[i].x; + const unsigned refy = training[i].y.label; + //cerr << "FMAP: " << fmapx << endl; + ComputeDotProducts(fmapx, x, &dotprods); + prob_t z; + for (unsigned j = 0; j < dotprods.size(); ++j) + z += (probs[j] = prob_t(dotprods[j], init_lnx())); + z += (probs.back() = prob_t::One()); + for (unsigned y = 0; y < probs.size(); ++y) { + probs[y] /= z; + //cerr << " p(y=" << y << ")=" << probs[y].as_float() << "\tz=" << z << endl; + } + cll -= log(probs[refy]); // log p(y | x) + + for (unsigned y = 0; y < dotprods.size(); ++y) { + double scale = probs[y].as_float(); + if (y == refy) { scale -= 1.0; } + GradAdd(fmapx, y, scale, g); + } + } + double reg = ApplyRegularizationTerms(x, g); + return cll + reg; + } +}; + +template +double LearnParameters(LossFunction& loss, + const double l1, + const unsigned l1_start, + const unsigned memory_buffers, + const double eps, + vector* px) { + LBFGS lbfgs(px, loss, memory_buffers, l1, l1_start, eps); + lbfgs.MinimizeFunction(); + return 0; +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + string line; + vector training; + const string xfile = conf["training_features"].as(); + const string yfile = conf["training_responses"].as(); + double l1 = conf["l1"].as(); + double l2 = conf["l2"].as(); + const unsigned memory_buffers = conf["memory_buffers"].as(); + const double epsilon = conf["epsilon"].as(); + if (l1 < 0.0) { + cerr << "L1 strength must be >= 0\n"; + return 1; + } + if (l2 < 0.0) { + cerr << "L2 strength must be >= 0\n"; + return 2; + } + + const bool is_continuous = conf.count("linear"); + vector labels; // only populated for non-continuous models + ReadLabeledInstances(xfile, yfile, is_continuous, &training, &labels); + + if (conf.count("weights")) { + cerr << "Initial weights are not implemented, please implement." << endl; + // TODO read weights for categorical and continuous predictions + // can't use normal cdec weight framework + abort(); + } + + cerr << " Number of features: " << FD::NumFeats() << endl; + cerr << "Number of training examples: " << training.size() << endl; + const unsigned p = FD::NumFeats(); + cout.precision(15); + + if (conf.count("linear")) { // linear regression + vector weights(1 + FD::NumFeats(), 0.0); + cerr << " Number of parameters: " << weights.size() << endl; + UnivariateSquaredLoss loss(training, p, l2); + LearnParameters(loss, l1, 1, memory_buffers, epsilon, &weights); + cout << p << "\t***CONTINUOUS***" << endl; + cout << "***BIAS***\t" << weights[0] << endl; + for (unsigned f = 0; f < p; ++f) { + const double w = weights[1 + f]; + if (w) + cout << FD::Convert(f) << "\t" << w << endl; + } + } else { // logistic regression + vector weights((1 + FD::NumFeats()) * (labels.size() - 1), 0.0); + cerr << " Number of parameters: " << weights.size() << endl; + cerr << " Number of labels: " << labels.size() << endl; + const unsigned K = labels.size(); + const unsigned km1 = K - 1; + MulticlassLogLoss loss(training, K, p, l2); + LearnParameters(loss, l1, km1, memory_buffers, epsilon, &weights); + + cout << p << "\t***CATEGORICAL***"; + for (unsigned y = 0; y < K; ++y) + cout << '\t' << labels[y]; + cout << endl; + for (unsigned y = 0; y < km1; ++y) + cout << labels[y] << "\t***BIAS***\t" << weights[y] << endl; + for (unsigned y = 0; y < km1; ++y) { + for (unsigned f = 0; f < p; ++f) { + const double w = weights[km1 + y * p + f]; + if (w) + cout << labels[y] << "\t" << FD::Convert(f) << "\t" << w << endl; + } + } + } + + return 0; +} + diff --git a/training/liblbfgs/lbfgs++.h b/training/liblbfgs/lbfgs++.h index 342f9b0e..92ead955 100644 --- a/training/liblbfgs/lbfgs++.h +++ b/training/liblbfgs/lbfgs++.h @@ -16,28 +16,33 @@ template class LBFGS { public: - LBFGS(size_t n, // number of variables - const Function& f, // function to optimize - double l1_c = 0.0, // l1 penalty strength - size_t m = 10 // number of memory buffers - // TODO should use custom allocator here: + LBFGS(size_t n, // number of variables + const Function& f, // function to optimize + size_t m = 10, // number of memory buffers + double l1_c = 0.0, // l1 penalty strength + unsigned l1_start = 0, // l1 penalty starting index + double eps = 1e-5 // convergence epsilon + // TODO should use custom allocator here: ) : p_x(new std::vector(n, 0.0)), owned(true), m_x(*p_x), func(f) { - Init(m, l1_c); + Init(m, l1_c, l1_start, eps); } // constructor where external vector storage for variables is used LBFGS(std::vector* px, const Function& f, - double l1_c = 0.0, // l1 penalty strength - size_t m = 10 + size_t m = 10, // number of memory buffers + double l1_c = 0.0, // l1 penalty strength + unsigned l1_start = 0, // l1 penalty starting index + double eps = 1e-5 // convergence epsilon + // TODO should use custom allocator here: ) : p_x(px), owned(false), m_x(*p_x), func(f) { - Init(m, l1_c); + Init(m, l1_c, l1_start, eps); } ~LBFGS() { @@ -60,12 +65,14 @@ class LBFGS { } private: - void Init(size_t m, double l1_c) { + void Init(size_t m, double l1_c, unsigned l1_start, double eps) { lbfgs_parameter_init(¶m); param.m = m; + param.epsilon = eps; if (l1_c > 0.0) { param.linesearch = LBFGS_LINESEARCH_BACKTRACKING; - param.orthantwise_c = 1.0; + param.orthantwise_c = l1_c; + param.orthantwise_start = l1_start; } silence = false; } diff --git a/training/liblbfgs/ll_test.cc b/training/liblbfgs/ll_test.cc index 43c0f214..48bc0366 100644 --- a/training/liblbfgs/ll_test.cc +++ b/training/liblbfgs/ll_test.cc @@ -5,7 +5,7 @@ using namespace std; // Function must be lbfgsfloatval_t f(x.begin, x.end, g.begin) lbfgsfloatval_t func(const vector& x, lbfgsfloatval_t* g) { - int i; + unsigned i; lbfgsfloatval_t fx = 0.0; for (i = 0;i < x.size();i += 2) { @@ -24,7 +24,7 @@ void Opt(F& f) { lbfgs.MinimizeFunction(); } -int main(int argc, char** argv) { +int main() { Opt(func); return 0; } -- cgit v1.2.3 From 7001792f10cb17d88ed2d4c58364b6304bbd0816 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 13 May 2012 17:09:34 -0700 Subject: put creg in its own top-level folder --- creg/Makefile.am | 11 ++ creg/README | 7 + creg/creg.cc | 334 ++++++++++++++++++++++++++++++++++++++++ creg/json_feature_map_lexer.h | 15 ++ creg/json_feature_map_lexer.ll | 132 ++++++++++++++++ creg/test_data/iris.testfeat | 50 ++++++ creg/test_data/iris.testresp | 50 ++++++ creg/test_data/iris.trainfeat | 100 ++++++++++++ creg/test_data/iris.trainresp | 100 ++++++++++++ training/Makefile.am | 4 - training/creg.cc | 334 ---------------------------------------- training/liblbfgs/lbfgs++.h | 1 + utils/Makefile.am | 4 - utils/json_feature_map_lexer.h | 15 -- utils/json_feature_map_lexer.ll | 132 ---------------- 15 files changed, 800 insertions(+), 489 deletions(-) create mode 100644 creg/Makefile.am create mode 100644 creg/README create mode 100644 creg/creg.cc create mode 100644 creg/json_feature_map_lexer.h create mode 100644 creg/json_feature_map_lexer.ll create mode 100644 creg/test_data/iris.testfeat create mode 100644 creg/test_data/iris.testresp create mode 100644 creg/test_data/iris.trainfeat create mode 100644 creg/test_data/iris.trainresp delete mode 100644 training/creg.cc delete mode 100644 utils/json_feature_map_lexer.h delete mode 100644 utils/json_feature_map_lexer.ll (limited to 'training') diff --git a/creg/Makefile.am b/creg/Makefile.am new file mode 100644 index 00000000..9e25b838 --- /dev/null +++ b/creg/Makefile.am @@ -0,0 +1,11 @@ +bin_PROGRAMS = \ + creg + +creg_SOURCES = creg.cc json_feature_map_lexer.cc +creg_LDADD = $(top_srcdir)/training/liblbfgs/liblbfgs.a $(top_srcdir)/utils/libutils.a -lz + +json_feature_map_lexer.cc: json_feature_map_lexer.ll + $(LEX) -s -8 -CF -o$@ $< + +AM_CPPFLAGS = -W -Wall -DNDEBUG -I$(top_srcdir)/utils -I$(top_srcdir)/training + diff --git a/creg/README b/creg/README new file mode 100644 index 00000000..2c04c83b --- /dev/null +++ b/creg/README @@ -0,0 +1,7 @@ +creg is a fast tool for training linear and logistic regression models with +l_1 and l_2 regularization. Its data (feature and response) format is compatible +with ARKRegression. + +Example invokation: +$ ./creg -x test_data/iris.trainfeat -y test_data/iris.trainresp --l2 100 + diff --git a/creg/creg.cc b/creg/creg.cc new file mode 100644 index 00000000..43f01bc4 --- /dev/null +++ b/creg/creg.cc @@ -0,0 +1,334 @@ +#include +#include +#include +#include + +#include +#include + +#include "json_feature_map_lexer.h" +#include "prob.h" +#include "filelib.h" +#include "weights.h" +#include "sparse_vector.h" +#include "liblbfgs/lbfgs++.h" + +using namespace std; +using namespace std::tr1; +namespace po = boost::program_options; + +void InitCommandLine(int argc, char** argv, po::variables_map* conf) { + po::options_description opts("Configuration options"); + opts.add_options() + ("training_features,x", po::value(), "File containing training instance features (ARKRegression format)") + ("training_responses,y", po::value(), "File containing training response features (ARKRegression format)") + ("linear,n", "Linear (rather than logistic) regression") + ("l1",po::value()->default_value(0.0), "l_1 regularization strength") + ("l2",po::value()->default_value(0.0), "l_2 regularization strength") + ("weights,w", po::value(), "Initial weights") + ("epsilon,e", po::value()->default_value(1e-4), "Epsilon for convergence test. Terminates when ||g|| < epsilon * max(1, ||w||)") + ("memory_buffers,m",po::value()->default_value(40), "Number of memory buffers for LBFGS") + ("help,h", "Help"); + po::options_description dcmdline_options; + dcmdline_options.add(opts); + po::store(parse_command_line(argc, argv, dcmdline_options), *conf); + if (conf->count("help") || !conf->count("training_features") || !conf->count("training_responses")) { + cerr << dcmdline_options << endl; + exit(1); + } +} + +struct TrainingInstance { + SparseVector x; + union { + unsigned label; // for categorical predictions + float value; // for continuous predictions + } y; +}; + +struct ReaderHelper { + explicit ReaderHelper(vector* xyp) : xy_pairs(xyp), lc(), flag() {} + unordered_map id2ind; + vector* xy_pairs; + int lc; + bool flag; +}; + +void ReaderCB(const string& id, const SparseVector& fmap, void* extra) { + ReaderHelper& rh = *reinterpret_cast(extra); + ++rh.lc; + if (rh.lc % 1000 == 0) { cerr << '.'; rh.flag = true; } + if (rh.lc % 40000 == 0) { cerr << " [" << rh.lc << "]\n"; rh.flag = false; } + const unordered_map::iterator it = rh.id2ind.find(id); + if (it == rh.id2ind.end()) { + cerr << "Unlabeled example in line " << rh.lc << endl; + abort(); + } + (*rh.xy_pairs)[it->second - 1].x = fmap; +} + +void ReadLabeledInstances(const string& ffeats, + const string& fresp, + const bool is_continuous, + vector* xy_pairs, + vector* labels) { + bool flag = false; + xy_pairs->clear(); + int lc = 0; + ReaderHelper rh(xy_pairs); + unordered_map label2id; + cerr << "Reading training responses from " << fresp << " ..." << endl; + ReadFile fr(fresp); + for (unsigned i = 0; i < labels->size(); ++i) + label2id[(*labels)[i]] = i; + istream& in = *fr.stream(); + string line; + while(getline(in, line)) { + ++lc; + if (lc % 1000 == 0) { cerr << '.'; flag = true; } + if (lc % 40000 == 0) { cerr << " [" << lc << "]\n"; flag = false; } + if (line.size() == 0) continue; + if (line[0] == '#') continue; + unsigned p = 0; + while (p < line.size() && line[p] != ' ' && line[p] != '\t') { ++p; } + unsigned& ind = rh.id2ind[line.substr(0, p)]; + if (ind != 0) { cerr << "ID " << line.substr(0, p) << " duplicated in line " << lc << endl; abort(); } + while (p < line.size() && (line[p] == ' ' || line[p] == '\t')) { ++p; } + assert(p < line.size()); + xy_pairs->push_back(TrainingInstance()); + ind = xy_pairs->size(); + if (is_continuous) { + xy_pairs->back().y.value = strtof(&line[p], 0); + } else { // categorical predictions + unordered_map::iterator it = label2id.find(line.substr(p)); + if (it == label2id.end()) { + const string label = line.substr(p); + it = label2id.insert(make_pair(label, labels->size())).first; + labels->push_back(label); + } + xy_pairs->back().y.label = it->second; // label id + } + } + if (flag) cerr << endl; + if (!is_continuous) { + cerr << "LABELS:"; + for (unsigned j = 0; j < labels->size(); ++j) + cerr << " " << (*labels)[j]; + cerr << endl; + } + cerr << "Reading training features from " << ffeats << " ..." << endl; + ReadFile ff(ffeats); + JSONFeatureMapLexer::ReadRules(ff.stream(), ReaderCB, &rh); + if (rh.flag) cerr << endl; +} + +// helper base class (not polymorphic- just a container and some helper functions) for loss functions +// real loss functions should implement double operator()(const vector& x, double* g), +// which should evaluate f(x) and g = f'(x) +struct BaseLoss { + // dimp1 = number of categorial outputs possible for logistic regression + // for linear regression, it should be 1 more than the dimension of the response variable + BaseLoss( + const vector& tr, + unsigned dimp1, + unsigned numfeats, + unsigned ll2) : training(tr), K(dimp1), p(numfeats), l2(ll2) {} + + // weight vector layout for K classes, with p features + // w[0 : K-1] = bias weights + // w[y*p + K : y*p + K + p - 1] = feature weights for y^th class + // this representation is used in ComputeDotProducts and GradAdd + void ComputeDotProducts(const SparseVector& fx, // feature vector of x + const vector& w, // full weight vector + vector* pdotprods) const { + vector& dotprods = *pdotprods; + const unsigned km1 = K - 1; + dotprods.resize(km1); + for (unsigned y = 0; y < km1; ++y) + dotprods[y] = w[y]; // bias terms + for (SparseVector::const_iterator it = fx.begin(); it != fx.end(); ++it) { + const float fval = it->second; + const unsigned fid = it->first; + for (unsigned y = 0; y < km1; ++y) + dotprods[y] += w[fid + y * p + km1] * fval; + } + } + + double ApplyRegularizationTerms(const vector& weights, + double* g) const { + double reg = 0; + for (size_t i = K - 1; i < weights.size(); ++i) { + const double& w_i = weights[i]; + reg += l2 * w_i * w_i; + g[i] += 2 * l2 * w_i; + } + return reg; + } + + void GradAdd(const SparseVector& fx, + const unsigned y, + const double scale, + double* acc) const { + acc[y] += scale; // class bias + for (SparseVector::const_iterator it = fx.begin(); + it != fx.end(); ++it) + acc[it->first + y * p + K - 1] += it->second * scale; + } + + const vector& training; + const unsigned K, p; + const double l2; +}; + +struct UnivariateSquaredLoss : public BaseLoss { + UnivariateSquaredLoss( + const vector& tr, + unsigned numfeats, + const double l2) : BaseLoss(tr, 2, numfeats, l2) {} + + // evaluate squared loss and gradient + double operator()(const vector& x, double* g) const { + fill(g, g + x.size(), 0.0); + double cll = 0; + vector dotprods(1); // univariate prediction + for (unsigned i = 0; i < training.size(); ++i) { + const SparseVector& fmapx = training[i].x; + const double refy = training[i].y.value; + ComputeDotProducts(fmapx, x, &dotprods); + double diff = dotprods[0] - refy; + cll += diff * diff; + + double scale = 2 * diff; + GradAdd(fmapx, 0, scale, g); + } + double reg = ApplyRegularizationTerms(x, g); + return cll + reg; + } +}; + +struct MulticlassLogLoss : public BaseLoss { + MulticlassLogLoss( + const vector& tr, + unsigned k, + unsigned numfeats, + const double l2) : BaseLoss(tr, k, numfeats, l2) {} + + // evaluate log loss and gradient + double operator()(const vector& x, double* g) const { + fill(g, g + x.size(), 0.0); + vector dotprods(K - 1); // K-1 degrees of freedom + vector probs(K); + double cll = 0; + for (unsigned i = 0; i < training.size(); ++i) { + const SparseVector& fmapx = training[i].x; + const unsigned refy = training[i].y.label; + //cerr << "FMAP: " << fmapx << endl; + ComputeDotProducts(fmapx, x, &dotprods); + prob_t z; + for (unsigned j = 0; j < dotprods.size(); ++j) + z += (probs[j] = prob_t(dotprods[j], init_lnx())); + z += (probs.back() = prob_t::One()); + for (unsigned y = 0; y < probs.size(); ++y) { + probs[y] /= z; + //cerr << " p(y=" << y << ")=" << probs[y].as_float() << "\tz=" << z << endl; + } + cll -= log(probs[refy]); // log p(y | x) + + for (unsigned y = 0; y < dotprods.size(); ++y) { + double scale = probs[y].as_float(); + if (y == refy) { scale -= 1.0; } + GradAdd(fmapx, y, scale, g); + } + } + double reg = ApplyRegularizationTerms(x, g); + return cll + reg; + } +}; + +template +double LearnParameters(LossFunction& loss, + const double l1, + const unsigned l1_start, + const unsigned memory_buffers, + const double eps, + vector* px) { + LBFGS lbfgs(px, loss, memory_buffers, l1, l1_start, eps); + lbfgs.MinimizeFunction(); + return 0; +} + +int main(int argc, char** argv) { + po::variables_map conf; + InitCommandLine(argc, argv, &conf); + string line; + vector training; + const string xfile = conf["training_features"].as(); + const string yfile = conf["training_responses"].as(); + double l1 = conf["l1"].as(); + double l2 = conf["l2"].as(); + const unsigned memory_buffers = conf["memory_buffers"].as(); + const double epsilon = conf["epsilon"].as(); + if (l1 < 0.0) { + cerr << "L1 strength must be >= 0\n"; + return 1; + } + if (l2 < 0.0) { + cerr << "L2 strength must be >= 0\n"; + return 2; + } + + const bool is_continuous = conf.count("linear"); + vector labels; // only populated for non-continuous models + ReadLabeledInstances(xfile, yfile, is_continuous, &training, &labels); + + if (conf.count("weights")) { + cerr << "Initial weights are not implemented, please implement." << endl; + // TODO read weights for categorical and continuous predictions + // can't use normal cdec weight framework + abort(); + } + + cerr << " Number of features: " << FD::NumFeats() << endl; + cerr << "Number of training examples: " << training.size() << endl; + const unsigned p = FD::NumFeats(); + cout.precision(15); + + if (conf.count("linear")) { // linear regression + vector weights(1 + FD::NumFeats(), 0.0); + cerr << " Number of parameters: " << weights.size() << endl; + UnivariateSquaredLoss loss(training, p, l2); + LearnParameters(loss, l1, 1, memory_buffers, epsilon, &weights); + cout << p << "\t***CONTINUOUS***" << endl; + cout << "***BIAS***\t" << weights[0] << endl; + for (unsigned f = 0; f < p; ++f) { + const double w = weights[1 + f]; + if (w) + cout << FD::Convert(f) << "\t" << w << endl; + } + } else { // logistic regression + vector weights((1 + FD::NumFeats()) * (labels.size() - 1), 0.0); + cerr << " Number of parameters: " << weights.size() << endl; + cerr << " Number of labels: " << labels.size() << endl; + const unsigned K = labels.size(); + const unsigned km1 = K - 1; + MulticlassLogLoss loss(training, K, p, l2); + LearnParameters(loss, l1, km1, memory_buffers, epsilon, &weights); + + cout << p << "\t***CATEGORICAL***"; + for (unsigned y = 0; y < K; ++y) + cout << '\t' << labels[y]; + cout << endl; + for (unsigned y = 0; y < km1; ++y) + cout << labels[y] << "\t***BIAS***\t" << weights[y] << endl; + for (unsigned y = 0; y < km1; ++y) { + for (unsigned f = 0; f < p; ++f) { + const double w = weights[km1 + y * p + f]; + if (w) + cout << labels[y] << "\t" << FD::Convert(f) << "\t" << w << endl; + } + } + } + + return 0; +} + diff --git a/creg/json_feature_map_lexer.h b/creg/json_feature_map_lexer.h new file mode 100644 index 00000000..3324aa29 --- /dev/null +++ b/creg/json_feature_map_lexer.h @@ -0,0 +1,15 @@ +#ifndef _RULE_LEXER_H_ +#define _RULE_LEXER_H_ + +#include +#include + +#include "sparse_vector.h" + +struct JSONFeatureMapLexer { + typedef void (*FeatureMapCallback)(const std::string& id, const SparseVector& fmap, void* extra); + static void ReadRules(std::istream* in, FeatureMapCallback func, void* extra); +}; + +#endif + diff --git a/creg/json_feature_map_lexer.ll b/creg/json_feature_map_lexer.ll new file mode 100644 index 00000000..372b52f5 --- /dev/null +++ b/creg/json_feature_map_lexer.ll @@ -0,0 +1,132 @@ +%option nounput +%{ + +#include "json_feature_map_lexer.h" +#include "fdict.h" +#include "fast_sparse_vector.h" + +#define YY_DECL int json_fmap_yylex (void) +#undef YY_INPUT +#define YY_INPUT(buf, result, max_size) (result = jfmap_stream->read(buf, max_size).gcount()) +#define YY_SKIP_YYWRAP 1 +int yywrap() { return 1; } + +JSONFeatureMapLexer::FeatureMapCallback json_fmap_callback = NULL; +void* json_fmap_callback_extra = NULL; +std::istream* jfmap_stream = NULL; +bool fl = true; +unsigned spos = 0; +char featname[16000]; +#define MAX_FEATS 20000 +std::pair featmap[MAX_FEATS]; +unsigned curfeat = 0; +std::string instid; + +inline unsigned unicode_escape_to_utf8(uint16_t w1, uint16_t w2, char* putf8) { + uint32_t cp; + if((w1 & 0xfc00) == 0xd800) { + if((w2 & 0xfc00) == 0xdc00) { + cp = 0x10000 + (((static_cast(w1) & 0x3ff) << 10) | (w2 & 0x3ff)); + } else { + abort(); + } + } else { + cp = w1; + } + + + if(cp < 0x80) { + putf8[0] = static_cast(cp); + return 1; + } else if(cp < 0x0800) { + putf8[0] = 0xc0 | ((cp >> 6) & 0x1f); + putf8[1] = 0x80 | (cp & 0x3f); + return 2; + } else if(cp < 0x10000) { + putf8[0] = 0xe0 | ((cp >> 6) & 0x0f); + putf8[1] = 0x80 | ((cp >> 6) & 0x3f); + putf8[2] = 0x80 | (cp & 0x3f); + return 3; + } else if(cp < 0x1fffff) { + putf8[0] = 0xf0 | ((cp >> 18) & 0x07); + putf8[1] = 0x80 | ((cp >> 12) & 0x3f); + putf8[2] = 0x80 | ((cp >> 6) & 0x3f); + putf8[3] = 0x80 | (cp & 0x3f); + return 4; + } else { + abort(); + } + return 0; +} + +%} + +ID [A-Za-z_0-9]+ +HEX_D [a-fA-F0-9] +INT [-]?[0-9]+ +DOUBLE {INT}((\.[0-9]+)?([eE][-+]?[0-9]+)?) +WS [ \t\r\n] +LCB [{] +RCB [}] +UNESCAPED_CH [^\"\\\b\n\r\f\t] + +%x JSON PREVAL STRING JSONVAL POSTVAL DOUBLE +%% + +{ID} { instid = yytext; BEGIN(JSON); } + +{WS}*{LCB}{WS}* { BEGIN(PREVAL); } + +\" { BEGIN(STRING); spos=0; } + +\" { featname[spos] = 0; + featmap[curfeat].first = FD::Convert(featname); + BEGIN(JSONVAL); + } +{UNESCAPED_CH} { featname[spos++] = yytext[0]; } +\\\" { featname[spos++] = '"'; } +\\\\ { featname[spos++] = '\\'; } +\\\/ { featname[spos++] = '/'; } +\\b { } +\\f { } +\\n { } +\\r { } +\\t { } +\\u{HEX_D}{HEX_D}{HEX_D}{HEX_D} { abort(); + } + +{WS}*:{WS}* { BEGIN(DOUBLE); } +{DOUBLE} { featmap[curfeat++].second = strtod(yytext, 0); + BEGIN(POSTVAL); } + +{WS}*,{WS}* { BEGIN(PREVAL); } +{WS}*{RCB}\n* { + const SparseVector x(&featmap[0], &featmap[curfeat]); + json_fmap_callback(instid, x, json_fmap_callback_extra); + curfeat = 0; + BEGIN(INITIAL); + } + +. { std::cerr << "bad input: " << yytext << std::endl; abort(); } + +%% + +void JSONFeatureMapLexer::ReadRules(std::istream* in, FeatureMapCallback func, void* extra) { + json_fmap_callback = func; + json_fmap_callback_extra = extra; + jfmap_stream = in; + json_fmap_yylex(); +} + +#if 0 +void cb(const std::string& id, const SparseVector& fmap, void* extra) { + (void) extra; + static int cc = 0; + cc++; +} + +int main() { + JSONFeatureMapLexer::ReadRules(&std::cin, cb, NULL); +} +#endif + diff --git a/creg/test_data/iris.testfeat b/creg/test_data/iris.testfeat new file mode 100644 index 00000000..f7528f81 --- /dev/null +++ b/creg/test_data/iris.testfeat @@ -0,0 +1,50 @@ +100 {"sepal-length": 4.9, "sepal-width": 2.5, "petal-length": 4.5, "petal-width": 1.7} +101 {"sepal-length": 6.5, "sepal-width": 3.0, "petal-length": 5.2, "petal-width": 2.0} +102 {"sepal-length": 4.4, "sepal-width": 3.0, "petal-length": 1.3, "petal-width": 0.2} +103 {"sepal-length": 5.0, "sepal-width": 3.4, "petal-length": 1.5, "petal-width": 0.2} +104 {"sepal-length": 5.0, "sepal-width": 3.0, "petal-length": 1.6, "petal-width": 0.2} +105 {"sepal-length": 5.1, "sepal-width": 3.4, "petal-length": 1.5, "petal-width": 0.2} +106 {"sepal-length": 5.5, "sepal-width": 2.3, "petal-length": 4.0, "petal-width": 1.3} +107 {"sepal-length": 5.5, "sepal-width": 2.6, "petal-length": 4.4, "petal-width": 1.2} +108 {"sepal-length": 5.4, "sepal-width": 3.4, "petal-length": 1.7, "petal-width": 0.2} +109 {"sepal-length": 5.5, "sepal-width": 2.4, "petal-length": 3.7, "petal-width": 1.0} +110 {"sepal-length": 6.7, "sepal-width": 3.0, "petal-length": 5.0, "petal-width": 1.7} +111 {"sepal-length": 6.4, "sepal-width": 2.8, "petal-length": 5.6, "petal-width": 2.2} +112 {"sepal-length": 5.5, "sepal-width": 4.2, "petal-length": 1.4, "petal-width": 0.2} +113 {"sepal-length": 5.9, "sepal-width": 3.0, "petal-length": 4.2, "petal-width": 1.5} +114 {"sepal-length": 4.9, "sepal-width": 3.1, "petal-length": 1.5, "petal-width": 0.1} +115 {"sepal-length": 7.7, "sepal-width": 2.6, "petal-length": 6.9, "petal-width": 2.3} +116 {"sepal-length": 5.0, "sepal-width": 3.6, "petal-length": 1.4, "petal-width": 0.2} +117 {"sepal-length": 6.3, "sepal-width": 2.3, "petal-length": 4.4, "petal-width": 1.3} +118 {"sepal-length": 6.7, "sepal-width": 3.3, "petal-length": 5.7, "petal-width": 2.1} +119 {"sepal-length": 5.8, "sepal-width": 2.7, "petal-length": 5.1, "petal-width": 1.9} +120 {"sepal-length": 5.2, "sepal-width": 2.7, "petal-length": 3.9, "petal-width": 1.4} +121 {"sepal-length": 5.0, "sepal-width": 3.5, "petal-length": 1.6, "petal-width": 0.6} +122 {"sepal-length": 5.0, "sepal-width": 3.2, "petal-length": 1.2, "petal-width": 0.2} +123 {"sepal-length": 6.7, "sepal-width": 3.0, "petal-length": 5.2, "petal-width": 2.3} +124 {"sepal-length": 5.5, "sepal-width": 2.5, "petal-length": 4.0, "petal-width": 1.3} +125 {"sepal-length": 5.6, "sepal-width": 3.0, "petal-length": 4.5, "petal-width": 1.5} +126 {"sepal-length": 6.6, "sepal-width": 3.0, "petal-length": 4.4, "petal-width": 1.4} +127 {"sepal-length": 5.1, "sepal-width": 3.8, "petal-length": 1.6, "petal-width": 0.2} +128 {"sepal-length": 5.9, "sepal-width": 3.0, "petal-length": 5.1, "petal-width": 1.8} +129 {"sepal-length": 6.2, "sepal-width": 3.4, "petal-length": 5.4, "petal-width": 2.3} +130 {"sepal-length": 5.6, "sepal-width": 2.8, "petal-length": 4.9, "petal-width": 2.0} +131 {"sepal-length": 5.7, "sepal-width": 2.9, "petal-length": 4.2, "petal-width": 1.3} +132 {"sepal-length": 6.2, "sepal-width": 2.9, "petal-length": 4.3, "petal-width": 1.3} +133 {"sepal-length": 6.0, "sepal-width": 3.4, "petal-length": 4.5, "petal-width": 1.6} +134 {"sepal-length": 5.4, "sepal-width": 3.9, "petal-length": 1.7, "petal-width": 0.4} +135 {"sepal-length": 6.3, "sepal-width": 3.3, "petal-length": 6.0, "petal-width": 2.5} +136 {"sepal-length": 6.5, "sepal-width": 3.2, "petal-length": 5.1, "petal-width": 2.0} +137 {"sepal-length": 5.1, "sepal-width": 2.5, "petal-length": 3.0, "petal-width": 1.1} +138 {"sepal-length": 4.3, "sepal-width": 3.0, "petal-length": 1.1, "petal-width": 0.1} +139 {"sepal-length": 5.7, "sepal-width": 2.5, "petal-length": 5.0, "petal-width": 2.0} +140 {"sepal-length": 6.0, "sepal-width": 2.2, "petal-length": 5.0, "petal-width": 1.5} +141 {"sepal-length": 6.4, "sepal-width": 3.2, "petal-length": 5.3, "petal-width": 2.3} +142 {"sepal-length": 6.5, "sepal-width": 2.8, "petal-length": 4.6, "petal-width": 1.5} +143 {"sepal-length": 5.5, "sepal-width": 3.5, "petal-length": 1.3, "petal-width": 0.2} +144 {"sepal-length": 4.7, "sepal-width": 3.2, "petal-length": 1.3, "petal-width": 0.2} +145 {"sepal-length": 4.6, "sepal-width": 3.4, "petal-length": 1.4, "petal-width": 0.3} +146 {"sepal-length": 5.7, "sepal-width": 2.6, "petal-length": 3.5, "petal-width": 1.0} +147 {"sepal-length": 5.8, "sepal-width": 2.8, "petal-length": 5.1, "petal-width": 2.4} +148 {"sepal-length": 7.7, "sepal-width": 2.8, "petal-length": 6.7, "petal-width": 2.0} +149 {"sepal-length": 6.3, "sepal-width": 2.9, "petal-length": 5.6, "petal-width": 1.8} diff --git a/creg/test_data/iris.testresp b/creg/test_data/iris.testresp new file mode 100644 index 00000000..0952e4da --- /dev/null +++ b/creg/test_data/iris.testresp @@ -0,0 +1,50 @@ +100 Iris-virginica +101 Iris-virginica +102 Iris-setosa +103 Iris-setosa +104 Iris-setosa +105 Iris-setosa +106 Iris-versicolor +107 Iris-versicolor +108 Iris-setosa +109 Iris-versicolor +110 Iris-versicolor +111 Iris-virginica +112 Iris-setosa +113 Iris-versicolor +114 Iris-setosa +115 Iris-virginica +116 Iris-setosa +117 Iris-versicolor +118 Iris-virginica +119 Iris-virginica +120 Iris-versicolor +121 Iris-setosa +122 Iris-setosa +123 Iris-virginica +124 Iris-versicolor +125 Iris-versicolor +126 Iris-versicolor +127 Iris-setosa +128 Iris-virginica +129 Iris-virginica +130 Iris-virginica +131 Iris-versicolor +132 Iris-versicolor +133 Iris-versicolor +134 Iris-setosa +135 Iris-virginica +136 Iris-virginica +137 Iris-versicolor +138 Iris-setosa +139 Iris-virginica +140 Iris-virginica +141 Iris-virginica +142 Iris-versicolor +143 Iris-setosa +144 Iris-setosa +145 Iris-setosa +146 Iris-versicolor +147 Iris-virginica +148 Iris-virginica +149 Iris-virginica diff --git a/creg/test_data/iris.trainfeat b/creg/test_data/iris.trainfeat new file mode 100644 index 00000000..a930a446 --- /dev/null +++ b/creg/test_data/iris.trainfeat @@ -0,0 +1,100 @@ +0 {"sepal-length": 5.4, "sepal-width": 3.0, "petal-length": 4.5, "petal-width": 1.5} +1 {"sepal-length": 5.0, "sepal-width": 3.4, "petal-length": 1.6, "petal-width": 0.4} +2 {"sepal-length": 5.0, "sepal-width": 3.3, "petal-length": 1.4, "petal-width": 0.2} +3 {"sepal-length": 5.7, "sepal-width": 2.8, "petal-length": 4.5, "petal-width": 1.3} +4 {"sepal-length": 6.4, "sepal-width": 3.1, "petal-length": 5.5, "petal-width": 1.8} +5 {"sepal-length": 7.9, "sepal-width": 3.8, "petal-length": 6.4, "petal-width": 2.0} +6 {"sepal-length": 5.9, "sepal-width": 3.2, "petal-length": 4.8, "petal-width": 1.8} +7 {"sepal-length": 6.7, "sepal-width": 2.5, "petal-length": 5.8, "petal-width": 1.8} +8 {"sepal-length": 6.7, "sepal-width": 3.1, "petal-length": 4.4, "petal-width": 1.4} +9 {"sepal-length": 6.3, "sepal-width": 2.5, "petal-length": 4.9, "petal-width": 1.5} +10 {"sepal-length": 6.1, "sepal-width": 2.9, "petal-length": 4.7, "petal-width": 1.4} +11 {"sepal-length": 6.3, "sepal-width": 3.3, "petal-length": 4.7, "petal-width": 1.6} +12 {"sepal-length": 6.7, "sepal-width": 3.1, "petal-length": 4.7, "petal-width": 1.5} +13 {"sepal-length": 6.2, "sepal-width": 2.8, "petal-length": 4.8, "petal-width": 1.8} +14 {"sepal-length": 5.0, "sepal-width": 3.5, "petal-length": 1.3, "petal-width": 0.3} +15 {"sepal-length": 5.4, "sepal-width": 3.9, "petal-length": 1.3, "petal-width": 0.4} +16 {"sepal-length": 7.4, "sepal-width": 2.8, "petal-length": 6.1, "petal-width": 1.9} +17 {"sepal-length": 7.2, "sepal-width": 3.2, "petal-length": 6.0, "petal-width": 1.8} +18 {"sepal-length": 5.7, "sepal-width": 3.8, "petal-length": 1.7, "petal-width": 0.3} +19 {"sepal-length": 4.5, "sepal-width": 2.3, "petal-length": 1.3, "petal-width": 0.3} +20 {"sepal-length": 5.6, "sepal-width": 3.0, "petal-length": 4.1, "petal-width": 1.3} +21 {"sepal-length": 6.8, "sepal-width": 3.0, "petal-length": 5.5, "petal-width": 2.1} +22 {"sepal-length": 6.5, "sepal-width": 3.0, "petal-length": 5.8, "petal-width": 2.2} +23 {"sepal-length": 4.4, "sepal-width": 3.2, "petal-length": 1.3, "petal-width": 0.2} +24 {"sepal-length": 6.3, "sepal-width": 2.5, "petal-length": 5.0, "petal-width": 1.9} +25 {"sepal-length": 4.4, "sepal-width": 2.9, "petal-length": 1.4, "petal-width": 0.2} +26 {"sepal-length": 4.9, "sepal-width": 3.0, "petal-length": 1.4, "petal-width": 0.2} +27 {"sepal-length": 5.4, "sepal-width": 3.4, "petal-length": 1.5, "petal-width": 0.4} +28 {"sepal-length": 5.8, "sepal-width": 2.7, "petal-length": 3.9, "petal-width": 1.2} +29 {"sepal-length": 5.6, "sepal-width": 2.5, "petal-length": 3.9, "petal-width": 1.1} +30 {"sepal-length": 5.1, "sepal-width": 3.5, "petal-length": 1.4, "petal-width": 0.3} +31 {"sepal-length": 5.6, "sepal-width": 2.7, "petal-length": 4.2, "petal-width": 1.3} +32 {"sepal-length": 5.1, "sepal-width": 3.5, "petal-length": 1.4, "petal-width": 0.2} +33 {"sepal-length": 6.4, "sepal-width": 2.7, "petal-length": 5.3, "petal-width": 1.9} +34 {"sepal-length": 5.8, "sepal-width": 4.0, "petal-length": 1.2, "petal-width": 0.2} +35 {"sepal-length": 5.2, "sepal-width": 3.4, "petal-length": 1.4, "petal-width": 0.2} +36 {"sepal-length": 7.6, "sepal-width": 3.0, "petal-length": 6.6, "petal-width": 2.1} +37 {"sepal-length": 5.8, "sepal-width": 2.7, "petal-length": 5.1, "petal-width": 1.9} +38 {"sepal-length": 6.0, "sepal-width": 2.2, "petal-length": 4.0, "petal-width": 1.0} +39 {"sepal-length": 7.7, "sepal-width": 3.0, "petal-length": 6.1, "petal-width": 2.3} +40 {"sepal-length": 5.1, "sepal-width": 3.7, "petal-length": 1.5, "petal-width": 0.4} +41 {"sepal-length": 6.1, "sepal-width": 2.6, "petal-length": 5.6, "petal-width": 1.4} +42 {"sepal-length": 6.7, "sepal-width": 3.1, "petal-length": 5.6, "petal-width": 2.4} +43 {"sepal-length": 7.7, "sepal-width": 3.8, "petal-length": 6.7, "petal-width": 2.2} +44 {"sepal-length": 5.1, "sepal-width": 3.3, "petal-length": 1.7, "petal-width": 0.5} +45 {"sepal-length": 6.3, "sepal-width": 2.8, "petal-length": 5.1, "petal-width": 1.5} +46 {"sepal-length": 5.0, "sepal-width": 2.0, "petal-length": 3.5, "petal-width": 1.0} +47 {"sepal-length": 5.1, "sepal-width": 3.8, "petal-length": 1.5, "petal-width": 0.3} +48 {"sepal-length": 4.9, "sepal-width": 3.1, "petal-length": 1.5, "petal-width": 0.1} +49 {"sepal-length": 6.1, "sepal-width": 3.0, "petal-length": 4.9, "petal-width": 1.8} +50 {"sepal-length": 6.4, "sepal-width": 2.8, "petal-length": 5.6, "petal-width": 2.1} +51 {"sepal-length": 6.5, "sepal-width": 3.0, "petal-length": 5.5, "petal-width": 1.8} +52 {"sepal-length": 6.1, "sepal-width": 2.8, "petal-length": 4.7, "petal-width": 1.2} +53 {"sepal-length": 6.1, "sepal-width": 2.8, "petal-length": 4.0, "petal-width": 1.3} +54 {"sepal-length": 4.9, "sepal-width": 3.1, "petal-length": 1.5, "petal-width": 0.1} +55 {"sepal-length": 6.8, "sepal-width": 2.8, "petal-length": 4.8, "petal-width": 1.4} +56 {"sepal-length": 6.3, "sepal-width": 2.7, "petal-length": 4.9, "petal-width": 1.8} +57 {"sepal-length": 4.6, "sepal-width": 3.2, "petal-length": 1.4, "petal-width": 0.2} +58 {"sepal-length": 6.3, "sepal-width": 3.4, "petal-length": 5.6, "petal-width": 2.4} +59 {"sepal-length": 5.7, "sepal-width": 4.4, "petal-length": 1.5, "petal-width": 0.4} +60 {"sepal-length": 6.4, "sepal-width": 2.9, "petal-length": 4.3, "petal-width": 1.3} +61 {"sepal-length": 7.2, "sepal-width": 3.6, "petal-length": 6.1, "petal-width": 2.5} +62 {"sepal-length": 5.8, "sepal-width": 2.7, "petal-length": 4.1, "petal-width": 1.0} +63 {"sepal-length": 6.0, "sepal-width": 3.0, "petal-length": 4.8, "petal-width": 1.8} +64 {"sepal-length": 4.7, "sepal-width": 3.2, "petal-length": 1.6, "petal-width": 0.2} +65 {"sepal-length": 6.9, "sepal-width": 3.2, "petal-length": 5.7, "petal-width": 2.3} +66 {"sepal-length": 6.4, "sepal-width": 3.2, "petal-length": 4.5, "petal-width": 1.5} +67 {"sepal-length": 6.9, "sepal-width": 3.1, "petal-length": 5.4, "petal-width": 2.1} +68 {"sepal-length": 5.2, "sepal-width": 3.5, "petal-length": 1.5, "petal-width": 0.2} +69 {"sepal-length": 5.3, "sepal-width": 3.7, "petal-length": 1.5, "petal-width": 0.2} +70 {"sepal-length": 5.5, "sepal-width": 2.4, "petal-length": 3.8, "petal-width": 1.1} +71 {"sepal-length": 4.8, "sepal-width": 3.4, "petal-length": 1.9, "petal-width": 0.2} +72 {"sepal-length": 5.7, "sepal-width": 2.8, "petal-length": 4.1, "petal-width": 1.3} +73 {"sepal-length": 4.9, "sepal-width": 2.4, "petal-length": 3.3, "petal-width": 1.0} +74 {"sepal-length": 6.2, "sepal-width": 2.2, "petal-length": 4.5, "petal-width": 1.5} +75 {"sepal-length": 6.7, "sepal-width": 3.3, "petal-length": 5.7, "petal-width": 2.5} +76 {"sepal-length": 6.1, "sepal-width": 3.0, "petal-length": 4.6, "petal-width": 1.4} +77 {"sepal-length": 4.6, "sepal-width": 3.6, "petal-length": 1.0, "petal-width": 0.2} +78 {"sepal-length": 7.0, "sepal-width": 3.2, "petal-length": 4.7, "petal-width": 1.4} +79 {"sepal-length": 6.6, "sepal-width": 2.9, "petal-length": 4.6, "petal-width": 1.3} +80 {"sepal-length": 5.4, "sepal-width": 3.7, "petal-length": 1.5, "petal-width": 0.2} +81 {"sepal-length": 4.8, "sepal-width": 3.0, "petal-length": 1.4, "petal-width": 0.3} +82 {"sepal-length": 7.2, "sepal-width": 3.0, "petal-length": 5.8, "petal-width": 1.6} +83 {"sepal-length": 7.1, "sepal-width": 3.0, "petal-length": 5.9, "petal-width": 2.1} +84 {"sepal-length": 6.9, "sepal-width": 3.1, "petal-length": 4.9, "petal-width": 1.5} +85 {"sepal-length": 4.8, "sepal-width": 3.0, "petal-length": 1.4, "petal-width": 0.1} +86 {"sepal-length": 7.3, "sepal-width": 2.9, "petal-length": 6.3, "petal-width": 1.8} +87 {"sepal-length": 6.0, "sepal-width": 2.7, "petal-length": 5.1, "petal-width": 1.6} +88 {"sepal-length": 6.8, "sepal-width": 3.2, "petal-length": 5.9, "petal-width": 2.3} +89 {"sepal-length": 4.6, "sepal-width": 3.1, "petal-length": 1.5, "petal-width": 0.2} +90 {"sepal-length": 4.8, "sepal-width": 3.1, "petal-length": 1.6, "petal-width": 0.2} +91 {"sepal-length": 5.0, "sepal-width": 2.3, "petal-length": 3.3, "petal-width": 1.0} +92 {"sepal-length": 6.9, "sepal-width": 3.1, "petal-length": 5.1, "petal-width": 2.3} +93 {"sepal-length": 5.7, "sepal-width": 3.0, "petal-length": 4.2, "petal-width": 1.2} +94 {"sepal-length": 5.1, "sepal-width": 3.8, "petal-length": 1.9, "petal-width": 0.4} +95 {"sepal-length": 6.0, "sepal-width": 2.9, "petal-length": 4.5, "petal-width": 1.5} +96 {"sepal-length": 4.8, "sepal-width": 3.4, "petal-length": 1.6, "petal-width": 0.2} +97 {"sepal-length": 5.2, "sepal-width": 4.1, "petal-length": 1.5, "petal-width": 0.1} +98 {"sepal-length": 5.6, "sepal-width": 2.9, "petal-length": 3.6, "petal-width": 1.3} +99 {"sepal-length": 5.8, "sepal-width": 2.6, "petal-length": 4.0, "petal-width": 1.2} diff --git a/creg/test_data/iris.trainresp b/creg/test_data/iris.trainresp new file mode 100644 index 00000000..d77bc6a2 --- /dev/null +++ b/creg/test_data/iris.trainresp @@ -0,0 +1,100 @@ +0 Iris-versicolor +1 Iris-setosa +2 Iris-setosa +3 Iris-versicolor +4 Iris-virginica +5 Iris-virginica +6 Iris-versicolor +7 Iris-virginica +8 Iris-versicolor +9 Iris-versicolor +10 Iris-versicolor +11 Iris-versicolor +12 Iris-versicolor +13 Iris-virginica +14 Iris-setosa +15 Iris-setosa +16 Iris-virginica +17 Iris-virginica +18 Iris-setosa +19 Iris-setosa +20 Iris-versicolor +21 Iris-virginica +22 Iris-virginica +23 Iris-setosa +24 Iris-virginica +25 Iris-setosa +26 Iris-setosa +27 Iris-setosa +28 Iris-versicolor +29 Iris-versicolor +30 Iris-setosa +31 Iris-versicolor +32 Iris-setosa +33 Iris-virginica +34 Iris-setosa +35 Iris-setosa +36 Iris-virginica +37 Iris-virginica +38 Iris-versicolor +39 Iris-virginica +40 Iris-setosa +41 Iris-virginica +42 Iris-virginica +43 Iris-virginica +44 Iris-setosa +45 Iris-virginica +46 Iris-versicolor +47 Iris-setosa +48 Iris-setosa +49 Iris-virginica +50 Iris-virginica +51 Iris-virginica +52 Iris-versicolor +53 Iris-versicolor +54 Iris-setosa +55 Iris-versicolor +56 Iris-virginica +57 Iris-setosa +58 Iris-virginica +59 Iris-setosa +60 Iris-versicolor +61 Iris-virginica +62 Iris-versicolor +63 Iris-virginica +64 Iris-setosa +65 Iris-virginica +66 Iris-versicolor +67 Iris-virginica +68 Iris-setosa +69 Iris-setosa +70 Iris-versicolor +71 Iris-setosa +72 Iris-versicolor +73 Iris-versicolor +74 Iris-versicolor +75 Iris-virginica +76 Iris-versicolor +77 Iris-setosa +78 Iris-versicolor +79 Iris-versicolor +80 Iris-setosa +81 Iris-setosa +82 Iris-virginica +83 Iris-virginica +84 Iris-versicolor +85 Iris-setosa +86 Iris-virginica +87 Iris-versicolor +88 Iris-virginica +89 Iris-setosa +90 Iris-setosa +91 Iris-versicolor +92 Iris-virginica +93 Iris-versicolor +94 Iris-setosa +95 Iris-versicolor +96 Iris-setosa +97 Iris-setosa +98 Iris-versicolor +99 Iris-versicolor diff --git a/training/Makefile.am b/training/Makefile.am index 4b69ea94..991ac210 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -1,6 +1,5 @@ bin_PROGRAMS = \ model1 \ - creg \ lbl_model \ test_ngram \ mr_em_map_adapter \ @@ -24,9 +23,6 @@ noinst_PROGRAMS = \ TESTS = lbfgs_test optimize_test -creg_SOURCES = creg.cc -creg_LDADD = ./liblbfgs/liblbfgs.a $(top_srcdir)/utils/libutils.a -lz - mpi_online_optimize_SOURCES = mpi_online_optimize.cc online_optimizer.cc mpi_online_optimize_LDADD = $(top_srcdir)/decoder/libcdec.a $(top_srcdir)/mteval/libmteval.a $(top_srcdir)/utils/libutils.a ../klm/lm/libklm.a ../klm/util/libklm_util.a -lz diff --git a/training/creg.cc b/training/creg.cc deleted file mode 100644 index 58adea00..00000000 --- a/training/creg.cc +++ /dev/null @@ -1,334 +0,0 @@ -#include -#include -#include -#include - -#include -#include - -#include "json_feature_map_lexer.h" -#include "prob.h" -#include "filelib.h" -#include "weights.h" -#include "sparse_vector.h" -#include "liblbfgs/lbfgs++.h" - -using namespace std; -using namespace std::tr1; -namespace po = boost::program_options; - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("training_features,x", po::value(), "File containing training instance features (ARKRegression format)") - ("training_responses,y", po::value(), "File containing training response features (ARKRegression format)") - ("linear,n", "Linear (rather than logistic) regression") - ("l1",po::value()->default_value(0.0), "l_1 regularization strength") - ("l2",po::value()->default_value(0.0), "l_2 regularization strength") - ("weights,w", po::value(), "Initial weights") - ("epsilon,e", po::value()->default_value(1e-4), "Epsilon for convergence test. Terminates when ||g|| < epsilon * max(1, ||x||)") - ("memory_buffers,m",po::value()->default_value(40), "Number of memory buffers for LBFGS") - ("help,h", "Help"); - po::options_description dcmdline_options; - dcmdline_options.add(opts); - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("help") || !conf->count("training_features") || !conf->count("training_responses")) { - cerr << dcmdline_options << endl; - exit(1); - } -} - -struct TrainingInstance { - SparseVector x; - union { - unsigned label; // for categorical predictions - float value; // for continuous predictions - } y; -}; - -struct ReaderHelper { - explicit ReaderHelper(vector* xyp) : xy_pairs(xyp), lc(), flag() {} - unordered_map id2ind; - vector* xy_pairs; - int lc; - bool flag; -}; - -void ReaderCB(const string& id, const SparseVector& fmap, void* extra) { - ReaderHelper& rh = *reinterpret_cast(extra); - ++rh.lc; - if (rh.lc % 1000 == 0) { cerr << '.'; rh.flag = true; } - if (rh.lc % 40000 == 0) { cerr << " [" << rh.lc << "]\n"; rh.flag = false; } - const unordered_map::iterator it = rh.id2ind.find(id); - if (it == rh.id2ind.end()) { - cerr << "Unlabeled example in line " << rh.lc << endl; - abort(); - } - (*rh.xy_pairs)[it->second - 1].x = fmap; -} - -void ReadLabeledInstances(const string& ffeats, - const string& fresp, - const bool is_continuous, - vector* xy_pairs, - vector* labels) { - bool flag = false; - xy_pairs->clear(); - int lc = 0; - ReaderHelper rh(xy_pairs); - unordered_map label2id; - cerr << "Reading training responses from " << fresp << " ..." << endl; - ReadFile fr(fresp); - for (unsigned i = 0; i < labels->size(); ++i) - label2id[(*labels)[i]] = i; - istream& in = *fr.stream(); - string line; - while(getline(in, line)) { - ++lc; - if (lc % 1000 == 0) { cerr << '.'; flag = true; } - if (lc % 40000 == 0) { cerr << " [" << lc << "]\n"; flag = false; } - if (line.size() == 0) continue; - if (line[0] == '#') continue; - unsigned p = 0; - while (p < line.size() && line[p] != ' ' && line[p] != '\t') { ++p; } - unsigned& ind = rh.id2ind[line.substr(0, p)]; - if (ind != 0) { cerr << "ID " << line.substr(0, p) << " duplicated in line " << lc << endl; abort(); } - while (p < line.size() && (line[p] == ' ' || line[p] == '\t')) { ++p; } - assert(p < line.size()); - xy_pairs->push_back(TrainingInstance()); - ind = xy_pairs->size(); - if (is_continuous) { - xy_pairs->back().y.value = strtof(&line[p], 0); - } else { // categorical predictions - unordered_map::iterator it = label2id.find(line.substr(p)); - if (it == label2id.end()) { - const string label = line.substr(p); - it = label2id.insert(make_pair(label, labels->size())).first; - labels->push_back(label); - } - xy_pairs->back().y.label = it->second; // label id - } - } - if (flag) cerr << endl; - if (!is_continuous) { - cerr << "LABELS:"; - for (unsigned j = 0; j < labels->size(); ++j) - cerr << " " << (*labels)[j]; - cerr << endl; - } - cerr << "Reading training features from " << ffeats << " ..." << endl; - ReadFile ff(ffeats); - JSONFeatureMapLexer::ReadRules(ff.stream(), ReaderCB, &rh); - if (rh.flag) cerr << endl; -} - -// helper base class (not polymorphic- just a container and some helper functions) for loss functions -// real loss functions should implement double operator()(const vector& x, double* g), -// which should evaluate f(x) and g = f'(x) -struct BaseLoss { - // dimp1 = number of categorial outputs possible for logistic regression - // for linear regression, it should be 1 more than the dimension of the response variable - BaseLoss( - const vector& tr, - unsigned dimp1, - unsigned numfeats, - unsigned ll2) : training(tr), K(dimp1), p(numfeats), l2(ll2) {} - - // weight vector layout for K classes, with p features - // w[0 : K-1] = bias weights - // w[y*p + K : y*p + K + p - 1] = feature weights for y^th class - // this representation is used in ComputeDotProducts and GradAdd - void ComputeDotProducts(const SparseVector& fx, // feature vector of x - const vector& w, // full weight vector - vector* pdotprods) const { - vector& dotprods = *pdotprods; - const unsigned km1 = K - 1; - dotprods.resize(km1); - for (unsigned y = 0; y < km1; ++y) - dotprods[y] = w[y]; // bias terms - for (SparseVector::const_iterator it = fx.begin(); it != fx.end(); ++it) { - const float fval = it->second; - const unsigned fid = it->first; - for (unsigned y = 0; y < km1; ++y) - dotprods[y] += w[fid + y * p + km1] * fval; - } - } - - double ApplyRegularizationTerms(const vector& weights, - double* g) const { - double reg = 0; - for (size_t i = K - 1; i < weights.size(); ++i) { - const double& w_i = weights[i]; - reg += l2 * w_i * w_i; - g[i] += 2 * l2 * w_i; - } - return reg; - } - - void GradAdd(const SparseVector& fx, - const unsigned y, - const double scale, - double* acc) const { - acc[y] += scale; // class bias - for (SparseVector::const_iterator it = fx.begin(); - it != fx.end(); ++it) - acc[it->first + y * p + K - 1] += it->second * scale; - } - - const vector& training; - const unsigned K, p; - const double l2; -}; - -struct UnivariateSquaredLoss : public BaseLoss { - UnivariateSquaredLoss( - const vector& tr, - unsigned numfeats, - const double l2) : BaseLoss(tr, 2, numfeats, l2) {} - - // evaluate squared loss and gradient - double operator()(const vector& x, double* g) const { - fill(g, g + x.size(), 0.0); - double cll = 0; - vector dotprods(1); // univariate prediction - for (int i = 0; i < training.size(); ++i) { - const SparseVector& fmapx = training[i].x; - const double refy = training[i].y.value; - ComputeDotProducts(fmapx, x, &dotprods); - double diff = dotprods[0] - refy; - cll += diff * diff; - - double scale = 2 * diff; - GradAdd(fmapx, 0, scale, g); - } - double reg = ApplyRegularizationTerms(x, g); - return cll + reg; - } -}; - -struct MulticlassLogLoss : public BaseLoss { - MulticlassLogLoss( - const vector& tr, - unsigned k, - unsigned numfeats, - const double l2) : BaseLoss(tr, k, numfeats, l2) {} - - // evaluate log loss and gradient - double operator()(const vector& x, double* g) const { - fill(g, g + x.size(), 0.0); - vector dotprods(K - 1); // K-1 degrees of freedom - vector probs(K); - double cll = 0; - for (int i = 0; i < training.size(); ++i) { - const SparseVector& fmapx = training[i].x; - const unsigned refy = training[i].y.label; - //cerr << "FMAP: " << fmapx << endl; - ComputeDotProducts(fmapx, x, &dotprods); - prob_t z; - for (unsigned j = 0; j < dotprods.size(); ++j) - z += (probs[j] = prob_t(dotprods[j], init_lnx())); - z += (probs.back() = prob_t::One()); - for (unsigned y = 0; y < probs.size(); ++y) { - probs[y] /= z; - //cerr << " p(y=" << y << ")=" << probs[y].as_float() << "\tz=" << z << endl; - } - cll -= log(probs[refy]); // log p(y | x) - - for (unsigned y = 0; y < dotprods.size(); ++y) { - double scale = probs[y].as_float(); - if (y == refy) { scale -= 1.0; } - GradAdd(fmapx, y, scale, g); - } - } - double reg = ApplyRegularizationTerms(x, g); - return cll + reg; - } -}; - -template -double LearnParameters(LossFunction& loss, - const double l1, - const unsigned l1_start, - const unsigned memory_buffers, - const double eps, - vector* px) { - LBFGS lbfgs(px, loss, memory_buffers, l1, l1_start, eps); - lbfgs.MinimizeFunction(); - return 0; -} - -int main(int argc, char** argv) { - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - string line; - vector training; - const string xfile = conf["training_features"].as(); - const string yfile = conf["training_responses"].as(); - double l1 = conf["l1"].as(); - double l2 = conf["l2"].as(); - const unsigned memory_buffers = conf["memory_buffers"].as(); - const double epsilon = conf["epsilon"].as(); - if (l1 < 0.0) { - cerr << "L1 strength must be >= 0\n"; - return 1; - } - if (l2 < 0.0) { - cerr << "L2 strength must be >= 0\n"; - return 2; - } - - const bool is_continuous = conf.count("linear"); - vector labels; // only populated for non-continuous models - ReadLabeledInstances(xfile, yfile, is_continuous, &training, &labels); - - if (conf.count("weights")) { - cerr << "Initial weights are not implemented, please implement." << endl; - // TODO read weights for categorical and continuous predictions - // can't use normal cdec weight framework - abort(); - } - - cerr << " Number of features: " << FD::NumFeats() << endl; - cerr << "Number of training examples: " << training.size() << endl; - const unsigned p = FD::NumFeats(); - cout.precision(15); - - if (conf.count("linear")) { // linear regression - vector weights(1 + FD::NumFeats(), 0.0); - cerr << " Number of parameters: " << weights.size() << endl; - UnivariateSquaredLoss loss(training, p, l2); - LearnParameters(loss, l1, 1, memory_buffers, epsilon, &weights); - cout << p << "\t***CONTINUOUS***" << endl; - cout << "***BIAS***\t" << weights[0] << endl; - for (unsigned f = 0; f < p; ++f) { - const double w = weights[1 + f]; - if (w) - cout << FD::Convert(f) << "\t" << w << endl; - } - } else { // logistic regression - vector weights((1 + FD::NumFeats()) * (labels.size() - 1), 0.0); - cerr << " Number of parameters: " << weights.size() << endl; - cerr << " Number of labels: " << labels.size() << endl; - const unsigned K = labels.size(); - const unsigned km1 = K - 1; - MulticlassLogLoss loss(training, K, p, l2); - LearnParameters(loss, l1, km1, memory_buffers, epsilon, &weights); - - cout << p << "\t***CATEGORICAL***"; - for (unsigned y = 0; y < K; ++y) - cout << '\t' << labels[y]; - cout << endl; - for (unsigned y = 0; y < km1; ++y) - cout << labels[y] << "\t***BIAS***\t" << weights[y] << endl; - for (unsigned y = 0; y < km1; ++y) { - for (unsigned f = 0; f < p; ++f) { - const double w = weights[km1 + y * p + f]; - if (w) - cout << labels[y] << "\t" << FD::Convert(f) << "\t" << w << endl; - } - } - } - - return 0; -} - diff --git a/training/liblbfgs/lbfgs++.h b/training/liblbfgs/lbfgs++.h index 92ead955..2b40c19b 100644 --- a/training/liblbfgs/lbfgs++.h +++ b/training/liblbfgs/lbfgs++.h @@ -90,6 +90,7 @@ class LBFGS { lbfgsfloatval_t *g, const int n, const lbfgsfloatval_t step) { + (void) x; (void) n; (void) step; if (!silence) { ec++; std::cerr << '.'; } diff --git a/utils/Makefile.am b/utils/Makefile.am index b7da0f06..46650c75 100644 --- a/utils/Makefile.am +++ b/utils/Makefile.am @@ -33,16 +33,12 @@ libutils_a_SOURCES = \ sparse_vector.cc \ timing_stats.cc \ verbose.cc \ - json_feature_map_lexer.cc \ weights.cc if HAVE_CMPH libutils_a_SOURCES += perfect_hash.cc endif -json_feature_map_lexer.cc: json_feature_map_lexer.ll - $(LEX) -s -8 -CF -o$@ $< - phmt_SOURCES = phmt.cc ts_SOURCES = ts.cc m_test_SOURCES = m_test.cc diff --git a/utils/json_feature_map_lexer.h b/utils/json_feature_map_lexer.h deleted file mode 100644 index 3324aa29..00000000 --- a/utils/json_feature_map_lexer.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef _RULE_LEXER_H_ -#define _RULE_LEXER_H_ - -#include -#include - -#include "sparse_vector.h" - -struct JSONFeatureMapLexer { - typedef void (*FeatureMapCallback)(const std::string& id, const SparseVector& fmap, void* extra); - static void ReadRules(std::istream* in, FeatureMapCallback func, void* extra); -}; - -#endif - diff --git a/utils/json_feature_map_lexer.ll b/utils/json_feature_map_lexer.ll deleted file mode 100644 index 372b52f5..00000000 --- a/utils/json_feature_map_lexer.ll +++ /dev/null @@ -1,132 +0,0 @@ -%option nounput -%{ - -#include "json_feature_map_lexer.h" -#include "fdict.h" -#include "fast_sparse_vector.h" - -#define YY_DECL int json_fmap_yylex (void) -#undef YY_INPUT -#define YY_INPUT(buf, result, max_size) (result = jfmap_stream->read(buf, max_size).gcount()) -#define YY_SKIP_YYWRAP 1 -int yywrap() { return 1; } - -JSONFeatureMapLexer::FeatureMapCallback json_fmap_callback = NULL; -void* json_fmap_callback_extra = NULL; -std::istream* jfmap_stream = NULL; -bool fl = true; -unsigned spos = 0; -char featname[16000]; -#define MAX_FEATS 20000 -std::pair featmap[MAX_FEATS]; -unsigned curfeat = 0; -std::string instid; - -inline unsigned unicode_escape_to_utf8(uint16_t w1, uint16_t w2, char* putf8) { - uint32_t cp; - if((w1 & 0xfc00) == 0xd800) { - if((w2 & 0xfc00) == 0xdc00) { - cp = 0x10000 + (((static_cast(w1) & 0x3ff) << 10) | (w2 & 0x3ff)); - } else { - abort(); - } - } else { - cp = w1; - } - - - if(cp < 0x80) { - putf8[0] = static_cast(cp); - return 1; - } else if(cp < 0x0800) { - putf8[0] = 0xc0 | ((cp >> 6) & 0x1f); - putf8[1] = 0x80 | (cp & 0x3f); - return 2; - } else if(cp < 0x10000) { - putf8[0] = 0xe0 | ((cp >> 6) & 0x0f); - putf8[1] = 0x80 | ((cp >> 6) & 0x3f); - putf8[2] = 0x80 | (cp & 0x3f); - return 3; - } else if(cp < 0x1fffff) { - putf8[0] = 0xf0 | ((cp >> 18) & 0x07); - putf8[1] = 0x80 | ((cp >> 12) & 0x3f); - putf8[2] = 0x80 | ((cp >> 6) & 0x3f); - putf8[3] = 0x80 | (cp & 0x3f); - return 4; - } else { - abort(); - } - return 0; -} - -%} - -ID [A-Za-z_0-9]+ -HEX_D [a-fA-F0-9] -INT [-]?[0-9]+ -DOUBLE {INT}((\.[0-9]+)?([eE][-+]?[0-9]+)?) -WS [ \t\r\n] -LCB [{] -RCB [}] -UNESCAPED_CH [^\"\\\b\n\r\f\t] - -%x JSON PREVAL STRING JSONVAL POSTVAL DOUBLE -%% - -{ID} { instid = yytext; BEGIN(JSON); } - -{WS}*{LCB}{WS}* { BEGIN(PREVAL); } - -\" { BEGIN(STRING); spos=0; } - -\" { featname[spos] = 0; - featmap[curfeat].first = FD::Convert(featname); - BEGIN(JSONVAL); - } -{UNESCAPED_CH} { featname[spos++] = yytext[0]; } -\\\" { featname[spos++] = '"'; } -\\\\ { featname[spos++] = '\\'; } -\\\/ { featname[spos++] = '/'; } -\\b { } -\\f { } -\\n { } -\\r { } -\\t { } -\\u{HEX_D}{HEX_D}{HEX_D}{HEX_D} { abort(); - } - -{WS}*:{WS}* { BEGIN(DOUBLE); } -{DOUBLE} { featmap[curfeat++].second = strtod(yytext, 0); - BEGIN(POSTVAL); } - -{WS}*,{WS}* { BEGIN(PREVAL); } -{WS}*{RCB}\n* { - const SparseVector x(&featmap[0], &featmap[curfeat]); - json_fmap_callback(instid, x, json_fmap_callback_extra); - curfeat = 0; - BEGIN(INITIAL); - } - -. { std::cerr << "bad input: " << yytext << std::endl; abort(); } - -%% - -void JSONFeatureMapLexer::ReadRules(std::istream* in, FeatureMapCallback func, void* extra) { - json_fmap_callback = func; - json_fmap_callback_extra = extra; - jfmap_stream = in; - json_fmap_yylex(); -} - -#if 0 -void cb(const std::string& id, const SparseVector& fmap, void* extra) { - (void) extra; - static int cc = 0; - cc++; -} - -int main() { - JSONFeatureMapLexer::ReadRules(&std::cin, cb, NULL); -} -#endif - -- cgit v1.2.3