diff options
Diffstat (limited to 'utils/synutils/maxent-3.0/maxent.h')
-rw-r--r-- | utils/synutils/maxent-3.0/maxent.h | 402 |
1 files changed, 0 insertions, 402 deletions
diff --git a/utils/synutils/maxent-3.0/maxent.h b/utils/synutils/maxent-3.0/maxent.h deleted file mode 100644 index 88a04e25..00000000 --- a/utils/synutils/maxent-3.0/maxent.h +++ /dev/null @@ -1,402 +0,0 @@ -/* - * $Id: maxent.h,v 1.1.1.1 2007/05/15 08:30:35 kyoshida Exp $ - */ - -#ifndef __MAXENT_H_ -#define __MAXENT_H_ - -#include <string> -#include <vector> -#include <list> -#include <map> -#include <algorithm> -#include <iostream> -#include <string> -#include <cassert> -#include "mathvec.h" - -#define USE_HASH_MAP // if you encounter errors with hash, try commenting out - // this line. (the program will be a bit slower, though) -#ifdef USE_HASH_MAP -#include <ext/hash_map> -#endif - -// -// data format for each sample for training/testing -// -struct ME_Sample { - public: - ME_Sample() : label("") {}; - ME_Sample(const std::string& l) : label(l) {}; - void set_label(const std::string& l) { label = l; } - - // to add a binary feature - void add_feature(const std::string& f) { features.push_back(f); } - - // to add a real-valued feature - void add_feature(const std::string& s, const double d) { - rvfeatures.push_back(std::pair<std::string, double>(s, d)); - } - - public: - std::string label; - std::vector<std::string> features; - std::vector<std::pair<std::string, double> > rvfeatures; - - // obsolete - void add_feature(const std::pair<std::string, double>& f) { - rvfeatures.push_back(f); // real-valued features - } -}; - -// -// for those who want to use load_from_array() -// -typedef struct ME_Model_Data { - char* label; - char* feature; - double weight; -} ME_Model_Data; - -class ME_Model { - public: - void add_training_sample(const ME_Sample& s); - int train(); - std::vector<double> classify(ME_Sample& s) const; - bool load_from_file(const std::string& filename); - bool save_to_file(const std::string& filename, const double th = 0) const; - int num_classes() const { return _num_classes; } - std::string get_class_label(int i) const { return _label_bag.Str(i); } - int get_class_id(const std::string& s) const { return _label_bag.Id(s); } - void get_features( - std::list<std::pair<std::pair<std::string, std::string>, double> >& fl); - void set_heldout(const int h, const int n = 0) { - _nheldout = h; - _early_stopping_n = n; - }; - void use_l1_regularizer(const double v) { _l1reg = v; } - void use_l2_regularizer(const double v) { _l2reg = v; } - void use_SGD(int iter = 30, double eta0 = 1, double alpha = 0.85) { - _optimization_method = SGD; - SGD_ITER = iter; - SGD_ETA0 = eta0; - SGD_ALPHA = alpha; - } - bool load_from_array(const ME_Model_Data data[]); - void set_reference_model(const ME_Model& ref_model) { - _ref_modelp = &ref_model; - }; - void clear(); - - ME_Model() { - _l1reg = _l2reg = 0; - _nheldout = 0; - _early_stopping_n = 0; - _ref_modelp = NULL; - _optimization_method = LBFGS; - } - - public: - // obsolete. just for downward compatibility - int train(const std::vector<ME_Sample>& train); - - private: - enum OPTIMIZATION_METHOD { - LBFGS, - OWLQN, - SGD - } _optimization_method; - // OWLQN and SGD are available only for L1-regularization - - int SGD_ITER; - double SGD_ETA0; - double SGD_ALPHA; - - double _l1reg, _l2reg; - - struct Sample { - int label; - std::vector<int> positive_features; - std::vector<std::pair<int, double> > rvfeatures; - std::vector<double> ref_pd; // reference probability distribution - bool operator<(const Sample& x) const { - for (unsigned int i = 0; i < positive_features.size(); i++) { - if (i >= x.positive_features.size()) return false; - int v0 = positive_features[i]; - int v1 = x.positive_features[i]; - if (v0 < v1) return true; - if (v0 > v1) return false; - } - return false; - } - }; - - struct ME_Feature { - enum { - MAX_LABEL_TYPES = 255 - }; - - // ME_Feature(const int l, const int f) : _body((l << 24) + f) { - // assert(l >= 0 && l < 256); - // assert(f >= 0 && f <= 0xffffff); - // }; - // int label() const { return _body >> 24; } - // int feature() const { return _body & 0xffffff; } - ME_Feature(const int l, const int f) : _body((f << 8) + l) { - assert(l >= 0 && l <= MAX_LABEL_TYPES); - assert(f >= 0 && f <= 0xffffff); - }; - int label() const { return _body & 0xff; } - int feature() const { return _body >> 8; } - unsigned int body() const { return _body; } - - private: - unsigned int _body; - }; - - struct ME_FeatureBag { -#ifdef USE_HASH_MAP - typedef __gnu_cxx::hash_map<unsigned int, int> map_type; -#else - typedef std::map<unsigned int, int> map_type; -#endif - map_type mef2id; - std::vector<ME_Feature> id2mef; - int Put(const ME_Feature& i) { - map_type::const_iterator j = mef2id.find(i.body()); - if (j == mef2id.end()) { - int id = id2mef.size(); - id2mef.push_back(i); - mef2id[i.body()] = id; - return id; - } - return j->second; - } - int Id(const ME_Feature& i) const { - map_type::const_iterator j = mef2id.find(i.body()); - if (j == mef2id.end()) { - return -1; - } - return j->second; - } - ME_Feature Feature(int id) const { - assert(id >= 0 && id < (int)id2mef.size()); - return id2mef[id]; - } - int Size() const { return id2mef.size(); } - void Clear() { - mef2id.clear(); - id2mef.clear(); - } - }; - - struct hashfun_str { - size_t operator()(const std::string& s) const { - assert(sizeof(int) == 4 && sizeof(char) == 1); - const int* p = reinterpret_cast<const int*>(s.c_str()); - size_t v = 0; - int n = s.size() / 4; - for (int i = 0; i < n; i++, p++) { - // v ^= *p; - v ^= *p << (4 * (i % 2)); // note) 0 <= char < 128 - } - int m = s.size() % 4; - for (int i = 0; i < m; i++) { - v ^= s[4 * n + i] << (i * 8); - } - return v; - } - }; - - struct MiniStringBag { -#ifdef USE_HASH_MAP - typedef __gnu_cxx::hash_map<std::string, int, hashfun_str> map_type; -#else - typedef std::map<std::string, int> map_type; -#endif - int _size; - map_type str2id; - MiniStringBag() : _size(0) {} - int Put(const std::string& i) { - map_type::const_iterator j = str2id.find(i); - if (j == str2id.end()) { - int id = _size; - _size++; - str2id[i] = id; - return id; - } - return j->second; - } - int Id(const std::string& i) const { - map_type::const_iterator j = str2id.find(i); - if (j == str2id.end()) return -1; - return j->second; - } - int Size() const { return _size; } - void Clear() { - str2id.clear(); - _size = 0; - } - map_type::const_iterator begin() const { return str2id.begin(); } - map_type::const_iterator end() const { return str2id.end(); } - }; - - struct StringBag : public MiniStringBag { - std::vector<std::string> id2str; - int Put(const std::string& i) { - map_type::const_iterator j = str2id.find(i); - if (j == str2id.end()) { - int id = id2str.size(); - id2str.push_back(i); - str2id[i] = id; - return id; - } - return j->second; - } - std::string Str(const int id) const { - assert(id >= 0 && id < (int)id2str.size()); - return id2str[id]; - } - int Size() const { return id2str.size(); } - void Clear() { - str2id.clear(); - id2str.clear(); - } - }; - - std::vector<Sample> _vs; // vector of training_samples - StringBag _label_bag; - MiniStringBag _featurename_bag; - std::vector<double> _vl; // vector of lambda - ME_FeatureBag _fb; - int _num_classes; - std::vector<double> _vee; // empirical expectation - std::vector<double> _vme; // empirical expectation - std::vector<std::vector<int> > _feature2mef; - std::vector<Sample> _heldout; - double _train_error; // current error rate on the training data - double _heldout_error; // current error rate on the heldout data - int _nheldout; - int _early_stopping_n; - std::vector<double> _vhlogl; - const ME_Model* _ref_modelp; - - double heldout_likelihood(); - int conditional_probability(const Sample& nbs, - std::vector<double>& membp) const; - int make_feature_bag(const int cutoff); - int classify(const Sample& nbs, std::vector<double>& membp) const; - double update_model_expectation(); - int perform_QUASI_NEWTON(); - int perform_SGD(); - int perform_GIS(int C); - std::vector<double> perform_LBFGS(const std::vector<double>& x0); - std::vector<double> perform_OWLQN(const std::vector<double>& x0, - const double C); - double backtracking_line_search(const Vec& x0, const Vec& grad0, - const double f0, const Vec& dx, Vec& x, - Vec& grad1); - double regularized_func_grad(const double C, const Vec& x, Vec& grad); - double constrained_line_search(double C, const Vec& x0, const Vec& grad0, - const double f0, const Vec& dx, Vec& x, - Vec& grad1); - - void set_ref_dist(Sample& s) const; - void init_feature2mef(); - - double FunctionGradient(const std::vector<double>& x, - std::vector<double>& grad); - static double FunctionGradientWrapper(const std::vector<double>& x, - std::vector<double>& grad); -}; - -#endif - -/* - * $Log: maxent.h,v $ - * Revision 1.1.1.1 2007/05/15 08:30:35 kyoshida - * stepp tagger, by Okanohara and Tsuruoka - * - * Revision 1.24 2006/08/21 17:30:38 tsuruoka - * use MAX_LABEL_TYPES - * - * Revision 1.23 2006/07/25 13:19:53 tsuruoka - * sort _vs[] - * - * Revision 1.22 2006/07/18 11:13:15 tsuruoka - * modify comments - * - * Revision 1.21 2006/07/18 10:02:15 tsuruoka - * remove sample2feature[] - * speed up conditional_probability() - * - * Revision 1.20 2006/07/18 05:10:51 tsuruoka - * add ref_dist - * - * Revision 1.19 2005/12/23 10:33:02 tsuruoka - * support real-valued features - * - * Revision 1.18 2005/12/23 09:15:29 tsuruoka - * modify _train to reduce memory consumption - * - * Revision 1.17 2005/10/28 13:02:34 tsuruoka - * set_heldout(): add default value - * Feature() - * - * Revision 1.16 2005/09/12 13:51:16 tsuruoka - * Sample: list -> vector - * - * Revision 1.15 2005/09/12 13:27:10 tsuruoka - * add add_training_sample() - * - * Revision 1.14 2005/04/27 11:22:27 tsuruoka - * bugfix - * ME_Sample: list -> vector - * - * Revision 1.13 2005/04/27 10:20:19 tsuruoka - * MiniStringBag -> StringBag - * - * Revision 1.12 2005/04/27 10:00:42 tsuruoka - * remove tmpfb - * - * Revision 1.11 2005/04/26 14:25:53 tsuruoka - * add MiniStringBag, USE_HASH_MAP - * - * Revision 1.10 2004/10/04 05:50:25 tsuruoka - * add Clear() - * - * Revision 1.9 2004/08/09 12:27:21 tsuruoka - * change messages - * - * Revision 1.8 2004/08/04 13:55:19 tsuruoka - * modify _sample2feature - * - * Revision 1.7 2004/07/29 05:51:13 tsuruoka - * remove modeldata.h - * - * Revision 1.6 2004/07/28 13:42:58 tsuruoka - * add AGIS - * - * Revision 1.5 2004/07/28 05:54:14 tsuruoka - * get_class_name() -> get_class_label() - * ME_Feature: bugfix - * - * Revision 1.4 2004/07/27 16:58:47 tsuruoka - * modify the interface of classify() - * - * Revision 1.3 2004/07/26 17:23:46 tsuruoka - * _sample2feature: list -> vector - * - * Revision 1.2 2004/07/26 15:49:23 tsuruoka - * modify ME_Feature - * - * Revision 1.1 2004/07/26 13:10:55 tsuruoka - * add files - * - * Revision 1.18 2004/07/22 08:34:45 tsuruoka - * modify _sample2feature[] - * - * Revision 1.17 2004/07/21 16:33:01 tsuruoka - * remove some comments - * - */ |