diff options
author | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2012-11-18 13:35:42 -0500 |
---|---|---|
committer | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2012-11-18 13:35:42 -0500 |
commit | 1b8181bf0d6e9137e6b9ccdbe414aec37377a1a9 (patch) | |
tree | 33e5f3aa5abff1f41314cf8f6afbd2c2c40e4bfd /training/lbfgs.h | |
parent | 7c4665949fb93fb3de402e4ce1d19bef67850d05 (diff) |
major restructure of the training code
Diffstat (limited to 'training/lbfgs.h')
-rw-r--r-- | training/lbfgs.h | 1459 |
1 files changed, 0 insertions, 1459 deletions
diff --git a/training/lbfgs.h b/training/lbfgs.h deleted file mode 100644 index e8baecab..00000000 --- a/training/lbfgs.h +++ /dev/null @@ -1,1459 +0,0 @@ -#ifndef SCITBX_LBFGS_H -#define SCITBX_LBFGS_H - -#include <cstdio> -#include <cstddef> -#include <cmath> -#include <stdexcept> -#include <algorithm> -#include <vector> -#include <string> -#include <iostream> -#include <sstream> - -namespace scitbx { - -//! Limited-memory Broyden-Fletcher-Goldfarb-Shanno (LBFGS) %minimizer. -/*! Implementation of the - Limited-memory Broyden-Fletcher-Goldfarb-Shanno (LBFGS) - algorithm for large-scale multidimensional minimization - problems. - - This code was manually derived from Java code which was - in turn derived from the Fortran program - <code>lbfgs.f</code>. The Java translation was - effected mostly mechanically, with some manual - clean-up; in particular, array indices start at 0 - instead of 1. Most of the comments from the Fortran - code have been pasted in. - - Information on the original LBFGS Fortran source code is - available at - http://www.netlib.org/opt/lbfgs_um.shar . The following - information is taken verbatim from the Netlib documentation - for the Fortran source. - - <pre> - file opt/lbfgs_um.shar - for unconstrained optimization problems - alg limited memory BFGS method - by J. Nocedal - contact nocedal@eecs.nwu.edu - ref D. C. Liu and J. Nocedal, ``On the limited memory BFGS method for - , large scale optimization methods'' Mathematical Programming 45 - , (1989), pp. 503-528. - , (Postscript file of this paper is available via anonymous ftp - , to eecs.nwu.edu in the directory pub/%lbfgs/lbfgs_um.) - </pre> - - @author Jorge Nocedal: original Fortran version, including comments - (July 1990).<br> - Robert Dodier: Java translation, August 1997.<br> - Ralf W. Grosse-Kunstleve: C++ port, March 2002.<br> - Chris Dyer: serialize/deserialize functionality - */ -namespace lbfgs { - - //! Generic exception class for %lbfgs %error messages. - /*! All exceptions thrown by the minimizer are derived from this class. - */ - class error : public std::exception { - public: - //! Constructor. - error(std::string const& msg) throw() - : msg_("lbfgs error: " + msg) - {} - //! Access to error message. - virtual const char* what() const throw() { return msg_.c_str(); } - protected: - virtual ~error() throw() {} - std::string msg_; - public: - static std::string itoa(unsigned long i) { - std::ostringstream os; - os << i; - return os.str(); - } - }; - - //! Specific exception class. - class error_internal_error : public error { - public: - //! Constructor. - error_internal_error(const char* file, unsigned long line) throw() - : error( - "Internal Error: " + std::string(file) + "(" + itoa(line) + ")") - {} - }; - - //! Specific exception class. - class error_improper_input_parameter : public error { - public: - //! Constructor. - error_improper_input_parameter(std::string const& msg) throw() - : error("Improper input parameter: " + msg) - {} - }; - - //! Specific exception class. - class error_improper_input_data : public error { - public: - //! Constructor. - error_improper_input_data(std::string const& msg) throw() - : error("Improper input data: " + msg) - {} - }; - - //! Specific exception class. - class error_search_direction_not_descent : public error { - public: - //! Constructor. - error_search_direction_not_descent() throw() - : error("The search direction is not a descent direction.") - {} - }; - - //! Specific exception class. - class error_line_search_failed : public error { - public: - //! Constructor. - error_line_search_failed(std::string const& msg) throw() - : error("Line search failed: " + msg) - {} - }; - - //! Specific exception class. - class error_line_search_failed_rounding_errors - : public error_line_search_failed { - public: - //! Constructor. - error_line_search_failed_rounding_errors(std::string const& msg) throw() - : error_line_search_failed(msg) - {} - }; - - namespace detail { - - template <typename NumType> - inline - NumType - pow2(NumType const& x) { return x * x; } - - template <typename NumType> - inline - NumType - abs(NumType const& x) { - if (x < NumType(0)) return -x; - return x; - } - - // This class implements an algorithm for multi-dimensional line search. - template <typename FloatType, typename SizeType = std::size_t> - class mcsrch - { - protected: - int infoc; - FloatType dginit; - bool brackt; - bool stage1; - FloatType finit; - FloatType dgtest; - FloatType width; - FloatType width1; - FloatType stx; - FloatType fx; - FloatType dgx; - FloatType sty; - FloatType fy; - FloatType dgy; - FloatType stmin; - FloatType stmax; - - static FloatType const& max3( - FloatType const& x, - FloatType const& y, - FloatType const& z) - { - return x < y ? (y < z ? z : y ) : (x < z ? z : x ); - } - - public: - /* Minimize a function along a search direction. This code is - a Java translation of the function <code>MCSRCH</code> from - <code>lbfgs.f</code>, which in turn is a slight modification - of the subroutine <code>CSRCH</code> of More' and Thuente. - The changes are to allow reverse communication, and do not - affect the performance of the routine. This function, in turn, - calls <code>mcstep</code>.<p> - - The Java translation was effected mostly mechanically, with - some manual clean-up; in particular, array indices start at 0 - instead of 1. Most of the comments from the Fortran code have - been pasted in here as well.<p> - - The purpose of <code>mcsrch</code> is to find a step which - satisfies a sufficient decrease condition and a curvature - condition.<p> - - At each stage this function updates an interval of uncertainty - with endpoints <code>stx</code> and <code>sty</code>. The - interval of uncertainty is initially chosen so that it - contains a minimizer of the modified function - <pre> - f(x+stp*s) - f(x) - ftol*stp*(gradf(x)'s). - </pre> - If a step is obtained for which the modified function has a - nonpositive function value and nonnegative derivative, then - the interval of uncertainty is chosen so that it contains a - minimizer of <code>f(x+stp*s)</code>.<p> - - The algorithm is designed to find a step which satisfies - the sufficient decrease condition - <pre> - f(x+stp*s) <= f(X) + ftol*stp*(gradf(x)'s), - </pre> - and the curvature condition - <pre> - abs(gradf(x+stp*s)'s)) <= gtol*abs(gradf(x)'s). - </pre> - If <code>ftol</code> is less than <code>gtol</code> and if, - for example, the function is bounded below, then there is - always a step which satisfies both conditions. If no step can - be found which satisfies both conditions, then the algorithm - usually stops when rounding errors prevent further progress. - In this case <code>stp</code> only satisfies the sufficient - decrease condition.<p> - - @author Original Fortran version by Jorge J. More' and - David J. Thuente as part of the Minpack project, June 1983, - Argonne National Laboratory. Java translation by Robert - Dodier, August 1997. - - @param n The number of variables. - - @param x On entry this contains the base point for the line - search. On exit it contains <code>x + stp*s</code>. - - @param f On entry this contains the value of the objective - function at <code>x</code>. On exit it contains the value - of the objective function at <code>x + stp*s</code>. - - @param g On entry this contains the gradient of the objective - function at <code>x</code>. On exit it contains the gradient - at <code>x + stp*s</code>. - - @param s The search direction. - - @param stp On entry this contains an initial estimate of a - satifactory step length. On exit <code>stp</code> contains - the final estimate. - - @param ftol Tolerance for the sufficient decrease condition. - - @param xtol Termination occurs when the relative width of the - interval of uncertainty is at most <code>xtol</code>. - - @param maxfev Termination occurs when the number of evaluations - of the objective function is at least <code>maxfev</code> by - the end of an iteration. - - @param info This is an output variable, which can have these - values: - <ul> - <li><code>info = -1</code> A return is made to compute - the function and gradient. - <li><code>info = 1</code> The sufficient decrease condition - and the directional derivative condition hold. - </ul> - - @param nfev On exit, this is set to the number of function - evaluations. - - @param wa Temporary storage array, of length <code>n</code>. - */ - void run( - FloatType const& gtol, - FloatType const& stpmin, - FloatType const& stpmax, - SizeType n, - FloatType* x, - FloatType f, - const FloatType* g, - FloatType* s, - SizeType is0, - FloatType& stp, - FloatType ftol, - FloatType xtol, - SizeType maxfev, - int& info, - SizeType& nfev, - FloatType* wa); - - /* The purpose of this function is to compute a safeguarded step - for a linesearch and to update an interval of uncertainty for - a minimizer of the function.<p> - - The parameter <code>stx</code> contains the step with the - least function value. The parameter <code>stp</code> contains - the current step. It is assumed that the derivative at - <code>stx</code> is negative in the direction of the step. If - <code>brackt</code> is <code>true</code> when - <code>mcstep</code> returns then a minimizer has been - bracketed in an interval of uncertainty with endpoints - <code>stx</code> and <code>sty</code>.<p> - - Variables that must be modified by <code>mcstep</code> are - implemented as 1-element arrays. - - @param stx Step at the best step obtained so far. - This variable is modified by <code>mcstep</code>. - @param fx Function value at the best step obtained so far. - This variable is modified by <code>mcstep</code>. - @param dx Derivative at the best step obtained so far. - The derivative must be negative in the direction of the - step, that is, <code>dx</code> and <code>stp-stx</code> must - have opposite signs. This variable is modified by - <code>mcstep</code>. - - @param sty Step at the other endpoint of the interval of - uncertainty. This variable is modified by <code>mcstep</code>. - @param fy Function value at the other endpoint of the interval - of uncertainty. This variable is modified by - <code>mcstep</code>. - - @param dy Derivative at the other endpoint of the interval of - uncertainty. This variable is modified by <code>mcstep</code>. - - @param stp Step at the current step. If <code>brackt</code> is set - then on input <code>stp</code> must be between <code>stx</code> - and <code>sty</code>. On output <code>stp</code> is set to the - new step. - @param fp Function value at the current step. - @param dp Derivative at the current step. - - @param brackt Tells whether a minimizer has been bracketed. - If the minimizer has not been bracketed, then on input this - variable must be set <code>false</code>. If the minimizer has - been bracketed, then on output this variable is - <code>true</code>. - - @param stpmin Lower bound for the step. - @param stpmax Upper bound for the step. - - If the return value is 1, 2, 3, or 4, then the step has - been computed successfully. A return value of 0 indicates - improper input parameters. - - @author Jorge J. More, David J. Thuente: original Fortran version, - as part of Minpack project. Argonne Nat'l Laboratory, June 1983. - Robert Dodier: Java translation, August 1997. - */ - static int mcstep( - FloatType& stx, - FloatType& fx, - FloatType& dx, - FloatType& sty, - FloatType& fy, - FloatType& dy, - FloatType& stp, - FloatType fp, - FloatType dp, - bool& brackt, - FloatType stpmin, - FloatType stpmax); - - void serialize(std::ostream* out) const { - out->write((const char*)&infoc,sizeof(infoc)); - out->write((const char*)&dginit,sizeof(dginit)); - out->write((const char*)&brackt,sizeof(brackt)); - out->write((const char*)&stage1,sizeof(stage1)); - out->write((const char*)&finit,sizeof(finit)); - out->write((const char*)&dgtest,sizeof(dgtest)); - out->write((const char*)&width,sizeof(width)); - out->write((const char*)&width1,sizeof(width1)); - out->write((const char*)&stx,sizeof(stx)); - out->write((const char*)&fx,sizeof(fx)); - out->write((const char*)&dgx,sizeof(dgx)); - out->write((const char*)&sty,sizeof(sty)); - out->write((const char*)&fy,sizeof(fy)); - out->write((const char*)&dgy,sizeof(dgy)); - out->write((const char*)&stmin,sizeof(stmin)); - out->write((const char*)&stmax,sizeof(stmax)); - } - - void deserialize(std::istream* in) const { - in->read((char*)&infoc, sizeof(infoc)); - in->read((char*)&dginit, sizeof(dginit)); - in->read((char*)&brackt, sizeof(brackt)); - in->read((char*)&stage1, sizeof(stage1)); - in->read((char*)&finit, sizeof(finit)); - in->read((char*)&dgtest, sizeof(dgtest)); - in->read((char*)&width, sizeof(width)); - in->read((char*)&width1, sizeof(width1)); - in->read((char*)&stx, sizeof(stx)); - in->read((char*)&fx, sizeof(fx)); - in->read((char*)&dgx, sizeof(dgx)); - in->read((char*)&sty, sizeof(sty)); - in->read((char*)&fy, sizeof(fy)); - in->read((char*)&dgy, sizeof(dgy)); - in->read((char*)&stmin, sizeof(stmin)); - in->read((char*)&stmax, sizeof(stmax)); - } - }; - - template <typename FloatType, typename SizeType> - void mcsrch<FloatType, SizeType>::run( - FloatType const& gtol, - FloatType const& stpmin, - FloatType const& stpmax, - SizeType n, - FloatType* x, - FloatType f, - const FloatType* g, - FloatType* s, - SizeType is0, - FloatType& stp, - FloatType ftol, - FloatType xtol, - SizeType maxfev, - int& info, - SizeType& nfev, - FloatType* wa) - { - if (info != -1) { - infoc = 1; - if ( n == 0 - || maxfev == 0 - || gtol < FloatType(0) - || xtol < FloatType(0) - || stpmin < FloatType(0) - || stpmax < stpmin) { - throw error_internal_error(__FILE__, __LINE__); - } - if (stp <= FloatType(0) || ftol < FloatType(0)) { - throw error_internal_error(__FILE__, __LINE__); - } - // Compute the initial gradient in the search direction - // and check that s is a descent direction. - dginit = FloatType(0); - for (SizeType j = 0; j < n; j++) { - dginit += g[j] * s[is0+j]; - } - if (dginit >= FloatType(0)) { - throw error_search_direction_not_descent(); - } - brackt = false; - stage1 = true; - nfev = 0; - finit = f; - dgtest = ftol*dginit; - width = stpmax - stpmin; - width1 = FloatType(2) * width; - std::copy(x, x+n, wa); - // The variables stx, fx, dgx contain the values of the step, - // function, and directional derivative at the best step. - // The variables sty, fy, dgy contain the value of the step, - // function, and derivative at the other endpoint of - // the interval of uncertainty. - // The variables stp, f, dg contain the values of the step, - // function, and derivative at the current step. - stx = FloatType(0); - fx = finit; - dgx = dginit; - sty = FloatType(0); - fy = finit; - dgy = dginit; - } - for (;;) { - if (info != -1) { - // Set the minimum and maximum steps to correspond - // to the present interval of uncertainty. - if (brackt) { - stmin = std::min(stx, sty); - stmax = std::max(stx, sty); - } - else { - stmin = stx; - stmax = stp + FloatType(4) * (stp - stx); - } - // Force the step to be within the bounds stpmax and stpmin. - stp = std::max(stp, stpmin); - stp = std::min(stp, stpmax); - // If an unusual termination is to occur then let - // stp be the lowest point obtained so far. - if ( (brackt && (stp <= stmin || stp >= stmax)) - || nfev >= maxfev - 1 || infoc == 0 - || (brackt && stmax - stmin <= xtol * stmax)) { - stp = stx; - } - // Evaluate the function and gradient at stp - // and compute the directional derivative. - // We return to main program to obtain F and G. - for (SizeType j = 0; j < n; j++) { - x[j] = wa[j] + stp * s[is0+j]; - } - info=-1; - break; - } - info = 0; - nfev++; - FloatType dg(0); - for (SizeType j = 0; j < n; j++) { - dg += g[j] * s[is0+j]; - } - FloatType ftest1 = finit + stp*dgtest; - // Test for convergence. - if ((brackt && (stp <= stmin || stp >= stmax)) || infoc == 0) { - throw error_line_search_failed_rounding_errors( - "Rounding errors prevent further progress." - " There may not be a step which satisfies the" - " sufficient decrease and curvature conditions." - " Tolerances may be too small."); - } - if (stp == stpmax && f <= ftest1 && dg <= dgtest) { - throw error_line_search_failed( - "The step is at the upper bound stpmax()."); - } - if (stp == stpmin && (f > ftest1 || dg >= dgtest)) { - throw error_line_search_failed( - "The step is at the lower bound stpmin()."); - } - if (nfev >= maxfev) { - throw error_line_search_failed( - "Number of function evaluations has reached maxfev()."); - } - if (brackt && stmax - stmin <= xtol * stmax) { - throw error_line_search_failed( - "Relative width of the interval of uncertainty" - " is at most xtol()."); - } - // Check for termination. - if (f <= ftest1 && abs(dg) <= gtol * (-dginit)) { - info = 1; - break; - } - // In the first stage we seek a step for which the modified - // function has a nonpositive value and nonnegative derivative. - if ( stage1 && f <= ftest1 - && dg >= std::min(ftol, gtol) * dginit) { - stage1 = false; - } - // A modified function is used to predict the step only if - // we have not obtained a step for which the modified - // function has a nonpositive function value and nonnegative - // derivative, and if a lower function value has been - // obtained but the decrease is not sufficient. - if (stage1 && f <= fx && f > ftest1) { - // Define the modified function and derivative values. - FloatType fm = f - stp*dgtest; - FloatType fxm = fx - stx*dgtest; - FloatType fym = fy - sty*dgtest; - FloatType dgm = dg - dgtest; - FloatType dgxm = dgx - dgtest; - FloatType dgym = dgy - dgtest; - // Call cstep to update the interval of uncertainty - // and to compute the new step. - infoc = mcstep(stx, fxm, dgxm, sty, fym, dgym, stp, fm, dgm, - brackt, stmin, stmax); - // Reset the function and gradient values for f. - fx = fxm + stx*dgtest; - fy = fym + sty*dgtest; - dgx = dgxm + dgtest; - dgy = dgym + dgtest; - } - else { - // Call mcstep to update the interval of uncertainty - // and to compute the new step. - infoc = mcstep(stx, fx, dgx, sty, fy, dgy, stp, f, dg, - brackt, stmin, stmax); - } - // Force a sufficient decrease in the size of the - // interval of uncertainty. - if (brackt) { - if (abs(sty - stx) >= FloatType(0.66) * width1) { - stp = stx + FloatType(0.5) * (sty - stx); - } - width1 = width; - width = abs(sty - stx); - } - } - } - - template <typename FloatType, typename SizeType> - int mcsrch<FloatType, SizeType>::mcstep( - FloatType& stx, - FloatType& fx, - FloatType& dx, - FloatType& sty, - FloatType& fy, - FloatType& dy, - FloatType& stp, - FloatType fp, - FloatType dp, - bool& brackt, - FloatType stpmin, - FloatType stpmax) - { - bool bound; - FloatType gamma, p, q, r, s, sgnd, stpc, stpf, stpq, theta; - int info = 0; - if ( ( brackt && (stp <= std::min(stx, sty) - || stp >= std::max(stx, sty))) - || dx * (stp - stx) >= FloatType(0) || stpmax < stpmin) { - return 0; - } - // Determine if the derivatives have opposite sign. - sgnd = dp * (dx / abs(dx)); - if (fp > fx) { - // First case. A higher function value. - // The minimum is bracketed. If the cubic step is closer - // to stx than the quadratic step, the cubic step is taken, - // else the average of the cubic and quadratic steps is taken. - info = 1; - bound = true; - theta = FloatType(3) * (fx - fp) / (stp - stx) + dx + dp; - s = max3(abs(theta), abs(dx), abs(dp)); - gamma = s * std::sqrt(pow2(theta / s) - (dx / s) * (dp / s)); - if (stp < stx) gamma = - gamma; - p = (gamma - dx) + theta; - q = ((gamma - dx) + gamma) + dp; - r = p/q; - stpc = stx + r * (stp - stx); - stpq = stx - + ((dx / ((fx - fp) / (stp - stx) + dx)) / FloatType(2)) - * (stp - stx); - if (abs(stpc - stx) < abs(stpq - stx)) { - stpf = stpc; - } - else { - stpf = stpc + (stpq - stpc) / FloatType(2); - } - brackt = true; - } - else if (sgnd < FloatType(0)) { - // Second case. A lower function value and derivatives of - // opposite sign. The minimum is bracketed. If the cubic - // step is closer to stx than the quadratic (secant) step, - // the cubic step is taken, else the quadratic step is taken. - info = 2; - bound = false; - theta = FloatType(3) * (fx - fp) / (stp - stx) + dx + dp; - s = max3(abs(theta), abs(dx), abs(dp)); - gamma = s * std::sqrt(pow2(theta / s) - (dx / s) * (dp / s)); - if (stp > stx) gamma = - gamma; - p = (gamma - dp) + theta; - q = ((gamma - dp) + gamma) + dx; - r = p/q; - stpc = stp + r * (stx - stp); - stpq = stp + (dp / (dp - dx)) * (stx - stp); - if (abs(stpc - stp) > abs(stpq - stp)) { - stpf = stpc; - } - else { - stpf = stpq; - } - brackt = true; - } - else if (abs(dp) < abs(dx)) { - // Third case. A lower function value, derivatives of the - // same sign, and the magnitude of the derivative decreases. - // The cubic step is only used if the cubic tends to infinity - // in the direction of the step or if the minimum of the cubic - // is beyond stp. Otherwise the cubic step is defined to be - // either stpmin or stpmax. The quadratic (secant) step is also - // computed and if the minimum is bracketed then the the step - // closest to stx is taken, else the step farthest away is taken. - info = 3; - bound = true; - theta = FloatType(3) * (fx - fp) / (stp - stx) + dx + dp; - s = max3(abs(theta), abs(dx), abs(dp)); - gamma = s * std::sqrt( - std::max(FloatType(0), pow2(theta / s) - (dx / s) * (dp / s))); - if (stp > stx) gamma = -gamma; - p = (gamma - dp) + theta; - q = (gamma + (dx - dp)) + gamma; - r = p/q; - if (r < FloatType(0) && gamma != FloatType(0)) { - stpc = stp + r * (stx - stp); - } - else if (stp > stx) { - stpc = stpmax; - } - else { - stpc = stpmin; - } - stpq = stp + (dp / (dp - dx)) * (stx - stp); - if (brackt) { - if (abs(stp - stpc) < abs(stp - stpq)) { - stpf = stpc; - } - else { - stpf = stpq; - } - } - else { - if (abs(stp - stpc) > abs(stp - stpq)) { - stpf = stpc; - } - else { - stpf = stpq; - } - } - } - else { - // Fourth case. A lower function value, derivatives of the - // same sign, and the magnitude of the derivative does - // not decrease. If the minimum is not bracketed, the step - // is either stpmin or stpmax, else the cubic step is taken. - info = 4; - bound = false; - if (brackt) { - theta = FloatType(3) * (fp - fy) / (sty - stp) + dy + dp; - s = max3(abs(theta), abs(dy), abs(dp)); - gamma = s * std::sqrt(pow2(theta / s) - (dy / s) * (dp / s)); - if (stp > sty) gamma = -gamma; - p = (gamma - dp) + theta; - q = ((gamma - dp) + gamma) + dy; - r = p/q; - stpc = stp + r * (sty - stp); - stpf = stpc; - } - else if (stp > stx) { - stpf = stpmax; - } - else { - stpf = stpmin; - } - } - // Update the interval of uncertainty. This update does not - // depend on the new step or the case analysis above. - if (fp > fx) { - sty = stp; - fy = fp; - dy = dp; - } - else { - if (sgnd < FloatType(0)) { - sty = stx; - fy = fx; - dy = dx; - } - stx = stp; - fx = fp; - dx = dp; - } - // Compute the new step and safeguard it. - stpf = std::min(stpmax, stpf); - stpf = std::max(stpmin, stpf); - stp = stpf; - if (brackt && bound) { - if (sty > stx) { - stp = std::min(stx + FloatType(0.66) * (sty - stx), stp); - } - else { - stp = std::max(stx + FloatType(0.66) * (sty - stx), stp); - } - } - return info; - } - - /* Compute the sum of a vector times a scalar plus another vector. - Adapted from the subroutine <code>daxpy</code> in - <code>lbfgs.f</code>. - */ - template <typename FloatType, typename SizeType> - void daxpy( - SizeType n, - FloatType da, - const FloatType* dx, - SizeType ix0, - SizeType incx, - FloatType* dy, - SizeType iy0, - SizeType incy) - { - SizeType i, ix, iy, m; - if (n == 0) return; - if (da == FloatType(0)) return; - if (!(incx == 1 && incy == 1)) { - ix = 0; - iy = 0; - for (i = 0; i < n; i++) { - dy[iy0+iy] += da * dx[ix0+ix]; - ix += incx; - iy += incy; - } - return; - } - m = n % 4; - for (i = 0; i < m; i++) { - dy[iy0+i] += da * dx[ix0+i]; - } - for (; i < n;) { - dy[iy0+i] += da * dx[ix0+i]; i++; - dy[iy0+i] += da * dx[ix0+i]; i++; - dy[iy0+i] += da * dx[ix0+i]; i++; - dy[iy0+i] += da * dx[ix0+i]; i++; - } - } - - template <typename FloatType, typename SizeType> - inline - void daxpy( - SizeType n, - FloatType da, - const FloatType* dx, - SizeType ix0, - FloatType* dy) - { - daxpy(n, da, dx, ix0, SizeType(1), dy, SizeType(0), SizeType(1)); - } - - /* Compute the dot product of two vectors. - Adapted from the subroutine <code>ddot</code> - in <code>lbfgs.f</code>. - */ - template <typename FloatType, typename SizeType> - FloatType ddot( - SizeType n, - const FloatType* dx, - SizeType ix0, - SizeType incx, - const FloatType* dy, - SizeType iy0, - SizeType incy) - { - SizeType i, ix, iy, m; - FloatType dtemp(0); - if (n == 0) return FloatType(0); - if (!(incx == 1 && incy == 1)) { - ix = 0; - iy = 0; - for (i = 0; i < n; i++) { - dtemp += dx[ix0+ix] * dy[iy0+iy]; - ix += incx; - iy += incy; - } - return dtemp; - } - m = n % 5; - for (i = 0; i < m; i++) { - dtemp += dx[ix0+i] * dy[iy0+i]; - } - for (; i < n;) { - dtemp += dx[ix0+i] * dy[iy0+i]; i++; - dtemp += dx[ix0+i] * dy[iy0+i]; i++; - dtemp += dx[ix0+i] * dy[iy0+i]; i++; - dtemp += dx[ix0+i] * dy[iy0+i]; i++; - dtemp += dx[ix0+i] * dy[iy0+i]; i++; - } - return dtemp; - } - - template <typename FloatType, typename SizeType> - inline - FloatType ddot( - SizeType n, - const FloatType* dx, - const FloatType* dy) - { - return ddot( - n, dx, SizeType(0), SizeType(1), dy, SizeType(0), SizeType(1)); - } - - } // namespace detail - - //! Interface to the LBFGS %minimizer. - /*! This class solves the unconstrained minimization problem - <pre> - min f(x), x = (x1,x2,...,x_n), - </pre> - using the limited-memory BFGS method. The routine is - especially effective on problems involving a large number of - variables. In a typical iteration of this method an - approximation Hk to the inverse of the Hessian - is obtained by applying <code>m</code> BFGS updates to a - diagonal matrix Hk0, using information from the - previous <code>m</code> steps. The user specifies the number - <code>m</code>, which determines the amount of storage - required by the routine. The user may also provide the - diagonal matrices Hk0 (parameter <code>diag</code> in the run() - function) if not satisfied with the default choice. The - algorithm is described in "On the limited memory BFGS method for - large scale optimization", by D. Liu and J. Nocedal, Mathematical - Programming B 45 (1989) 503-528. - - The user is required to calculate the function value - <code>f</code> and its gradient <code>g</code>. In order to - allow the user complete control over these computations, - reverse communication is used. The routine must be called - repeatedly under the control of the member functions - <code>requests_f_and_g()</code>, - <code>requests_diag()</code>. - If neither requests_f_and_g() nor requests_diag() is - <code>true</code> the user should check for convergence - (using class traditional_convergence_test or any - other custom test). If the convergence test is negative, - the minimizer may be called again for the next iteration. - - The steplength (stp()) is determined at each iteration - by means of the line search routine <code>mcsrch</code>, which is - a slight modification of the routine <code>CSRCH</code> written - by More' and Thuente. - - The only variables that are machine-dependent are - <code>xtol</code>, - <code>stpmin</code> and - <code>stpmax</code>. - - Fatal errors cause <code>error</code> exceptions to be thrown. - The generic class <code>error</code> is sub-classed (e.g. - class <code>error_line_search_failed</code>) to facilitate - granular %error handling. - - A note on performance: Using Compaq Fortran V5.4 and - Compaq C++ V6.5, the C++ implementation is about 15% slower - than the Fortran implementation. - */ - template <typename FloatType, typename SizeType = std::size_t> - class minimizer - { - public: - //! Default constructor. Some members are not initialized! - minimizer() - : n_(0), m_(0), maxfev_(0), - gtol_(0), xtol_(0), - stpmin_(0), stpmax_(0), - ispt(0), iypt(0) - {} - - //! Constructor. - /*! @param n The number of variables in the minimization problem. - Restriction: <code>n > 0</code>. - - @param m The number of corrections used in the BFGS update. - Values of <code>m</code> less than 3 are not recommended; - large values of <code>m</code> will result in excessive - computing time. <code>3 <= m <= 7</code> is - recommended. - Restriction: <code>m > 0</code>. - - @param maxfev Maximum number of function evaluations - <b>per line search</b>. - Termination occurs when the number of evaluations - of the objective function is at least <code>maxfev</code> by - the end of an iteration. - - @param gtol Controls the accuracy of the line search. - If the function and gradient evaluations are inexpensive with - respect to the cost of the iteration (which is sometimes the - case when solving very large problems) it may be advantageous - to set <code>gtol</code> to a small value. A typical small - value is 0.1. - Restriction: <code>gtol</code> should be greater than 1e-4. - - @param xtol An estimate of the machine precision (e.g. 10e-16 - on a SUN station 3/60). The line search routine will - terminate if the relative width of the interval of - uncertainty is less than <code>xtol</code>. - - @param stpmin Specifies the lower bound for the step - in the line search. - The default value is 1e-20. This value need not be modified - unless the exponent is too large for the machine being used, - or unless the problem is extremely badly scaled (in which - case the exponent should be increased). - - @param stpmax specifies the upper bound for the step - in the line search. - The default value is 1e20. This value need not be modified - unless the exponent is too large for the machine being used, - or unless the problem is extremely badly scaled (in which - case the exponent should be increased). - */ - explicit - minimizer( - SizeType n, - SizeType m = 5, - SizeType maxfev = 20, - FloatType gtol = FloatType(0.9), - FloatType xtol = FloatType(1.e-16), - FloatType stpmin = FloatType(1.e-20), - FloatType stpmax = FloatType(1.e20)) - : n_(n), m_(m), maxfev_(maxfev), - gtol_(gtol), xtol_(xtol), - stpmin_(stpmin), stpmax_(stpmax), - iflag_(0), requests_f_and_g_(false), requests_diag_(false), - iter_(0), nfun_(0), stp_(0), - stp1(0), ftol(0.0001), ys(0), point(0), npt(0), - ispt(n+2*m), iypt((n+2*m)+n*m), - info(0), bound(0), nfev(0) - { - if (n_ == 0) { - throw error_improper_input_parameter("n = 0."); - } - if (m_ == 0) { - throw error_improper_input_parameter("m = 0."); - } - if (maxfev_ == 0) { - throw error_improper_input_parameter("maxfev = 0."); - } - if (gtol_ <= FloatType(1.e-4)) { - throw error_improper_input_parameter("gtol <= 1.e-4."); - } - if (xtol_ < FloatType(0)) { - throw error_improper_input_parameter("xtol < 0."); - } - if (stpmin_ < FloatType(0)) { - throw error_improper_input_parameter("stpmin < 0."); - } - if (stpmax_ < stpmin) { - throw error_improper_input_parameter("stpmax < stpmin"); - } - w_.resize(n_*(2*m_+1)+2*m_); - scratch_array_.resize(n_); - } - - //! Number of free parameters (as passed to the constructor). - SizeType n() const { return n_; } - - //! Number of corrections kept (as passed to the constructor). - SizeType m() const { return m_; } - - /*! \brief Maximum number of evaluations of the objective function - per line search (as passed to the constructor). - */ - SizeType maxfev() const { return maxfev_; } - - /*! \brief Control of the accuracy of the line search. - (as passed to the constructor). - */ - FloatType gtol() const { return gtol_; } - - //! Estimate of the machine precision (as passed to the constructor). - FloatType xtol() const { return xtol_; } - - /*! \brief Lower bound for the step in the line search. - (as passed to the constructor). - */ - FloatType stpmin() const { return stpmin_; } - - /*! \brief Upper bound for the step in the line search. - (as passed to the constructor). - */ - FloatType stpmax() const { return stpmax_; } - - //! Status indicator for reverse communication. - /*! <code>true</code> if the run() function returns to request - evaluation of the objective function (<code>f</code>) and - gradients (<code>g</code>) for the current point - (<code>x</code>). To continue the minimization the - run() function is called again with the updated values for - <code>f</code> and <code>g</code>. - <p> - See also: requests_diag() - */ - bool requests_f_and_g() const { return requests_f_and_g_; } - - //! Status indicator for reverse communication. - /*! <code>true</code> if the run() function returns to request - evaluation of the diagonal matrix (<code>diag</code>) - for the current point (<code>x</code>). - To continue the minimization the run() function is called - again with the updated values for <code>diag</code>. - <p> - See also: requests_f_and_g() - */ - bool requests_diag() const { return requests_diag_; } - - //! Number of iterations so far. - /*! Note that one iteration may involve multiple evaluations - of the objective function. - <p> - See also: nfun() - */ - SizeType iter() const { return iter_; } - - //! Total number of evaluations of the objective function so far. - /*! The total number of function evaluations increases by the - number of evaluations required for the line search. The total - is only increased after a successful line search. - <p> - See also: iter() - */ - SizeType nfun() const { return nfun_; } - - //! Norm of gradient given gradient array of length n(). - FloatType euclidean_norm(const FloatType* a) const { - return std::sqrt(detail::ddot(n_, a, a)); - } - - //! Current stepsize. - FloatType stp() const { return stp_; } - - //! Execution of one step of the minimization. - /*! @param x On initial entry this must be set by the user to - the values of the initial estimate of the solution vector. - - @param f Before initial entry or on re-entry under the - control of requests_f_and_g(), <code>f</code> must be set - by the user to contain the value of the objective function - at the current point <code>x</code>. - - @param g Before initial entry or on re-entry under the - control of requests_f_and_g(), <code>g</code> must be set - by the user to contain the components of the gradient at - the current point <code>x</code>. - - The return value is <code>true</code> if either - requests_f_and_g() or requests_diag() is <code>true</code>. - Otherwise the user should check for convergence - (e.g. using class traditional_convergence_test) and - call the run() function again to continue the minimization. - If the return value is <code>false</code> the user - should <b>not</b> update <code>f</code>, <code>g</code> or - <code>diag</code> (other overload) before calling - the run() function again. - - Note that <code>x</code> is always modified by the run() - function. Depending on the situation it can therefore be - necessary to evaluate the objective function one more time - after the minimization is terminated. - */ - bool run( - FloatType* x, - FloatType f, - const FloatType* g) - { - return generic_run(x, f, g, false, 0); - } - - //! Execution of one step of the minimization. - /*! @param x See other overload. - - @param f See other overload. - - @param g See other overload. - - @param diag On initial entry or on re-entry under the - control of requests_diag(), <code>diag</code> must be set by - the user to contain the values of the diagonal matrix Hk0. - The routine will return at each iteration of the algorithm - with requests_diag() set to <code>true</code>. - <p> - Restriction: all elements of <code>diag</code> must be - positive. - */ - bool run( - FloatType* x, - FloatType f, - const FloatType* g, - const FloatType* diag) - { - return generic_run(x, f, g, true, diag); - } - - void serialize(std::ostream* out) const { - out->write((const char*)&n_, sizeof(n_)); // sanity check - out->write((const char*)&m_, sizeof(m_)); // sanity check - SizeType fs = sizeof(FloatType); - out->write((const char*)&fs, sizeof(fs)); // sanity check - - mcsrch_instance.serialize(out); - out->write((const char*)&iflag_, sizeof(iflag_)); - out->write((const char*)&requests_f_and_g_, sizeof(requests_f_and_g_)); - out->write((const char*)&requests_diag_, sizeof(requests_diag_)); - out->write((const char*)&iter_, sizeof(iter_)); - out->write((const char*)&nfun_, sizeof(nfun_)); - out->write((const char*)&stp_, sizeof(stp_)); - out->write((const char*)&stp1, sizeof(stp1)); - out->write((const char*)&ftol, sizeof(ftol)); - out->write((const char*)&ys, sizeof(ys)); - out->write((const char*)&point, sizeof(point)); - out->write((const char*)&npt, sizeof(npt)); - out->write((const char*)&info, sizeof(info)); - out->write((const char*)&bound, sizeof(bound)); - out->write((const char*)&nfev, sizeof(nfev)); - out->write((const char*)&w_[0], sizeof(FloatType) * w_.size()); - out->write((const char*)&scratch_array_[0], sizeof(FloatType) * scratch_array_.size()); - } - - void deserialize(std::istream* in) { - SizeType n, m, fs; - in->read((char*)&n, sizeof(n)); - in->read((char*)&m, sizeof(m)); - in->read((char*)&fs, sizeof(fs)); - assert(n == n_); - assert(m == m_); - assert(fs == sizeof(FloatType)); - - mcsrch_instance.deserialize(in); - in->read((char*)&iflag_, sizeof(iflag_)); - in->read((char*)&requests_f_and_g_, sizeof(requests_f_and_g_)); - in->read((char*)&requests_diag_, sizeof(requests_diag_)); - in->read((char*)&iter_, sizeof(iter_)); - in->read((char*)&nfun_, sizeof(nfun_)); - in->read((char*)&stp_, sizeof(stp_)); - in->read((char*)&stp1, sizeof(stp1)); - in->read((char*)&ftol, sizeof(ftol)); - in->read((char*)&ys, sizeof(ys)); - in->read((char*)&point, sizeof(point)); - in->read((char*)&npt, sizeof(npt)); - in->read((char*)&info, sizeof(info)); - in->read((char*)&bound, sizeof(bound)); - in->read((char*)&nfev, sizeof(nfev)); - in->read((char*)&w_[0], sizeof(FloatType) * w_.size()); - in->read((char*)&scratch_array_[0], sizeof(FloatType) * scratch_array_.size()); - } - - protected: - static void throw_diagonal_element_not_positive(SizeType i) { - throw error_improper_input_data( - "The " + error::itoa(i) + ". diagonal element of the" - " inverse Hessian approximation is not positive."); - } - - bool generic_run( - FloatType* x, - FloatType f, - const FloatType* g, - bool diagco, - const FloatType* diag); - - detail::mcsrch<FloatType, SizeType> mcsrch_instance; - const SizeType n_; - const SizeType m_; - const SizeType maxfev_; - const FloatType gtol_; - const FloatType xtol_; - const FloatType stpmin_; - const FloatType stpmax_; - int iflag_; - bool requests_f_and_g_; - bool requests_diag_; - SizeType iter_; - SizeType nfun_; - FloatType stp_; - FloatType stp1; - FloatType ftol; - FloatType ys; - SizeType point; - SizeType npt; - const SizeType ispt; - const SizeType iypt; - int info; - SizeType bound; - SizeType nfev; - std::vector<FloatType> w_; - std::vector<FloatType> scratch_array_; - }; - - template <typename FloatType, typename SizeType> - bool minimizer<FloatType, SizeType>::generic_run( - FloatType* x, - FloatType f, - const FloatType* g, - bool diagco, - const FloatType* diag) - { - bool execute_entire_while_loop = false; - if (!(requests_f_and_g_ || requests_diag_)) { - execute_entire_while_loop = true; - } - requests_f_and_g_ = false; - requests_diag_ = false; - FloatType* w = &(*(w_.begin())); - if (iflag_ == 0) { // Initialize. - nfun_ = 1; - if (diagco) { - for (SizeType i = 0; i < n_; i++) { - if (diag[i] <= FloatType(0)) { - throw_diagonal_element_not_positive(i); - } - } - } - else { - std::fill_n(scratch_array_.begin(), n_, FloatType(1)); - diag = &(*(scratch_array_.begin())); - } - for (SizeType i = 0; i < n_; i++) { - w[ispt + i] = -g[i] * diag[i]; - } - FloatType gnorm = std::sqrt(detail::ddot(n_, g, g)); - if (gnorm == FloatType(0)) return false; - stp1 = FloatType(1) / gnorm; - execute_entire_while_loop = true; - } - if (execute_entire_while_loop) { - bound = iter_; - iter_++; - info = 0; - if (iter_ != 1) { - if (iter_ > m_) bound = m_; - ys = detail::ddot( - n_, w, iypt + npt, SizeType(1), w, ispt + npt, SizeType(1)); - if (!diagco) { - FloatType yy = detail::ddot( - n_, w, iypt + npt, SizeType(1), w, iypt + npt, SizeType(1)); - std::fill_n(scratch_array_.begin(), n_, ys / yy); - diag = &(*(scratch_array_.begin())); - } - else { - iflag_ = 2; - requests_diag_ = true; - return true; - } - } - } - if (execute_entire_while_loop || iflag_ == 2) { - if (iter_ != 1) { - if (diag == 0) { - throw error_internal_error(__FILE__, __LINE__); - } - if (diagco) { - for (SizeType i = 0; i < n_; i++) { - if (diag[i] <= FloatType(0)) { - throw_diagonal_element_not_positive(i); - } - } - } - SizeType cp = point; - if (point == 0) cp = m_; - w[n_ + cp -1] = 1 / ys; - SizeType i; - for (i = 0; i < n_; i++) { - w[i] = -g[i]; - } - cp = point; - for (i = 0; i < bound; i++) { - if (cp == 0) cp = m_; - cp--; - FloatType sq = detail::ddot( - n_, w, ispt + cp * n_, SizeType(1), w, SizeType(0), SizeType(1)); - SizeType inmc=n_+m_+cp; - SizeType iycn=iypt+cp*n_; - w[inmc] = w[n_ + cp] * sq; - detail::daxpy(n_, -w[inmc], w, iycn, w); - } - for (i = 0; i < n_; i++) { - w[i] *= diag[i]; - } - for (i = 0; i < bound; i++) { - FloatType yr = detail::ddot( - n_, w, iypt + cp * n_, SizeType(1), w, SizeType(0), SizeType(1)); - FloatType beta = w[n_ + cp] * yr; - SizeType inmc=n_+m_+cp; - beta = w[inmc] - beta; - SizeType iscn=ispt+cp*n_; - detail::daxpy(n_, beta, w, iscn, w); - cp++; - if (cp == m_) cp = 0; - } - std::copy(w, w+n_, w+(ispt + point * n_)); - } - stp_ = FloatType(1); - if (iter_ == 1) stp_ = stp1; - std::copy(g, g+n_, w); - } - mcsrch_instance.run( - gtol_, stpmin_, stpmax_, n_, x, f, g, w, ispt + point * n_, - stp_, ftol, xtol_, maxfev_, info, nfev, &(*(scratch_array_.begin()))); - if (info == -1) { - iflag_ = 1; - requests_f_and_g_ = true; - return true; - } - if (info != 1) { - throw error_internal_error(__FILE__, __LINE__); - } - nfun_ += nfev; - npt = point*n_; - for (SizeType i = 0; i < n_; i++) { - w[ispt + npt + i] = stp_ * w[ispt + npt + i]; - w[iypt + npt + i] = g[i] - w[i]; - } - point++; - if (point == m_) point = 0; - return false; - } - - //! Traditional LBFGS convergence test. - /*! This convergence test is equivalent to the test embedded - in the <code>lbfgs.f</code> Fortran code. The test assumes that - there is a meaningful relation between the Euclidean norm of the - parameter vector <code>x</code> and the norm of the gradient - vector <code>g</code>. Therefore this test should not be used if - this assumption is not correct for a given problem. - */ - template <typename FloatType, typename SizeType = std::size_t> - class traditional_convergence_test - { - public: - //! Default constructor. - traditional_convergence_test() - : n_(0), eps_(0) - {} - - //! Constructor. - /*! @param n The number of variables in the minimization problem. - Restriction: <code>n > 0</code>. - - @param eps Determines the accuracy with which the solution - is to be found. - */ - explicit - traditional_convergence_test( - SizeType n, - FloatType eps = FloatType(1.e-5)) - : n_(n), eps_(eps) - { - if (n_ == 0) { - throw error_improper_input_parameter("n = 0."); - } - if (eps_ < FloatType(0)) { - throw error_improper_input_parameter("eps < 0."); - } - } - - //! Number of free parameters (as passed to the constructor). - SizeType n() const { return n_; } - - /*! \brief Accuracy with which the solution is to be found - (as passed to the constructor). - */ - FloatType eps() const { return eps_; } - - //! Execution of the convergence test for the given parameters. - /*! Returns <code>true</code> if - <pre> - ||g|| < eps * max(1,||x||), - </pre> - where <code>||.||</code> denotes the Euclidean norm. - - @param x Current solution vector. - - @param g Components of the gradient at the current - point <code>x</code>. - */ - bool - operator()(const FloatType* x, const FloatType* g) const - { - FloatType xnorm = std::sqrt(detail::ddot(n_, x, x)); - FloatType gnorm = std::sqrt(detail::ddot(n_, g, g)); - if (gnorm <= eps_ * std::max(FloatType(1), xnorm)) return true; - return false; - } - protected: - const SizeType n_; - const FloatType eps_; - }; - -}} // namespace scitbx::lbfgs - -template <typename T> -std::ostream& operator<<(std::ostream& os, const scitbx::lbfgs::minimizer<T>& min) { - return os << "ITER=" << min.iter() << "\tNFUN=" << min.nfun() << "\tSTP=" << min.stp() << "\tDIAG=" << min.requests_diag() << "\tF&G=" << min.requests_f_and_g(); -} - - -#endif // SCITBX_LBFGS_H |