utils/owlqn.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127

#include <vector>
#include <iostream>
#include <cmath>
#include <stdio.h>
#include "mathvec.h"
#include "lbfgs.h"
#include "maxent.h"

using namespace std;

const static int M = LBFGS_M;
const static double LINE_SEARCH_ALPHA = 0.1;
const static double LINE_SEARCH_BETA = 0.5;

// stopping criteria
int OWLQN_MAX_ITER = 300;
const static double MIN_GRAD_NORM = 0.0001;

Vec approximate_Hg(const int iter, const Vec& grad, const Vec s[],
                   const Vec y[], const double z[]);

inline int sign(double x) {
  if (x > 0) return 1;
  if (x < 0) return -1;
  return 0;
};

static Vec pseudo_gradient(const Vec& x, const Vec& grad0, const double C) {
  Vec grad = grad0;
  for (size_t i = 0; i < x.Size(); i++) {
    if (x[i] != 0) {
      grad[i] += C * sign(x[i]);
      continue;
    }
    const double gm = grad0[i] - C;
    if (gm > 0) {
      grad[i] = gm;
      continue;
    }
    const double gp = grad0[i] + C;
    if (gp < 0) {
      grad[i] = gp;
      continue;
    }
    grad[i] = 0;
  }

  return grad;
}

double ME_Model::regularized_func_grad(const double C, const Vec& x,
                                       Vec& grad) {
  double f = FunctionGradient(x.STLVec(), grad.STLVec());
  for (size_t i = 0; i < x.Size(); i++) {
    f += C * fabs(x[i]);
  }

  return f;
}

double ME_Model::constrained_line_search(double C, const Vec& x0,
                                         const Vec& grad0, const double f0,
                                         const Vec& dx, Vec& x, Vec& grad1) {
  // compute the orthant to explore
  Vec orthant = x0;
  for (size_t i = 0; i < orthant.Size(); i++) {
    if (orthant[i] == 0) orthant[i] = -grad0[i];
  }

  double t = 1.0 / LINE_SEARCH_BETA;

  double f;
  do {
    t *= LINE_SEARCH_BETA;
    x = x0 + t * dx;
    x.Project(orthant);
    //    for (size_t i = 0; i < x.Size(); i++) {
    //      if (x0[i] != 0 && sign(x[i]) != sign(x0[i])) x[i] = 0;
    //    }

    f = regularized_func_grad(C, x, grad1);
    //        cout << "*";
  } while (f > f0 + LINE_SEARCH_ALPHA * dot_product(x - x0, grad0));

  return f;
}

vector<double> ME_Model::perform_OWLQN(const vector<double>& x0,
                                       const double C) {
  const size_t dim = x0.size();
  Vec x = x0;

  Vec grad(dim), dx(dim);
  double f = regularized_func_grad(C, x, grad);

  Vec s[M], y[M];
  double z[M];  // rho

  for (int iter = 0; iter < OWLQN_MAX_ITER; iter++) {
    Vec pg = pseudo_gradient(x, grad, C);

    fprintf(stderr, "%3d  obj(err) = %f (%6.4f)", iter + 1, -f, _train_error);
    if (_nheldout > 0) {
      const double heldout_logl = heldout_likelihood();
      fprintf(stderr, "  heldout_logl(err) = %f (%6.4f)", heldout_logl,
              _heldout_error);
    }
    fprintf(stderr, "\n");

    if (sqrt(dot_product(pg, pg)) < MIN_GRAD_NORM) break;

    dx = -1 * approximate_Hg(iter, pg, s, y, z);
    if (dot_product(dx, pg) >= 0) dx.Project(-1 * pg);

    Vec x1(dim), grad1(dim);
    f = constrained_line_search(C, x, pg, f, dx, x1, grad1);

    s[iter % M] = x1 - x;
    y[iter % M] = grad1 - grad;
    z[iter % M] = 1.0 / dot_product(y[iter % M], s[iter % M]);

    x = x1;
    grad = grad1;
  }

  return x.STLVec();
}