get rid of bad Weights class so it no longer keeps a copy of a vector inside it

author: Chris Dyer <cdyer@cs.cmu.edu> 2011-09-13 17:36:23 +0100
committer: Chris Dyer <cdyer@cs.cmu.edu> 2011-09-13 17:36:23 +0100
commit: 251da4347ea356f799e6c227ac8cf541c0cef2f2 (patch)
tree: 407e647e34aa89049754d83e9e1eb2cddff05de8 /utils
parent: 75bff8e374f3cdcf3dc141f8b7b37858d0611234 (diff)
4 files changed, 65 insertions, 42 deletions
diff --git a/utils/fdict.h b/utils/fdict.h
index 771e8b91..f0871b9a 100644
--- a/utils/fdict.h
+++ b/utils/fdict.h
@@ -28,6 +28,8 @@ struct FD {
   }
   static void EnableHash(const std::string& cmph_file) {
 #ifdef HAVE_CMPH
+    assert(dict_.max() == 0);  // dictionary must not have
+                               // been added to
     hash_ = new PerfectHashFunction(cmph_file);
 #endif
   }
diff --git a/utils/phmt.cc b/utils/phmt.cc
index 1f59afaf..48d9f093 100644
--- a/utils/phmt.cc
+++ b/utils/phmt.cc
@@ -19,22 +19,18 @@ int main(int argc, char** argv) {
   cerr << "LexFE = " << FD::Convert("LexFE") << endl;
   cerr << "LexEF = " << FD::Convert("LexEF") << endl;
   {
-    Weights w;
     vector<weight_t> v(FD::NumFeats());
     v[FD::Convert("LexFE")] = 1.0;
     v[FD::Convert("LexEF")] = 0.5;
-    w.InitFromVector(v);
     cerr << "Writing...\n";
-    w.WriteToFile("weights.bin");
+    Weights::WriteToFile("weights.bin", v);
     cerr << "Done.\n";
   }
   {
-    Weights w;
     vector<weight_t> v(FD::NumFeats());
     cerr << "Reading...\n";
-    w.InitFromFile("weights.bin");
+    Weights::InitFromFile("weights.bin", &v);
     cerr << "Done.\n";
-    w.InitVector(&v);
     assert(v[FD::Convert("LexFE")] == 1.0);
     assert(v[FD::Convert("LexEF")] == 0.5);
   }
diff --git a/utils/weights.cc b/utils/weights.cc
index 0916b72a..c49000be 100644
--- a/utils/weights.cc
+++ b/utils/weights.cc
@@ -8,7 +8,10 @@
 
 using namespace std;
 
-void Weights::InitFromFile(const std::string& filename, vector<string>* feature_list) {
+void Weights::InitFromFile(const string& filename,
+                           vector<weight_t>* pweights,
+                           vector<string>* feature_list) {
+  vector<weight_t>& weights = *pweights;
   if (!SILENT) cerr << "Reading weights from " << filename << endl;
   ReadFile in_file(filename);
   istream& in = *in_file.stream();
@@ -47,16 +50,16 @@ void Weights::InitFromFile(const std::string& filename, vector<string>* feature_
       int end = 0;
       while(end < buf.size() && buf[end] != ' ') ++end;
       const int fid = FD::Convert(buf.substr(start, end - start));
+      if (feature_list) { feature_list->push_back(buf.substr(start, end - start)); }
       while(end < buf.size() && buf[end] == ' ') ++end;
       val = strtod(&buf.c_str()[end], NULL);
       if (isnan(val)) {
         cerr << FD::Convert(fid) << " has weight NaN!\n";
         abort();
       }
-      if (wv_.size() <= fid)
-        wv_.resize(fid + 1);
-      wv_[fid] = val;
-      if (feature_list) { feature_list->push_back(FD::Convert(fid)); }
+      if (weights.size() <= fid)
+        weights.resize(fid + 1);
+      weights[fid] = val;
       ++weight_count;
       if (!SILENT) {
         if (weight_count %   50000 == 0) { cerr << '.' << flush; fl = true; }
@@ -76,8 +79,8 @@ void Weights::InitFromFile(const std::string& filename, vector<string>* feature_
       cerr << "Hash function reports " << FD::NumFeats() << " keys but weights file contains " << num_keys[0] << endl;
       abort();
     }
-    wv_.resize(num_keys[0]);
-    in.get(reinterpret_cast<char*>(&wv_[0]), num_keys[0] * sizeof(weight_t));
+    weights.resize(num_keys[0]);
+    in.get(reinterpret_cast<char*>(&weights[0]), num_keys[0] * sizeof(weight_t));
     if (!in.good()) {
       cerr << "Error loading weights!\n";
       abort();
@@ -85,7 +88,10 @@ void Weights::InitFromFile(const std::string& filename, vector<string>* feature_
   }
 }
 
-void Weights::WriteToFile(const std::string& fname, bool hide_zero_value_features, const string* extra) const {
+void Weights::WriteToFile(const string& fname,
+                          const vector<weight_t>& weights,
+                          bool hide_zero_value_features,
+                          const string* extra) {
   WriteFile out(fname);
   ostream& o = *out.stream();
   assert(o);
@@ -96,41 +102,54 @@ void Weights::WriteToFile(const std::string& fname, bool hide_zero_value_feature
     o.precision(17);
     const int num_feats = FD::NumFeats();
     for (int i = 1; i < num_feats; ++i) {
-      const weight_t val = (i < wv_.size() ? wv_[i] : 0.0);
+      const weight_t val = (i < weights.size() ? weights[i] : 0.0);
       if (hide_zero_value_features && val == 0.0) continue;
       o << FD::Convert(i) << ' ' << val << endl;
     }
   } else {
     o.write("_PHWf", 5);
     const size_t keys = FD::NumFeats();
-    assert(keys <= wv_.size());
+    assert(keys <= weights.size());
     o.write(reinterpret_cast<const char*>(&keys), sizeof(keys));
-    o.write(reinterpret_cast<const char*>(&wv_[0]), keys * sizeof(weight_t));
+    o.write(reinterpret_cast<const char*>(&weights[0]), keys * sizeof(weight_t));
   }
 }
 
-void Weights::InitVector(std::vector<weight_t>* w) const {
-  *w = wv_;
+void Weights::InitSparseVector(const vector<weight_t>& dv,
+                               SparseVector<weight_t>* sv) {
+  sv->clear();
+  for (unsigned i = 1; i < dv.size(); ++i) {
+    if (dv[i]) sv->set_value(i, dv[i]);
+  }
 }
 
-void Weights::InitSparseVector(SparseVector<weight_t>* w) const {
-  for (int i = 1; i < wv_.size(); ++i) {
-    const weight_t& weight = wv_[i];
-    if (weight) w->set_value(i, weight);
+void Weights::SanityCheck(const vector<weight_t>& w) {
+  for (int i = 0; i < w.size(); ++i) {
+    assert(!isnan(w[i]));
+    assert(!isinf(w[i]));
   }
 }
 
-void Weights::InitFromVector(const std::vector<weight_t>& w) {
-  wv_ = w;
-  if (wv_.size() > FD::NumFeats())
-    cerr << "WARNING: initializing weight vector has more features than the global feature dictionary!\n";
-  wv_.resize(FD::NumFeats(), 0);
-}
+struct FComp {
+  const vector<weight_t>& w_;
+  FComp(const vector<weight_t>& w) : w_(w) {}
+  bool operator()(int a, int b) const {
+    return fabs(w_[a]) > fabs(w_[b]);
+  }
+};
 
-void Weights::InitFromVector(const SparseVector<weight_t>& w) {
-  wv_.clear();
-  wv_.resize(FD::NumFeats(), 0.0);
-  for (int i = 1; i < FD::NumFeats(); ++i)
-    wv_[i] = w.value(i);
+void Weights::ShowLargestFeatures(const vector<weight_t>& w) {
+  vector<int> fnums(w.size());
+  for (int i = 0; i < w.size(); ++i)
+    fnums[i] = i;
+  vector<int>::iterator mid = fnums.begin();
+  mid += (w.size() > 10 ? 10 : w.size());
+  partial_sort(fnums.begin(), mid, fnums.end(), FComp(w));
+  cerr << "TOP FEATURES:";
+  for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) {
+    cerr << ' ' << FD::Convert(*i) << '=' << w[*i];
+  }
+  cerr << endl;
 }
 
+
diff --git a/utils/weights.h b/utils/weights.h
index 7664810b..30f71db0 100644
--- a/utils/weights.h
+++ b/utils/weights.h
@@ -10,15 +10,21 @@ typedef double weight_t;
 
 class Weights {
  public:
-  Weights() {}
-  void InitFromFile(const std::string& fname, std::vector<std::string>* feature_list = NULL);
-  void WriteToFile(const std::string& fname, bool hide_zero_value_features = true, const std::string* extra = NULL) const;
-  void InitVector(std::vector<weight_t>* w) const;
-  void InitSparseVector(SparseVector<weight_t>* w) const;
-  void InitFromVector(const std::vector<weight_t>& w);
-  void InitFromVector(const SparseVector<weight_t>& w);
+  static void InitFromFile(const std::string& fname,
+                           std::vector<weight_t>* weights,
+                           std::vector<std::string>* feature_list = NULL);
+  static void WriteToFile(const std::string& fname,
+                          const std::vector<weight_t>& weights,
+                          bool hide_zero_value_features = true,
+                          const std::string* extra = NULL);
+  static void InitSparseVector(const std::vector<weight_t>& dv,
+                               SparseVector<weight_t>* sv);
+  // check for infinities, NaNs, etc
+  static void SanityCheck(const std::vector<weight_t>& w);
+  // write weights with largest magnitude to cerr
+  static void ShowLargestFeatures(const std::vector<weight_t>& w);
  private:
-  std::vector<weight_t> wv_;
+  Weights();
 };
 
 #endif
author	Chris Dyer <cdyer@cs.cmu.edu>	2011-09-13 17:36:23 +0100
committer	Chris Dyer <cdyer@cs.cmu.edu>	2011-09-13 17:36:23 +0100
commit	251da4347ea356f799e6c227ac8cf541c0cef2f2 (patch)
tree	407e647e34aa89049754d83e9e1eb2cddff05de8 /utils
parent	75bff8e374f3cdcf3dc141f8b7b37858d0611234 (diff)