summaryrefslogtreecommitdiff
path: root/utils
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2011-09-13 17:36:23 +0100
committerChris Dyer <cdyer@cs.cmu.edu>2011-09-13 17:36:23 +0100
commit251da4347ea356f799e6c227ac8cf541c0cef2f2 (patch)
tree407e647e34aa89049754d83e9e1eb2cddff05de8 /utils
parent75bff8e374f3cdcf3dc141f8b7b37858d0611234 (diff)
get rid of bad Weights class so it no longer keeps a copy of a vector inside it
Diffstat (limited to 'utils')
-rw-r--r--utils/fdict.h2
-rw-r--r--utils/phmt.cc8
-rw-r--r--utils/weights.cc75
-rw-r--r--utils/weights.h22
4 files changed, 65 insertions, 42 deletions
diff --git a/utils/fdict.h b/utils/fdict.h
index 771e8b91..f0871b9a 100644
--- a/utils/fdict.h
+++ b/utils/fdict.h
@@ -28,6 +28,8 @@ struct FD {
}
static void EnableHash(const std::string& cmph_file) {
#ifdef HAVE_CMPH
+ assert(dict_.max() == 0); // dictionary must not have
+ // been added to
hash_ = new PerfectHashFunction(cmph_file);
#endif
}
diff --git a/utils/phmt.cc b/utils/phmt.cc
index 1f59afaf..48d9f093 100644
--- a/utils/phmt.cc
+++ b/utils/phmt.cc
@@ -19,22 +19,18 @@ int main(int argc, char** argv) {
cerr << "LexFE = " << FD::Convert("LexFE") << endl;
cerr << "LexEF = " << FD::Convert("LexEF") << endl;
{
- Weights w;
vector<weight_t> v(FD::NumFeats());
v[FD::Convert("LexFE")] = 1.0;
v[FD::Convert("LexEF")] = 0.5;
- w.InitFromVector(v);
cerr << "Writing...\n";
- w.WriteToFile("weights.bin");
+ Weights::WriteToFile("weights.bin", v);
cerr << "Done.\n";
}
{
- Weights w;
vector<weight_t> v(FD::NumFeats());
cerr << "Reading...\n";
- w.InitFromFile("weights.bin");
+ Weights::InitFromFile("weights.bin", &v);
cerr << "Done.\n";
- w.InitVector(&v);
assert(v[FD::Convert("LexFE")] == 1.0);
assert(v[FD::Convert("LexEF")] == 0.5);
}
diff --git a/utils/weights.cc b/utils/weights.cc
index 0916b72a..c49000be 100644
--- a/utils/weights.cc
+++ b/utils/weights.cc
@@ -8,7 +8,10 @@
using namespace std;
-void Weights::InitFromFile(const std::string& filename, vector<string>* feature_list) {
+void Weights::InitFromFile(const string& filename,
+ vector<weight_t>* pweights,
+ vector<string>* feature_list) {
+ vector<weight_t>& weights = *pweights;
if (!SILENT) cerr << "Reading weights from " << filename << endl;
ReadFile in_file(filename);
istream& in = *in_file.stream();
@@ -47,16 +50,16 @@ void Weights::InitFromFile(const std::string& filename, vector<string>* feature_
int end = 0;
while(end < buf.size() && buf[end] != ' ') ++end;
const int fid = FD::Convert(buf.substr(start, end - start));
+ if (feature_list) { feature_list->push_back(buf.substr(start, end - start)); }
while(end < buf.size() && buf[end] == ' ') ++end;
val = strtod(&buf.c_str()[end], NULL);
if (isnan(val)) {
cerr << FD::Convert(fid) << " has weight NaN!\n";
abort();
}
- if (wv_.size() <= fid)
- wv_.resize(fid + 1);
- wv_[fid] = val;
- if (feature_list) { feature_list->push_back(FD::Convert(fid)); }
+ if (weights.size() <= fid)
+ weights.resize(fid + 1);
+ weights[fid] = val;
++weight_count;
if (!SILENT) {
if (weight_count % 50000 == 0) { cerr << '.' << flush; fl = true; }
@@ -76,8 +79,8 @@ void Weights::InitFromFile(const std::string& filename, vector<string>* feature_
cerr << "Hash function reports " << FD::NumFeats() << " keys but weights file contains " << num_keys[0] << endl;
abort();
}
- wv_.resize(num_keys[0]);
- in.get(reinterpret_cast<char*>(&wv_[0]), num_keys[0] * sizeof(weight_t));
+ weights.resize(num_keys[0]);
+ in.get(reinterpret_cast<char*>(&weights[0]), num_keys[0] * sizeof(weight_t));
if (!in.good()) {
cerr << "Error loading weights!\n";
abort();
@@ -85,7 +88,10 @@ void Weights::InitFromFile(const std::string& filename, vector<string>* feature_
}
}
-void Weights::WriteToFile(const std::string& fname, bool hide_zero_value_features, const string* extra) const {
+void Weights::WriteToFile(const string& fname,
+ const vector<weight_t>& weights,
+ bool hide_zero_value_features,
+ const string* extra) {
WriteFile out(fname);
ostream& o = *out.stream();
assert(o);
@@ -96,41 +102,54 @@ void Weights::WriteToFile(const std::string& fname, bool hide_zero_value_feature
o.precision(17);
const int num_feats = FD::NumFeats();
for (int i = 1; i < num_feats; ++i) {
- const weight_t val = (i < wv_.size() ? wv_[i] : 0.0);
+ const weight_t val = (i < weights.size() ? weights[i] : 0.0);
if (hide_zero_value_features && val == 0.0) continue;
o << FD::Convert(i) << ' ' << val << endl;
}
} else {
o.write("_PHWf", 5);
const size_t keys = FD::NumFeats();
- assert(keys <= wv_.size());
+ assert(keys <= weights.size());
o.write(reinterpret_cast<const char*>(&keys), sizeof(keys));
- o.write(reinterpret_cast<const char*>(&wv_[0]), keys * sizeof(weight_t));
+ o.write(reinterpret_cast<const char*>(&weights[0]), keys * sizeof(weight_t));
}
}
-void Weights::InitVector(std::vector<weight_t>* w) const {
- *w = wv_;
+void Weights::InitSparseVector(const vector<weight_t>& dv,
+ SparseVector<weight_t>* sv) {
+ sv->clear();
+ for (unsigned i = 1; i < dv.size(); ++i) {
+ if (dv[i]) sv->set_value(i, dv[i]);
+ }
}
-void Weights::InitSparseVector(SparseVector<weight_t>* w) const {
- for (int i = 1; i < wv_.size(); ++i) {
- const weight_t& weight = wv_[i];
- if (weight) w->set_value(i, weight);
+void Weights::SanityCheck(const vector<weight_t>& w) {
+ for (int i = 0; i < w.size(); ++i) {
+ assert(!isnan(w[i]));
+ assert(!isinf(w[i]));
}
}
-void Weights::InitFromVector(const std::vector<weight_t>& w) {
- wv_ = w;
- if (wv_.size() > FD::NumFeats())
- cerr << "WARNING: initializing weight vector has more features than the global feature dictionary!\n";
- wv_.resize(FD::NumFeats(), 0);
-}
+struct FComp {
+ const vector<weight_t>& w_;
+ FComp(const vector<weight_t>& w) : w_(w) {}
+ bool operator()(int a, int b) const {
+ return fabs(w_[a]) > fabs(w_[b]);
+ }
+};
-void Weights::InitFromVector(const SparseVector<weight_t>& w) {
- wv_.clear();
- wv_.resize(FD::NumFeats(), 0.0);
- for (int i = 1; i < FD::NumFeats(); ++i)
- wv_[i] = w.value(i);
+void Weights::ShowLargestFeatures(const vector<weight_t>& w) {
+ vector<int> fnums(w.size());
+ for (int i = 0; i < w.size(); ++i)
+ fnums[i] = i;
+ vector<int>::iterator mid = fnums.begin();
+ mid += (w.size() > 10 ? 10 : w.size());
+ partial_sort(fnums.begin(), mid, fnums.end(), FComp(w));
+ cerr << "TOP FEATURES:";
+ for (vector<int>::iterator i = fnums.begin(); i != mid; ++i) {
+ cerr << ' ' << FD::Convert(*i) << '=' << w[*i];
+ }
+ cerr << endl;
}
+
diff --git a/utils/weights.h b/utils/weights.h
index 7664810b..30f71db0 100644
--- a/utils/weights.h
+++ b/utils/weights.h
@@ -10,15 +10,21 @@ typedef double weight_t;
class Weights {
public:
- Weights() {}
- void InitFromFile(const std::string& fname, std::vector<std::string>* feature_list = NULL);
- void WriteToFile(const std::string& fname, bool hide_zero_value_features = true, const std::string* extra = NULL) const;
- void InitVector(std::vector<weight_t>* w) const;
- void InitSparseVector(SparseVector<weight_t>* w) const;
- void InitFromVector(const std::vector<weight_t>& w);
- void InitFromVector(const SparseVector<weight_t>& w);
+ static void InitFromFile(const std::string& fname,
+ std::vector<weight_t>* weights,
+ std::vector<std::string>* feature_list = NULL);
+ static void WriteToFile(const std::string& fname,
+ const std::vector<weight_t>& weights,
+ bool hide_zero_value_features = true,
+ const std::string* extra = NULL);
+ static void InitSparseVector(const std::vector<weight_t>& dv,
+ SparseVector<weight_t>* sv);
+ // check for infinities, NaNs, etc
+ static void SanityCheck(const std::vector<weight_t>& w);
+ // write weights with largest magnitude to cerr
+ static void ShowLargestFeatures(const std::vector<weight_t>& w);
private:
- std::vector<weight_t> wv_;
+ Weights();
};
#endif