summaryrefslogtreecommitdiff
path: root/decoder/freqdict.h
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2012-03-10 16:42:12 -0500
committerChris Dyer <cdyer@cs.cmu.edu>2012-03-10 16:42:12 -0500
commitdfbc278c1057555fda9312291c8024049e00b7d8 (patch)
treee922651d48b1c9f73857f0dabd31c55a3ce8a74b /decoder/freqdict.h
parent289f96779e665ba24adca3461a624c68aa37bd99 (diff)
frequency-based binning
Diffstat (limited to 'decoder/freqdict.h')
-rw-r--r--decoder/freqdict.h37
1 files changed, 32 insertions, 5 deletions
diff --git a/decoder/freqdict.h b/decoder/freqdict.h
index 9acf0c33..4e03fadd 100644
--- a/decoder/freqdict.h
+++ b/decoder/freqdict.h
@@ -1,20 +1,47 @@
#ifndef _FREQDICT_H_
#define _FREQDICT_H_
+#include <iostream>
#include <map>
#include <string>
#include "wordid.h"
+#include "filelib.h"
+#include "tdict.h"
+template <typename T = float>
class FreqDict {
public:
- void Load(const std::string& fname);
- float LookUp(const WordID& word) const {
- std::map<WordID,float>::const_iterator i = counts_.find(word);
- if (i == counts_.end()) return 0;
+ FreqDict() : max_() {}
+ T Max() const { return max_; }
+ void Load(const std::string& fname) {
+ std::cerr << "Reading word statistics from: " << fname << std::endl;
+ ReadFile rf(fname);
+ std::istream& ifs = *rf.stream();
+ int cc=0;
+ std::string word;
+ while (ifs) {
+ ifs >> word;
+ if (word.size() == 0) continue;
+ if (word[0] == '#') continue;
+ T count = 0;
+ ifs >> count;
+ if (count > max_) max_ = count;
+ counts_[TD::Convert(word)]=count;
+ ++cc;
+ if (cc % 10000 == 0) { std::cerr << "."; }
+ }
+ std::cerr << "\n";
+ std::cerr << "Loaded " << cc << " words\n";
+ }
+
+ T LookUp(const WordID& word) const {
+ typename std::map<WordID,T>::const_iterator i = counts_.find(word);
+ if (i == counts_.end()) return T();
return i->second;
}
private:
- std::map<WordID, float> counts_;
+ T max_;
+ std::map<WordID, T> counts_;
};
#endif