diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2012-03-10 16:42:12 -0500 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2012-03-10 16:42:12 -0500 |
commit | dfbc278c1057555fda9312291c8024049e00b7d8 (patch) | |
tree | e922651d48b1c9f73857f0dabd31c55a3ce8a74b /decoder/freqdict.h | |
parent | 289f96779e665ba24adca3461a624c68aa37bd99 (diff) |
frequency-based binning
Diffstat (limited to 'decoder/freqdict.h')
-rw-r--r-- | decoder/freqdict.h | 37 |
1 files changed, 32 insertions, 5 deletions
diff --git a/decoder/freqdict.h b/decoder/freqdict.h index 9acf0c33..4e03fadd 100644 --- a/decoder/freqdict.h +++ b/decoder/freqdict.h @@ -1,20 +1,47 @@ #ifndef _FREQDICT_H_ #define _FREQDICT_H_ +#include <iostream> #include <map> #include <string> #include "wordid.h" +#include "filelib.h" +#include "tdict.h" +template <typename T = float> class FreqDict { public: - void Load(const std::string& fname); - float LookUp(const WordID& word) const { - std::map<WordID,float>::const_iterator i = counts_.find(word); - if (i == counts_.end()) return 0; + FreqDict() : max_() {} + T Max() const { return max_; } + void Load(const std::string& fname) { + std::cerr << "Reading word statistics from: " << fname << std::endl; + ReadFile rf(fname); + std::istream& ifs = *rf.stream(); + int cc=0; + std::string word; + while (ifs) { + ifs >> word; + if (word.size() == 0) continue; + if (word[0] == '#') continue; + T count = 0; + ifs >> count; + if (count > max_) max_ = count; + counts_[TD::Convert(word)]=count; + ++cc; + if (cc % 10000 == 0) { std::cerr << "."; } + } + std::cerr << "\n"; + std::cerr << "Loaded " << cc << " words\n"; + } + + T LookUp(const WordID& word) const { + typename std::map<WordID,T>::const_iterator i = counts_.find(word); + if (i == counts_.end()) return T(); return i->second; } private: - std::map<WordID, float> counts_; + T max_; + std::map<WordID, T> counts_; }; #endif |