diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2012-03-10 16:42:12 -0500 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2012-03-10 16:42:12 -0500 |
commit | dfbc278c1057555fda9312291c8024049e00b7d8 (patch) | |
tree | e922651d48b1c9f73857f0dabd31c55a3ce8a74b /decoder | |
parent | 289f96779e665ba24adca3461a624c68aa37bd99 (diff) |
frequency-based binning
Diffstat (limited to 'decoder')
-rw-r--r-- | decoder/Makefile.am | 1 | ||||
-rw-r--r-- | decoder/ff_csplit.cc | 2 | ||||
-rw-r--r-- | decoder/freqdict.cc | 29 | ||||
-rw-r--r-- | decoder/freqdict.h | 37 |
4 files changed, 33 insertions, 36 deletions
diff --git a/decoder/Makefile.am b/decoder/Makefile.am index a00b18af..ec51d643 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -76,7 +76,6 @@ libcdec_a_SOURCES = \ ff_source_syntax.cc \ ff_bleu.cc \ ff_factory.cc \ - freqdict.cc \ lexalign.cc \ lextrans.cc \ tagger.cc \ diff --git a/decoder/ff_csplit.cc b/decoder/ff_csplit.cc index 3991d38f..c9ed996c 100644 --- a/decoder/ff_csplit.cc +++ b/decoder/ff_csplit.cc @@ -72,7 +72,7 @@ struct BasicCSplitFeaturesImpl { const int fl1_; const int fl2_; const int bad_; - FreqDict freq_dict_; + FreqDict<float> freq_dict_; set<WordID> bad_words_; }; diff --git a/decoder/freqdict.cc b/decoder/freqdict.cc deleted file mode 100644 index 9e25d346..00000000 --- a/decoder/freqdict.cc +++ /dev/null @@ -1,29 +0,0 @@ -#include <iostream> -#include <fstream> -#include <cassert> -#include "freqdict.h" -#include "tdict.h" -#include "filelib.h" - -using namespace std; - -void FreqDict::Load(const std::string& fname) { - cerr << "Reading word frequencies: " << fname << endl; - ReadFile rf(fname); - istream& ifs = *rf.stream(); - int cc=0; - while (ifs) { - std::string word; - ifs >> word; - if (word.size() == 0) continue; - if (word[0] == '#') continue; - double count = 0; - ifs >> count; - assert(count > 0.0); // use -log(f) - counts_[TD::Convert(word)]=count; - ++cc; - if (cc % 10000 == 0) { std::cerr << "."; } - } - std::cerr << "\n"; - std::cerr << "Loaded " << cc << " words\n"; -} diff --git a/decoder/freqdict.h b/decoder/freqdict.h index 9acf0c33..4e03fadd 100644 --- a/decoder/freqdict.h +++ b/decoder/freqdict.h @@ -1,20 +1,47 @@ #ifndef _FREQDICT_H_ #define _FREQDICT_H_ +#include <iostream> #include <map> #include <string> #include "wordid.h" +#include "filelib.h" +#include "tdict.h" +template <typename T = float> class FreqDict { public: - void Load(const std::string& fname); - float LookUp(const WordID& word) const { - std::map<WordID,float>::const_iterator i = counts_.find(word); - if (i == counts_.end()) return 0; + FreqDict() : max_() {} + T Max() const { return max_; } + void Load(const std::string& fname) { + std::cerr << "Reading word statistics from: " << fname << std::endl; + ReadFile rf(fname); + std::istream& ifs = *rf.stream(); + int cc=0; + std::string word; + while (ifs) { + ifs >> word; + if (word.size() == 0) continue; + if (word[0] == '#') continue; + T count = 0; + ifs >> count; + if (count > max_) max_ = count; + counts_[TD::Convert(word)]=count; + ++cc; + if (cc % 10000 == 0) { std::cerr << "."; } + } + std::cerr << "\n"; + std::cerr << "Loaded " << cc << " words\n"; + } + + T LookUp(const WordID& word) const { + typename std::map<WordID,T>::const_iterator i = counts_.find(word); + if (i == counts_.end()) return T(); return i->second; } private: - std::map<WordID, float> counts_; + T max_; + std::map<WordID, T> counts_; }; #endif |