summaryrefslogtreecommitdiff
path: root/decoder
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2012-03-10 16:42:12 -0500
committerChris Dyer <cdyer@cs.cmu.edu>2012-03-10 16:42:12 -0500
commitdfbc278c1057555fda9312291c8024049e00b7d8 (patch)
treee922651d48b1c9f73857f0dabd31c55a3ce8a74b /decoder
parent289f96779e665ba24adca3461a624c68aa37bd99 (diff)
frequency-based binning
Diffstat (limited to 'decoder')
-rw-r--r--decoder/Makefile.am1
-rw-r--r--decoder/ff_csplit.cc2
-rw-r--r--decoder/freqdict.cc29
-rw-r--r--decoder/freqdict.h37
4 files changed, 33 insertions, 36 deletions
diff --git a/decoder/Makefile.am b/decoder/Makefile.am
index a00b18af..ec51d643 100644
--- a/decoder/Makefile.am
+++ b/decoder/Makefile.am
@@ -76,7 +76,6 @@ libcdec_a_SOURCES = \
ff_source_syntax.cc \
ff_bleu.cc \
ff_factory.cc \
- freqdict.cc \
lexalign.cc \
lextrans.cc \
tagger.cc \
diff --git a/decoder/ff_csplit.cc b/decoder/ff_csplit.cc
index 3991d38f..c9ed996c 100644
--- a/decoder/ff_csplit.cc
+++ b/decoder/ff_csplit.cc
@@ -72,7 +72,7 @@ struct BasicCSplitFeaturesImpl {
const int fl1_;
const int fl2_;
const int bad_;
- FreqDict freq_dict_;
+ FreqDict<float> freq_dict_;
set<WordID> bad_words_;
};
diff --git a/decoder/freqdict.cc b/decoder/freqdict.cc
deleted file mode 100644
index 9e25d346..00000000
--- a/decoder/freqdict.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-#include <iostream>
-#include <fstream>
-#include <cassert>
-#include "freqdict.h"
-#include "tdict.h"
-#include "filelib.h"
-
-using namespace std;
-
-void FreqDict::Load(const std::string& fname) {
- cerr << "Reading word frequencies: " << fname << endl;
- ReadFile rf(fname);
- istream& ifs = *rf.stream();
- int cc=0;
- while (ifs) {
- std::string word;
- ifs >> word;
- if (word.size() == 0) continue;
- if (word[0] == '#') continue;
- double count = 0;
- ifs >> count;
- assert(count > 0.0); // use -log(f)
- counts_[TD::Convert(word)]=count;
- ++cc;
- if (cc % 10000 == 0) { std::cerr << "."; }
- }
- std::cerr << "\n";
- std::cerr << "Loaded " << cc << " words\n";
-}
diff --git a/decoder/freqdict.h b/decoder/freqdict.h
index 9acf0c33..4e03fadd 100644
--- a/decoder/freqdict.h
+++ b/decoder/freqdict.h
@@ -1,20 +1,47 @@
#ifndef _FREQDICT_H_
#define _FREQDICT_H_
+#include <iostream>
#include <map>
#include <string>
#include "wordid.h"
+#include "filelib.h"
+#include "tdict.h"
+template <typename T = float>
class FreqDict {
public:
- void Load(const std::string& fname);
- float LookUp(const WordID& word) const {
- std::map<WordID,float>::const_iterator i = counts_.find(word);
- if (i == counts_.end()) return 0;
+ FreqDict() : max_() {}
+ T Max() const { return max_; }
+ void Load(const std::string& fname) {
+ std::cerr << "Reading word statistics from: " << fname << std::endl;
+ ReadFile rf(fname);
+ std::istream& ifs = *rf.stream();
+ int cc=0;
+ std::string word;
+ while (ifs) {
+ ifs >> word;
+ if (word.size() == 0) continue;
+ if (word[0] == '#') continue;
+ T count = 0;
+ ifs >> count;
+ if (count > max_) max_ = count;
+ counts_[TD::Convert(word)]=count;
+ ++cc;
+ if (cc % 10000 == 0) { std::cerr << "."; }
+ }
+ std::cerr << "\n";
+ std::cerr << "Loaded " << cc << " words\n";
+ }
+
+ T LookUp(const WordID& word) const {
+ typename std::map<WordID,T>::const_iterator i = counts_.find(word);
+ if (i == counts_.end()) return T();
return i->second;
}
private:
- std::map<WordID, float> counts_;
+ T max_;
+ std::map<WordID, T> counts_;
};
#endif