diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2011-12-29 21:08:30 -0500 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2011-12-29 21:08:30 -0500 |
commit | aac3ef3e3fdf636406fc61a40096cee6381e5461 (patch) | |
tree | e014b029915614a23a8b2691a37fa14204a84a89 /gi/pf/unigrams.cc | |
parent | c96125ad4860aa823ff2f17aa3ccb565c8f99c0e (diff) |
lexical alignment samplers
Diffstat (limited to 'gi/pf/unigrams.cc')
-rw-r--r-- | gi/pf/unigrams.cc | 80 |
1 files changed, 80 insertions, 0 deletions
diff --git a/gi/pf/unigrams.cc b/gi/pf/unigrams.cc new file mode 100644 index 00000000..40829775 --- /dev/null +++ b/gi/pf/unigrams.cc @@ -0,0 +1,80 @@ +#include "unigrams.h" + +#include <string> +#include <cmath> + +#include "stringlib.h" +#include "filelib.h" + +using namespace std; + +void UnigramModel::LoadUnigrams(const string& fname) { + cerr << "Loading unigram probabilities from " << fname << " ..." << endl; + ReadFile rf(fname); + string line; + istream& in = *rf.stream(); + assert(in); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\data\\"); + getline(in, line); + size_t pos = line.find("ngram 1="); + assert(pos == 0); + assert(line.size() > 8); + const size_t num_unigrams = atoi(&line[8]); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\1-grams:"); + for (size_t i = 0; i < num_unigrams; ++i) { + getline(in, line); + assert(line.size() > 0); + pos = line.find('\t'); + assert(pos > 0); + assert(pos + 1 < line.size()); + const WordID w = TD::Convert(line.substr(pos + 1)); + line[pos] = 0; + float p = atof(&line[0]); + if (w < probs_.size()) probs_[w].logeq(p * log(10)); else cerr << "WARNING: don't know about '" << TD::Convert(w) << "'\n"; + } +} + +void UnigramWordModel::LoadUnigrams(const string& fname) { + cerr << "Loading unigram probabilities from " << fname << " ..." << endl; + ReadFile rf(fname); + string line; + istream& in = *rf.stream(); + assert(in); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\data\\"); + getline(in, line); + size_t pos = line.find("ngram 1="); + assert(pos == 0); + assert(line.size() > 8); + const size_t num_unigrams = atoi(&line[8]); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\1-grams:"); + for (size_t i = 0; i < num_unigrams; ++i) { + getline(in, line); + assert(line.size() > 0); + pos = line.find('\t'); + assert(pos > 0); + assert(pos + 1 < line.size()); + size_t cur = pos + 1; + vector<WordID> w; + while (cur < line.size()) { + const size_t len = UTF8Len(line[cur]); + w.push_back(TD::Convert(line.substr(cur, len))); + cur += len; + } + line[pos] = 0; + float p = atof(&line[0]); + probs_[w].logeq(p * log(10.0)); + } +} + |