From aac3ef3e3fdf636406fc61a40096cee6381e5461 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 29 Dec 2011 21:08:30 -0500 Subject: lexical alignment samplers --- gi/pf/unigrams.cc | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 gi/pf/unigrams.cc (limited to 'gi/pf/unigrams.cc') diff --git a/gi/pf/unigrams.cc b/gi/pf/unigrams.cc new file mode 100644 index 00000000..40829775 --- /dev/null +++ b/gi/pf/unigrams.cc @@ -0,0 +1,80 @@ +#include "unigrams.h" + +#include +#include + +#include "stringlib.h" +#include "filelib.h" + +using namespace std; + +void UnigramModel::LoadUnigrams(const string& fname) { + cerr << "Loading unigram probabilities from " << fname << " ..." << endl; + ReadFile rf(fname); + string line; + istream& in = *rf.stream(); + assert(in); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\data\\"); + getline(in, line); + size_t pos = line.find("ngram 1="); + assert(pos == 0); + assert(line.size() > 8); + const size_t num_unigrams = atoi(&line[8]); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\1-grams:"); + for (size_t i = 0; i < num_unigrams; ++i) { + getline(in, line); + assert(line.size() > 0); + pos = line.find('\t'); + assert(pos > 0); + assert(pos + 1 < line.size()); + const WordID w = TD::Convert(line.substr(pos + 1)); + line[pos] = 0; + float p = atof(&line[0]); + if (w < probs_.size()) probs_[w].logeq(p * log(10)); else cerr << "WARNING: don't know about '" << TD::Convert(w) << "'\n"; + } +} + +void UnigramWordModel::LoadUnigrams(const string& fname) { + cerr << "Loading unigram probabilities from " << fname << " ..." << endl; + ReadFile rf(fname); + string line; + istream& in = *rf.stream(); + assert(in); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\data\\"); + getline(in, line); + size_t pos = line.find("ngram 1="); + assert(pos == 0); + assert(line.size() > 8); + const size_t num_unigrams = atoi(&line[8]); + getline(in, line); + assert(line.empty()); + getline(in, line); + assert(line == "\\1-grams:"); + for (size_t i = 0; i < num_unigrams; ++i) { + getline(in, line); + assert(line.size() > 0); + pos = line.find('\t'); + assert(pos > 0); + assert(pos + 1 < line.size()); + size_t cur = pos + 1; + vector w; + while (cur < line.size()) { + const size_t len = UTF8Len(line[cur]); + w.push_back(TD::Convert(line.substr(cur, len))); + cur += len; + } + line[pos] = 0; + float p = atof(&line[0]); + probs_[w].logeq(p * log(10.0)); + } +} + -- cgit v1.2.3