From 967e1c98980b07909b03ff5e3e71442cbeb216e8 Mon Sep 17 00:00:00 2001
From: Patrick Simianer
Date: Wed, 19 Oct 2011 20:56:22 +0200
Subject: merged, compiles but not working
---
klm/lm/binary_format.cc | 4 --
klm/lm/search_trie.cc | 123 ------------------------------------------------
klm/lm/trie.cc | 10 ----
3 files changed, 137 deletions(-)
(limited to 'klm')
diff --git a/klm/lm/binary_format.cc b/klm/lm/binary_format.cc
index eac8aa85..27cada13 100644
--- a/klm/lm/binary_format.cc
+++ b/klm/lm/binary_format.cc
@@ -182,10 +182,6 @@ void SeekPastHeader(int fd, const Parameters ¶ms) {
SeekOrThrow(fd, TotalHeaderSize(params.counts.size()));
}
-void SeekPastHeader(int fd, const Parameters ¶ms) {
- SeekOrThrow(fd, TotalHeaderSize(params.counts.size()));
-}
-
uint8_t *SetupBinary(const Config &config, const Parameters ¶ms, std::size_t memory_size, Backing &backing) {
const off_t file_size = util::SizeFile(backing.file.get());
// The header is smaller than a page, so we have to map the whole header as well.
diff --git a/klm/lm/search_trie.cc b/klm/lm/search_trie.cc
index 1bcfe27d..5d8c70db 100644
--- a/klm/lm/search_trie.cc
+++ b/klm/lm/search_trie.cc
@@ -234,19 +234,8 @@ class FindBlanks {
return unigrams_[index].prob;
}
-<<<<<<< HEAD
-// Phase to count n-grams, including blanks inserted because they were pruned but have extensions
-class JustCount {
- public:
- template JustCount(ContextReader * /*contexts*/, UnigramValue * /*unigrams*/, Middle * /*middle*/, Longest &/*longest*/, uint64_t *counts, unsigned char order)
- : counts_(counts), longest_counts_(counts + order - 1) {}
-
- void Unigrams(WordIndex begin, WordIndex end) {
- counts_[0] += end - begin;
-=======
void Unigram(WordIndex /*index*/) {
++counts_[0];
->>>>>>> upstream/master
}
void MiddleBlank(const unsigned char order, const WordIndex *indices, unsigned char lower, float prob_basis) {
@@ -278,11 +267,7 @@ class JustCount {
// Phase to actually write n-grams to the trie.
template class WriteEntries {
public:
-<<<<<<< HEAD
- WriteEntries(ContextReader *contexts, UnigramValue *unigrams, BitPackedMiddle *middle, BitPackedLongest &longest, const uint64_t * /*counts*/, unsigned char order) :
-=======
WriteEntries(RecordReader *contexts, UnigramValue *unigrams, BitPackedMiddle *middle, BitPackedLongest &longest, unsigned char order, SRISucks &sri) :
->>>>>>> upstream/master
contexts_(contexts),
unigrams_(unigrams),
middle_(middle),
@@ -330,16 +315,8 @@ template class WriteEntries {
SRISucks &sri_;
};
-<<<<<<< HEAD
-template class RecursiveInsert {
- public:
- template RecursiveInsert(SortedFileReader *inputs, ContextReader *contexts, UnigramValue *unigrams, MiddleT *middle, LongestT &longest, uint64_t *counts, unsigned char order) :
- doing_(contexts, unigrams, middle, longest, counts, order), inputs_(inputs), inputs_end_(inputs + order - 1), order_minus_2_(order - 2) {
- }
-=======
struct Gram {
Gram(const WordIndex *in_begin, unsigned char order) : begin(in_begin), end(in_begin + order) {}
->>>>>>> upstream/master
const WordIndex *begin, *end;
@@ -440,29 +417,6 @@ void SanityCheckCounts(const std::vector &initial, const std::vector void TrainQuantizer(uint8_t order, uint64_t count, SortedFileReader &reader, util::ErsatzProgress &progress, Quant &quant) {
- ProbBackoff weights;
- std::vector probs, backoffs;
- probs.reserve(count);
- backoffs.reserve(count);
- for (reader.Rewind(); !reader.Ended(); reader.NextHeader()) {
- uint64_t entries = reader.ReadCount();
- for (uint64_t c = 0; c < entries; ++c) {
- reader.ReadWord();
- reader.ReadWeights(weights);
- // kBlankProb isn't added yet.
- probs.push_back(weights.prob);
- if (weights.backoff != 0.0) backoffs.push_back(weights.backoff);
- ++progress;
- }
-=======
template void TrainQuantizer(uint8_t order, uint64_t count, const std::vector &additional, RecordReader &reader, util::ErsatzProgress &progress, Quant &quant) {
std::vector probs(additional), backoffs;
probs.reserve(count + additional.size());
@@ -472,26 +426,10 @@ template void TrainQuantizer(uint8_t order, uint64_t count, const
probs.push_back(weights.prob);
if (weights.backoff != 0.0) backoffs.push_back(weights.backoff);
++progress;
->>>>>>> upstream/master
}
quant.Train(order, probs, backoffs);
}
-<<<<<<< HEAD
-template void TrainProbQuantizer(uint8_t order, uint64_t count, SortedFileReader &reader, util::ErsatzProgress &progress, Quant &quant) {
- Prob weights;
- std::vector probs, backoffs;
- probs.reserve(count);
- for (reader.Rewind(); !reader.Ended(); reader.NextHeader()) {
- uint64_t entries = reader.ReadCount();
- for (uint64_t c = 0; c < entries; ++c) {
- reader.ReadWord();
- reader.ReadWeights(weights);
- // kBlankProb isn't added yet.
- probs.push_back(weights.prob);
- ++progress;
- }
-=======
template void TrainProbQuantizer(uint8_t order, uint64_t count, RecordReader &reader, util::ErsatzProgress &progress, Quant &quant) {
std::vector probs, backoffs;
probs.reserve(count);
@@ -499,18 +437,10 @@ template void TrainProbQuantizer(uint8_t order, uint64_t count, Re
const Prob &weights = *reinterpret_cast(reinterpret_cast(reader.Data()) + sizeof(WordIndex) * order);
probs.push_back(weights.prob);
++progress;
->>>>>>> upstream/master
}
quant.TrainProb(order, probs);
}
-<<<<<<< HEAD
-} // namespace
-
-template void BuildTrie(const std::string &file_prefix, std::vector &counts, const Config &config, TrieSearch &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing) {
- std::vector inputs(counts.size() - 1);
- std::vector contexts(counts.size() - 1);
-=======
void PopulateUnigramWeights(FILE *file, WordIndex unigram_count, RecordReader &contexts, UnigramValue *unigrams) {
// Fill unigram probabilities.
try {
@@ -533,7 +463,6 @@ void PopulateUnigramWeights(FILE *file, WordIndex unigram_count, RecordReader &c
template void BuildTrie(const std::string &file_prefix, std::vector &counts, const Config &config, TrieSearch &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing) {
RecordReader inputs[kMaxOrder - 1];
RecordReader contexts[kMaxOrder - 1];
->>>>>>> upstream/master
for (unsigned char i = 2; i <= counts.size(); ++i) {
std::stringstream assembled;
@@ -548,17 +477,12 @@ template void BuildTrie(const std::string &file_pre
SRISucks sri;
std::vector fixed_counts(counts.size());
{
-<<<<<<< HEAD
- RecursiveInsert counter(&*inputs.begin(), &*contexts.begin(), NULL, out.middle_begin_, out.longest, &*fixed_counts.begin(), counts.size());
- counter.Apply(config.messages, "Counting n-grams that should not have been pruned", counts[0]);
-=======
std::string temp(file_prefix); temp += "unigrams";
util::scoped_fd unigram_file(util::OpenReadOrThrow(temp.c_str()));
util::scoped_memory unigrams;
MapRead(util::POPULATE_OR_READ, unigram_file.get(), 0, counts[0] * sizeof(ProbBackoff), unigrams);
FindBlanks finder(&*fixed_counts.begin(), counts.size(), reinterpret_cast(unigrams.get()), sri);
RecursiveInsert(counts.size(), counts[0], inputs, config.messages, "Identifying n-grams omitted by SRI", finder);
->>>>>>> upstream/master
}
for (const RecordReader *i = inputs; i != inputs + counts.size() - 2; ++i) {
if (*i) UTIL_THROW(FormatLoadException, "There's a bug in the trie implementation: the " << (i - inputs + 2) << "-gram table did not complete reading");
@@ -566,18 +490,6 @@ template void BuildTrie(const std::string &file_pre
SanityCheckCounts(counts, fixed_counts);
counts = fixed_counts;
-<<<<<<< HEAD
- out.SetupMemory(GrowForSearch(config, vocab.UnkCountChangePadding(), TrieSearch::Size(fixed_counts, config), backing), fixed_counts, config);
-
- if (Quant::kTrain) {
- util::ErsatzProgress progress(config.messages, "Quantizing", std::accumulate(counts.begin() + 1, counts.end(), 0));
- for (unsigned char i = 2; i < counts.size(); ++i) {
- TrainQuantizer(i, counts[i-1], inputs[i-2], progress, quant);
- }
- TrainProbQuantizer(counts.size(), counts.back(), inputs[counts.size() - 2], progress, quant);
- quant.FinishedLoading(config);
- }
-=======
util::scoped_FILE unigram_file;
{
std::string name(file_prefix + "unigrams");
@@ -587,7 +499,6 @@ template void BuildTrie(const std::string &file_pre
sri.ObtainBackoffs(counts.size(), unigram_file.get(), inputs);
out.SetupMemory(GrowForSearch(config, vocab.UnkCountChangePadding(), TrieSearch::Size(fixed_counts, config), backing), fixed_counts, config);
->>>>>>> upstream/master
for (unsigned char i = 2; i <= counts.size(); ++i) {
inputs[i-2].Rewind();
@@ -610,30 +521,8 @@ template void BuildTrie(const std::string &file_pre
}
// Fill entries except unigram probabilities.
{
-<<<<<<< HEAD
- RecursiveInsert > inserter(&*inputs.begin(), &*contexts.begin(), unigrams, out.middle_begin_, out.longest, &*fixed_counts.begin(), counts.size());
- inserter.Apply(config.messages, "Building trie", fixed_counts[0]);
- }
-
- // Fill unigram probabilities.
- try {
- std::string name(file_prefix + "unigrams");
- util::scoped_FILE file(OpenOrThrow(name.c_str(), "r"));
- for (WordIndex i = 0; i < counts[0]; ++i) {
- ReadOrThrow(file.get(), &unigrams[i].weights, sizeof(ProbBackoff));
- if (contexts[0] && **contexts[0] == i) {
- SetExtension(unigrams[i].weights.backoff);
- ++contexts[0];
- }
- }
- RemoveOrThrow(name.c_str());
- } catch (util::Exception &e) {
- e << " while re-reading unigram probabilities";
- throw;
-=======
WriteEntries writer(contexts, unigrams, out.middle_begin_, out.longest, counts.size(), sri);
RecursiveInsert(counts.size(), counts[0], inputs, config.messages, "Writing trie", writer);
->>>>>>> upstream/master
}
// Do not disable this error message or else too little state will be returned. Both WriteEntries::Middle and returning state based on found n-grams will need to be fixed to handle this situation.
@@ -687,17 +576,6 @@ template uint8_t *TrieSearch::Setup
}
longest.Init(start, quant_.Long(counts.size()), counts[0]);
return start + Longest::Size(Quant::LongestBits(config), counts.back(), counts[0]);
-<<<<<<< HEAD
-}
-
-template void TrieSearch::LoadedBinary() {
- unigram.LoadedBinary();
- for (Middle *i = middle_begin_; i != middle_end_; ++i) {
- i->LoadedBinary();
- }
- longest.LoadedBinary();
-}
-=======
}
template void TrieSearch::LoadedBinary() {
@@ -715,7 +593,6 @@ bool IsDirectory(const char *path) {
return S_ISDIR(info.st_mode);
}
} // namespace
->>>>>>> upstream/master
template void TrieSearch::InitializeFromARPA(const char *file, util::FilePiece &f, std::vector &counts, const Config &config, SortedVocabulary &vocab, Backing &backing) {
std::string temporary_directory;
diff --git a/klm/lm/trie.cc b/klm/lm/trie.cc
index a1136b6f..20075bb8 100644
--- a/klm/lm/trie.cc
+++ b/klm/lm/trie.cc
@@ -91,15 +91,6 @@ template bool BitPackedMiddle::Find
if (!FindBitPacked(base_, word_mask_, word_bits_, total_bits_, range.begin, range.end, max_vocab_, word, at_pointer)) {
return false;
}
-<<<<<<< HEAD
- uint64_t index = at_pointer;
- at_pointer *= total_bits_;
- at_pointer += word_bits_;
- quant_.Read(base_, at_pointer, prob, backoff);
- at_pointer += quant_.TotalBits();
-
- bhiksha_.ReadNext(base_, at_pointer, index, total_bits_, range);
-=======
pointer = at_pointer;
at_pointer *= total_bits_;
at_pointer += word_bits_;
@@ -108,7 +99,6 @@ template bool BitPackedMiddle::Find
at_pointer += quant_.TotalBits();
bhiksha_.ReadNext(base_, at_pointer, pointer, total_bits_, range);
->>>>>>> upstream/master
return true;
}
--
cgit v1.2.3