From ef2df950520a47ca7011736648334eedeae5297a Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Wed, 19 Oct 2011 20:56:22 +0200 Subject: merged, compiles but not working --- .gitignore | 3 + decoder/ff_klm.cc | 19 ------- dtrain/dtrain.cc | 75 ++++++++++++++++--------- dtrain/dtrain.h | 2 + dtrain/kbestget.h | 6 +- dtrain/test/example/dtrain.ini | 8 +-- klm/lm/binary_format.cc | 4 -- klm/lm/search_trie.cc | 123 ----------------------------------------- klm/lm/trie.cc | 10 ---- utils/fdict.h | 1 - 10 files changed, 63 insertions(+), 188 deletions(-) diff --git a/.gitignore b/.gitignore index 7e63c4ef..43b48a97 100644 --- a/.gitignore +++ b/.gitignore @@ -155,3 +155,6 @@ training/compute_cllh dtrain/dtrain weights.gz dtrain/test/eval/ +phrasinator/gibbs_train_plm_notables +training/mpi_flex_optimize +utils/phmt diff --git a/decoder/ff_klm.cc b/decoder/ff_klm.cc index 28bcb6b9..ed6f731e 100644 --- a/decoder/ff_klm.cc +++ b/decoder/ff_klm.cc @@ -392,22 +392,3 @@ std::string KLanguageModelFactory::usage(bool params,bool verbose) const { return KLanguageModel::usage(params, verbose); } - switch (m) { - case HASH_PROBING: - return CreateModel(param); - case TRIE_SORTED: - return CreateModel(param); - case ARRAY_TRIE_SORTED: - return CreateModel(param); - case QUANT_TRIE_SORTED: - return CreateModel(param); - case QUANT_ARRAY_TRIE_SORTED: - return CreateModel(param); - default: - UTIL_THROW(util::Exception, "Unrecognized kenlm binary file type " << (unsigned)m); - } -} - -std::string KLanguageModelFactory::usage(bool params,bool verbose) const { - return KLanguageModel::usage(params, verbose); -} diff --git a/dtrain/dtrain.cc b/dtrain/dtrain.cc index 0a94f7aa..e96b65aa 100644 --- a/dtrain/dtrain.cc +++ b/dtrain/dtrain.cc @@ -20,8 +20,8 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg) ("stop_after", po::value()->default_value(0), "stop after X input sentences") ("print_weights", po::value(), "weights to print on each iteration") ("hstreaming", po::value()->zero_tokens(), "run in hadoop streaming mode") - ("learning_rate", po::value()->default_value(0.0005), "learning rate") - ("gamma", po::value()->default_value(0), "gamma for SVM (0 for perceptron)") + ("learning_rate", po::value()->default_value(0.0005), "learning rate") + ("gamma", po::value()->default_value(0), "gamma for SVM (0 for perceptron)") ("tmp", po::value()->default_value("/tmp"), "temp dir to use") ("select_weights", po::value()->default_value("last"), "output 'best' or 'last' weights ('VOID' to throw away)") ("noup", po::value()->zero_tokens(), "do not update weights"); @@ -134,15 +134,14 @@ main(int argc, char** argv) observer->SetScorer(scorer); // init weights - Weights weights; - if (cfg.count("input_weights")) weights.InitFromFile(cfg["input_weights"].as()); - SparseVector lambdas; - weights.InitSparseVector(&lambdas); - vector dense_weights; + vector& dense_weights = decoder.CurrentWeightVector(); + SparseVector lambdas; + if (cfg.count("input_weights")) Weights::InitFromFile(cfg["input_weights"].as(), &dense_weights); + Weights::InitSparseVector(dense_weights, &lambdas); // meta params for perceptron, SVM - double eta = cfg["learning_rate"].as(); - double gamma = cfg["gamma"].as(); + weight_t eta = cfg["learning_rate"].as(); + weight_t gamma = cfg["gamma"].as(); WordID __bias = FD::Convert("__bias"); lambdas.add_value(__bias, 0); @@ -160,7 +159,7 @@ main(int argc, char** argv) grammar_buf_out.open(grammar_buf_fn.c_str()); unsigned in_sz = 999999999; // input index, input size - vector > all_scores; + vector > all_scores; score_t max_score = 0.; unsigned best_it = 0; float overall_time = 0.; @@ -189,6 +188,15 @@ main(int argc, char** argv) } + //LogVal a(2.2); + //LogVal b(2.1); + //cout << a << endl; + //cout << log(a) << endl; + //LogVal c = a - b; + //cout << log(c) << endl; + //exit(0); + + for (unsigned t = 0; t < T; t++) // T epochs { @@ -196,7 +204,8 @@ main(int argc, char** argv) time(&start); igzstream grammar_buf_in; if (t > 0) grammar_buf_in.open(grammar_buf_fn.c_str()); - score_t score_sum = 0., model_sum = 0.; + score_t score_sum = 0.; + score_t model_sum(0); unsigned ii = 0, nup = 0, npairs = 0; if (!quiet) cerr << "Iteration #" << t+1 << " of " << T << "." << endl; @@ -238,10 +247,7 @@ main(int argc, char** argv) if (next || stop) break; // weights - dense_weights.clear(); - weights.InitFromVector(lambdas); - weights.InitVector(&dense_weights); - decoder.SetWeights(dense_weights); + lambdas.init_vector(&dense_weights); // getting input vector in_split; // input: sid\tsrc\tref\tpsg @@ -289,7 +295,8 @@ main(int argc, char** argv) // get (scored) samples vector* samples = observer->GetSamples(); - if (verbose) { + // FIXME + /*if (verbose) { cout << "[ref: '"; if (t > 0) cout << ref_ids_buf[ii]; else cout << ref_ids; @@ -297,7 +304,15 @@ main(int argc, char** argv) cout << _p5 << _np << "1best: " << "'" << (*samples)[0].w << "'" << endl; cout << "SCORE=" << (*samples)[0].score << ",model="<< (*samples)[0].model << endl; cout << "F{" << (*samples)[0].f << "} ]" << endl << endl; - } + }*/ + /*cout << lambdas.get(FD::Convert("PhraseModel_0")) << endl; + cout << (*samples)[0].model << endl; + cout << "1best: "; + for (unsigned u = 0; u < (*samples)[0].w.size(); u++) cout << TD::Convert((*samples)[0].w[u]) << " "; + cout << endl; + cout << (*samples)[0].f << endl; + cout << "___" << endl;*/ + score_sum += (*samples)[0].score; model_sum += (*samples)[0].model; @@ -317,21 +332,21 @@ main(int argc, char** argv) if (!gamma) { // perceptron if (it->first.score - it->second.score < 0) { // rank error - SparseVector dv = it->second.f - it->first.f; + SparseVector dv = it->second.f - it->first.f; dv.add_value(__bias, -1); lambdas.plus_eq_v_times_s(dv, eta); nup++; } } else { // SVM - double rank_error = it->second.score - it->first.score; + score_t rank_error = it->second.score - it->first.score; if (rank_error > 0) { - SparseVector dv = it->second.f - it->first.f; + SparseVector dv = it->second.f - it->first.f; dv.add_value(__bias, -1); lambdas.plus_eq_v_times_s(dv, eta); } // regularization - double margin = it->first.model - it->second.model; + score_t margin = it->first.model - it->second.model; if (rank_error || margin < 1) { lambdas.plus_eq_v_times_s(lambdas, -2*gamma*eta); // reg /= #EXAMPLES or #UPDATES ? nup++; @@ -339,6 +354,15 @@ main(int argc, char** argv) } } } + + + vector x; + lambdas.init_vector(&x); + for (int q = 0; q < x.size(); q++) { + if (x[q] < -10 && x[q] != 0) + cout << FD::Convert(q) << " " << x[q] << endl; + } + cout << " --- " << endl; ++ii; @@ -358,7 +382,8 @@ main(int argc, char** argv) // print some stats score_t score_avg = score_sum/(score_t)in_sz; score_t model_avg = model_sum/(score_t)in_sz; - score_t score_diff, model_diff; + score_t score_diff; + score_t model_diff; if (t > 0) { score_diff = score_avg - all_scores[t-1].first; model_diff = model_avg - all_scores[t-1].second; @@ -402,10 +427,10 @@ main(int argc, char** argv) // write weights to file if (select_weights == "best") { - weights.InitFromVector(lambdas); string infix = "dtrain-weights-" + boost::lexical_cast(t); + lambdas.init_vector(&dense_weights); string w_fn = gettmpf(tmp_path, infix, "gz"); - weights.WriteToFile(w_fn, true); + Weights::WriteToFile(w_fn, dense_weights, true); weights_files.push_back(w_fn); } @@ -420,7 +445,7 @@ main(int argc, char** argv) ostream& o = *of.stream(); o.precision(17); o << _np; - for (SparseVector::const_iterator it = lambdas.begin(); it != lambdas.end(); ++it) { + for (SparseVector::const_iterator it = lambdas.begin(); it != lambdas.end(); ++it) { if (it->second == 0) continue; o << FD::Convert(it->first) << '\t' << it->second << endl; } diff --git a/dtrain/dtrain.h b/dtrain/dtrain.h index e98ef470..7c1509e4 100644 --- a/dtrain/dtrain.h +++ b/dtrain/dtrain.h @@ -11,6 +11,8 @@ #include "ksampler.h" #include "pairsampling.h" +#include "filelib.h" + #define DTRAIN_DOTS 100 // when to display a '.' #define DTRAIN_GRAMMAR_DELIM "########EOS########" diff --git a/dtrain/kbestget.h b/dtrain/kbestget.h index d141da60..4aadee7a 100644 --- a/dtrain/kbestget.h +++ b/dtrain/kbestget.h @@ -7,6 +7,7 @@ #include "ff_register.h" #include "decoder.h" #include "weights.h" +#include "logval.h" using namespace std; @@ -106,7 +107,8 @@ struct KBestGetter : public HypSampler ScoredHyp h; h.w = d->yield; h.f = d->feature_values; - h.model = log(d->score); + h.model = d->score; + cout << i << ". "<< h.model << endl; h.rank = i; h.score = scorer_->Score(h.w, *ref_, i); s_.push_back(h); @@ -125,7 +127,7 @@ struct KBestGetter : public HypSampler ScoredHyp h; h.w = d->yield; h.f = d->feature_values; - h.model = log(d->score); + h.model = -1*log(d->score); h.rank = i; h.score = scorer_->Score(h.w, *ref_, i); s_.push_back(h); diff --git a/dtrain/test/example/dtrain.ini b/dtrain/test/example/dtrain.ini index 9b83193a..96bdbf8e 100644 --- a/dtrain/test/example/dtrain.ini +++ b/dtrain/test/example/dtrain.ini @@ -1,14 +1,14 @@ decoder_config=test/example/cdec.ini k=100 N=3 -gamma=0.00001 +gamma=0 #.00001 epochs=2 input=test/example/nc-1k-tabs.gz scorer=stupid_bleu output=- -stop_after=10 +stop_after=5 sample_from=kbest -pair_sampling=108010 -select_weights=best +pair_sampling=all #108010 +select_weights=VOID print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 PhraseModel_1 PhraseModel_2 PhraseModel_3 PhraseModel_4 PassThrough tmp=/tmp diff --git a/klm/lm/binary_format.cc b/klm/lm/binary_format.cc index eac8aa85..27cada13 100644 --- a/klm/lm/binary_format.cc +++ b/klm/lm/binary_format.cc @@ -182,10 +182,6 @@ void SeekPastHeader(int fd, const Parameters ¶ms) { SeekOrThrow(fd, TotalHeaderSize(params.counts.size())); } -void SeekPastHeader(int fd, const Parameters ¶ms) { - SeekOrThrow(fd, TotalHeaderSize(params.counts.size())); -} - uint8_t *SetupBinary(const Config &config, const Parameters ¶ms, std::size_t memory_size, Backing &backing) { const off_t file_size = util::SizeFile(backing.file.get()); // The header is smaller than a page, so we have to map the whole header as well. diff --git a/klm/lm/search_trie.cc b/klm/lm/search_trie.cc index 1bcfe27d..5d8c70db 100644 --- a/klm/lm/search_trie.cc +++ b/klm/lm/search_trie.cc @@ -234,19 +234,8 @@ class FindBlanks { return unigrams_[index].prob; } -<<<<<<< HEAD -// Phase to count n-grams, including blanks inserted because they were pruned but have extensions -class JustCount { - public: - template JustCount(ContextReader * /*contexts*/, UnigramValue * /*unigrams*/, Middle * /*middle*/, Longest &/*longest*/, uint64_t *counts, unsigned char order) - : counts_(counts), longest_counts_(counts + order - 1) {} - - void Unigrams(WordIndex begin, WordIndex end) { - counts_[0] += end - begin; -======= void Unigram(WordIndex /*index*/) { ++counts_[0]; ->>>>>>> upstream/master } void MiddleBlank(const unsigned char order, const WordIndex *indices, unsigned char lower, float prob_basis) { @@ -278,11 +267,7 @@ class JustCount { // Phase to actually write n-grams to the trie. template class WriteEntries { public: -<<<<<<< HEAD - WriteEntries(ContextReader *contexts, UnigramValue *unigrams, BitPackedMiddle *middle, BitPackedLongest &longest, const uint64_t * /*counts*/, unsigned char order) : -======= WriteEntries(RecordReader *contexts, UnigramValue *unigrams, BitPackedMiddle *middle, BitPackedLongest &longest, unsigned char order, SRISucks &sri) : ->>>>>>> upstream/master contexts_(contexts), unigrams_(unigrams), middle_(middle), @@ -330,16 +315,8 @@ template class WriteEntries { SRISucks &sri_; }; -<<<<<<< HEAD -template class RecursiveInsert { - public: - template RecursiveInsert(SortedFileReader *inputs, ContextReader *contexts, UnigramValue *unigrams, MiddleT *middle, LongestT &longest, uint64_t *counts, unsigned char order) : - doing_(contexts, unigrams, middle, longest, counts, order), inputs_(inputs), inputs_end_(inputs + order - 1), order_minus_2_(order - 2) { - } -======= struct Gram { Gram(const WordIndex *in_begin, unsigned char order) : begin(in_begin), end(in_begin + order) {} ->>>>>>> upstream/master const WordIndex *begin, *end; @@ -440,29 +417,6 @@ void SanityCheckCounts(const std::vector &initial, const std::vector void TrainQuantizer(uint8_t order, uint64_t count, SortedFileReader &reader, util::ErsatzProgress &progress, Quant &quant) { - ProbBackoff weights; - std::vector probs, backoffs; - probs.reserve(count); - backoffs.reserve(count); - for (reader.Rewind(); !reader.Ended(); reader.NextHeader()) { - uint64_t entries = reader.ReadCount(); - for (uint64_t c = 0; c < entries; ++c) { - reader.ReadWord(); - reader.ReadWeights(weights); - // kBlankProb isn't added yet. - probs.push_back(weights.prob); - if (weights.backoff != 0.0) backoffs.push_back(weights.backoff); - ++progress; - } -======= template void TrainQuantizer(uint8_t order, uint64_t count, const std::vector &additional, RecordReader &reader, util::ErsatzProgress &progress, Quant &quant) { std::vector probs(additional), backoffs; probs.reserve(count + additional.size()); @@ -472,26 +426,10 @@ template void TrainQuantizer(uint8_t order, uint64_t count, const probs.push_back(weights.prob); if (weights.backoff != 0.0) backoffs.push_back(weights.backoff); ++progress; ->>>>>>> upstream/master } quant.Train(order, probs, backoffs); } -<<<<<<< HEAD -template void TrainProbQuantizer(uint8_t order, uint64_t count, SortedFileReader &reader, util::ErsatzProgress &progress, Quant &quant) { - Prob weights; - std::vector probs, backoffs; - probs.reserve(count); - for (reader.Rewind(); !reader.Ended(); reader.NextHeader()) { - uint64_t entries = reader.ReadCount(); - for (uint64_t c = 0; c < entries; ++c) { - reader.ReadWord(); - reader.ReadWeights(weights); - // kBlankProb isn't added yet. - probs.push_back(weights.prob); - ++progress; - } -======= template void TrainProbQuantizer(uint8_t order, uint64_t count, RecordReader &reader, util::ErsatzProgress &progress, Quant &quant) { std::vector probs, backoffs; probs.reserve(count); @@ -499,18 +437,10 @@ template void TrainProbQuantizer(uint8_t order, uint64_t count, Re const Prob &weights = *reinterpret_cast(reinterpret_cast(reader.Data()) + sizeof(WordIndex) * order); probs.push_back(weights.prob); ++progress; ->>>>>>> upstream/master } quant.TrainProb(order, probs); } -<<<<<<< HEAD -} // namespace - -template void BuildTrie(const std::string &file_prefix, std::vector &counts, const Config &config, TrieSearch &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing) { - std::vector inputs(counts.size() - 1); - std::vector contexts(counts.size() - 1); -======= void PopulateUnigramWeights(FILE *file, WordIndex unigram_count, RecordReader &contexts, UnigramValue *unigrams) { // Fill unigram probabilities. try { @@ -533,7 +463,6 @@ void PopulateUnigramWeights(FILE *file, WordIndex unigram_count, RecordReader &c template void BuildTrie(const std::string &file_prefix, std::vector &counts, const Config &config, TrieSearch &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing) { RecordReader inputs[kMaxOrder - 1]; RecordReader contexts[kMaxOrder - 1]; ->>>>>>> upstream/master for (unsigned char i = 2; i <= counts.size(); ++i) { std::stringstream assembled; @@ -548,17 +477,12 @@ template void BuildTrie(const std::string &file_pre SRISucks sri; std::vector fixed_counts(counts.size()); { -<<<<<<< HEAD - RecursiveInsert counter(&*inputs.begin(), &*contexts.begin(), NULL, out.middle_begin_, out.longest, &*fixed_counts.begin(), counts.size()); - counter.Apply(config.messages, "Counting n-grams that should not have been pruned", counts[0]); -======= std::string temp(file_prefix); temp += "unigrams"; util::scoped_fd unigram_file(util::OpenReadOrThrow(temp.c_str())); util::scoped_memory unigrams; MapRead(util::POPULATE_OR_READ, unigram_file.get(), 0, counts[0] * sizeof(ProbBackoff), unigrams); FindBlanks finder(&*fixed_counts.begin(), counts.size(), reinterpret_cast(unigrams.get()), sri); RecursiveInsert(counts.size(), counts[0], inputs, config.messages, "Identifying n-grams omitted by SRI", finder); ->>>>>>> upstream/master } for (const RecordReader *i = inputs; i != inputs + counts.size() - 2; ++i) { if (*i) UTIL_THROW(FormatLoadException, "There's a bug in the trie implementation: the " << (i - inputs + 2) << "-gram table did not complete reading"); @@ -566,18 +490,6 @@ template void BuildTrie(const std::string &file_pre SanityCheckCounts(counts, fixed_counts); counts = fixed_counts; -<<<<<<< HEAD - out.SetupMemory(GrowForSearch(config, vocab.UnkCountChangePadding(), TrieSearch::Size(fixed_counts, config), backing), fixed_counts, config); - - if (Quant::kTrain) { - util::ErsatzProgress progress(config.messages, "Quantizing", std::accumulate(counts.begin() + 1, counts.end(), 0)); - for (unsigned char i = 2; i < counts.size(); ++i) { - TrainQuantizer(i, counts[i-1], inputs[i-2], progress, quant); - } - TrainProbQuantizer(counts.size(), counts.back(), inputs[counts.size() - 2], progress, quant); - quant.FinishedLoading(config); - } -======= util::scoped_FILE unigram_file; { std::string name(file_prefix + "unigrams"); @@ -587,7 +499,6 @@ template void BuildTrie(const std::string &file_pre sri.ObtainBackoffs(counts.size(), unigram_file.get(), inputs); out.SetupMemory(GrowForSearch(config, vocab.UnkCountChangePadding(), TrieSearch::Size(fixed_counts, config), backing), fixed_counts, config); ->>>>>>> upstream/master for (unsigned char i = 2; i <= counts.size(); ++i) { inputs[i-2].Rewind(); @@ -610,30 +521,8 @@ template void BuildTrie(const std::string &file_pre } // Fill entries except unigram probabilities. { -<<<<<<< HEAD - RecursiveInsert > inserter(&*inputs.begin(), &*contexts.begin(), unigrams, out.middle_begin_, out.longest, &*fixed_counts.begin(), counts.size()); - inserter.Apply(config.messages, "Building trie", fixed_counts[0]); - } - - // Fill unigram probabilities. - try { - std::string name(file_prefix + "unigrams"); - util::scoped_FILE file(OpenOrThrow(name.c_str(), "r")); - for (WordIndex i = 0; i < counts[0]; ++i) { - ReadOrThrow(file.get(), &unigrams[i].weights, sizeof(ProbBackoff)); - if (contexts[0] && **contexts[0] == i) { - SetExtension(unigrams[i].weights.backoff); - ++contexts[0]; - } - } - RemoveOrThrow(name.c_str()); - } catch (util::Exception &e) { - e << " while re-reading unigram probabilities"; - throw; -======= WriteEntries writer(contexts, unigrams, out.middle_begin_, out.longest, counts.size(), sri); RecursiveInsert(counts.size(), counts[0], inputs, config.messages, "Writing trie", writer); ->>>>>>> upstream/master } // Do not disable this error message or else too little state will be returned. Both WriteEntries::Middle and returning state based on found n-grams will need to be fixed to handle this situation. @@ -687,17 +576,6 @@ template uint8_t *TrieSearch::Setup } longest.Init(start, quant_.Long(counts.size()), counts[0]); return start + Longest::Size(Quant::LongestBits(config), counts.back(), counts[0]); -<<<<<<< HEAD -} - -template void TrieSearch::LoadedBinary() { - unigram.LoadedBinary(); - for (Middle *i = middle_begin_; i != middle_end_; ++i) { - i->LoadedBinary(); - } - longest.LoadedBinary(); -} -======= } template void TrieSearch::LoadedBinary() { @@ -715,7 +593,6 @@ bool IsDirectory(const char *path) { return S_ISDIR(info.st_mode); } } // namespace ->>>>>>> upstream/master template void TrieSearch::InitializeFromARPA(const char *file, util::FilePiece &f, std::vector &counts, const Config &config, SortedVocabulary &vocab, Backing &backing) { std::string temporary_directory; diff --git a/klm/lm/trie.cc b/klm/lm/trie.cc index a1136b6f..20075bb8 100644 --- a/klm/lm/trie.cc +++ b/klm/lm/trie.cc @@ -91,15 +91,6 @@ template bool BitPackedMiddle::Find if (!FindBitPacked(base_, word_mask_, word_bits_, total_bits_, range.begin, range.end, max_vocab_, word, at_pointer)) { return false; } -<<<<<<< HEAD - uint64_t index = at_pointer; - at_pointer *= total_bits_; - at_pointer += word_bits_; - quant_.Read(base_, at_pointer, prob, backoff); - at_pointer += quant_.TotalBits(); - - bhiksha_.ReadNext(base_, at_pointer, index, total_bits_, range); -======= pointer = at_pointer; at_pointer *= total_bits_; at_pointer += word_bits_; @@ -108,7 +99,6 @@ template bool BitPackedMiddle::Find at_pointer += quant_.TotalBits(); bhiksha_.ReadNext(base_, at_pointer, pointer, total_bits_, range); ->>>>>>> upstream/master return true; } diff --git a/utils/fdict.h b/utils/fdict.h index 9c8d7cde..f0871b9a 100644 --- a/utils/fdict.h +++ b/utils/fdict.h @@ -33,7 +33,6 @@ struct FD { hash_ = new PerfectHashFunction(cmph_file); #endif } ->>>>>>> upstream/master static inline int NumFeats() { #ifdef HAVE_CMPH if (hash_) return hash_->number_of_keys(); -- cgit v1.2.3