From a45af4a3704531a8382cd231f6445b3a33b598a3 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 10 Mar 2012 16:42:12 -0500 Subject: frequency-based binning --- decoder/Makefile.am | 1 - decoder/ff_csplit.cc | 2 +- decoder/freqdict.cc | 29 ----------------------------- decoder/freqdict.h | 37 ++++++++++++++++++++++++++++++++----- gi/pf/align-lexonly-pyp.cc | 24 +++++++++++++++++------- gi/pf/make-freq-bins.pl | 26 ++++++++++++++++++++++++++ gi/pf/pyp_tm.cc | 24 +++++++++++++++++------- gi/pf/pyp_tm.h | 7 ++++--- 8 files changed, 97 insertions(+), 53 deletions(-) delete mode 100644 decoder/freqdict.cc create mode 100755 gi/pf/make-freq-bins.pl diff --git a/decoder/Makefile.am b/decoder/Makefile.am index a00b18af..ec51d643 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -76,7 +76,6 @@ libcdec_a_SOURCES = \ ff_source_syntax.cc \ ff_bleu.cc \ ff_factory.cc \ - freqdict.cc \ lexalign.cc \ lextrans.cc \ tagger.cc \ diff --git a/decoder/ff_csplit.cc b/decoder/ff_csplit.cc index 3991d38f..c9ed996c 100644 --- a/decoder/ff_csplit.cc +++ b/decoder/ff_csplit.cc @@ -72,7 +72,7 @@ struct BasicCSplitFeaturesImpl { const int fl1_; const int fl2_; const int bad_; - FreqDict freq_dict_; + FreqDict freq_dict_; set bad_words_; }; diff --git a/decoder/freqdict.cc b/decoder/freqdict.cc deleted file mode 100644 index 9e25d346..00000000 --- a/decoder/freqdict.cc +++ /dev/null @@ -1,29 +0,0 @@ -#include -#include -#include -#include "freqdict.h" -#include "tdict.h" -#include "filelib.h" - -using namespace std; - -void FreqDict::Load(const std::string& fname) { - cerr << "Reading word frequencies: " << fname << endl; - ReadFile rf(fname); - istream& ifs = *rf.stream(); - int cc=0; - while (ifs) { - std::string word; - ifs >> word; - if (word.size() == 0) continue; - if (word[0] == '#') continue; - double count = 0; - ifs >> count; - assert(count > 0.0); // use -log(f) - counts_[TD::Convert(word)]=count; - ++cc; - if (cc % 10000 == 0) { std::cerr << "."; } - } - std::cerr << "\n"; - std::cerr << "Loaded " << cc << " words\n"; -} diff --git a/decoder/freqdict.h b/decoder/freqdict.h index 9acf0c33..4e03fadd 100644 --- a/decoder/freqdict.h +++ b/decoder/freqdict.h @@ -1,20 +1,47 @@ #ifndef _FREQDICT_H_ #define _FREQDICT_H_ +#include #include #include #include "wordid.h" +#include "filelib.h" +#include "tdict.h" +template class FreqDict { public: - void Load(const std::string& fname); - float LookUp(const WordID& word) const { - std::map::const_iterator i = counts_.find(word); - if (i == counts_.end()) return 0; + FreqDict() : max_() {} + T Max() const { return max_; } + void Load(const std::string& fname) { + std::cerr << "Reading word statistics from: " << fname << std::endl; + ReadFile rf(fname); + std::istream& ifs = *rf.stream(); + int cc=0; + std::string word; + while (ifs) { + ifs >> word; + if (word.size() == 0) continue; + if (word[0] == '#') continue; + T count = 0; + ifs >> count; + if (count > max_) max_ = count; + counts_[TD::Convert(word)]=count; + ++cc; + if (cc % 10000 == 0) { std::cerr << "."; } + } + std::cerr << "\n"; + std::cerr << "Loaded " << cc << " words\n"; + } + + T LookUp(const WordID& word) const { + typename std::map::const_iterator i = counts_.find(word); + if (i == counts_.end()) return T(); return i->second; } private: - std::map counts_; + T max_; + std::map counts_; }; #endif diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index 6c054753..942dcf51 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -20,6 +20,9 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() ("samples,s",po::value()->default_value(1000),"Number of samples") + ("infer_alignment_hyperparameters,I", "Infer alpha and p_null, otherwise fixed values will be assumed") + ("p_null,0", po::value()->default_value(0.08), "probability of aligning to null") + ("align_alpha,a", po::value()->default_value(4.0), "how 'tight' is the bias toward be along the diagonal?") ("input,i",po::value(),"Read parallel data from") ("random_seed,S",po::value(), "Random seed"); po::options_description clo("Command line options"); @@ -59,9 +62,13 @@ struct AlignedSentencePair { }; struct Aligner { - Aligner(const vector >& lets, int num_letters, vector* c) : + Aligner(const vector >& lets, + int num_letters, + const po::variables_map& conf, + vector* c) : corpus(*c), - paj_model(4, 0.08), + paj_model(conf["align_alpha"].as(), conf["p_null"].as()), + infer_paj(conf.count("infer_alignment_hyperparameters") > 0), model(lets, num_letters), kNULL(TD::Convert("NULL")) { assert(lets[kNULL].size() == 0); @@ -69,12 +76,13 @@ struct Aligner { vector& corpus; QuasiModel2 paj_model; + const bool infer_paj; PYPLexicalTranslation model; const WordID kNULL; void ResampleHyperparameters() { model.ResampleHyperparameters(prng); - paj_model.ResampleHyperparameters(prng); + if (infer_paj) paj_model.ResampleHyperparameters(prng); } void InitializeRandom() { @@ -117,8 +125,6 @@ struct Aligner { paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size()); } } - cerr << "LLH = " << Likelihood() << " \t(Amodel=" << paj_model.Likelihood() - << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl; } prob_t Likelihood() const { @@ -211,13 +217,17 @@ int main(int argc, char** argv) { ExtractLetters(vocabf, &letters, NULL); letters[TD::Convert("NULL")].clear(); - Aligner aligner(letters, letset.size(), &corpus); + Aligner aligner(letters, letset.size(), conf, &corpus); aligner.InitializeRandom(); const unsigned samples = conf["samples"].as(); for (int i = 0; i < samples; ++i) { for (int j = 65; j < 67; ++j) Debug(corpus[j]); - if (i % 10 == 9) aligner.ResampleHyperparameters(); + if (i % 10 == 9) { + aligner.ResampleHyperparameters(); + cerr << "LLH = " << aligner.Likelihood() << " \t(Amodel=" << aligner.paj_model.Likelihood() + << " TModel=" << aligner.model.Likelihood() << ") contexts=" << aligner.model.UniqueConditioningContexts() << endl; + } aligner.ResampleCorpus(); if (i > (samples / 5) && (i % 6 == 5)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); } diff --git a/gi/pf/make-freq-bins.pl b/gi/pf/make-freq-bins.pl new file mode 100755 index 00000000..fdcd3555 --- /dev/null +++ b/gi/pf/make-freq-bins.pl @@ -0,0 +1,26 @@ +#!/usr/bin/perl -w +use strict; + +my $BASE = 6; +my $CUTOFF = 3; + +my %d; +my $num = 0; +while(<>){ + chomp; + my @words = split /\s+/; + for my $w (@words) {$d{$w}++; $num++;} +} + +my @vocab = sort {$d{$b} <=> $d{$a}} keys %d; + +for (my $i=0; $i #include -#include "base_distributions.h" -#include "monotonic_pseg.h" -#include "conditional_pseg.h" #include "tdict.h" #include "ccrp.h" #include "pyp_word_model.h" @@ -15,9 +12,19 @@ using namespace std; using namespace std::tr1; -template +struct FreqBinner { + FreqBinner(const std::string& fname) { fd_.Load(fname); } + unsigned NumberOfBins() const { return fd_.Max() + 1; } + unsigned Bin(const WordID& w) const { return fd_.LookUp(w); } + FreqDict fd_; +}; + +template struct ConditionalPYPWordModel { - ConditionalPYPWordModel(Base* b) : base(*b), btr(2) {} + ConditionalPYPWordModel(Base* b, const Binner* bnr = NULL) : + base(*b), + binner(bnr), + btr(binner ? binner->NumberOfBins() + 1u : 2u) {} void Summary() const { cerr << "Number of conditioning contexts: " << r.size() << endl; @@ -46,7 +53,9 @@ struct ConditionalPYPWordModel { if (it == r.end()) { it = r.insert(make_pair(src, CCRP >(0.5,1.0))).first; static const WordID kNULL = TD::Convert("NULL"); - btr.Add(src == kNULL ? 0 : 1, &it->second); + unsigned bin = (src == kNULL ? 0 : 1); + if (binner && bin) { bin = binner->Bin(src) + 1; } + btr.Add(bin, &it->second); } if (it->second.increment(trglets, base(trglets), rng)) base.Increment(trglets, rng); @@ -75,6 +84,7 @@ struct ConditionalPYPWordModel { // TODO tie PYP hyperparameters based on source word frequency bins Base& base; + const Binner* binner; BinTiedResampler > > btr; typedef unordered_map > > RuleModelHash; RuleModelHash r; @@ -84,7 +94,7 @@ PYPLexicalTranslation::PYPLexicalTranslation(const vector >& lets const unsigned num_letters) : letters(lets), up0(new PYPWordModel(num_letters)), - tmodel(new ConditionalPYPWordModel(up0)), + tmodel(new ConditionalPYPWordModel(up0, new FreqBinner("10k.freq"))), kX(-TD::Convert("X")) {} void PYPLexicalTranslation::Summary() const { diff --git a/gi/pf/pyp_tm.h b/gi/pf/pyp_tm.h index fa0fb28f..63e7c96d 100644 --- a/gi/pf/pyp_tm.h +++ b/gi/pf/pyp_tm.h @@ -5,10 +5,11 @@ #include "wordid.h" #include "prob.h" #include "sampler.h" +#include "freqdict.h" -struct TRule; +struct FreqBinner; struct PYPWordModel; -template struct ConditionalPYPWordModel; +template struct ConditionalPYPWordModel; struct PYPLexicalTranslation { explicit PYPLexicalTranslation(const std::vector >& lets, @@ -26,7 +27,7 @@ struct PYPLexicalTranslation { private: const std::vector >& letters; // spelling dictionary PYPWordModel* up0; // base distribuction (model English word) - ConditionalPYPWordModel* tmodel; // translation distributions + ConditionalPYPWordModel* tmodel; // translation distributions // (model English word | French word) const WordID kX; }; -- cgit v1.2.3