diff options
author | Chris Dyer <cdyer@cs.cmu.edu> | 2012-03-10 16:42:12 -0500 |
---|---|---|
committer | Chris Dyer <cdyer@cs.cmu.edu> | 2012-03-10 16:42:12 -0500 |
commit | dfbc278c1057555fda9312291c8024049e00b7d8 (patch) | |
tree | e922651d48b1c9f73857f0dabd31c55a3ce8a74b /gi | |
parent | 289f96779e665ba24adca3461a624c68aa37bd99 (diff) |
frequency-based binning
Diffstat (limited to 'gi')
-rw-r--r-- | gi/pf/align-lexonly-pyp.cc | 24 | ||||
-rwxr-xr-x | gi/pf/make-freq-bins.pl | 26 | ||||
-rw-r--r-- | gi/pf/pyp_tm.cc | 24 | ||||
-rw-r--r-- | gi/pf/pyp_tm.h | 7 |
4 files changed, 64 insertions, 17 deletions
diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index 6c054753..942dcf51 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -20,6 +20,9 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") + ("infer_alignment_hyperparameters,I", "Infer alpha and p_null, otherwise fixed values will be assumed") + ("p_null,0", po::value<double>()->default_value(0.08), "probability of aligning to null") + ("align_alpha,a", po::value<double>()->default_value(4.0), "how 'tight' is the bias toward be along the diagonal?") ("input,i",po::value<string>(),"Read parallel data from") ("random_seed,S",po::value<uint32_t>(), "Random seed"); po::options_description clo("Command line options"); @@ -59,9 +62,13 @@ struct AlignedSentencePair { }; struct Aligner { - Aligner(const vector<vector<WordID> >& lets, int num_letters, vector<AlignedSentencePair>* c) : + Aligner(const vector<vector<WordID> >& lets, + int num_letters, + const po::variables_map& conf, + vector<AlignedSentencePair>* c) : corpus(*c), - paj_model(4, 0.08), + paj_model(conf["align_alpha"].as<double>(), conf["p_null"].as<double>()), + infer_paj(conf.count("infer_alignment_hyperparameters") > 0), model(lets, num_letters), kNULL(TD::Convert("NULL")) { assert(lets[kNULL].size() == 0); @@ -69,12 +76,13 @@ struct Aligner { vector<AlignedSentencePair>& corpus; QuasiModel2 paj_model; + const bool infer_paj; PYPLexicalTranslation model; const WordID kNULL; void ResampleHyperparameters() { model.ResampleHyperparameters(prng); - paj_model.ResampleHyperparameters(prng); + if (infer_paj) paj_model.ResampleHyperparameters(prng); } void InitializeRandom() { @@ -117,8 +125,6 @@ struct Aligner { paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size()); } } - cerr << "LLH = " << Likelihood() << " \t(Amodel=" << paj_model.Likelihood() - << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl; } prob_t Likelihood() const { @@ -211,13 +217,17 @@ int main(int argc, char** argv) { ExtractLetters(vocabf, &letters, NULL); letters[TD::Convert("NULL")].clear(); - Aligner aligner(letters, letset.size(), &corpus); + Aligner aligner(letters, letset.size(), conf, &corpus); aligner.InitializeRandom(); const unsigned samples = conf["samples"].as<unsigned>(); for (int i = 0; i < samples; ++i) { for (int j = 65; j < 67; ++j) Debug(corpus[j]); - if (i % 10 == 9) aligner.ResampleHyperparameters(); + if (i % 10 == 9) { + aligner.ResampleHyperparameters(); + cerr << "LLH = " << aligner.Likelihood() << " \t(Amodel=" << aligner.paj_model.Likelihood() + << " TModel=" << aligner.model.Likelihood() << ") contexts=" << aligner.model.UniqueConditioningContexts() << endl; + } aligner.ResampleCorpus(); if (i > (samples / 5) && (i % 6 == 5)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]); } diff --git a/gi/pf/make-freq-bins.pl b/gi/pf/make-freq-bins.pl new file mode 100755 index 00000000..fdcd3555 --- /dev/null +++ b/gi/pf/make-freq-bins.pl @@ -0,0 +1,26 @@ +#!/usr/bin/perl -w +use strict; + +my $BASE = 6; +my $CUTOFF = 3; + +my %d; +my $num = 0; +while(<>){ + chomp; + my @words = split /\s+/; + for my $w (@words) {$d{$w}++; $num++;} +} + +my @vocab = sort {$d{$b} <=> $d{$a}} keys %d; + +for (my $i=0; $i<scalar @vocab; $i++) { + my $most = $d{$vocab[$i]}; + my $least = 1; + + my $nl = -int(log($most / $num) / log($BASE) + $CUTOFF); + if ($nl < 0) { $nl = 0; } + print "$vocab[$i] $nl\n" +} + + diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc index 34ef0ba2..e21f0267 100644 --- a/gi/pf/pyp_tm.cc +++ b/gi/pf/pyp_tm.cc @@ -4,9 +4,6 @@ #include <iostream> #include <queue> -#include "base_distributions.h" -#include "monotonic_pseg.h" -#include "conditional_pseg.h" #include "tdict.h" #include "ccrp.h" #include "pyp_word_model.h" @@ -15,9 +12,19 @@ using namespace std; using namespace std::tr1; -template <typename Base> +struct FreqBinner { + FreqBinner(const std::string& fname) { fd_.Load(fname); } + unsigned NumberOfBins() const { return fd_.Max() + 1; } + unsigned Bin(const WordID& w) const { return fd_.LookUp(w); } + FreqDict<unsigned> fd_; +}; + +template <typename Base, class Binner = FreqBinner> struct ConditionalPYPWordModel { - ConditionalPYPWordModel(Base* b) : base(*b), btr(2) {} + ConditionalPYPWordModel(Base* b, const Binner* bnr = NULL) : + base(*b), + binner(bnr), + btr(binner ? binner->NumberOfBins() + 1u : 2u) {} void Summary() const { cerr << "Number of conditioning contexts: " << r.size() << endl; @@ -46,7 +53,9 @@ struct ConditionalPYPWordModel { if (it == r.end()) { it = r.insert(make_pair(src, CCRP<vector<WordID> >(0.5,1.0))).first; static const WordID kNULL = TD::Convert("NULL"); - btr.Add(src == kNULL ? 0 : 1, &it->second); + unsigned bin = (src == kNULL ? 0 : 1); + if (binner && bin) { bin = binner->Bin(src) + 1; } + btr.Add(bin, &it->second); } if (it->second.increment(trglets, base(trglets), rng)) base.Increment(trglets, rng); @@ -75,6 +84,7 @@ struct ConditionalPYPWordModel { // TODO tie PYP hyperparameters based on source word frequency bins Base& base; + const Binner* binner; BinTiedResampler<CCRP<vector<WordID> > > btr; typedef unordered_map<WordID, CCRP<vector<WordID> > > RuleModelHash; RuleModelHash r; @@ -84,7 +94,7 @@ PYPLexicalTranslation::PYPLexicalTranslation(const vector<vector<WordID> >& lets const unsigned num_letters) : letters(lets), up0(new PYPWordModel(num_letters)), - tmodel(new ConditionalPYPWordModel<PYPWordModel>(up0)), + tmodel(new ConditionalPYPWordModel<PYPWordModel>(up0, new FreqBinner("10k.freq"))), kX(-TD::Convert("X")) {} void PYPLexicalTranslation::Summary() const { diff --git a/gi/pf/pyp_tm.h b/gi/pf/pyp_tm.h index fa0fb28f..63e7c96d 100644 --- a/gi/pf/pyp_tm.h +++ b/gi/pf/pyp_tm.h @@ -5,10 +5,11 @@ #include "wordid.h" #include "prob.h" #include "sampler.h" +#include "freqdict.h" -struct TRule; +struct FreqBinner; struct PYPWordModel; -template <typename T> struct ConditionalPYPWordModel; +template <typename T, class B> struct ConditionalPYPWordModel; struct PYPLexicalTranslation { explicit PYPLexicalTranslation(const std::vector<std::vector<WordID> >& lets, @@ -26,7 +27,7 @@ struct PYPLexicalTranslation { private: const std::vector<std::vector<WordID> >& letters; // spelling dictionary PYPWordModel* up0; // base distribuction (model English word) - ConditionalPYPWordModel<PYPWordModel>* tmodel; // translation distributions + ConditionalPYPWordModel<PYPWordModel, FreqBinner>* tmodel; // translation distributions // (model English word | French word) const WordID kX; }; |