summaryrefslogtreecommitdiff
path: root/gi/pf
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2012-03-10 16:42:12 -0500
committerChris Dyer <cdyer@cs.cmu.edu>2012-03-10 16:42:12 -0500
commita45af4a3704531a8382cd231f6445b3a33b598a3 (patch)
treecb6be837287be58fcb9834da4118b03dca213962 /gi/pf
parent280d5aa74b6a41f8f6deb5dd374140b7e3ab2703 (diff)
frequency-based binning
Diffstat (limited to 'gi/pf')
-rw-r--r--gi/pf/align-lexonly-pyp.cc24
-rwxr-xr-xgi/pf/make-freq-bins.pl26
-rw-r--r--gi/pf/pyp_tm.cc24
-rw-r--r--gi/pf/pyp_tm.h7
4 files changed, 64 insertions, 17 deletions
diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc
index 6c054753..942dcf51 100644
--- a/gi/pf/align-lexonly-pyp.cc
+++ b/gi/pf/align-lexonly-pyp.cc
@@ -20,6 +20,9 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
po::options_description opts("Configuration options");
opts.add_options()
("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
+ ("infer_alignment_hyperparameters,I", "Infer alpha and p_null, otherwise fixed values will be assumed")
+ ("p_null,0", po::value<double>()->default_value(0.08), "probability of aligning to null")
+ ("align_alpha,a", po::value<double>()->default_value(4.0), "how 'tight' is the bias toward be along the diagonal?")
("input,i",po::value<string>(),"Read parallel data from")
("random_seed,S",po::value<uint32_t>(), "Random seed");
po::options_description clo("Command line options");
@@ -59,9 +62,13 @@ struct AlignedSentencePair {
};
struct Aligner {
- Aligner(const vector<vector<WordID> >& lets, int num_letters, vector<AlignedSentencePair>* c) :
+ Aligner(const vector<vector<WordID> >& lets,
+ int num_letters,
+ const po::variables_map& conf,
+ vector<AlignedSentencePair>* c) :
corpus(*c),
- paj_model(4, 0.08),
+ paj_model(conf["align_alpha"].as<double>(), conf["p_null"].as<double>()),
+ infer_paj(conf.count("infer_alignment_hyperparameters") > 0),
model(lets, num_letters),
kNULL(TD::Convert("NULL")) {
assert(lets[kNULL].size() == 0);
@@ -69,12 +76,13 @@ struct Aligner {
vector<AlignedSentencePair>& corpus;
QuasiModel2 paj_model;
+ const bool infer_paj;
PYPLexicalTranslation model;
const WordID kNULL;
void ResampleHyperparameters() {
model.ResampleHyperparameters(prng);
- paj_model.ResampleHyperparameters(prng);
+ if (infer_paj) paj_model.ResampleHyperparameters(prng);
}
void InitializeRandom() {
@@ -117,8 +125,6 @@ struct Aligner {
paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size());
}
}
- cerr << "LLH = " << Likelihood() << " \t(Amodel=" << paj_model.Likelihood()
- << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl;
}
prob_t Likelihood() const {
@@ -211,13 +217,17 @@ int main(int argc, char** argv) {
ExtractLetters(vocabf, &letters, NULL);
letters[TD::Convert("NULL")].clear();
- Aligner aligner(letters, letset.size(), &corpus);
+ Aligner aligner(letters, letset.size(), conf, &corpus);
aligner.InitializeRandom();
const unsigned samples = conf["samples"].as<unsigned>();
for (int i = 0; i < samples; ++i) {
for (int j = 65; j < 67; ++j) Debug(corpus[j]);
- if (i % 10 == 9) aligner.ResampleHyperparameters();
+ if (i % 10 == 9) {
+ aligner.ResampleHyperparameters();
+ cerr << "LLH = " << aligner.Likelihood() << " \t(Amodel=" << aligner.paj_model.Likelihood()
+ << " TModel=" << aligner.model.Likelihood() << ") contexts=" << aligner.model.UniqueConditioningContexts() << endl;
+ }
aligner.ResampleCorpus();
if (i > (samples / 5) && (i % 6 == 5)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]);
}
diff --git a/gi/pf/make-freq-bins.pl b/gi/pf/make-freq-bins.pl
new file mode 100755
index 00000000..fdcd3555
--- /dev/null
+++ b/gi/pf/make-freq-bins.pl
@@ -0,0 +1,26 @@
+#!/usr/bin/perl -w
+use strict;
+
+my $BASE = 6;
+my $CUTOFF = 3;
+
+my %d;
+my $num = 0;
+while(<>){
+ chomp;
+ my @words = split /\s+/;
+ for my $w (@words) {$d{$w}++; $num++;}
+}
+
+my @vocab = sort {$d{$b} <=> $d{$a}} keys %d;
+
+for (my $i=0; $i<scalar @vocab; $i++) {
+ my $most = $d{$vocab[$i]};
+ my $least = 1;
+
+ my $nl = -int(log($most / $num) / log($BASE) + $CUTOFF);
+ if ($nl < 0) { $nl = 0; }
+ print "$vocab[$i] $nl\n"
+}
+
+
diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc
index 34ef0ba2..e21f0267 100644
--- a/gi/pf/pyp_tm.cc
+++ b/gi/pf/pyp_tm.cc
@@ -4,9 +4,6 @@
#include <iostream>
#include <queue>
-#include "base_distributions.h"
-#include "monotonic_pseg.h"
-#include "conditional_pseg.h"
#include "tdict.h"
#include "ccrp.h"
#include "pyp_word_model.h"
@@ -15,9 +12,19 @@
using namespace std;
using namespace std::tr1;
-template <typename Base>
+struct FreqBinner {
+ FreqBinner(const std::string& fname) { fd_.Load(fname); }
+ unsigned NumberOfBins() const { return fd_.Max() + 1; }
+ unsigned Bin(const WordID& w) const { return fd_.LookUp(w); }
+ FreqDict<unsigned> fd_;
+};
+
+template <typename Base, class Binner = FreqBinner>
struct ConditionalPYPWordModel {
- ConditionalPYPWordModel(Base* b) : base(*b), btr(2) {}
+ ConditionalPYPWordModel(Base* b, const Binner* bnr = NULL) :
+ base(*b),
+ binner(bnr),
+ btr(binner ? binner->NumberOfBins() + 1u : 2u) {}
void Summary() const {
cerr << "Number of conditioning contexts: " << r.size() << endl;
@@ -46,7 +53,9 @@ struct ConditionalPYPWordModel {
if (it == r.end()) {
it = r.insert(make_pair(src, CCRP<vector<WordID> >(0.5,1.0))).first;
static const WordID kNULL = TD::Convert("NULL");
- btr.Add(src == kNULL ? 0 : 1, &it->second);
+ unsigned bin = (src == kNULL ? 0 : 1);
+ if (binner && bin) { bin = binner->Bin(src) + 1; }
+ btr.Add(bin, &it->second);
}
if (it->second.increment(trglets, base(trglets), rng))
base.Increment(trglets, rng);
@@ -75,6 +84,7 @@ struct ConditionalPYPWordModel {
// TODO tie PYP hyperparameters based on source word frequency bins
Base& base;
+ const Binner* binner;
BinTiedResampler<CCRP<vector<WordID> > > btr;
typedef unordered_map<WordID, CCRP<vector<WordID> > > RuleModelHash;
RuleModelHash r;
@@ -84,7 +94,7 @@ PYPLexicalTranslation::PYPLexicalTranslation(const vector<vector<WordID> >& lets
const unsigned num_letters) :
letters(lets),
up0(new PYPWordModel(num_letters)),
- tmodel(new ConditionalPYPWordModel<PYPWordModel>(up0)),
+ tmodel(new ConditionalPYPWordModel<PYPWordModel>(up0, new FreqBinner("10k.freq"))),
kX(-TD::Convert("X")) {}
void PYPLexicalTranslation::Summary() const {
diff --git a/gi/pf/pyp_tm.h b/gi/pf/pyp_tm.h
index fa0fb28f..63e7c96d 100644
--- a/gi/pf/pyp_tm.h
+++ b/gi/pf/pyp_tm.h
@@ -5,10 +5,11 @@
#include "wordid.h"
#include "prob.h"
#include "sampler.h"
+#include "freqdict.h"
-struct TRule;
+struct FreqBinner;
struct PYPWordModel;
-template <typename T> struct ConditionalPYPWordModel;
+template <typename T, class B> struct ConditionalPYPWordModel;
struct PYPLexicalTranslation {
explicit PYPLexicalTranslation(const std::vector<std::vector<WordID> >& lets,
@@ -26,7 +27,7 @@ struct PYPLexicalTranslation {
private:
const std::vector<std::vector<WordID> >& letters; // spelling dictionary
PYPWordModel* up0; // base distribuction (model English word)
- ConditionalPYPWordModel<PYPWordModel>* tmodel; // translation distributions
+ ConditionalPYPWordModel<PYPWordModel, FreqBinner>* tmodel; // translation distributions
// (model English word | French word)
const WordID kX;
};