summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2012-03-10 16:42:12 -0500
committerChris Dyer <cdyer@cs.cmu.edu>2012-03-10 16:42:12 -0500
commita45af4a3704531a8382cd231f6445b3a33b598a3 (patch)
treecb6be837287be58fcb9834da4118b03dca213962
parent280d5aa74b6a41f8f6deb5dd374140b7e3ab2703 (diff)
frequency-based binning
-rw-r--r--decoder/Makefile.am1
-rw-r--r--decoder/ff_csplit.cc2
-rw-r--r--decoder/freqdict.cc29
-rw-r--r--decoder/freqdict.h37
-rw-r--r--gi/pf/align-lexonly-pyp.cc24
-rwxr-xr-xgi/pf/make-freq-bins.pl26
-rw-r--r--gi/pf/pyp_tm.cc24
-rw-r--r--gi/pf/pyp_tm.h7
8 files changed, 97 insertions, 53 deletions
diff --git a/decoder/Makefile.am b/decoder/Makefile.am
index a00b18af..ec51d643 100644
--- a/decoder/Makefile.am
+++ b/decoder/Makefile.am
@@ -76,7 +76,6 @@ libcdec_a_SOURCES = \
ff_source_syntax.cc \
ff_bleu.cc \
ff_factory.cc \
- freqdict.cc \
lexalign.cc \
lextrans.cc \
tagger.cc \
diff --git a/decoder/ff_csplit.cc b/decoder/ff_csplit.cc
index 3991d38f..c9ed996c 100644
--- a/decoder/ff_csplit.cc
+++ b/decoder/ff_csplit.cc
@@ -72,7 +72,7 @@ struct BasicCSplitFeaturesImpl {
const int fl1_;
const int fl2_;
const int bad_;
- FreqDict freq_dict_;
+ FreqDict<float> freq_dict_;
set<WordID> bad_words_;
};
diff --git a/decoder/freqdict.cc b/decoder/freqdict.cc
deleted file mode 100644
index 9e25d346..00000000
--- a/decoder/freqdict.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-#include <iostream>
-#include <fstream>
-#include <cassert>
-#include "freqdict.h"
-#include "tdict.h"
-#include "filelib.h"
-
-using namespace std;
-
-void FreqDict::Load(const std::string& fname) {
- cerr << "Reading word frequencies: " << fname << endl;
- ReadFile rf(fname);
- istream& ifs = *rf.stream();
- int cc=0;
- while (ifs) {
- std::string word;
- ifs >> word;
- if (word.size() == 0) continue;
- if (word[0] == '#') continue;
- double count = 0;
- ifs >> count;
- assert(count > 0.0); // use -log(f)
- counts_[TD::Convert(word)]=count;
- ++cc;
- if (cc % 10000 == 0) { std::cerr << "."; }
- }
- std::cerr << "\n";
- std::cerr << "Loaded " << cc << " words\n";
-}
diff --git a/decoder/freqdict.h b/decoder/freqdict.h
index 9acf0c33..4e03fadd 100644
--- a/decoder/freqdict.h
+++ b/decoder/freqdict.h
@@ -1,20 +1,47 @@
#ifndef _FREQDICT_H_
#define _FREQDICT_H_
+#include <iostream>
#include <map>
#include <string>
#include "wordid.h"
+#include "filelib.h"
+#include "tdict.h"
+template <typename T = float>
class FreqDict {
public:
- void Load(const std::string& fname);
- float LookUp(const WordID& word) const {
- std::map<WordID,float>::const_iterator i = counts_.find(word);
- if (i == counts_.end()) return 0;
+ FreqDict() : max_() {}
+ T Max() const { return max_; }
+ void Load(const std::string& fname) {
+ std::cerr << "Reading word statistics from: " << fname << std::endl;
+ ReadFile rf(fname);
+ std::istream& ifs = *rf.stream();
+ int cc=0;
+ std::string word;
+ while (ifs) {
+ ifs >> word;
+ if (word.size() == 0) continue;
+ if (word[0] == '#') continue;
+ T count = 0;
+ ifs >> count;
+ if (count > max_) max_ = count;
+ counts_[TD::Convert(word)]=count;
+ ++cc;
+ if (cc % 10000 == 0) { std::cerr << "."; }
+ }
+ std::cerr << "\n";
+ std::cerr << "Loaded " << cc << " words\n";
+ }
+
+ T LookUp(const WordID& word) const {
+ typename std::map<WordID,T>::const_iterator i = counts_.find(word);
+ if (i == counts_.end()) return T();
return i->second;
}
private:
- std::map<WordID, float> counts_;
+ T max_;
+ std::map<WordID, T> counts_;
};
#endif
diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc
index 6c054753..942dcf51 100644
--- a/gi/pf/align-lexonly-pyp.cc
+++ b/gi/pf/align-lexonly-pyp.cc
@@ -20,6 +20,9 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
po::options_description opts("Configuration options");
opts.add_options()
("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
+ ("infer_alignment_hyperparameters,I", "Infer alpha and p_null, otherwise fixed values will be assumed")
+ ("p_null,0", po::value<double>()->default_value(0.08), "probability of aligning to null")
+ ("align_alpha,a", po::value<double>()->default_value(4.0), "how 'tight' is the bias toward be along the diagonal?")
("input,i",po::value<string>(),"Read parallel data from")
("random_seed,S",po::value<uint32_t>(), "Random seed");
po::options_description clo("Command line options");
@@ -59,9 +62,13 @@ struct AlignedSentencePair {
};
struct Aligner {
- Aligner(const vector<vector<WordID> >& lets, int num_letters, vector<AlignedSentencePair>* c) :
+ Aligner(const vector<vector<WordID> >& lets,
+ int num_letters,
+ const po::variables_map& conf,
+ vector<AlignedSentencePair>* c) :
corpus(*c),
- paj_model(4, 0.08),
+ paj_model(conf["align_alpha"].as<double>(), conf["p_null"].as<double>()),
+ infer_paj(conf.count("infer_alignment_hyperparameters") > 0),
model(lets, num_letters),
kNULL(TD::Convert("NULL")) {
assert(lets[kNULL].size() == 0);
@@ -69,12 +76,13 @@ struct Aligner {
vector<AlignedSentencePair>& corpus;
QuasiModel2 paj_model;
+ const bool infer_paj;
PYPLexicalTranslation model;
const WordID kNULL;
void ResampleHyperparameters() {
model.ResampleHyperparameters(prng);
- paj_model.ResampleHyperparameters(prng);
+ if (infer_paj) paj_model.ResampleHyperparameters(prng);
}
void InitializeRandom() {
@@ -117,8 +125,6 @@ struct Aligner {
paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size());
}
}
- cerr << "LLH = " << Likelihood() << " \t(Amodel=" << paj_model.Likelihood()
- << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl;
}
prob_t Likelihood() const {
@@ -211,13 +217,17 @@ int main(int argc, char** argv) {
ExtractLetters(vocabf, &letters, NULL);
letters[TD::Convert("NULL")].clear();
- Aligner aligner(letters, letset.size(), &corpus);
+ Aligner aligner(letters, letset.size(), conf, &corpus);
aligner.InitializeRandom();
const unsigned samples = conf["samples"].as<unsigned>();
for (int i = 0; i < samples; ++i) {
for (int j = 65; j < 67; ++j) Debug(corpus[j]);
- if (i % 10 == 9) aligner.ResampleHyperparameters();
+ if (i % 10 == 9) {
+ aligner.ResampleHyperparameters();
+ cerr << "LLH = " << aligner.Likelihood() << " \t(Amodel=" << aligner.paj_model.Likelihood()
+ << " TModel=" << aligner.model.Likelihood() << ") contexts=" << aligner.model.UniqueConditioningContexts() << endl;
+ }
aligner.ResampleCorpus();
if (i > (samples / 5) && (i % 6 == 5)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]);
}
diff --git a/gi/pf/make-freq-bins.pl b/gi/pf/make-freq-bins.pl
new file mode 100755
index 00000000..fdcd3555
--- /dev/null
+++ b/gi/pf/make-freq-bins.pl
@@ -0,0 +1,26 @@
+#!/usr/bin/perl -w
+use strict;
+
+my $BASE = 6;
+my $CUTOFF = 3;
+
+my %d;
+my $num = 0;
+while(<>){
+ chomp;
+ my @words = split /\s+/;
+ for my $w (@words) {$d{$w}++; $num++;}
+}
+
+my @vocab = sort {$d{$b} <=> $d{$a}} keys %d;
+
+for (my $i=0; $i<scalar @vocab; $i++) {
+ my $most = $d{$vocab[$i]};
+ my $least = 1;
+
+ my $nl = -int(log($most / $num) / log($BASE) + $CUTOFF);
+ if ($nl < 0) { $nl = 0; }
+ print "$vocab[$i] $nl\n"
+}
+
+
diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc
index 34ef0ba2..e21f0267 100644
--- a/gi/pf/pyp_tm.cc
+++ b/gi/pf/pyp_tm.cc
@@ -4,9 +4,6 @@
#include <iostream>
#include <queue>
-#include "base_distributions.h"
-#include "monotonic_pseg.h"
-#include "conditional_pseg.h"
#include "tdict.h"
#include "ccrp.h"
#include "pyp_word_model.h"
@@ -15,9 +12,19 @@
using namespace std;
using namespace std::tr1;
-template <typename Base>
+struct FreqBinner {
+ FreqBinner(const std::string& fname) { fd_.Load(fname); }
+ unsigned NumberOfBins() const { return fd_.Max() + 1; }
+ unsigned Bin(const WordID& w) const { return fd_.LookUp(w); }
+ FreqDict<unsigned> fd_;
+};
+
+template <typename Base, class Binner = FreqBinner>
struct ConditionalPYPWordModel {
- ConditionalPYPWordModel(Base* b) : base(*b), btr(2) {}
+ ConditionalPYPWordModel(Base* b, const Binner* bnr = NULL) :
+ base(*b),
+ binner(bnr),
+ btr(binner ? binner->NumberOfBins() + 1u : 2u) {}
void Summary() const {
cerr << "Number of conditioning contexts: " << r.size() << endl;
@@ -46,7 +53,9 @@ struct ConditionalPYPWordModel {
if (it == r.end()) {
it = r.insert(make_pair(src, CCRP<vector<WordID> >(0.5,1.0))).first;
static const WordID kNULL = TD::Convert("NULL");
- btr.Add(src == kNULL ? 0 : 1, &it->second);
+ unsigned bin = (src == kNULL ? 0 : 1);
+ if (binner && bin) { bin = binner->Bin(src) + 1; }
+ btr.Add(bin, &it->second);
}
if (it->second.increment(trglets, base(trglets), rng))
base.Increment(trglets, rng);
@@ -75,6 +84,7 @@ struct ConditionalPYPWordModel {
// TODO tie PYP hyperparameters based on source word frequency bins
Base& base;
+ const Binner* binner;
BinTiedResampler<CCRP<vector<WordID> > > btr;
typedef unordered_map<WordID, CCRP<vector<WordID> > > RuleModelHash;
RuleModelHash r;
@@ -84,7 +94,7 @@ PYPLexicalTranslation::PYPLexicalTranslation(const vector<vector<WordID> >& lets
const unsigned num_letters) :
letters(lets),
up0(new PYPWordModel(num_letters)),
- tmodel(new ConditionalPYPWordModel<PYPWordModel>(up0)),
+ tmodel(new ConditionalPYPWordModel<PYPWordModel>(up0, new FreqBinner("10k.freq"))),
kX(-TD::Convert("X")) {}
void PYPLexicalTranslation::Summary() const {
diff --git a/gi/pf/pyp_tm.h b/gi/pf/pyp_tm.h
index fa0fb28f..63e7c96d 100644
--- a/gi/pf/pyp_tm.h
+++ b/gi/pf/pyp_tm.h
@@ -5,10 +5,11 @@
#include "wordid.h"
#include "prob.h"
#include "sampler.h"
+#include "freqdict.h"
-struct TRule;
+struct FreqBinner;
struct PYPWordModel;
-template <typename T> struct ConditionalPYPWordModel;
+template <typename T, class B> struct ConditionalPYPWordModel;
struct PYPLexicalTranslation {
explicit PYPLexicalTranslation(const std::vector<std::vector<WordID> >& lets,
@@ -26,7 +27,7 @@ struct PYPLexicalTranslation {
private:
const std::vector<std::vector<WordID> >& letters; // spelling dictionary
PYPWordModel* up0; // base distribuction (model English word)
- ConditionalPYPWordModel<PYPWordModel>* tmodel; // translation distributions
+ ConditionalPYPWordModel<PYPWordModel, FreqBinner>* tmodel; // translation distributions
// (model English word | French word)
const WordID kX;
};