From a45af4a3704531a8382cd231f6445b3a33b598a3 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Sat, 10 Mar 2012 16:42:12 -0500
Subject: frequency-based binning

---
 decoder/Makefile.am        |  1 -
 decoder/ff_csplit.cc       |  2 +-
 decoder/freqdict.cc        | 29 -----------------------------
 decoder/freqdict.h         | 37 ++++++++++++++++++++++++++++++++-----
 gi/pf/align-lexonly-pyp.cc | 24 +++++++++++++++++-------
 gi/pf/make-freq-bins.pl    | 26 ++++++++++++++++++++++++++
 gi/pf/pyp_tm.cc            | 24 +++++++++++++++++-------
 gi/pf/pyp_tm.h             |  7 ++++---
 8 files changed, 97 insertions(+), 53 deletions(-)
 delete mode 100644 decoder/freqdict.cc
 create mode 100755 gi/pf/make-freq-bins.pl
diff --git a/decoder/Makefile.am b/decoder/Makefile.am
index a00b18af..ec51d643 100644
--- a/decoder/Makefile.am
+++ b/decoder/Makefile.am
@@ -76,7 +76,6 @@ libcdec_a_SOURCES = \
   ff_source_syntax.cc \
   ff_bleu.cc \
   ff_factory.cc \
-  freqdict.cc \
   lexalign.cc \
   lextrans.cc \
   tagger.cc \
diff --git a/decoder/ff_csplit.cc b/decoder/ff_csplit.cc
index 3991d38f..c9ed996c 100644
--- a/decoder/ff_csplit.cc
+++ b/decoder/ff_csplit.cc
@@ -72,7 +72,7 @@ struct BasicCSplitFeaturesImpl {
   const int fl1_;
   const int fl2_;
   const int bad_;
-  FreqDict freq_dict_;
+  FreqDict<float> freq_dict_;
   set<WordID> bad_words_;
 };
 
diff --git a/decoder/freqdict.cc b/decoder/freqdict.cc
deleted file mode 100644
index 9e25d346..00000000
--- a/decoder/freqdict.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-#include <iostream>
-#include <fstream>
-#include <cassert>
-#include "freqdict.h"
-#include "tdict.h"
-#include "filelib.h"
-
-using namespace std;
-
-void FreqDict::Load(const std::string& fname) {
-  cerr << "Reading word frequencies: " << fname << endl;
-  ReadFile rf(fname);
-  istream& ifs = *rf.stream();
-  int cc=0;
-  while (ifs) {
-    std::string word;
-    ifs >> word;
-    if (word.size() == 0) continue;
-    if (word[0] == '#') continue;
-    double count = 0;
-    ifs >> count;
-    assert(count > 0.0);  // use -log(f)
-    counts_[TD::Convert(word)]=count;
-    ++cc;
-    if (cc % 10000 == 0) { std::cerr << "."; }
-  }
-  std::cerr << "\n";
-  std::cerr << "Loaded " << cc << " words\n";
-}
diff --git a/decoder/freqdict.h b/decoder/freqdict.h
index 9acf0c33..4e03fadd 100644
--- a/decoder/freqdict.h
+++ b/decoder/freqdict.h
@@ -1,20 +1,47 @@
 #ifndef _FREQDICT_H_
 #define _FREQDICT_H_
 
+#include <iostream>
 #include <map>
 #include <string>
 #include "wordid.h"
+#include "filelib.h"
+#include "tdict.h"
 
+template <typename T = float>
 class FreqDict {
  public:
-  void Load(const std::string& fname);
-  float LookUp(const WordID& word) const {
-    std::map<WordID,float>::const_iterator i = counts_.find(word);
-    if (i == counts_.end()) return 0;
+  FreqDict() : max_() {}
+  T Max() const { return max_; }
+  void Load(const std::string& fname) {
+    std::cerr << "Reading word statistics from: " << fname << std::endl;
+    ReadFile rf(fname);
+    std::istream& ifs = *rf.stream();
+    int cc=0;
+    std::string word;
+    while (ifs) {
+      ifs >> word;
+      if (word.size() == 0) continue;
+      if (word[0] == '#') continue;
+      T count = 0;
+      ifs >> count;
+      if (count > max_) max_ = count;
+      counts_[TD::Convert(word)]=count;
+      ++cc;
+      if (cc % 10000 == 0) { std::cerr << "."; }
+    }
+    std::cerr << "\n";
+    std::cerr << "Loaded " << cc << " words\n";
+  }
+
+  T LookUp(const WordID& word) const {
+    typename std::map<WordID,T>::const_iterator i = counts_.find(word);
+    if (i == counts_.end()) return T();
     return i->second;
   }
  private:
-  std::map<WordID, float> counts_;
+  T max_;
+  std::map<WordID, T> counts_;
 };
 
 #endif
diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc
index 6c054753..942dcf51 100644
--- a/gi/pf/align-lexonly-pyp.cc
+++ b/gi/pf/align-lexonly-pyp.cc
@@ -20,6 +20,9 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
   po::options_description opts("Configuration options");
   opts.add_options()
         ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
+        ("infer_alignment_hyperparameters,I", "Infer alpha and p_null, otherwise fixed values will be assumed")
+        ("p_null,0", po::value<double>()->default_value(0.08), "probability of aligning to null")
+        ("align_alpha,a", po::value<double>()->default_value(4.0), "how 'tight' is the bias toward be along the diagonal?")
         ("input,i",po::value<string>(),"Read parallel data from")
         ("random_seed,S",po::value<uint32_t>(), "Random seed");
   po::options_description clo("Command line options");
@@ -59,9 +62,13 @@ struct AlignedSentencePair {
 };
 
 struct Aligner {
-  Aligner(const vector<vector<WordID> >& lets, int num_letters, vector<AlignedSentencePair>* c) :
+  Aligner(const vector<vector<WordID> >& lets,
+          int num_letters,
+          const po::variables_map& conf,
+          vector<AlignedSentencePair>* c) :
       corpus(*c),
-      paj_model(4, 0.08),
+      paj_model(conf["align_alpha"].as<double>(), conf["p_null"].as<double>()),
+      infer_paj(conf.count("infer_alignment_hyperparameters") > 0),
       model(lets, num_letters),
       kNULL(TD::Convert("NULL")) {
     assert(lets[kNULL].size() == 0);
@@ -69,12 +76,13 @@ struct Aligner {
 
   vector<AlignedSentencePair>& corpus;
   QuasiModel2 paj_model;
+  const bool infer_paj;
   PYPLexicalTranslation model;
   const WordID kNULL;
 
   void ResampleHyperparameters() {
     model.ResampleHyperparameters(prng);
-    paj_model.ResampleHyperparameters(prng);
+    if (infer_paj) paj_model.ResampleHyperparameters(prng);
   }
 
   void InitializeRandom() {
@@ -117,8 +125,6 @@ struct Aligner {
         paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size());
       }
     }
-    cerr << "LLH = " << Likelihood() << "    \t(Amodel=" << paj_model.Likelihood()
-         << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl;
   }
 
   prob_t Likelihood() const {
@@ -211,13 +217,17 @@ int main(int argc, char** argv) {
   ExtractLetters(vocabf, &letters, NULL);
   letters[TD::Convert("NULL")].clear();
 
-  Aligner aligner(letters, letset.size(), &corpus);
+  Aligner aligner(letters, letset.size(), conf, &corpus);
   aligner.InitializeRandom();
 
   const unsigned samples = conf["samples"].as<unsigned>();
   for (int i = 0; i < samples; ++i) {
     for (int j = 65; j < 67; ++j) Debug(corpus[j]);
-    if (i % 10 == 9) aligner.ResampleHyperparameters();
+    if (i % 10 == 9) {
+      aligner.ResampleHyperparameters();
+      cerr << "LLH = " << aligner.Likelihood() << "    \t(Amodel=" << aligner.paj_model.Likelihood()
+           << " TModel=" << aligner.model.Likelihood() << ") contexts=" << aligner.model.UniqueConditioningContexts() << endl;
+    }
     aligner.ResampleCorpus();
     if (i > (samples / 5) && (i % 6 == 5)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]);
   }
diff --git a/gi/pf/make-freq-bins.pl b/gi/pf/make-freq-bins.pl
new file mode 100755
index 00000000..fdcd3555
--- /dev/null
+++ b/gi/pf/make-freq-bins.pl
@@ -0,0 +1,26 @@
+#!/usr/bin/perl -w
+use strict;
+
+my $BASE = 6;
+my $CUTOFF = 3;
+
+my %d;
+my $num = 0;
+while(<>){
+ chomp;
+ my @words = split /\s+/;
+ for my $w (@words) {$d{$w}++; $num++;}
+}
+
+my @vocab = sort {$d{$b} <=> $d{$a}} keys %d;
+
+for (my $i=0; $i<scalar @vocab; $i++) {
+  my $most = $d{$vocab[$i]};
+  my $least = 1;
+
+  my $nl = -int(log($most / $num) / log($BASE) + $CUTOFF);
+  if ($nl < 0) { $nl = 0; }
+  print "$vocab[$i] $nl\n"
+}
+
+
diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc
index 34ef0ba2..e21f0267 100644
--- a/gi/pf/pyp_tm.cc
+++ b/gi/pf/pyp_tm.cc
@@ -4,9 +4,6 @@
 #include <iostream>
 #include <queue>
 
-#include "base_distributions.h"
-#include "monotonic_pseg.h"
-#include "conditional_pseg.h"
 #include "tdict.h"
 #include "ccrp.h"
 #include "pyp_word_model.h"
@@ -15,9 +12,19 @@
 using namespace std;
 using namespace std::tr1;
 
-template <typename Base>
+struct FreqBinner {
+  FreqBinner(const std::string& fname) { fd_.Load(fname); }
+  unsigned NumberOfBins() const { return fd_.Max() + 1; }
+  unsigned Bin(const WordID& w) const { return fd_.LookUp(w); }
+  FreqDict<unsigned> fd_;
+};
+
+template <typename Base, class Binner = FreqBinner>
 struct ConditionalPYPWordModel {
-  ConditionalPYPWordModel(Base* b) : base(*b), btr(2) {}
+  ConditionalPYPWordModel(Base* b, const Binner* bnr = NULL) :
+      base(*b),
+      binner(bnr),
+      btr(binner ? binner->NumberOfBins() + 1u : 2u) {}
 
   void Summary() const {
     cerr << "Number of conditioning contexts: " << r.size() << endl;
@@ -46,7 +53,9 @@ struct ConditionalPYPWordModel {
     if (it == r.end()) {
       it = r.insert(make_pair(src, CCRP<vector<WordID> >(0.5,1.0))).first;
       static const WordID kNULL = TD::Convert("NULL");
-      btr.Add(src == kNULL ? 0 : 1, &it->second);
+      unsigned bin = (src == kNULL ? 0 : 1);
+      if (binner && bin) { bin = binner->Bin(src) + 1; }
+      btr.Add(bin, &it->second);
     }
     if (it->second.increment(trglets, base(trglets), rng))
       base.Increment(trglets, rng);
@@ -75,6 +84,7 @@ struct ConditionalPYPWordModel {
 
   // TODO tie PYP hyperparameters based on source word frequency bins
   Base& base;
+  const Binner* binner;
   BinTiedResampler<CCRP<vector<WordID> > > btr;
   typedef unordered_map<WordID, CCRP<vector<WordID> > > RuleModelHash;
   RuleModelHash r;
@@ -84,7 +94,7 @@ PYPLexicalTranslation::PYPLexicalTranslation(const vector<vector<WordID> >& lets
                                              const unsigned num_letters) :
     letters(lets),
     up0(new PYPWordModel(num_letters)),
-    tmodel(new ConditionalPYPWordModel<PYPWordModel>(up0)),
+    tmodel(new ConditionalPYPWordModel<PYPWordModel>(up0, new FreqBinner("10k.freq"))),
     kX(-TD::Convert("X")) {}
 
 void PYPLexicalTranslation::Summary() const {
diff --git a/gi/pf/pyp_tm.h b/gi/pf/pyp_tm.h
index fa0fb28f..63e7c96d 100644
--- a/gi/pf/pyp_tm.h
+++ b/gi/pf/pyp_tm.h
@@ -5,10 +5,11 @@
 #include "wordid.h"
 #include "prob.h"
 #include "sampler.h"
+#include "freqdict.h"
 
-struct TRule;
+struct FreqBinner;
 struct PYPWordModel;
-template <typename T> struct ConditionalPYPWordModel;
+template <typename T, class B> struct ConditionalPYPWordModel;
 
 struct PYPLexicalTranslation {
   explicit PYPLexicalTranslation(const std::vector<std::vector<WordID> >& lets,
@@ -26,7 +27,7 @@ struct PYPLexicalTranslation {
  private:
   const std::vector<std::vector<WordID> >& letters;   // spelling dictionary
   PYPWordModel* up0;  // base distribuction (model English word)
-  ConditionalPYPWordModel<PYPWordModel>* tmodel;  // translation distributions
+  ConditionalPYPWordModel<PYPWordModel, FreqBinner>* tmodel;  // translation distributions
                       // (model English word | French word)
   const WordID kX;
 };
-- 
cgit v1.2.3