diff options
Diffstat (limited to 'gi')
| -rw-r--r-- | gi/pf/align-lexonly-pyp.cc | 24 | ||||
| -rwxr-xr-x | gi/pf/make-freq-bins.pl | 26 | ||||
| -rw-r--r-- | gi/pf/pyp_tm.cc | 24 | ||||
| -rw-r--r-- | gi/pf/pyp_tm.h | 7 | 
4 files changed, 64 insertions, 17 deletions
| diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc index 6c054753..942dcf51 100644 --- a/gi/pf/align-lexonly-pyp.cc +++ b/gi/pf/align-lexonly-pyp.cc @@ -20,6 +20,9 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) {    po::options_description opts("Configuration options");    opts.add_options()          ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples") +        ("infer_alignment_hyperparameters,I", "Infer alpha and p_null, otherwise fixed values will be assumed") +        ("p_null,0", po::value<double>()->default_value(0.08), "probability of aligning to null") +        ("align_alpha,a", po::value<double>()->default_value(4.0), "how 'tight' is the bias toward be along the diagonal?")          ("input,i",po::value<string>(),"Read parallel data from")          ("random_seed,S",po::value<uint32_t>(), "Random seed");    po::options_description clo("Command line options"); @@ -59,9 +62,13 @@ struct AlignedSentencePair {  };  struct Aligner { -  Aligner(const vector<vector<WordID> >& lets, int num_letters, vector<AlignedSentencePair>* c) : +  Aligner(const vector<vector<WordID> >& lets, +          int num_letters, +          const po::variables_map& conf, +          vector<AlignedSentencePair>* c) :        corpus(*c), -      paj_model(4, 0.08), +      paj_model(conf["align_alpha"].as<double>(), conf["p_null"].as<double>()), +      infer_paj(conf.count("infer_alignment_hyperparameters") > 0),        model(lets, num_letters),        kNULL(TD::Convert("NULL")) {      assert(lets[kNULL].size() == 0); @@ -69,12 +76,13 @@ struct Aligner {    vector<AlignedSentencePair>& corpus;    QuasiModel2 paj_model; +  const bool infer_paj;    PYPLexicalTranslation model;    const WordID kNULL;    void ResampleHyperparameters() {      model.ResampleHyperparameters(prng); -    paj_model.ResampleHyperparameters(prng); +    if (infer_paj) paj_model.ResampleHyperparameters(prng);    }    void InitializeRandom() { @@ -117,8 +125,6 @@ struct Aligner {          paj_model.Increment(a_j, j, asp.src.size(), asp.trg.size());        }      } -    cerr << "LLH = " << Likelihood() << "    \t(Amodel=" << paj_model.Likelihood() -         << " TModel=" << model.Likelihood() << ") contexts=" << model.UniqueConditioningContexts() << endl;    }    prob_t Likelihood() const { @@ -211,13 +217,17 @@ int main(int argc, char** argv) {    ExtractLetters(vocabf, &letters, NULL);    letters[TD::Convert("NULL")].clear(); -  Aligner aligner(letters, letset.size(), &corpus); +  Aligner aligner(letters, letset.size(), conf, &corpus);    aligner.InitializeRandom();    const unsigned samples = conf["samples"].as<unsigned>();    for (int i = 0; i < samples; ++i) {      for (int j = 65; j < 67; ++j) Debug(corpus[j]); -    if (i % 10 == 9) aligner.ResampleHyperparameters(); +    if (i % 10 == 9) { +      aligner.ResampleHyperparameters(); +      cerr << "LLH = " << aligner.Likelihood() << "    \t(Amodel=" << aligner.paj_model.Likelihood() +           << " TModel=" << aligner.model.Likelihood() << ") contexts=" << aligner.model.UniqueConditioningContexts() << endl; +    }      aligner.ResampleCorpus();      if (i > (samples / 5) && (i % 6 == 5)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]);    } diff --git a/gi/pf/make-freq-bins.pl b/gi/pf/make-freq-bins.pl new file mode 100755 index 00000000..fdcd3555 --- /dev/null +++ b/gi/pf/make-freq-bins.pl @@ -0,0 +1,26 @@ +#!/usr/bin/perl -w +use strict; + +my $BASE = 6; +my $CUTOFF = 3; + +my %d; +my $num = 0; +while(<>){ + chomp; + my @words = split /\s+/; + for my $w (@words) {$d{$w}++; $num++;} +} + +my @vocab = sort {$d{$b} <=> $d{$a}} keys %d; + +for (my $i=0; $i<scalar @vocab; $i++) { +  my $most = $d{$vocab[$i]}; +  my $least = 1; + +  my $nl = -int(log($most / $num) / log($BASE) + $CUTOFF); +  if ($nl < 0) { $nl = 0; } +  print "$vocab[$i] $nl\n" +} + + diff --git a/gi/pf/pyp_tm.cc b/gi/pf/pyp_tm.cc index 34ef0ba2..e21f0267 100644 --- a/gi/pf/pyp_tm.cc +++ b/gi/pf/pyp_tm.cc @@ -4,9 +4,6 @@  #include <iostream>  #include <queue> -#include "base_distributions.h" -#include "monotonic_pseg.h" -#include "conditional_pseg.h"  #include "tdict.h"  #include "ccrp.h"  #include "pyp_word_model.h" @@ -15,9 +12,19 @@  using namespace std;  using namespace std::tr1; -template <typename Base> +struct FreqBinner { +  FreqBinner(const std::string& fname) { fd_.Load(fname); } +  unsigned NumberOfBins() const { return fd_.Max() + 1; } +  unsigned Bin(const WordID& w) const { return fd_.LookUp(w); } +  FreqDict<unsigned> fd_; +}; + +template <typename Base, class Binner = FreqBinner>  struct ConditionalPYPWordModel { -  ConditionalPYPWordModel(Base* b) : base(*b), btr(2) {} +  ConditionalPYPWordModel(Base* b, const Binner* bnr = NULL) : +      base(*b), +      binner(bnr), +      btr(binner ? binner->NumberOfBins() + 1u : 2u) {}    void Summary() const {      cerr << "Number of conditioning contexts: " << r.size() << endl; @@ -46,7 +53,9 @@ struct ConditionalPYPWordModel {      if (it == r.end()) {        it = r.insert(make_pair(src, CCRP<vector<WordID> >(0.5,1.0))).first;        static const WordID kNULL = TD::Convert("NULL"); -      btr.Add(src == kNULL ? 0 : 1, &it->second); +      unsigned bin = (src == kNULL ? 0 : 1); +      if (binner && bin) { bin = binner->Bin(src) + 1; } +      btr.Add(bin, &it->second);      }      if (it->second.increment(trglets, base(trglets), rng))        base.Increment(trglets, rng); @@ -75,6 +84,7 @@ struct ConditionalPYPWordModel {    // TODO tie PYP hyperparameters based on source word frequency bins    Base& base; +  const Binner* binner;    BinTiedResampler<CCRP<vector<WordID> > > btr;    typedef unordered_map<WordID, CCRP<vector<WordID> > > RuleModelHash;    RuleModelHash r; @@ -84,7 +94,7 @@ PYPLexicalTranslation::PYPLexicalTranslation(const vector<vector<WordID> >& lets                                               const unsigned num_letters) :      letters(lets),      up0(new PYPWordModel(num_letters)), -    tmodel(new ConditionalPYPWordModel<PYPWordModel>(up0)), +    tmodel(new ConditionalPYPWordModel<PYPWordModel>(up0, new FreqBinner("10k.freq"))),      kX(-TD::Convert("X")) {}  void PYPLexicalTranslation::Summary() const { diff --git a/gi/pf/pyp_tm.h b/gi/pf/pyp_tm.h index fa0fb28f..63e7c96d 100644 --- a/gi/pf/pyp_tm.h +++ b/gi/pf/pyp_tm.h @@ -5,10 +5,11 @@  #include "wordid.h"  #include "prob.h"  #include "sampler.h" +#include "freqdict.h" -struct TRule; +struct FreqBinner;  struct PYPWordModel; -template <typename T> struct ConditionalPYPWordModel; +template <typename T, class B> struct ConditionalPYPWordModel;  struct PYPLexicalTranslation {    explicit PYPLexicalTranslation(const std::vector<std::vector<WordID> >& lets, @@ -26,7 +27,7 @@ struct PYPLexicalTranslation {   private:    const std::vector<std::vector<WordID> >& letters;   // spelling dictionary    PYPWordModel* up0;  // base distribuction (model English word) -  ConditionalPYPWordModel<PYPWordModel>* tmodel;  // translation distributions +  ConditionalPYPWordModel<PYPWordModel, FreqBinner>* tmodel;  // translation distributions                        // (model English word | French word)    const WordID kX;  }; | 
