From 4ebb11b25cf87dc5938b5eb65e884d0e3f4ee146 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@cs.cmu.edu>
Date: Mon, 23 Jan 2012 15:47:29 -0500
Subject: more alignment stuff

---
 gi/pf/Makefile.am           |   4 +-
 gi/pf/align-lexonly-pyp.cc  | 327 ++++++++++++++++++++++++++++++++++++++++++++
 gi/pf/base_measures.cc      |  47 +++++++
 gi/pf/base_measures.h       |  18 +++
 gi/pf/conditional_pseg.h    |  74 ++++++++++
 word-aligner/stemmers/ur.pl |  38 +++++
 6 files changed, 507 insertions(+), 1 deletion(-)
 create mode 100644 gi/pf/align-lexonly-pyp.cc
 create mode 100755 word-aligner/stemmers/ur.pl
diff --git a/gi/pf/Makefile.am b/gi/pf/Makefile.am
index 7c8e89d0..28367e67 100644
--- a/gi/pf/Makefile.am
+++ b/gi/pf/Makefile.am
@@ -1,10 +1,12 @@
-bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly
+bin_PROGRAMS = cbgi brat dpnaive pfbrat pfdist itg pfnaive condnaive align-lexonly align-lexonly-pyp
 
 noinst_LIBRARIES = libpf.a
 libpf_a_SOURCES = base_measures.cc reachability.cc cfg_wfst_composer.cc corpus.cc unigrams.cc ngram_base.cc
 
 align_lexonly_SOURCES = align-lexonly.cc
 
+align_lexonly_pyp_SOURCES = align-lexonly-pyp.cc
+
 itg_SOURCES = itg.cc
 
 condnaive_SOURCES = condnaive.cc
diff --git a/gi/pf/align-lexonly-pyp.cc b/gi/pf/align-lexonly-pyp.cc
new file mode 100644
index 00000000..d2630a2b
--- /dev/null
+++ b/gi/pf/align-lexonly-pyp.cc
@@ -0,0 +1,327 @@
+#include <iostream>
+#include <tr1/memory>
+#include <queue>
+
+#include <boost/multi_array.hpp>
+#include <boost/program_options.hpp>
+#include <boost/program_options/variables_map.hpp>
+
+#include "array2d.h"
+#include "base_measures.h"
+#include "monotonic_pseg.h"
+#include "conditional_pseg.h"
+#include "trule.h"
+#include "tdict.h"
+#include "stringlib.h"
+#include "filelib.h"
+#include "dict.h"
+#include "sampler.h"
+#include "mfcr.h"
+#include "corpus.h"
+#include "ngram_base.h"
+
+using namespace std;
+using namespace tr1;
+namespace po = boost::program_options;
+
+void InitCommandLine(int argc, char** argv, po::variables_map* conf) {
+  po::options_description opts("Configuration options");
+  opts.add_options()
+        ("samples,s",po::value<unsigned>()->default_value(1000),"Number of samples")
+        ("input,i",po::value<string>(),"Read parallel data from")
+        ("random_seed,S",po::value<uint32_t>(), "Random seed");
+  po::options_description clo("Command line options");
+  clo.add_options()
+        ("config", po::value<string>(), "Configuration file")
+        ("help,h", "Print this help message and exit");
+  po::options_description dconfig_options, dcmdline_options;
+  dconfig_options.add(opts);
+  dcmdline_options.add(opts).add(clo);
+  
+  po::store(parse_command_line(argc, argv, dcmdline_options), *conf);
+  if (conf->count("config")) {
+    ifstream config((*conf)["config"].as<string>().c_str());
+    po::store(po::parse_config_file(config, dconfig_options), *conf);
+  }
+  po::notify(*conf);
+
+  if (conf->count("help") || (conf->count("input") == 0)) {
+    cerr << dcmdline_options << endl;
+    exit(1);
+  }
+}
+
+shared_ptr<MT19937> prng;
+
+struct LexicalAlignment {
+  unsigned char src_index;
+  bool is_transliteration;
+  vector<pair<short, short> > derivation;
+};
+
+struct AlignedSentencePair {
+  vector<WordID> src;
+  vector<WordID> trg;
+  vector<LexicalAlignment> a;
+  Array2D<short> posterior;
+};
+
+struct HierarchicalWordBase {
+  explicit HierarchicalWordBase(const unsigned vocab_e_size) :
+      base(prob_t::One()), r(1,1,1,25,25), u0(-log(vocab_e_size)), l(1,1.0), v(1, 0.0) {}
+
+  void ResampleHyperparameters(MT19937* rng) {
+    r.resample_hyperparameters(rng);
+  }
+
+  inline double logp0(const vector<WordID>& s) const {
+    return s.size() * u0;
+  }
+
+  // return p0 of rule.e_
+  prob_t operator()(const TRule& rule) const {
+    v[0] = exp(logp0(rule.e_));
+    return prob_t(r.prob(rule.e_, v, l));
+  }
+
+  void Increment(const TRule& rule) {
+    v[0] = exp(logp0(rule.e_));
+    if (r.increment(rule.e_, v, l, &*prng).count) {
+      base *= prob_t(v[0] * l[0]);
+    }
+  }
+
+  void Decrement(const TRule& rule) {
+    if (r.decrement(rule.e_, &*prng).count) {
+      base /= prob_t(exp(logp0(rule.e_)));
+    }
+  }
+
+  prob_t Likelihood() const {
+    prob_t p; p.logeq(r.log_crp_prob());
+    p *= base;
+    return p;
+  }
+
+  void Summary() const {
+    cerr << "NUMBER OF CUSTOMERS: " << r.num_customers() << "  (d=" << r.d() << ",\\alpha=" << r.alpha() << ')' << endl;
+    for (MFCR<vector<WordID> >::const_iterator it = r.begin(); it != r.end(); ++it)
+      cerr << "   " << it->second.total_dish_count_ << " (on " << it->second.table_counts_.size() << " tables)" << TD::GetString(it->first) << endl;
+  }
+
+  prob_t base;
+  MFCR<vector<WordID> > r;
+  const double u0;
+  const vector<double> l;
+  mutable vector<double> v;
+};
+
+struct BasicLexicalAlignment {
+  explicit BasicLexicalAlignment(const vector<vector<WordID> >& lets,
+                                 const unsigned words_e,
+                                 const unsigned letters_e,
+                                 vector<AlignedSentencePair>* corp) :
+      letters(lets),
+      corpus(*corp),
+      //up0(words_e),
+      //up0("en.chars.1gram", letters_e),
+      //up0("en.words.1gram"),
+      up0(letters_e),
+      //up0("en.chars.2gram"),
+      tmodel(up0) {
+  }
+
+  void InstantiateRule(const WordID src,
+                       const WordID trg,
+                       TRule* rule) const {
+    static const WordID kX = TD::Convert("X") * -1;
+    rule->lhs_ = kX;
+    rule->e_ = letters[trg];
+    rule->f_ = letters[src];
+  }
+
+  void InitializeRandom() {
+    const WordID kNULL = TD::Convert("NULL");
+    cerr << "Initializing with random alignments ...\n";
+    for (unsigned i = 0; i < corpus.size(); ++i) {
+      AlignedSentencePair& asp = corpus[i];
+      asp.a.resize(asp.trg.size());
+      for (unsigned j = 0; j < asp.trg.size(); ++j) {
+        const unsigned char a_j = prng->next() * (1 + asp.src.size());
+        const WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
+        TRule r;
+        InstantiateRule(f_a_j, asp.trg[j], &r);
+        asp.a[j].is_transliteration = false;
+        asp.a[j].src_index = a_j;
+        if (tmodel.IncrementRule(r, &*prng))
+          up0.Increment(r);
+      }
+    }
+    cerr << "  LLH = " << Likelihood() << endl;
+  }
+
+  prob_t Likelihood() const {
+    prob_t p = tmodel.Likelihood();
+    p *= up0.Likelihood();
+    return p;
+  }
+
+  void ResampleHyperparemeters() {
+    cerr << "  LLH_prev = " << Likelihood() << flush;
+    tmodel.ResampleHyperparameters(&*prng);
+    up0.ResampleHyperparameters(&*prng);
+    cerr << "\tLLH_post = " << Likelihood() << endl;
+  }
+
+  void ResampleCorpus();
+
+  const vector<vector<WordID> >& letters; // spelling dictionary
+  vector<AlignedSentencePair>& corpus;
+  //PhraseConditionalUninformativeBase up0;
+  //PhraseConditionalUninformativeUnigramBase up0;
+  //UnigramWordBase up0;
+  //HierarchicalUnigramBase up0;
+  HierarchicalWordBase up0;
+  //CompletelyUniformBase up0;
+  //FixedNgramBase up0;
+  //ConditionalTranslationModel<PhraseConditionalUninformativeBase> tmodel;
+  //ConditionalTranslationModel<PhraseConditionalUninformativeUnigramBase> tmodel;
+  //ConditionalTranslationModel<UnigramWordBase> tmodel;
+  //ConditionalTranslationModel<HierarchicalUnigramBase> tmodel;
+  MConditionalTranslationModel<HierarchicalWordBase> tmodel;
+  //ConditionalTranslationModel<FixedNgramBase> tmodel;
+  //ConditionalTranslationModel<CompletelyUniformBase> tmodel;
+};
+
+void BasicLexicalAlignment::ResampleCorpus() {
+  static const WordID kNULL = TD::Convert("NULL");
+  for (unsigned i = 0; i < corpus.size(); ++i) {
+    AlignedSentencePair& asp = corpus[i];
+    SampleSet<prob_t> ss; ss.resize(asp.src.size() + 1);
+    for (unsigned j = 0; j < asp.trg.size(); ++j) {
+      TRule r;
+      unsigned char& a_j = asp.a[j].src_index;
+      WordID f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
+      InstantiateRule(f_a_j, asp.trg[j], &r);
+      if (tmodel.DecrementRule(r, &*prng))
+        up0.Decrement(r);
+
+      for (unsigned prop_a_j = 0; prop_a_j <= asp.src.size(); ++prop_a_j) {
+        const WordID prop_f = (prop_a_j ? asp.src[prop_a_j - 1] : kNULL);
+        InstantiateRule(prop_f, asp.trg[j], &r);
+        ss[prop_a_j] = tmodel.RuleProbability(r);
+      }
+      a_j = prng->SelectSample(ss);
+      f_a_j = (a_j ? asp.src[a_j - 1] : kNULL);
+      InstantiateRule(f_a_j, asp.trg[j], &r);
+      if (tmodel.IncrementRule(r, &*prng))
+        up0.Increment(r);
+    }
+  }
+  cerr << "  LLH = " << tmodel.Likelihood() << endl;
+}
+
+void ExtractLetters(const set<WordID>& v, vector<vector<WordID> >* l, set<WordID>* letset = NULL) {
+  for (set<WordID>::const_iterator it = v.begin(); it != v.end(); ++it) {
+    vector<WordID>& letters = (*l)[*it];
+    if (letters.size()) continue;   // if e and f have the same word
+
+    const string& w = TD::Convert(*it);
+    
+    size_t cur = 0;
+    while (cur < w.size()) {
+      const size_t len = UTF8Len(w[cur]);
+      letters.push_back(TD::Convert(w.substr(cur, len)));
+      if (letset) letset->insert(letters.back());
+      cur += len;
+    }
+  }
+}
+
+void Debug(const AlignedSentencePair& asp) {
+  cerr << TD::GetString(asp.src) << endl << TD::GetString(asp.trg) << endl;
+  Array2D<bool> a(asp.src.size(), asp.trg.size());
+  for (unsigned j = 0; j < asp.trg.size(); ++j)
+    if (asp.a[j].src_index) a(asp.a[j].src_index - 1, j) = true;
+  cerr << a << endl;
+}
+
+void AddSample(AlignedSentencePair* asp) {
+  for (unsigned j = 0; j < asp->trg.size(); ++j)
+    asp->posterior(asp->a[j].src_index, j)++;
+}
+
+void WriteAlignments(const AlignedSentencePair& asp) {
+  bool first = true;
+  for (unsigned j = 0; j < asp.trg.size(); ++j) {
+    int src_index = -1;
+    int mc = -1;
+    for (unsigned i = 0; i <= asp.src.size(); ++i) {
+      if (asp.posterior(i, j) > mc) {
+        mc = asp.posterior(i, j);
+        src_index = i;
+      }
+    }
+
+    if (src_index) {
+      if (first) first = false; else cout << ' ';
+      cout << (src_index - 1) << '-' << j;
+    }
+  }
+  cout << endl;
+}
+
+int main(int argc, char** argv) {
+  po::variables_map conf;
+  InitCommandLine(argc, argv, &conf);
+
+  if (conf.count("random_seed"))
+    prng.reset(new MT19937(conf["random_seed"].as<uint32_t>()));
+  else
+    prng.reset(new MT19937);
+//  MT19937& rng = *prng;
+
+  vector<vector<int> > corpuse, corpusf;
+  set<int> vocabe, vocabf;
+  corpus::ReadParallelCorpus(conf["input"].as<string>(), &corpusf, &corpuse, &vocabf, &vocabe);
+  cerr << "f-Corpus size: " << corpusf.size() << " sentences\n";
+  cerr << "f-Vocabulary size: " << vocabf.size() << " types\n";
+  cerr << "f-Corpus size: " << corpuse.size() << " sentences\n";
+  cerr << "f-Vocabulary size: " << vocabe.size() << " types\n";
+  assert(corpusf.size() == corpuse.size());
+
+  vector<AlignedSentencePair> corpus(corpuse.size());
+  for (unsigned i = 0; i < corpuse.size(); ++i) {
+    corpus[i].src.swap(corpusf[i]);
+    corpus[i].trg.swap(corpuse[i]);
+    corpus[i].posterior.resize(corpus[i].src.size() + 1, corpus[i].trg.size());
+  }
+  corpusf.clear(); corpuse.clear();
+
+  vocabf.insert(TD::Convert("NULL"));
+  vector<vector<WordID> > letters(TD::NumWords());
+  set<WordID> letset;
+  ExtractLetters(vocabe, &letters, &letset);
+  ExtractLetters(vocabf, &letters, NULL);
+  letters[TD::Convert("NULL")].clear();
+
+  BasicLexicalAlignment x(letters, vocabe.size(), letset.size(), &corpus);
+  x.InitializeRandom();
+  const unsigned samples = conf["samples"].as<unsigned>();
+  for (int i = 0; i < samples; ++i) {
+    for (int j = 65; j < 67; ++j) Debug(corpus[j]);
+    cerr << i << "\t" << x.tmodel.r.size() << "\t";
+    if (i % 10 == 0) x.ResampleHyperparemeters();
+    x.ResampleCorpus();
+    if (i > (samples / 5) && (i % 10 == 9)) for (int j = 0; j < corpus.size(); ++j) AddSample(&corpus[j]);
+  }
+  for (unsigned i = 0; i < corpus.size(); ++i)
+    WriteAlignments(corpus[i]);
+  //ModelAndData posterior(x, &corpus, vocabe, vocabf);
+  x.tmodel.Summary();
+  x.up0.Summary();
+
+  //posterior.Sample();
+
+  return 0;
+}
diff --git a/gi/pf/base_measures.cc b/gi/pf/base_measures.cc
index 97b4e698..7894d3e7 100644
--- a/gi/pf/base_measures.cc
+++ b/gi/pf/base_measures.cc
@@ -6,6 +6,53 @@
 
 using namespace std;
 
+TableLookupBase::TableLookupBase(const string& fname) {
+  cerr << "TableLookupBase reading from " << fname << " ..." << endl;
+  ReadFile rf(fname);
+  istream& in = *rf.stream();
+  string line;
+  unsigned lc = 0;
+  const WordID kDIV = TD::Convert("|||");
+  vector<WordID> tmp;
+  vector<int> le, lf;
+  TRule x;
+  x.lhs_ = -TD::Convert("X");
+  bool flag = false;
+  while(getline(in, line)) {
+    ++lc;
+    if (lc % 1000000 == 0) { cerr << " [" << lc << ']' << endl; flag = false; }
+    else if (lc % 25000 == 0) { cerr << '.' << flush; flag = true; }
+    tmp.clear();
+    TD::ConvertSentence(line, &tmp);
+    x.f_.clear();
+    x.e_.clear();
+    size_t pos = 0;
+    int cc = 0;
+    while(pos < tmp.size()) {
+      const WordID cur = tmp[pos++];
+      if (cur == kDIV) {
+        ++cc;
+      } else if (cc == 0) {
+        x.f_.push_back(cur);    
+      } else if (cc == 1) {
+        x.e_.push_back(cur);
+      } else if (cc == 2) {
+        table[x] = atof(TD::Convert(cur));
+        ++cc;
+      } else {
+        if (flag) cerr << endl;
+        cerr << "Bad format in " << lc << ": " << line << endl; abort();
+      }
+    }
+    if (cc != 3) {
+      if (flag) cerr << endl;
+      cerr << "Bad format in " << lc << ": " << line << endl; abort();
+    }
+  }
+  if (flag) cerr << endl;
+  cerr << " read " << lc << " entries\n";
+}
+
 prob_t PhraseConditionalUninformativeUnigramBase::p0(const vector<WordID>& vsrc,
                                                      const vector<WordID>& vtrg,
                                                      int start_src, int start_trg) const {
diff --git a/gi/pf/base_measures.h b/gi/pf/base_measures.h
index a4e9ac28..7214aa22 100644
--- a/gi/pf/base_measures.h
+++ b/gi/pf/base_measures.h
@@ -72,6 +72,24 @@ struct UnigramWordBase {
   const UnigramWordModel un;
 };
 
+struct RuleHasher {
+  size_t operator()(const TRule& r) const {
+    return hash_value(r);
+  }
+};
+
+struct TableLookupBase {
+  TableLookupBase(const std::string& fname);
+
+  prob_t operator()(const TRule& rule) const {
+    const std::tr1::unordered_map<TRule,prob_t>::const_iterator it = table.find(rule);
+    assert(it != table.end());
+    return it->second;
+  }
+
+  std::tr1::unordered_map<TRule,prob_t,RuleHasher> table;
+};
+
 struct PhraseConditionalUninformativeBase {
   explicit PhraseConditionalUninformativeBase(const unsigned vocab_e_size) :
       kUNIFORM_TARGET(1.0 / vocab_e_size) {
diff --git a/gi/pf/conditional_pseg.h b/gi/pf/conditional_pseg.h
index edcdc813..db951d15 100644
--- a/gi/pf/conditional_pseg.h
+++ b/gi/pf/conditional_pseg.h
@@ -8,10 +8,84 @@
 
 #include "prob.h"
 #include "ccrp_nt.h"
+#include "mfcr.h"
 #include "trule.h"
 #include "base_measures.h"
 #include "tdict.h"
 
+template <typename ConditionalBaseMeasure>
+struct MConditionalTranslationModel {
+  explicit MConditionalTranslationModel(ConditionalBaseMeasure& rcp0) :
+    rp0(rcp0), lambdas(1, 1.0), p0s(1) {}
+
+  void Summary() const {
+    std::cerr << "Number of conditioning contexts: " << r.size() << std::endl;
+    for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
+      std::cerr << TD::GetString(it->first) << "   \t(d=" << it->second.d() << ",\\alpha = " << it->second.alpha() << ") --------------------------" << std::endl;
+      for (MFCR<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
+        std::cerr << "   " << -1 << '\t' << i2->first << std::endl;
+    }
+  }
+
+  void ResampleHyperparameters(MT19937* rng) {
+    for (RuleModelHash::iterator it = r.begin(); it != r.end(); ++it)
+      it->second.resample_hyperparameters(rng);
+  } 
+
+  int DecrementRule(const TRule& rule, MT19937* rng) {
+    RuleModelHash::iterator it = r.find(rule.f_);
+    assert(it != r.end());
+    const TableCount delta = it->second.decrement(rule, rng);
+    if (delta.count) {
+      if (it->second.num_customers() == 0) r.erase(it);
+    }
+    return delta.count;
+  }
+
+  int IncrementRule(const TRule& rule, MT19937* rng) {
+    RuleModelHash::iterator it = r.find(rule.f_);
+    if (it == r.end()) {
+      it = r.insert(make_pair(rule.f_, MFCR<TRule>(1, 1.0, 1.0, 1.0, 1.0, 1e-9, 4.0))).first;
+    }
+    p0s[0] = rp0(rule).as_float(); 
+    TableCount delta = it->second.increment(rule, p0s, lambdas, rng);
+    return delta.count;
+  }
+
+  prob_t RuleProbability(const TRule& rule) const {
+    prob_t p;
+    RuleModelHash::const_iterator it = r.find(rule.f_);
+    if (it == r.end()) {
+      p.logeq(log(rp0(rule)));
+    } else {
+      p0s[0] = rp0(rule).as_float();
+      p = prob_t(it->second.prob(rule, p0s, lambdas));
+    }
+    return p;
+  }
+
+  prob_t Likelihood() const {
+    prob_t p = prob_t::One();
+#if 0
+    for (RuleModelHash::const_iterator it = r.begin(); it != r.end(); ++it) {
+      prob_t q; q.logeq(it->second.log_crp_prob());
+      p *= q;
+      for (CCRP_NoTable<TRule>::const_iterator i2 = it->second.begin(); i2 != it->second.end(); ++i2)
+        p *= rp0(i2->first);
+    }
+#endif
+    return p;
+  }
+
+  const ConditionalBaseMeasure& rp0;
+  typedef std::tr1::unordered_map<std::vector<WordID>,
+                                  MFCR<TRule>,
+                                  boost::hash<std::vector<WordID> > > RuleModelHash;
+  RuleModelHash r;
+  std::vector<double> lambdas;
+  mutable std::vector<double> p0s;
+};
+
 template <typename ConditionalBaseMeasure>
 struct ConditionalTranslationModel {
   explicit ConditionalTranslationModel(ConditionalBaseMeasure& rcp0) :
diff --git a/word-aligner/stemmers/ur.pl b/word-aligner/stemmers/ur.pl
new file mode 100755
index 00000000..3a4f5a45
--- /dev/null
+++ b/word-aligner/stemmers/ur.pl
@@ -0,0 +1,38 @@
+#!/usr/bin/perl -w
+
+use strict;
+use utf8;
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT,":utf8");
+
+my $vocab = undef;
+if (scalar @ARGV > 0) {
+  die "Only allow --vocab" unless ($ARGV[0] eq '--vocab' && scalar @ARGV == 1);
+  $vocab = 1;
+}
+
+my %dict;
+while(<STDIN>) {
+  chomp;
+  my @words = split /\s+/;
+  my @out = ();
+  for my $w (@words) {
+    my $tw = $dict{$w};
+    if (!defined $tw) {
+      my $el = 4;
+      if ($w =~ /^(al|Al)/) { $el++; }
+      if ($el > length($w)) { $el = length($w); }
+      $tw = substr $w, 0, $el;
+      $dict{$w} = $tw;
+    }
+    push @out, $tw;
+  }
+  if ($vocab) {
+    die "Expected exactly one word per line with --vocab: $_" unless scalar @out == 1;
+    print "$_ @out\n";
+  } else {
+    print "@out\n";
+  }
+}
+
-- 
cgit v1.2.3