From e0bca5fea3b0267819186d0fc34c036e6b77679c Mon Sep 17 00:00:00 2001
From: philblunsom <philblunsom@ec762483-ff6d-05da-a07a-a48fb63a330f>
Date: Fri, 23 Jul 2010 16:04:32 +0000
Subject: Changed the initialisation of the sampler, hopefully this will work
 better.

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@376 ec762483-ff6d-05da-a07a-a48fb63a330f
---
 gi/pyp-topics/src/contexts_corpus.hh    |  8 ++++----
 gi/pyp-topics/src/corpus.hh             |  2 +-
 gi/pyp-topics/src/makefile.depend       | 13 +++++++++----
 gi/pyp-topics/src/mpi-pyp-topics.cc     | 26 +++++++++++++++++---------
 gi/pyp-topics/src/mpi-pyp-topics.hh     |  8 ++++----
 gi/pyp-topics/src/mpi-train-contexts.cc | 20 ++++++++++++++------
 gi/pyp-topics/src/pyp-topics.cc         |  6 ++++--
 7 files changed, 53 insertions(+), 30 deletions(-)

(limited to 'gi/pyp-topics/src')

diff --git a/gi/pyp-topics/src/contexts_corpus.hh b/gi/pyp-topics/src/contexts_corpus.hh
index f3c25454..f7dad21f 100644
--- a/gi/pyp-topics/src/contexts_corpus.hh
+++ b/gi/pyp-topics/src/contexts_corpus.hh
@@ -53,9 +53,9 @@ public:
     ContextsCorpus() : m_backoff(new TermBackoff) {}
     virtual ~ContextsCorpus() {}
 
-    unsigned read_contexts(const std::string &filename, 
-                           BackoffGenerator* backoff_gen=0,
-                           bool filter_singeltons=false);
+    virtual unsigned read_contexts(const std::string &filename, 
+                                   BackoffGenerator* backoff_gen=0,
+                                   bool filter_singeltons=false);
 
     TermBackoffPtr backoff_index() {
       return m_backoff;
@@ -77,7 +77,7 @@ public:
       return m_keys.at(i);
     }
 
-private:
+protected:
     TermBackoffPtr m_backoff;
     Dict m_dict;
     std::vector<std::string> m_keys;
diff --git a/gi/pyp-topics/src/corpus.hh b/gi/pyp-topics/src/corpus.hh
index 24981946..2aa03527 100644
--- a/gi/pyp-topics/src/corpus.hh
+++ b/gi/pyp-topics/src/corpus.hh
@@ -25,7 +25,7 @@ public:
     Corpus();
     virtual ~Corpus() {}
 
-    unsigned read(const std::string &filename);
+    virtual unsigned read(const std::string &filename);
 
     const_iterator begin() const { return m_documents.begin(); }
     const_iterator end() const { return m_documents.end(); }
diff --git a/gi/pyp-topics/src/makefile.depend b/gi/pyp-topics/src/makefile.depend
index d8ef78d8..9b8e306c 100644
--- a/gi/pyp-topics/src/makefile.depend
+++ b/gi/pyp-topics/src/makefile.depend
@@ -1442,6 +1442,9 @@ mpi-pyp-topics.o: mpi-pyp-topics.cc \
  /home/pblunsom/packages/include/boost/random/linear_congruential.hpp \
  /home/pblunsom/packages/include/boost/random/detail/const_mod.hpp \
  /home/pblunsom/packages/include/boost/random/detail/seed.hpp \
+ /home/pblunsom/packages/include/boost/random/inversive_congruential.hpp \
+ /home/pblunsom/packages/include/boost/random/lagged_fibonacci.hpp \
+ /home/pblunsom/packages/include/boost/config/no_tr1/cmath.hpp \
  /home/pblunsom/packages/include/boost/mpi/environment.hpp mpi-pyp.hh \
  /home/pblunsom/packages/include/boost/tuple/tuple.hpp \
  /home/pblunsom/packages/include/boost/ref.hpp \
@@ -2151,7 +2154,10 @@ mpi-train-contexts.o: mpi-train-contexts.cc \
  /home/pblunsom/packages/include/boost/random/mersenne_twister.hpp \
  /home/pblunsom/packages/include/boost/random/linear_congruential.hpp \
  /home/pblunsom/packages/include/boost/random/detail/const_mod.hpp \
- /home/pblunsom/packages/include/boost/random/detail/seed.hpp mpi-pyp.hh \
+ /home/pblunsom/packages/include/boost/random/detail/seed.hpp \
+ /home/pblunsom/packages/include/boost/random/inversive_congruential.hpp \
+ /home/pblunsom/packages/include/boost/random/lagged_fibonacci.hpp \
+ /home/pblunsom/packages/include/boost/config/no_tr1/cmath.hpp mpi-pyp.hh \
  /home/pblunsom/packages/include/boost/tuple/tuple.hpp \
  /home/pblunsom/packages/include/boost/tuple/detail/tuple_basic.hpp \
  /home/pblunsom/packages/include/boost/type_traits/cv_traits.hpp \
@@ -2239,14 +2245,13 @@ mpi-train-contexts.o: mpi-train-contexts.cc \
  /home/pblunsom/packages/include/boost/mpi/detail/broadcast_sc.hpp \
  /home/pblunsom/packages/include/boost/mpi/detail/communicator_sc.hpp \
  /home/pblunsom/packages/include/boost/mpi/timer.hpp pyp.hh \
- slice-sampler.h log_add.h mt19937ar.h corpus.hh contexts_corpus.hh \
- contexts_lexer.h ../../../decoder/dict.h \
+ slice-sampler.h log_add.h mt19937ar.h corpus.hh mpi-corpus.hh \
+ contexts_corpus.hh contexts_lexer.h ../../../decoder/dict.h \
  /home/pblunsom/packages/include/boost/functional/hash.hpp \
  /home/pblunsom/packages/include/boost/functional/hash/hash.hpp \
  /home/pblunsom/packages/include/boost/functional/hash/hash_fwd.hpp \
  /home/pblunsom/packages/include/boost/functional/hash/detail/hash_float.hpp \
  /home/pblunsom/packages/include/boost/functional/hash/detail/float_functions.hpp \
- /home/pblunsom/packages/include/boost/config/no_tr1/cmath.hpp \
  /home/pblunsom/packages/include/boost/functional/hash/detail/limits.hpp \
  /home/pblunsom/packages/include/boost/integer/static_log2.hpp \
  /home/pblunsom/packages/include/boost/functional/hash/detail/hash_float_generic.hpp \
diff --git a/gi/pyp-topics/src/mpi-pyp-topics.cc b/gi/pyp-topics/src/mpi-pyp-topics.cc
index fa951156..967c3a77 100644
--- a/gi/pyp-topics/src/mpi-pyp-topics.cc
+++ b/gi/pyp-topics/src/mpi-pyp-topics.cc
@@ -4,13 +4,14 @@
 #include "mpi-pyp-topics.hh"
 
 //#include <boost/date_time/posix_time/posix_time_types.hpp>
-void MPIPYPTopics::sample_corpus(const Corpus& corpus, int samples,
+void MPIPYPTopics::sample_corpus(const MPICorpus& corpus, int samples,
                               int freq_cutoff_start, int freq_cutoff_end,
                               int freq_cutoff_interval,
                               int max_contexts_per_document) {
   Timer timer;
 
-  int documents = corpus.num_documents();
+  //int documents = corpus.num_documents();
+  /*
   m_mpi_start = 0;
   m_mpi_end = documents;
   if (m_size != 1) {
@@ -19,6 +20,8 @@ void MPIPYPTopics::sample_corpus(const Corpus& corpus, int samples,
       if (m_rank == m_size-1) m_mpi_end = documents;
       else m_mpi_end = (documents / m_size)*(m_rank+1);
   }
+  */
+  corpus.bounds(&m_mpi_start, &m_mpi_end);
   int local_documents = m_mpi_end - m_mpi_start;
 
   if (!m_backoff.get()) {
@@ -74,7 +77,8 @@ void MPIPYPTopics::sample_corpus(const Corpus& corpus, int samples,
       int new_topic = -1;
       if (freq > frequency_cutoff
           && (!max_contexts_per_document || term_index < max_contexts_per_document)) {
-        new_topic = document_id % m_num_topics;
+        new_topic = sample(document_id, term);
+        //new_topic = document_id % m_num_topics;
 
         // add the new topic to the PYPs
         increment(term, new_topic);
@@ -336,7 +340,8 @@ MPIPYPTopics::F MPIPYPTopics::word_pyps_p0(const Term& term, int topic, int leve
     Term backoff_term = (*m_backoff)[term];
     if (!m_backoff->is_null(backoff_term)) {
       assert (level < m_backoff->order());
-      p0 = (1.0/(double)m_backoff->terms_at_level(level))*prob(backoff_term, topic, level+1);
+      //p0 = (1.0/(double)m_backoff->terms_at_level(level))*prob(backoff_term, topic, level+1);
+      p0 = prob(backoff_term, topic, level+1);
     }
     else
       p0 = m_term_p0;
@@ -373,10 +378,11 @@ int MPIPYPTopics::max_topic() const {
     }
   }
   assert(current_topic >= 0);
-  return current_topic;
+  assert(current_max >= 0);
+  return current_max;
 }
 
-int MPIPYPTopics::max(const DocumentId& true_doc) const {
+std::pair<int,MPIPYPTopics::F> MPIPYPTopics::max(const DocumentId& true_doc) const {
   //std::cerr << "MPIPYPTopics::max(" << doc << "," << term << ")" << std::endl;
   // collect probs
   F current_max=0.0;
@@ -399,10 +405,11 @@ int MPIPYPTopics::max(const DocumentId& true_doc) const {
     }
   }
   assert(current_topic >= 0);
-  return current_topic;
+  assert(current_max >= 0);
+  return std::make_pair(current_topic, current_max);
 }
 
-int MPIPYPTopics::max(const DocumentId& true_doc, const Term& term) const {
+std::pair<int,MPIPYPTopics::F> MPIPYPTopics::max(const DocumentId& true_doc, const Term& term) const {
   //std::cerr << "MPIPYPTopics::max(" << doc << "," << term << ")" << std::endl;
   // collect probs
   F current_max=0.0;
@@ -426,7 +433,8 @@ int MPIPYPTopics::max(const DocumentId& true_doc, const Term& term) const {
     }
   }
   assert(current_topic >= 0);
-  return current_topic;
+  assert(current_max >= 0);
+  return std::make_pair(current_topic, current_max);
 }
 
 std::ostream& MPIPYPTopics::print_document_topics(std::ostream& out) const {
diff --git a/gi/pyp-topics/src/mpi-pyp-topics.hh b/gi/pyp-topics/src/mpi-pyp-topics.hh
index 4a4433e6..d96bc4e5 100644
--- a/gi/pyp-topics/src/mpi-pyp-topics.hh
+++ b/gi/pyp-topics/src/mpi-pyp-topics.hh
@@ -16,7 +16,7 @@
 
 
 #include "mpi-pyp.hh"
-#include "corpus.hh"
+#include "mpi-corpus.hh"
 
 class MPIPYPTopics {
 public:
@@ -37,14 +37,14 @@ public:
       m_am_root = (m_rank == 0);
     }
 
-  void sample_corpus(const Corpus& corpus, int samples,
+  void sample_corpus(const MPICorpus& corpus, int samples,
                      int freq_cutoff_start=0, int freq_cutoff_end=0, 
                      int freq_cutoff_interval=0,
                      int max_contexts_per_document=0);
 
   int sample(const DocumentId& doc, const Term& term);
-  int max(const DocumentId& doc, const Term& term) const;
-  int max(const DocumentId& doc) const;
+  std::pair<int,F> max(const DocumentId& doc, const Term& term) const;
+  std::pair<int,F> max(const DocumentId& doc) const;
   int max_topic() const;
 
   void set_backoff(const std::string& filename) {
diff --git a/gi/pyp-topics/src/mpi-train-contexts.cc b/gi/pyp-topics/src/mpi-train-contexts.cc
index 4f966a65..7bb890d2 100644
--- a/gi/pyp-topics/src/mpi-train-contexts.cc
+++ b/gi/pyp-topics/src/mpi-train-contexts.cc
@@ -15,7 +15,7 @@
 // Local
 #include "mpi-pyp-topics.hh"
 #include "corpus.hh"
-#include "contexts_corpus.hh"
+#include "mpi-corpus.hh"
 #include "gzstream.hh"
 
 static const char *REVISION = "$Rev: 170 $";
@@ -105,8 +105,13 @@ int main(int argc, char **argv)
     }
   }
 
-  ContextsCorpus contexts_corpus;
+  //ContextsCorpus contexts_corpus;
+  MPICorpus contexts_corpus;
   contexts_corpus.read_contexts(vm["data"].as<string>(), backoff_gen, /*vm.count("filter-singleton-contexts")*/ false);
+  int mpi_start = 0, mpi_end = 0;
+  contexts_corpus.bounds(&mpi_start, &mpi_end);
+  std::cerr << "\tProcess " << rank << " has documents " << mpi_start << " -> " << mpi_end << "." << std::endl;
+
   model.set_backoff(contexts_corpus.backoff_index());
 
   if (backoff_gen) 
@@ -121,13 +126,15 @@ int main(int argc, char **argv)
 
   if (vm.count("document-topics-out")) {
     std::ofstream documents_out((vm["document-topics-out"].as<string>() + ".pyp-process-" + boost::lexical_cast<std::string>(rank)).c_str());
-    int documents = contexts_corpus.num_documents();
+    //int documents = contexts_corpus.num_documents();
+    /*
     int mpi_start = 0, mpi_end = documents;
     if (world.size() != 1) {
       mpi_start = (documents / world.size()) * rank;
       if (rank == world.size()-1) mpi_end = documents;
       else mpi_end = (documents / world.size())*(rank+1);
     }
+    */
 
     map<int,int> all_terms;
     for (int document_id=mpi_start; document_id<mpi_end; ++document_id) {
@@ -143,13 +150,14 @@ int main(int argc, char **argv)
           all_terms[*docIt] = all_terms[*docIt] + 1;
       }
       documents_out << contexts_corpus.key(document_id) << '\t';
-      documents_out << model.max(document_id) << " " << doc.size() << " ||| ";
+      documents_out << model.max(document_id).first << " " << doc.size() << " ||| ";
       for (std::vector<int>::const_iterator termIt=unique_terms.begin(); termIt != unique_terms.end(); ++termIt) {
         if (termIt != unique_terms.begin())
           documents_out << " ||| ";
         vector<std::string> strings = contexts_corpus.context2string(*termIt);
         copy(strings.begin(), strings.end(),ostream_iterator<std::string>(documents_out, " "));
-        documents_out << "||| C=" << model.max(document_id, *termIt);
+        std::pair<int,MPIPYPTopics::F> maxinfo = model.max(document_id, *termIt);
+        documents_out << "||| C=" << maxinfo.first << " P=" << maxinfo.second;
       }
       documents_out <<endl;
     }
@@ -173,7 +181,7 @@ int main(int argc, char **argv)
       default_topics << model.max_topic() <<endl;
       for (std::map<int,int>::const_iterator termIt=all_terms.begin(); termIt != all_terms.end(); ++termIt) {
         vector<std::string> strings = contexts_corpus.context2string(termIt->first);
-        default_topics << model.max(-1, termIt->first) << " ||| " << termIt->second << " ||| ";
+        default_topics << model.max(-1, termIt->first).first << " ||| " << termIt->second << " ||| ";
         copy(strings.begin(), strings.end(),ostream_iterator<std::string>(default_topics, " "));
         default_topics <<endl;
       }
diff --git a/gi/pyp-topics/src/pyp-topics.cc b/gi/pyp-topics/src/pyp-topics.cc
index 06499291..4c777f0c 100644
--- a/gi/pyp-topics/src/pyp-topics.cc
+++ b/gi/pyp-topics/src/pyp-topics.cc
@@ -58,7 +58,8 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples,
       int new_topic = -1;
       if (freq > frequency_cutoff
           && (!max_contexts_per_document || term_index < max_contexts_per_document)) {
-        new_topic = document_id % m_num_topics;
+        new_topic = sample(document_id, term);
+        //new_topic = document_id % m_num_topics;
 
         // add the new topic to the PYPs
         increment(term, new_topic);
@@ -314,7 +315,8 @@ PYPTopics::F PYPTopics::word_pyps_p0(const Term& term, int topic, int level) con
     Term backoff_term = (*m_backoff)[term];
     if (!m_backoff->is_null(backoff_term)) {
       assert (level < m_backoff->order());
-      p0 = (1.0/(double)m_backoff->terms_at_level(level))*prob(backoff_term, topic, level+1);
+      //p0 = (1.0/(double)m_backoff->terms_at_level(level))*prob(backoff_term, topic, level+1);
+      p0 = prob(backoff_term, topic, level+1);
     }
     else
       p0 = m_term_p0;
-- 
cgit v1.2.3