working on the mpi version

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@257 ec762483-ff6d-05da-a07a-a48fb63a330f
author: philblunsom <philblunsom@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-15 03:39:32 +0000
committer: philblunsom <philblunsom@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-15 03:39:32 +0000
commit: 901d45bb36e1395018de8460e4214cc9561f5b32 (patch)
tree: 2773688f7128b45c1c6ce59adb5a9fe4f5e509db /gi/pyp-topics/src
parent: 48c6d106041ae1172d31a5252e13b38f973c4396 (diff)
7 files changed, 166 insertions, 148 deletions
diff --git a/gi/pyp-topics/src/Makefile.am b/gi/pyp-topics/src/Makefile.am
index a3a30acd..c22819db 100644
--- a/gi/pyp-topics/src/Makefile.am
+++ b/gi/pyp-topics/src/Makefile.am
@@ -1,4 +1,4 @@
-bin_PROGRAMS = pyp-topics-train pyp-contexts-train mpi-pyp-contexts-train
+bin_PROGRAMS = pyp-topics-train pyp-contexts-train #mpi-pyp-contexts-train
 
 contexts_lexer.cc: contexts_lexer.l
 	$(LEX) -s -CF -8 -o$@ $<
@@ -9,8 +9,8 @@ pyp_topics_train_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
 pyp_contexts_train_SOURCES = mt19937ar.c corpus.cc gzstream.cc pyp-topics.cc contexts_lexer.cc contexts_corpus.cc train-contexts.cc
 pyp_contexts_train_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
 
-mpi_pyp_contexts_train_SOURCES = mt19937ar.c corpus.cc gzstream.cc mpi-pyp-topics.cc contexts_lexer.cc contexts_corpus.cc mpi-train-contexts.cc
-mpi_pyp_contexts_train_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
+#mpi_pyp_contexts_train_SOURCES = mt19937ar.c corpus.cc gzstream.cc mpi-pyp-topics.cc contexts_lexer.cc contexts_corpus.cc mpi-train-contexts.cc
+#mpi_pyp_contexts_train_LDADD = $(top_srcdir)/decoder/libcdec.a -lz
 
 AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops
 
diff --git a/gi/pyp-topics/src/Makefile.mpi b/gi/pyp-topics/src/Makefile.mpi
new file mode 100644
index 00000000..b22cc7e7
--- /dev/null
+++ b/gi/pyp-topics/src/Makefile.mpi
@@ -0,0 +1,25 @@
+BLD_ARCH=$(shell uname -s)
+-include macros.${BLD_ARCH}
+
+local_objs = mt19937ar.o corpus.o gzstream.o mpi-pyp-topics.o contexts_lexer.o contexts_corpus.o mpi-train-contexts.o
+
+all: mpi-pyp-contexts-train
+
+-include makefile.depend
+
+#-----------------------#
+# Local stuff
+#-----------------------#
+
+mpi-pyp-contexts-train: mpi-train-contexts.o $(local_objs)
+	$(CXX) -o $@ $^ $(LDFLAGS)
+
+.PHONY: depend echo
+depend:
+	$(CXX) -MM $(CXXFLAGS) *.cc *.c | sed 's/^\(.*\.o:\)/obj\/\1/' > makefile.depend
+
+clean:
+	rm -f obj/*.o
+
+#clobber: clean
+#	rm makefile.depend ../bin/${ARCH}/*
diff --git a/gi/pyp-topics/src/macros.Linux b/gi/pyp-topics/src/macros.Linux
new file mode 100644
index 00000000..ade6d92d
--- /dev/null
+++ b/gi/pyp-topics/src/macros.Linux
@@ -0,0 +1,20 @@
+#CC=gcc-4.1
+#CXX=g++-4.1
+#LD=g++-4.1
+#FC=gfortran-4.1
+CC = mpicc
+CXX = mpicxx
+LD = mpicxx
+FC = mpif77
+
+CXXFLAGS = -Wall -I/home/pblunsom/packages/include
+CFLAGS = -Wall -I/home/pblunsom/packages/include
+FFLAGS = -Wall
+LDFLAGS = -lm -lz -L/home/pblunsom/packages/lib \
+	-lboost_program_options -lboost_mpi -lboost_serialization \
+  -lboost_regex -L../../../decoder -lcdec
+
+FFLAGS += -g -O6  -march=native
+CFLAGS += -g -O6  -march=native
+CXXFLAGS += -g -O6  -march=native
+LDFLAGS += -g -O6  -march=native
diff --git a/gi/pyp-topics/src/mpi-pyp-topics.cc b/gi/pyp-topics/src/mpi-pyp-topics.cc
index d2daad4f..2ad28278 100644
--- a/gi/pyp-topics/src/mpi-pyp-topics.cc
+++ b/gi/pyp-topics/src/mpi-pyp-topics.cc
@@ -1,3 +1,5 @@
+#include <boost/mpi/communicator.hpp>
+
 #include "timing.h"
 #include "mpi-pyp-topics.hh"
 
@@ -6,37 +8,51 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples,
                               int freq_cutoff_start, int freq_cutoff_end,
                               int freq_cutoff_interval,
                               int max_contexts_per_document) {
+  std::cout << "I am process " << m_rank << " of " << m_size << "." << std::endl;
   Timer timer;
+  std::cout << m_am_root << std::endl;
+
+  int documents = corpus.num_documents();
+  m_mpi_start = 0;
+  m_mpi_end = documents;
+  if (m_size != 1) {
+      assert(documents < std::numeric_limits<int>::max());
+      m_mpi_start = (documents / m_size) * m_rank;
+      if (m_rank == m_size-1) m_mpi_end = documents;
+      else m_mpi_end = (documents / m_size)*(m_rank+1);
+  }
+  int local_documents = m_mpi_end - m_mpi_start;
+
 
   if (!m_backoff.get()) {
     m_word_pyps.clear();
     m_word_pyps.push_back(PYPs());
   }
 
-  std::cerr << "\n Training with " << m_word_pyps.size()-1 << " backoff level"
+  if (m_am_root) std::cerr << "\n Training with " << m_word_pyps.size()-1 << " backoff level"
     << (m_word_pyps.size()==2 ? ":" : "s:") << std::endl;
 
   for (int i=0; i<(int)m_word_pyps.size(); ++i)
   {
     m_word_pyps.at(i).reserve(m_num_topics);
     for (int j=0; j<m_num_topics; ++j)
-      m_word_pyps.at(i).push_back(new PYP<int>(0.5, 1.0, m_seed));
+      m_word_pyps.at(i).push_back(new PYP<int>(0.5, 1.0));
   }
-  std::cerr << std::endl;
+  if (m_am_root) std::cerr << std::endl;
 
   m_document_pyps.reserve(corpus.num_documents());
   for (int j=0; j<corpus.num_documents(); ++j)
-    m_document_pyps.push_back(new PYP<int>(0.5, 1.0, m_seed));
+    m_document_pyps.push_back(new PYP<int>(0.5, 1.0));
 
   m_topic_p0 = 1.0/m_num_topics;
   m_term_p0 = 1.0/corpus.num_types();
   m_backoff_p0 = 1.0/corpus.num_documents();
 
-  std::cerr << " Documents: " << corpus.num_documents() << " Terms: "
+  if (m_am_root) std::cerr << " Documents: " << corpus.num_documents() << " Terms: "
     << corpus.num_types() << std::endl;
 
   int frequency_cutoff = freq_cutoff_start;
-  std::cerr << " Context frequency cutoff set to " << frequency_cutoff << std::endl;
+  if (m_am_root) std::cerr << " Context frequency cutoff set to " << frequency_cutoff << std::endl;
 
   timer.Reset();
   // Initialisation pass
@@ -74,11 +90,11 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples,
       m_corpus_topics[document_id][term_index] = new_topic;
     }
   }
-  std::cerr << "  Initialized in " << timer.Elapsed() << " seconds\n";
+  if (m_am_root) std::cerr << "  Initialized in " << timer.Elapsed() << " seconds\n";
 
-  int* randomDocIndices = new int[corpus.num_documents()];
-  for (int i = 0; i < corpus.num_documents(); ++i)
-	  randomDocIndices[i] = i;
+  int* randomDocIndices = new int[local_documents];
+  for (int i = 0; i < local_documents; ++i)
+	  randomDocIndices[i] = i+m_mpi_start;
 
   // Sampling phase
   for (int curr_sample=0; curr_sample < samples; ++curr_sample) {
@@ -86,16 +102,15 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples,
         && curr_sample % freq_cutoff_interval == 1
         && frequency_cutoff > freq_cutoff_end) {
       frequency_cutoff--;
-      std::cerr << "\n Context frequency cutoff set to " << frequency_cutoff << std::endl;
+      if (m_am_root) std::cerr << "\n Context frequency cutoff set to " << frequency_cutoff << std::endl;
     }
 
-    std::cerr << "\n  -- Sample " << curr_sample << " "; std::cerr.flush();
+    if (m_am_root) std::cerr << "\n  -- Sample " << curr_sample << " "; std::cerr.flush();
 
     // Randomize the corpus indexing array
     int tmp;
     int processed_terms=0;
-    for (int i = corpus.num_documents()-1; i > 0; --i)
-    {
+    for (int i = local_documents-1; i > 0; --i) {
         //i+1 since j \in [0,i] but rnd() \in [0,1)
     	int j = (int)(rnd() * (i+1));
       assert(j >= 0 && j <= i);
@@ -106,7 +121,7 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples,
 
     // for each document in the corpus
     int document_id;
-    for (int i=0; i<corpus.num_documents(); ++i) {
+    for (int i=0; i<local_documents; ++i) {
     	document_id = randomDocIndices[i];
 
       // for each term in the document
@@ -151,15 +166,16 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples,
         else m_document_pyps[document_id].increment(new_topic, m_topic_p0);
       }
       if (document_id && document_id % 10000 == 0) {
-        std::cerr << "."; std::cerr.flush();
+        if (m_am_root) std::cerr << "."; std::cerr.flush();
       }
     }
-    std::cerr << " ||| sampled " << processed_terms << " terms.";
+    m_world.barrier();
+    if (m_am_root) std::cerr << " ||| sampled " << processed_terms << " terms.";
 
     if (curr_sample != 0 && curr_sample % 10 == 0) {
-      std::cerr << " ||| time=" << (timer.Elapsed() / 10.0) << " sec/sample" << std::endl;
+      if (m_am_root) std::cerr << " ||| time=" << (timer.Elapsed() / 10.0) << " sec/sample" << std::endl;
       timer.Reset();
-      std::cerr << "     ... Resampling hyperparameters (" << max_threads << " threads)"; std::cerr.flush();
+      if (m_am_root) std::cerr << "     ... Resampling hyperparameters"; std::cerr.flush();
 
       // resample the hyperparamters
       F log_p=0.0;
@@ -172,21 +188,10 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples,
         }
       }
 
-      WorkerPtrVect workers;
-      for (int i = 0; i < max_threads; ++i)
-      {
-        JobReturnsF job = boost::bind(&PYPTopics::hresample_docs, this, max_threads, i);
-        workers.push_back(new SimpleResampleWorker(job));
-      }
-
-      WorkerPtrVect::iterator workerIt;
-      for (workerIt = workers.begin(); workerIt != workers.end(); ++workerIt)
-      {
-        //std::cerr << "Retrieving worker result.."; std::cerr.flush();
-        F wresult = workerIt->getResult(); //blocks until worker done
-        log_p += wresult;
-        //std::cerr << ".. got " << wresult << std::endl; std::cerr.flush();
-
+      for (PYPs::iterator pypIt=m_document_pyps.begin();
+           pypIt != m_document_pyps.end(); ++pypIt) {
+        pypIt->resample_prior();
+        log_p += pypIt->log_restaurant_prob();
       }
 
       if (m_use_topic_pyp) {
@@ -195,64 +200,25 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples,
       }
 
       std::cerr.precision(10);
-      std::cerr << " ||| LLH=" << log_p << " ||| resampling time=" << timer.Elapsed() << " sec" << std::endl;
+      if (m_am_root) std::cerr << " ||| LLH=" << log_p << " ||| resampling time=" << timer.Elapsed() << " sec" << std::endl;
       timer.Reset();
 
       int k=0;
-      std::cerr << "Topics distribution: ";
+      if (m_am_root) std::cerr << "Topics distribution: ";
       std::cerr.precision(2);
       for (PYPs::iterator pypIt=m_word_pyps.front().begin();
            pypIt != m_word_pyps.front().end(); ++pypIt, ++k) {
-        if (k % 5 == 0) std::cerr << std::endl << '\t';
-        std::cerr << "<" << k << ":" << pypIt->num_customers() << ","
+        if (m_am_root && k % 5 == 0) std::cerr << std::endl << '\t';
+        if (m_am_root) std::cerr << "<" << k << ":" << pypIt->num_customers() << ","
           << pypIt->num_types() << "," << m_topic_pyp.prob(k, m_topic_p0) << "> ";
       }
       std::cerr.precision(4);
-      std::cerr << std::endl;
+      if (m_am_root) std::cerr << std::endl;
     }
   }
   delete [] randomDocIndices;
 }
 
-PYPTopics::F PYPTopics::hresample_docs(int num_threads, int thread_id)
-{
-  int resample_counter=0;
-  F log_p = 0.0;
-  PYPs::iterator pypIt = m_document_pyps.begin();
-  PYPs::iterator end = m_document_pyps.end();
-  pypIt += thread_id;
-//  std::cerr << thread_id << " started " << std::endl; std::cerr.flush();
-
-  while (pypIt < end)
-  {
-    pypIt->resample_prior();
-    log_p += pypIt->log_restaurant_prob();
-    if (resample_counter++ % 5000 == 0) {
-      std::cerr << "."; std::cerr.flush();
-    }
-    pypIt += num_threads;
-  }
-//  std::cerr << thread_id << " did " << resample_counter << " with answer " << log_p << std::endl; std::cerr.flush();
-
-  return log_p;
-}
-
-//PYPTopics::F PYPTopics::hresample_topics()
-//{
-//  F log_p = 0.0;
-//  for (std::vector<PYPs>::iterator levelIt=m_word_pyps.begin();
-//      levelIt != m_word_pyps.end(); ++levelIt) {
-//    for (PYPs::iterator pypIt=levelIt->begin();
-//        pypIt != levelIt->end(); ++pypIt) {
-//
-//      pypIt->resample_prior();
-//      log_p += pypIt->log_restaurant_prob();
-//    }
-//  }
-//  //std::cerr << "topicworker has answer " << log_p << std::endl; std::cerr.flush();
-//
-// return log_p;
-//}
 
 void PYPTopics::decrement(const Term& term, int topic, int level) {
   //std::cerr << "PYPTopics::decrement(" << term << "," << topic << "," << level << ")" << std::endl;
diff --git a/gi/pyp-topics/src/mpi-pyp-topics.hh b/gi/pyp-topics/src/mpi-pyp-topics.hh
index d978c7a1..5da35d82 100644
--- a/gi/pyp-topics/src/mpi-pyp-topics.hh
+++ b/gi/pyp-topics/src/mpi-pyp-topics.hh
@@ -3,15 +3,16 @@
 
 #include <vector>
 #include <iostream>
-#include <boost/ptr_container/ptr_vector.hpp>
 
+#include <boost/ptr_container/ptr_vector.hpp>
 #include <boost/random/uniform_real.hpp>
 #include <boost/random/variate_generator.hpp>
 #include <boost/random/mersenne_twister.hpp>
+#include <boost/mpi/environment.hpp>
+#include <boost/mpi/communicator.hpp>
 
 #include "mpi-pyp.hh"
 #include "corpus.hh"
-#include "workers.hh"
 
 class PYPTopics {
 public:
@@ -20,13 +21,17 @@ public:
   typedef double F;
 
 public:
-  PYPTopics(int num_topics, bool use_topic_pyp=false, unsigned long seed = 0,
-        int max_threads = 1) 
+  PYPTopics(int num_topics, bool use_topic_pyp=false, unsigned long seed = 0) 
     : m_num_topics(num_topics), m_word_pyps(1), 
-    m_topic_pyp(0.5,1.0,seed), m_use_topic_pyp(use_topic_pyp),
+    m_topic_pyp(0.5,1.0), m_use_topic_pyp(use_topic_pyp),
     m_seed(seed),
     uni_dist(0,1), rng(seed == 0 ? (unsigned long)this : seed), 
-    rnd(rng, uni_dist), max_threads(max_threads) {}
+    rnd(rng, uni_dist), m_mpi_start(-1), m_mpi_end(-1) {
+      boost::mpi::communicator m_world;
+      m_rank = m_world.rank(); 
+      m_size = m_world.size();
+      m_am_root = (m_rank == 0);
+    }
 
   void sample_corpus(const Corpus& corpus, int samples,
                      int freq_cutoff_start=0, int freq_cutoff_end=0, 
@@ -81,17 +86,12 @@ private:
   gen_type rnd; //instantiate: rnd(rng, uni_dist)
                 //call: rnd() generates uniform on [0,1)
 
-  typedef boost::function<F()> JobReturnsF;
-  typedef SimpleWorker<JobReturnsF, F> SimpleResampleWorker;
-  typedef boost::ptr_vector<SimpleResampleWorker> WorkerPtrVect;
-
-  F hresample_docs(int num_threads, int thread_id);
-
-//  F hresample_topics();
-  
-  int max_threads;
-
   TermBackoffPtr m_backoff;
+
+  boost::mpi::communicator m_world;
+  bool m_am_root;
+  int m_rank, m_size;
+  int m_mpi_start, m_mpi_end;
 };
 
 #endif // PYP_TOPICS_HH
diff --git a/gi/pyp-topics/src/mpi-pyp.hh b/gi/pyp-topics/src/mpi-pyp.hh
index dc47244b..3396f92b 100644
--- a/gi/pyp-topics/src/mpi-pyp.hh
+++ b/gi/pyp-topics/src/mpi-pyp.hh
@@ -32,7 +32,7 @@ public:
 //  using google::sparse_hash_map<Dish,int>::begin;
 //  using google::sparse_hash_map<Dish,int>::end;
 
-  PYP(double a, double b, unsigned long seed = 0, Hash hash=Hash());
+  PYP(double a, double b, Hash hash=Hash());
 
   int increment(Dish d, double p0);
   int decrement(Dish d);
@@ -153,7 +153,7 @@ private:
 };
 
 template <typename Dish, typename Hash>
-PYP<Dish,Hash>::PYP(double a, double b, unsigned long seed, Hash)
+PYP<Dish,Hash>::PYP(double a, double b, Hash)
 : std::tr1::unordered_map<Dish, int, Hash>(10), _a(a), _b(b), 
 //: google::sparse_hash_map<Dish, int, Hash>(10), _a(a), _b(b), 
   _a_beta_a(1), _a_beta_b(1), _b_gamma_s(1), _b_gamma_c(1),
diff --git a/gi/pyp-topics/src/mpi-train-contexts.cc b/gi/pyp-topics/src/mpi-train-contexts.cc
index 6309fe93..956ce123 100644
--- a/gi/pyp-topics/src/mpi-train-contexts.cc
+++ b/gi/pyp-topics/src/mpi-train-contexts.cc
@@ -8,6 +8,8 @@
 #include <boost/program_options/parsers.hpp>
 #include <boost/program_options/variables_map.hpp>
 #include <boost/scoped_ptr.hpp>
+#include <boost/mpi/environment.hpp>
+#include <boost/mpi/communicator.hpp>
 
 // Local
 #include "mpi-pyp-topics.hh"
@@ -24,8 +26,12 @@ using namespace std;
 
 int main(int argc, char **argv)
 {
- cout << "Pitman Yor topic models: Copyright 2010 Phil Blunsom\n";
- cout << REVISION << '\n' <<endl;
+  mpi::environment env(argc, argv);
+  mpi::communicator world;
+  bool am_root = (world.rank() == 0);
+  if (am_root) std::cout << "I am process " << world.rank() << " of " << world.size() << "." << std::endl;
+  if (am_root) cout << "Pitman Yor topic models: Copyright 2010 Phil Blunsom\n";
+  if (am_root) cout << REVISION << '\n' <<endl;
 
   ////////////////////////////////////////////////////////////////////////////////////////////
   // Command line processing
@@ -53,7 +59,6 @@ int main(int argc, char **argv)
       ("freq-cutoff-start", value<int>()->default_value(0), "initial frequency cutoff.")
       ("freq-cutoff-end", value<int>()->default_value(0), "final frequency cutoff.")
       ("freq-cutoff-interval", value<int>()->default_value(0), "number of iterations between frequency decrement.")
-      ("max-threads", value<int>()->default_value(1), "maximum number of simultaneous threads allowed")
       ("max-contexts-per-document", value<int>()->default_value(0), "Only sample the n most frequent contexts for a document.")
       ;
 
@@ -81,7 +86,7 @@ int main(int argc, char **argv)
 
   // seed the random number generator: 0 = automatic, specify value otherwise
   unsigned long seed = 0; 
-  PYPTopics model(vm["topics"].as<int>(), vm.count("hierarchical-topics"), seed, vm["max-threads"].as<int>());
+  PYPTopics model(vm["topics"].as<int>(), vm.count("hierarchical-topics"), seed);
 
   // read the data
   BackoffGenerator* backoff_gen=0;
@@ -112,58 +117,60 @@ int main(int argc, char **argv)
                       vm["freq-cutoff-interval"].as<int>(),
                       vm["max-contexts-per-document"].as<int>());
 
-  if (vm.count("document-topics-out")) {
-    ogzstream documents_out(vm["document-topics-out"].as<string>().c_str());
-
-    int document_id=0;
-    map<int,int> all_terms;
-    for (Corpus::const_iterator corpusIt=contexts_corpus.begin(); 
-         corpusIt != contexts_corpus.end(); ++corpusIt, ++document_id) {
-      vector<int> unique_terms;
-      for (Document::const_iterator docIt=corpusIt->begin();
-           docIt != corpusIt->end(); ++docIt) {
-        if (unique_terms.empty() || *docIt != unique_terms.back())
-          unique_terms.push_back(*docIt);
-        // increment this terms frequency
-        pair<map<int,int>::iterator,bool> insert_result = all_terms.insert(make_pair(*docIt,1));
-        if (!insert_result.second) 
-          all_terms[*docIt] = all_terms[*docIt] + 1;
+  if (world.rank() == 0) {
+    if (vm.count("document-topics-out")) {
+      ogzstream documents_out(vm["document-topics-out"].as<string>().c_str());
+
+      int document_id=0;
+      map<int,int> all_terms;
+      for (Corpus::const_iterator corpusIt=contexts_corpus.begin(); 
+           corpusIt != contexts_corpus.end(); ++corpusIt, ++document_id) {
+        vector<int> unique_terms;
+        for (Document::const_iterator docIt=corpusIt->begin();
+             docIt != corpusIt->end(); ++docIt) {
+          if (unique_terms.empty() || *docIt != unique_terms.back())
+            unique_terms.push_back(*docIt);
+          // increment this terms frequency
+          pair<map<int,int>::iterator,bool> insert_result = all_terms.insert(make_pair(*docIt,1));
+          if (!insert_result.second) 
+            all_terms[*docIt] = all_terms[*docIt] + 1;
           //insert_result.first++;
+        }
+        documents_out << contexts_corpus.key(document_id) << '\t';
+        documents_out << model.max(document_id) << " " << corpusIt->size() << " ||| ";
+        for (std::vector<int>::const_iterator termIt=unique_terms.begin();
+             termIt != unique_terms.end(); ++termIt) {
+          if (termIt != unique_terms.begin())
+            documents_out << " ||| ";
+          vector<std::string> strings = contexts_corpus.context2string(*termIt);
+          copy(strings.begin(), strings.end(),ostream_iterator<std::string>(documents_out, " "));
+          documents_out << "||| C=" << model.max(document_id, *termIt);
+
+        }
+        documents_out <<endl;
       }
-      documents_out << contexts_corpus.key(document_id) << '\t';
-      documents_out << model.max(document_id) << " " << corpusIt->size() << " ||| ";
-      for (std::vector<int>::const_iterator termIt=unique_terms.begin();
-           termIt != unique_terms.end(); ++termIt) {
-        if (termIt != unique_terms.begin())
-          documents_out << " ||| ";
-       vector<std::string> strings = contexts_corpus.context2string(*termIt);
-       copy(strings.begin(), strings.end(),ostream_iterator<std::string>(documents_out, " "));
-        documents_out << "||| C=" << model.max(document_id, *termIt);
-
+      documents_out.close();
+
+      if (vm.count("default-topics-out")) {
+        ofstream default_topics(vm["default-topics-out"].as<string>().c_str());
+        default_topics << model.max_topic() <<endl;
+        for (std::map<int,int>::const_iterator termIt=all_terms.begin(); termIt != all_terms.end(); ++termIt) {
+          vector<std::string> strings = contexts_corpus.context2string(termIt->first);
+          default_topics << model.max(-1, termIt->first) << " ||| " << termIt->second << " ||| ";
+          copy(strings.begin(), strings.end(),ostream_iterator<std::string>(default_topics, " "));
+          default_topics <<endl;
+        }
       }
-      documents_out <<endl;
     }
-    documents_out.close();
-
-    if (vm.count("default-topics-out")) {
-      ofstream default_topics(vm["default-topics-out"].as<string>().c_str());
-      default_topics << model.max_topic() <<endl;
-      for (std::map<int,int>::const_iterator termIt=all_terms.begin(); termIt != all_terms.end(); ++termIt) {
-       vector<std::string> strings = contexts_corpus.context2string(termIt->first);
-        default_topics << model.max(-1, termIt->first) << " ||| " << termIt->second << " ||| ";
-       copy(strings.begin(), strings.end(),ostream_iterator<std::string>(default_topics, " "));
-        default_topics <<endl;
-      }
+
+    if (vm.count("topic-words-out")) {
+      ogzstream topics_out(vm["topic-words-out"].as<string>().c_str());
+      model.print_topic_terms(topics_out);
+      topics_out.close();
     }
-  }
 
-  if (vm.count("topic-words-out")) {
-    ogzstream topics_out(vm["topic-words-out"].as<string>().c_str());
-    model.print_topic_terms(topics_out);
-    topics_out.close();
+    cout <<endl;
   }
 
- cout <<endl;
-
   return 0;
 }
author	philblunsom <philblunsom@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-15 03:39:32 +0000
committer	philblunsom <philblunsom@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-15 03:39:32 +0000
commit	901d45bb36e1395018de8460e4214cc9561f5b32 (patch)
tree	2773688f7128b45c1c6ce59adb5a9fe4f5e509db /gi/pyp-topics/src
parent	48c6d106041ae1172d31a5252e13b38f973c4396 (diff)