diff options
| author | philblunsom <philblunsom@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-23 16:04:32 +0000 | 
|---|---|---|
| committer | philblunsom <philblunsom@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-23 16:04:32 +0000 | 
| commit | e0bca5fea3b0267819186d0fc34c036e6b77679c (patch) | |
| tree | 5461a308d1a0f848a692ddcff5852c2c8d880089 /gi | |
| parent | 04cce54639520ca6a8175194a463d0f5297b01b5 (diff) | |
Changed the initialisation of the sampler, hopefully this will work better.
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@376 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi')
| -rw-r--r-- | gi/pyp-topics/src/contexts_corpus.hh | 8 | ||||
| -rw-r--r-- | gi/pyp-topics/src/corpus.hh | 2 | ||||
| -rw-r--r-- | gi/pyp-topics/src/makefile.depend | 13 | ||||
| -rw-r--r-- | gi/pyp-topics/src/mpi-pyp-topics.cc | 26 | ||||
| -rw-r--r-- | gi/pyp-topics/src/mpi-pyp-topics.hh | 8 | ||||
| -rw-r--r-- | gi/pyp-topics/src/mpi-train-contexts.cc | 20 | ||||
| -rw-r--r-- | gi/pyp-topics/src/pyp-topics.cc | 6 | 
7 files changed, 53 insertions, 30 deletions
| diff --git a/gi/pyp-topics/src/contexts_corpus.hh b/gi/pyp-topics/src/contexts_corpus.hh index f3c25454..f7dad21f 100644 --- a/gi/pyp-topics/src/contexts_corpus.hh +++ b/gi/pyp-topics/src/contexts_corpus.hh @@ -53,9 +53,9 @@ public:      ContextsCorpus() : m_backoff(new TermBackoff) {}      virtual ~ContextsCorpus() {} -    unsigned read_contexts(const std::string &filename,  -                           BackoffGenerator* backoff_gen=0, -                           bool filter_singeltons=false); +    virtual unsigned read_contexts(const std::string &filename,  +                                   BackoffGenerator* backoff_gen=0, +                                   bool filter_singeltons=false);      TermBackoffPtr backoff_index() {        return m_backoff; @@ -77,7 +77,7 @@ public:        return m_keys.at(i);      } -private: +protected:      TermBackoffPtr m_backoff;      Dict m_dict;      std::vector<std::string> m_keys; diff --git a/gi/pyp-topics/src/corpus.hh b/gi/pyp-topics/src/corpus.hh index 24981946..2aa03527 100644 --- a/gi/pyp-topics/src/corpus.hh +++ b/gi/pyp-topics/src/corpus.hh @@ -25,7 +25,7 @@ public:      Corpus();      virtual ~Corpus() {} -    unsigned read(const std::string &filename); +    virtual unsigned read(const std::string &filename);      const_iterator begin() const { return m_documents.begin(); }      const_iterator end() const { return m_documents.end(); } diff --git a/gi/pyp-topics/src/makefile.depend b/gi/pyp-topics/src/makefile.depend index d8ef78d8..9b8e306c 100644 --- a/gi/pyp-topics/src/makefile.depend +++ b/gi/pyp-topics/src/makefile.depend @@ -1442,6 +1442,9 @@ mpi-pyp-topics.o: mpi-pyp-topics.cc \   /home/pblunsom/packages/include/boost/random/linear_congruential.hpp \   /home/pblunsom/packages/include/boost/random/detail/const_mod.hpp \   /home/pblunsom/packages/include/boost/random/detail/seed.hpp \ + /home/pblunsom/packages/include/boost/random/inversive_congruential.hpp \ + /home/pblunsom/packages/include/boost/random/lagged_fibonacci.hpp \ + /home/pblunsom/packages/include/boost/config/no_tr1/cmath.hpp \   /home/pblunsom/packages/include/boost/mpi/environment.hpp mpi-pyp.hh \   /home/pblunsom/packages/include/boost/tuple/tuple.hpp \   /home/pblunsom/packages/include/boost/ref.hpp \ @@ -2151,7 +2154,10 @@ mpi-train-contexts.o: mpi-train-contexts.cc \   /home/pblunsom/packages/include/boost/random/mersenne_twister.hpp \   /home/pblunsom/packages/include/boost/random/linear_congruential.hpp \   /home/pblunsom/packages/include/boost/random/detail/const_mod.hpp \ - /home/pblunsom/packages/include/boost/random/detail/seed.hpp mpi-pyp.hh \ + /home/pblunsom/packages/include/boost/random/detail/seed.hpp \ + /home/pblunsom/packages/include/boost/random/inversive_congruential.hpp \ + /home/pblunsom/packages/include/boost/random/lagged_fibonacci.hpp \ + /home/pblunsom/packages/include/boost/config/no_tr1/cmath.hpp mpi-pyp.hh \   /home/pblunsom/packages/include/boost/tuple/tuple.hpp \   /home/pblunsom/packages/include/boost/tuple/detail/tuple_basic.hpp \   /home/pblunsom/packages/include/boost/type_traits/cv_traits.hpp \ @@ -2239,14 +2245,13 @@ mpi-train-contexts.o: mpi-train-contexts.cc \   /home/pblunsom/packages/include/boost/mpi/detail/broadcast_sc.hpp \   /home/pblunsom/packages/include/boost/mpi/detail/communicator_sc.hpp \   /home/pblunsom/packages/include/boost/mpi/timer.hpp pyp.hh \ - slice-sampler.h log_add.h mt19937ar.h corpus.hh contexts_corpus.hh \ - contexts_lexer.h ../../../decoder/dict.h \ + slice-sampler.h log_add.h mt19937ar.h corpus.hh mpi-corpus.hh \ + contexts_corpus.hh contexts_lexer.h ../../../decoder/dict.h \   /home/pblunsom/packages/include/boost/functional/hash.hpp \   /home/pblunsom/packages/include/boost/functional/hash/hash.hpp \   /home/pblunsom/packages/include/boost/functional/hash/hash_fwd.hpp \   /home/pblunsom/packages/include/boost/functional/hash/detail/hash_float.hpp \   /home/pblunsom/packages/include/boost/functional/hash/detail/float_functions.hpp \ - /home/pblunsom/packages/include/boost/config/no_tr1/cmath.hpp \   /home/pblunsom/packages/include/boost/functional/hash/detail/limits.hpp \   /home/pblunsom/packages/include/boost/integer/static_log2.hpp \   /home/pblunsom/packages/include/boost/functional/hash/detail/hash_float_generic.hpp \ diff --git a/gi/pyp-topics/src/mpi-pyp-topics.cc b/gi/pyp-topics/src/mpi-pyp-topics.cc index fa951156..967c3a77 100644 --- a/gi/pyp-topics/src/mpi-pyp-topics.cc +++ b/gi/pyp-topics/src/mpi-pyp-topics.cc @@ -4,13 +4,14 @@  #include "mpi-pyp-topics.hh"  //#include <boost/date_time/posix_time/posix_time_types.hpp> -void MPIPYPTopics::sample_corpus(const Corpus& corpus, int samples, +void MPIPYPTopics::sample_corpus(const MPICorpus& corpus, int samples,                                int freq_cutoff_start, int freq_cutoff_end,                                int freq_cutoff_interval,                                int max_contexts_per_document) {    Timer timer; -  int documents = corpus.num_documents(); +  //int documents = corpus.num_documents(); +  /*    m_mpi_start = 0;    m_mpi_end = documents;    if (m_size != 1) { @@ -19,6 +20,8 @@ void MPIPYPTopics::sample_corpus(const Corpus& corpus, int samples,        if (m_rank == m_size-1) m_mpi_end = documents;        else m_mpi_end = (documents / m_size)*(m_rank+1);    } +  */ +  corpus.bounds(&m_mpi_start, &m_mpi_end);    int local_documents = m_mpi_end - m_mpi_start;    if (!m_backoff.get()) { @@ -74,7 +77,8 @@ void MPIPYPTopics::sample_corpus(const Corpus& corpus, int samples,        int new_topic = -1;        if (freq > frequency_cutoff            && (!max_contexts_per_document || term_index < max_contexts_per_document)) { -        new_topic = document_id % m_num_topics; +        new_topic = sample(document_id, term); +        //new_topic = document_id % m_num_topics;          // add the new topic to the PYPs          increment(term, new_topic); @@ -336,7 +340,8 @@ MPIPYPTopics::F MPIPYPTopics::word_pyps_p0(const Term& term, int topic, int leve      Term backoff_term = (*m_backoff)[term];      if (!m_backoff->is_null(backoff_term)) {        assert (level < m_backoff->order()); -      p0 = (1.0/(double)m_backoff->terms_at_level(level))*prob(backoff_term, topic, level+1); +      //p0 = (1.0/(double)m_backoff->terms_at_level(level))*prob(backoff_term, topic, level+1); +      p0 = prob(backoff_term, topic, level+1);      }      else        p0 = m_term_p0; @@ -373,10 +378,11 @@ int MPIPYPTopics::max_topic() const {      }    }    assert(current_topic >= 0); -  return current_topic; +  assert(current_max >= 0); +  return current_max;  } -int MPIPYPTopics::max(const DocumentId& true_doc) const { +std::pair<int,MPIPYPTopics::F> MPIPYPTopics::max(const DocumentId& true_doc) const {    //std::cerr << "MPIPYPTopics::max(" << doc << "," << term << ")" << std::endl;    // collect probs    F current_max=0.0; @@ -399,10 +405,11 @@ int MPIPYPTopics::max(const DocumentId& true_doc) const {      }    }    assert(current_topic >= 0); -  return current_topic; +  assert(current_max >= 0); +  return std::make_pair(current_topic, current_max);  } -int MPIPYPTopics::max(const DocumentId& true_doc, const Term& term) const { +std::pair<int,MPIPYPTopics::F> MPIPYPTopics::max(const DocumentId& true_doc, const Term& term) const {    //std::cerr << "MPIPYPTopics::max(" << doc << "," << term << ")" << std::endl;    // collect probs    F current_max=0.0; @@ -426,7 +433,8 @@ int MPIPYPTopics::max(const DocumentId& true_doc, const Term& term) const {      }    }    assert(current_topic >= 0); -  return current_topic; +  assert(current_max >= 0); +  return std::make_pair(current_topic, current_max);  }  std::ostream& MPIPYPTopics::print_document_topics(std::ostream& out) const { diff --git a/gi/pyp-topics/src/mpi-pyp-topics.hh b/gi/pyp-topics/src/mpi-pyp-topics.hh index 4a4433e6..d96bc4e5 100644 --- a/gi/pyp-topics/src/mpi-pyp-topics.hh +++ b/gi/pyp-topics/src/mpi-pyp-topics.hh @@ -16,7 +16,7 @@  #include "mpi-pyp.hh" -#include "corpus.hh" +#include "mpi-corpus.hh"  class MPIPYPTopics {  public: @@ -37,14 +37,14 @@ public:        m_am_root = (m_rank == 0);      } -  void sample_corpus(const Corpus& corpus, int samples, +  void sample_corpus(const MPICorpus& corpus, int samples,                       int freq_cutoff_start=0, int freq_cutoff_end=0,                        int freq_cutoff_interval=0,                       int max_contexts_per_document=0);    int sample(const DocumentId& doc, const Term& term); -  int max(const DocumentId& doc, const Term& term) const; -  int max(const DocumentId& doc) const; +  std::pair<int,F> max(const DocumentId& doc, const Term& term) const; +  std::pair<int,F> max(const DocumentId& doc) const;    int max_topic() const;    void set_backoff(const std::string& filename) { diff --git a/gi/pyp-topics/src/mpi-train-contexts.cc b/gi/pyp-topics/src/mpi-train-contexts.cc index 4f966a65..7bb890d2 100644 --- a/gi/pyp-topics/src/mpi-train-contexts.cc +++ b/gi/pyp-topics/src/mpi-train-contexts.cc @@ -15,7 +15,7 @@  // Local  #include "mpi-pyp-topics.hh"  #include "corpus.hh" -#include "contexts_corpus.hh" +#include "mpi-corpus.hh"  #include "gzstream.hh"  static const char *REVISION = "$Rev: 170 $"; @@ -105,8 +105,13 @@ int main(int argc, char **argv)      }    } -  ContextsCorpus contexts_corpus; +  //ContextsCorpus contexts_corpus; +  MPICorpus contexts_corpus;    contexts_corpus.read_contexts(vm["data"].as<string>(), backoff_gen, /*vm.count("filter-singleton-contexts")*/ false); +  int mpi_start = 0, mpi_end = 0; +  contexts_corpus.bounds(&mpi_start, &mpi_end); +  std::cerr << "\tProcess " << rank << " has documents " << mpi_start << " -> " << mpi_end << "." << std::endl; +    model.set_backoff(contexts_corpus.backoff_index());    if (backoff_gen)  @@ -121,13 +126,15 @@ int main(int argc, char **argv)    if (vm.count("document-topics-out")) {      std::ofstream documents_out((vm["document-topics-out"].as<string>() + ".pyp-process-" + boost::lexical_cast<std::string>(rank)).c_str()); -    int documents = contexts_corpus.num_documents(); +    //int documents = contexts_corpus.num_documents(); +    /*      int mpi_start = 0, mpi_end = documents;      if (world.size() != 1) {        mpi_start = (documents / world.size()) * rank;        if (rank == world.size()-1) mpi_end = documents;        else mpi_end = (documents / world.size())*(rank+1);      } +    */      map<int,int> all_terms;      for (int document_id=mpi_start; document_id<mpi_end; ++document_id) { @@ -143,13 +150,14 @@ int main(int argc, char **argv)            all_terms[*docIt] = all_terms[*docIt] + 1;        }        documents_out << contexts_corpus.key(document_id) << '\t'; -      documents_out << model.max(document_id) << " " << doc.size() << " ||| "; +      documents_out << model.max(document_id).first << " " << doc.size() << " ||| ";        for (std::vector<int>::const_iterator termIt=unique_terms.begin(); termIt != unique_terms.end(); ++termIt) {          if (termIt != unique_terms.begin())            documents_out << " ||| ";          vector<std::string> strings = contexts_corpus.context2string(*termIt);          copy(strings.begin(), strings.end(),ostream_iterator<std::string>(documents_out, " ")); -        documents_out << "||| C=" << model.max(document_id, *termIt); +        std::pair<int,MPIPYPTopics::F> maxinfo = model.max(document_id, *termIt); +        documents_out << "||| C=" << maxinfo.first << " P=" << maxinfo.second;        }        documents_out <<endl;      } @@ -173,7 +181,7 @@ int main(int argc, char **argv)        default_topics << model.max_topic() <<endl;        for (std::map<int,int>::const_iterator termIt=all_terms.begin(); termIt != all_terms.end(); ++termIt) {          vector<std::string> strings = contexts_corpus.context2string(termIt->first); -        default_topics << model.max(-1, termIt->first) << " ||| " << termIt->second << " ||| "; +        default_topics << model.max(-1, termIt->first).first << " ||| " << termIt->second << " ||| ";          copy(strings.begin(), strings.end(),ostream_iterator<std::string>(default_topics, " "));          default_topics <<endl;        } diff --git a/gi/pyp-topics/src/pyp-topics.cc b/gi/pyp-topics/src/pyp-topics.cc index 06499291..4c777f0c 100644 --- a/gi/pyp-topics/src/pyp-topics.cc +++ b/gi/pyp-topics/src/pyp-topics.cc @@ -58,7 +58,8 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples,        int new_topic = -1;        if (freq > frequency_cutoff            && (!max_contexts_per_document || term_index < max_contexts_per_document)) { -        new_topic = document_id % m_num_topics; +        new_topic = sample(document_id, term); +        //new_topic = document_id % m_num_topics;          // add the new topic to the PYPs          increment(term, new_topic); @@ -314,7 +315,8 @@ PYPTopics::F PYPTopics::word_pyps_p0(const Term& term, int topic, int level) con      Term backoff_term = (*m_backoff)[term];      if (!m_backoff->is_null(backoff_term)) {        assert (level < m_backoff->order()); -      p0 = (1.0/(double)m_backoff->terms_at_level(level))*prob(backoff_term, topic, level+1); +      //p0 = (1.0/(double)m_backoff->terms_at_level(level))*prob(backoff_term, topic, level+1); +      p0 = prob(backoff_term, topic, level+1);      }      else        p0 = m_term_p0; | 
