diff options
| -rw-r--r-- | gi/pyp-topics/src/Makefile.mpi | 3 | ||||
| -rw-r--r-- | gi/pyp-topics/src/contexts_corpus.cc | 4 | ||||
| -rw-r--r-- | gi/pyp-topics/src/contexts_corpus.hh | 2 | ||||
| -rw-r--r-- | gi/pyp-topics/src/makefile.depend | 228 | ||||
| -rw-r--r-- | gi/pyp-topics/src/mpi-pyp-topics.cc | 148 | ||||
| -rw-r--r-- | gi/pyp-topics/src/mpi-pyp-topics.hh | 17 | ||||
| -rw-r--r-- | gi/pyp-topics/src/mpi-pyp.hh | 273 | ||||
| -rw-r--r-- | gi/pyp-topics/src/mpi-train-contexts.cc | 2 | ||||
| -rw-r--r-- | gi/pyp-topics/src/pyp.hh | 9 | 
9 files changed, 582 insertions, 104 deletions
| diff --git a/gi/pyp-topics/src/Makefile.mpi b/gi/pyp-topics/src/Makefile.mpi index 8c859881..b7b8a290 100644 --- a/gi/pyp-topics/src/Makefile.mpi +++ b/gi/pyp-topics/src/Makefile.mpi @@ -16,7 +16,8 @@ mpi-pyp-contexts-train: mpi-train-contexts.o $(local_objs)  .PHONY: depend echo  depend: -	$(CXX) -MM $(CXXFLAGS) *.cc *.c | sed 's/^\(.*\.o:\)/obj\/\1/' > makefile.depend +#$(CXX) -MM $(CXXFLAGS) *.cc *.c | sed 's/^\(.*\.o:\)/obj\/\1/' > makefile.depend +	$(CXX) -MM $(CXXFLAGS) *.cc *.c > makefile.depend  clean:  	rm -f *.o diff --git a/gi/pyp-topics/src/contexts_corpus.cc b/gi/pyp-topics/src/contexts_corpus.cc index 26d5718a..1cf69429 100644 --- a/gi/pyp-topics/src/contexts_corpus.cc +++ b/gi/pyp-topics/src/contexts_corpus.cc @@ -28,7 +28,7 @@ void read_callback(const ContextsLexer::PhraseContextsType& new_contexts, void*    Document* doc(new Document());    //cout << "READ: " << new_contexts.phrase << "\t"; -  for (int i=0; i < new_contexts.counts.size(); ++i) { +  for (int i=0; i < (int)new_contexts.counts.size(); ++i) {      int cache_word_count = corpus_ptr->m_dict.max();      //string context_str = corpus_ptr->m_dict.toString(new_contexts.contexts[i]); @@ -101,7 +101,7 @@ void filter_callback(const ContextsLexer::PhraseContextsType& new_contexts, void    map<string,int>* context_counts = (static_cast<map<string,int>*>(extra)); -  for (int i=0; i < new_contexts.counts.size(); ++i) { +  for (int i=0; i < (int)new_contexts.counts.size(); ++i) {      int context_index = new_contexts.counts.at(i).first;      int count = new_contexts.counts.at(i).second;      //int count = new_contexts.counts[i]; diff --git a/gi/pyp-topics/src/contexts_corpus.hh b/gi/pyp-topics/src/contexts_corpus.hh index 66b71783..4d3d5669 100644 --- a/gi/pyp-topics/src/contexts_corpus.hh +++ b/gi/pyp-topics/src/contexts_corpus.hh @@ -63,6 +63,8 @@ public:      std::vector<std::string> context2string(const WordID& id) const {        std::vector<std::string> res; +      assert (id >= 0); +      std::cerr << m_dict.Convert(id) << std::endl;        m_dict.AsVector(id, &res);        return res;      } diff --git a/gi/pyp-topics/src/makefile.depend b/gi/pyp-topics/src/makefile.depend index 88bab79e..88bc73c1 100644 --- a/gi/pyp-topics/src/makefile.depend +++ b/gi/pyp-topics/src/makefile.depend @@ -1,4 +1,4 @@ -obj/contexts_corpus.o: contexts_corpus.cc contexts_corpus.hh \ +contexts_corpus.o: contexts_corpus.cc contexts_corpus.hh \    /Users/pblunsom/packages/include/boost/ptr_container/ptr_vector.hpp \    /Users/pblunsom/packages/include/boost/ptr_container/ptr_sequence_adapter.hpp \    /Users/pblunsom/packages/include/boost/ptr_container/detail/reversible_ptr_container.hpp \ @@ -432,7 +432,7 @@ obj/contexts_corpus.o: contexts_corpus.cc contexts_corpus.hh \    /Users/pblunsom/packages/include/boost/type_traits/add_cv.hpp \    /Users/pblunsom/packages/include/boost/type_traits/remove_volatile.hpp \    /Users/pblunsom/packages/include/boost/type_traits/function_traits.hpp -obj/contexts_lexer.o: contexts_lexer.cc contexts_lexer.h \ +contexts_lexer.o: contexts_lexer.cc contexts_lexer.h \    ../../../decoder/dict.h \    /Users/pblunsom/packages/include/boost/functional/hash.hpp \    /Users/pblunsom/packages/include/boost/functional/hash/hash.hpp \ @@ -463,7 +463,7 @@ obj/contexts_lexer.o: contexts_lexer.cc contexts_lexer.h \    /Users/pblunsom/packages/include/boost/detail/container_fwd.hpp \    ../../../decoder/wordid.h ../../../decoder/filelib.h \    ../../../decoder/gzstream.h -obj/corpus.o: corpus.cc corpus.hh \ +corpus.o: corpus.cc corpus.hh \    /Users/pblunsom/packages/include/boost/shared_ptr.hpp \    /Users/pblunsom/packages/include/boost/smart_ptr/shared_ptr.hpp \    /Users/pblunsom/packages/include/boost/config.hpp \ @@ -874,8 +874,8 @@ obj/corpus.o: corpus.cc corpus.hh \    /Users/pblunsom/packages/include/boost/detail/is_incrementable.hpp \    /Users/pblunsom/packages/include/boost/ptr_container/detail/void_ptr_iterator.hpp \    gzstream.hh -obj/gzstream.o: gzstream.cc gzstream.hh -obj/mpi-pyp-topics.o: mpi-pyp-topics.cc \ +gzstream.o: gzstream.cc gzstream.hh +mpi-pyp-topics.o: mpi-pyp-topics.cc \    /Users/pblunsom/packages/include/boost/mpi/communicator.hpp \    /Users/pblunsom/packages/include/boost/mpi/config.hpp \    /Users/pblunsom/packages/include/boost/config.hpp \ @@ -1448,8 +1448,108 @@ obj/mpi-pyp-topics.o: mpi-pyp-topics.cc \    /Users/pblunsom/packages/include/boost/random/detail/const_mod.hpp \    /Users/pblunsom/packages/include/boost/random/detail/seed.hpp \    /Users/pblunsom/packages/include/boost/mpi/environment.hpp mpi-pyp.hh \ -  log_add.h slice-sampler.h mt19937ar.h corpus.hh -obj/mpi-train-contexts.o: mpi-train-contexts.cc \ +  /Users/pblunsom/packages/include/boost/tuple/tuple.hpp \ +  /Users/pblunsom/packages/include/boost/ref.hpp \ +  /Users/pblunsom/packages/include/boost/tuple/detail/tuple_basic.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/cv_traits.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/add_volatile.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/add_cv.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/remove_volatile.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/function_traits.hpp \ +  /Users/pblunsom/packages/include/boost/serialization/map.hpp \ +  /Users/pblunsom/packages/include/boost/serialization/utility.hpp \ +  /Users/pblunsom/packages/include/boost/serialization/collections_save_imp.hpp \ +  /Users/pblunsom/packages/include/boost/serialization/collections_load_imp.hpp \ +  /Users/pblunsom/packages/include/boost/serialization/detail/stack_constructor.hpp \ +  /Users/pblunsom/packages/include/boost/aligned_storage.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/type_with_alignment.hpp \ +  /Users/pblunsom/packages/include/boost/mpi.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/collectives.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/collectives/all_gather.hpp \ +  /Users/pblunsom/packages/include/boost/serialization/vector.hpp \ +  /Users/pblunsom/packages/include/boost/serialization/collection_traits.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/collectives/broadcast.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/collectives_fwd.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/collectives/gather.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/collectives/all_reduce.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/collectives/reduce.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/detail/computation_tree.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/operations.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/collectives/all_to_all.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/collectives/scatter.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/collectives/scan.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/graph_communicator.hpp \ +  /Users/pblunsom/packages/include/boost/graph/graph_traits.hpp \ +  /Users/pblunsom/packages/include/boost/pending/property.hpp \ +  /Users/pblunsom/packages/include/boost/pending/detail/property.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/same_traits.hpp \ +  /Users/pblunsom/packages/include/boost/graph/properties.hpp \ +  /Users/pblunsom/packages/include/boost/property_map/property_map.hpp \ +  /Users/pblunsom/packages/include/boost/pending/cstddef.hpp \ +  /Users/pblunsom/packages/include/boost/concept_check.hpp \ +  /Users/pblunsom/packages/include/boost/concept/assert.hpp \ +  /Users/pblunsom/packages/include/boost/concept/detail/general.hpp \ +  /Users/pblunsom/packages/include/boost/concept/detail/has_constraints.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/conversion_traits.hpp \ +  /Users/pblunsom/packages/include/boost/concept/usage.hpp \ +  /Users/pblunsom/packages/include/boost/concept/detail/concept_def.hpp \ +  /Users/pblunsom/packages/include/boost/concept/detail/concept_undef.hpp \ +  /Users/pblunsom/packages/include/boost/concept_archetype.hpp \ +  /Users/pblunsom/packages/include/boost/property_map/vector_property_map.hpp \ +  /Users/pblunsom/packages/include/boost/graph/property_maps/constant_property_map.hpp \ +  /Users/pblunsom/packages/include/boost/graph/property_maps/null_property_map.hpp \ +  /Users/pblunsom/packages/include/boost/iterator/counting_iterator.hpp \ +  /Users/pblunsom/packages/include/boost/detail/numeric_traits.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/has_nothrow_assign.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/has_trivial_assign.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/has_nothrow_constructor.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/has_trivial_constructor.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/has_nothrow_copy.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/has_nothrow_destructor.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/has_trivial_destructor.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/has_virtual_destructor.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/is_signed.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/is_unsigned.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/is_compound.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/is_floating_point.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/is_member_object_pointer.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/is_object.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/is_stateless.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/rank.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/extent.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/remove_all_extents.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/function_traits.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/aligned_storage.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/floating_point_promotion.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/integral_promotion.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/promote.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/integral_promotion.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/floating_point_promotion.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/make_unsigned.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/is_signed.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/is_unsigned.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/make_signed.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/decay.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/is_complex.hpp \ +  /Users/pblunsom/packages/include/boost/detail/select_type.hpp \ +  /Users/pblunsom/packages/include/boost/graph/iteration_macros.hpp \ +  /Users/pblunsom/packages/include/boost/shared_array.hpp \ +  /Users/pblunsom/packages/include/boost/smart_ptr/shared_array.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/group.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/intercommunicator.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/nonblocking.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/skeleton_and_content.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/detail/forward_skeleton_iarchive.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/detail/forward_skeleton_oarchive.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/detail/ignore_iprimitive.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/detail/ignore_oprimitive.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/detail/content_oarchive.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/detail/broadcast_sc.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/detail/communicator_sc.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/timer.hpp pyp.hh \ +  slice-sampler.h log_add.h mt19937ar.h corpus.hh +mpi-train-contexts.o: mpi-train-contexts.cc \    /Users/pblunsom/packages/include/boost/program_options/parsers.hpp \    /Users/pblunsom/packages/include/boost/program_options/config.hpp \    /Users/pblunsom/packages/include/boost/config.hpp \ @@ -2064,8 +2164,100 @@ obj/mpi-train-contexts.o: mpi-train-contexts.cc \    /Users/pblunsom/packages/include/boost/random/linear_congruential.hpp \    /Users/pblunsom/packages/include/boost/random/detail/const_mod.hpp \    /Users/pblunsom/packages/include/boost/random/detail/seed.hpp \ -  mpi-pyp.hh log_add.h slice-sampler.h mt19937ar.h corpus.hh \ -  contexts_corpus.hh contexts_lexer.h ../../../decoder/dict.h \ +  mpi-pyp.hh /Users/pblunsom/packages/include/boost/tuple/tuple.hpp \ +  /Users/pblunsom/packages/include/boost/tuple/detail/tuple_basic.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/cv_traits.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/add_cv.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/remove_volatile.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/function_traits.hpp \ +  /Users/pblunsom/packages/include/boost/serialization/map.hpp \ +  /Users/pblunsom/packages/include/boost/serialization/utility.hpp \ +  /Users/pblunsom/packages/include/boost/serialization/collections_save_imp.hpp \ +  /Users/pblunsom/packages/include/boost/serialization/collections_load_imp.hpp \ +  /Users/pblunsom/packages/include/boost/serialization/detail/stack_constructor.hpp \ +  /Users/pblunsom/packages/include/boost/aligned_storage.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/type_with_alignment.hpp \ +  /Users/pblunsom/packages/include/boost/mpi.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/collectives.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/collectives/all_gather.hpp \ +  /Users/pblunsom/packages/include/boost/serialization/vector.hpp \ +  /Users/pblunsom/packages/include/boost/serialization/collection_traits.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/collectives/broadcast.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/collectives_fwd.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/collectives/gather.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/collectives/all_reduce.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/collectives/reduce.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/detail/computation_tree.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/operations.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/collectives/all_to_all.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/collectives/scatter.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/collectives/scan.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/graph_communicator.hpp \ +  /Users/pblunsom/packages/include/boost/graph/graph_traits.hpp \ +  /Users/pblunsom/packages/include/boost/pending/property.hpp \ +  /Users/pblunsom/packages/include/boost/pending/detail/property.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/same_traits.hpp \ +  /Users/pblunsom/packages/include/boost/graph/properties.hpp \ +  /Users/pblunsom/packages/include/boost/property_map/property_map.hpp \ +  /Users/pblunsom/packages/include/boost/pending/cstddef.hpp \ +  /Users/pblunsom/packages/include/boost/concept_check.hpp \ +  /Users/pblunsom/packages/include/boost/concept/assert.hpp \ +  /Users/pblunsom/packages/include/boost/concept/detail/general.hpp \ +  /Users/pblunsom/packages/include/boost/concept/detail/has_constraints.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/conversion_traits.hpp \ +  /Users/pblunsom/packages/include/boost/concept/usage.hpp \ +  /Users/pblunsom/packages/include/boost/concept/detail/concept_def.hpp \ +  /Users/pblunsom/packages/include/boost/concept/detail/concept_undef.hpp \ +  /Users/pblunsom/packages/include/boost/concept_archetype.hpp \ +  /Users/pblunsom/packages/include/boost/property_map/vector_property_map.hpp \ +  /Users/pblunsom/packages/include/boost/graph/property_maps/constant_property_map.hpp \ +  /Users/pblunsom/packages/include/boost/graph/property_maps/null_property_map.hpp \ +  /Users/pblunsom/packages/include/boost/iterator/counting_iterator.hpp \ +  /Users/pblunsom/packages/include/boost/detail/numeric_traits.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/has_nothrow_assign.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/has_trivial_assign.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/has_nothrow_constructor.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/has_trivial_constructor.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/has_nothrow_copy.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/has_nothrow_destructor.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/has_virtual_destructor.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/is_compound.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/is_floating_point.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/is_member_object_pointer.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/is_object.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/is_stateless.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/rank.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/extent.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/remove_all_extents.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/function_traits.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/aligned_storage.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/floating_point_promotion.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/integral_promotion.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/promote.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/integral_promotion.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/floating_point_promotion.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/make_signed.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/decay.hpp \ +  /Users/pblunsom/packages/include/boost/type_traits/is_complex.hpp \ +  /Users/pblunsom/packages/include/boost/detail/select_type.hpp \ +  /Users/pblunsom/packages/include/boost/graph/iteration_macros.hpp \ +  /Users/pblunsom/packages/include/boost/shared_array.hpp \ +  /Users/pblunsom/packages/include/boost/smart_ptr/shared_array.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/group.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/intercommunicator.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/nonblocking.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/skeleton_and_content.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/detail/forward_skeleton_iarchive.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/detail/forward_skeleton_oarchive.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/detail/ignore_iprimitive.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/detail/ignore_oprimitive.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/detail/content_oarchive.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/detail/broadcast_sc.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/detail/communicator_sc.hpp \ +  /Users/pblunsom/packages/include/boost/mpi/timer.hpp pyp.hh \ +  slice-sampler.h log_add.h mt19937ar.h corpus.hh contexts_corpus.hh \ +  contexts_lexer.h ../../../decoder/dict.h \    /Users/pblunsom/packages/include/boost/functional/hash.hpp \    /Users/pblunsom/packages/include/boost/functional/hash/hash.hpp \    /Users/pblunsom/packages/include/boost/functional/hash/hash_fwd.hpp \ @@ -2078,7 +2270,7 @@ obj/mpi-train-contexts.o: mpi-train-contexts.cc \    /Users/pblunsom/packages/include/boost/functional/hash/extensions.hpp \    /Users/pblunsom/packages/include/boost/detail/container_fwd.hpp \    ../../../decoder/wordid.h gzstream.hh -obj/pyp-topics.o: pyp-topics.cc timing.h clock_gettime_stub.c pyp-topics.hh \ +pyp-topics.o: pyp-topics.cc timing.h clock_gettime_stub.c pyp-topics.hh \    /Users/pblunsom/packages/include/boost/ptr_container/ptr_vector.hpp \    /Users/pblunsom/packages/include/boost/ptr_container/ptr_sequence_adapter.hpp \    /Users/pblunsom/packages/include/boost/ptr_container/detail/reversible_ptr_container.hpp \ @@ -2484,7 +2676,7 @@ obj/pyp-topics.o: pyp-topics.cc timing.h clock_gettime_stub.c pyp-topics.hh \    /Users/pblunsom/packages/include/boost/random/linear_congruential.hpp \    /Users/pblunsom/packages/include/boost/random/detail/const_mod.hpp \    /Users/pblunsom/packages/include/boost/random/detail/seed.hpp pyp.hh \ -  log_add.h slice-sampler.h mt19937ar.h corpus.hh \ +  slice-sampler.h log_add.h mt19937ar.h corpus.hh \    /Users/pblunsom/packages/include/boost/shared_ptr.hpp \    /Users/pblunsom/packages/include/boost/smart_ptr/shared_ptr.hpp \    /Users/pblunsom/packages/include/boost/config/no_tr1/memory.hpp \ @@ -2658,7 +2850,7 @@ obj/pyp-topics.o: pyp-topics.cc timing.h clock_gettime_stub.c pyp-topics.hh \    /Users/pblunsom/packages/include/boost/smart_ptr/scoped_ptr.hpp \    /Users/pblunsom/packages/include/boost/type_traits/is_fundamental.hpp \    /Users/pblunsom/packages/include/boost/thread/condition.hpp -obj/train-contexts.o: train-contexts.cc \ +train-contexts.o: train-contexts.cc \    /Users/pblunsom/packages/include/boost/program_options/parsers.hpp \    /Users/pblunsom/packages/include/boost/program_options/config.hpp \    /Users/pblunsom/packages/include/boost/config.hpp \ @@ -3140,7 +3332,7 @@ obj/train-contexts.o: train-contexts.cc \    /Users/pblunsom/packages/include/boost/random/linear_congruential.hpp \    /Users/pblunsom/packages/include/boost/random/detail/const_mod.hpp \    /Users/pblunsom/packages/include/boost/random/detail/seed.hpp pyp.hh \ -  log_add.h slice-sampler.h mt19937ar.h corpus.hh workers.hh \ +  slice-sampler.h log_add.h mt19937ar.h corpus.hh workers.hh \    /Users/pblunsom/packages/include/boost/bind.hpp \    /Users/pblunsom/packages/include/boost/bind/bind.hpp \    /Users/pblunsom/packages/include/boost/is_placeholder.hpp \ @@ -3275,7 +3467,7 @@ obj/train-contexts.o: train-contexts.cc \    /Users/pblunsom/packages/include/boost/functional/hash/extensions.hpp \    /Users/pblunsom/packages/include/boost/detail/container_fwd.hpp \    ../../../decoder/wordid.h gzstream.hh -obj/train.o: train.cc \ +train.o: train.cc \    /Users/pblunsom/packages/include/boost/program_options/parsers.hpp \    /Users/pblunsom/packages/include/boost/program_options/config.hpp \    /Users/pblunsom/packages/include/boost/config.hpp \ @@ -3757,7 +3949,7 @@ obj/train.o: train.cc \    /Users/pblunsom/packages/include/boost/random/linear_congruential.hpp \    /Users/pblunsom/packages/include/boost/random/detail/const_mod.hpp \    /Users/pblunsom/packages/include/boost/random/detail/seed.hpp pyp.hh \ -  log_add.h slice-sampler.h mt19937ar.h corpus.hh workers.hh \ +  slice-sampler.h log_add.h mt19937ar.h corpus.hh workers.hh \    /Users/pblunsom/packages/include/boost/bind.hpp \    /Users/pblunsom/packages/include/boost/bind/bind.hpp \    /Users/pblunsom/packages/include/boost/is_placeholder.hpp \ @@ -3892,6 +4084,6 @@ obj/train.o: train.cc \    /Users/pblunsom/packages/include/boost/functional/hash/extensions.hpp \    /Users/pblunsom/packages/include/boost/detail/container_fwd.hpp \    ../../../decoder/wordid.h gzstream.hh -obj/clock_gettime_stub.o: clock_gettime_stub.c -obj/gammadist.o: gammadist.c gammadist.h mt19937ar.h -obj/mt19937ar.o: mt19937ar.c mt19937ar.h +clock_gettime_stub.o: clock_gettime_stub.c +gammadist.o: gammadist.c gammadist.h mt19937ar.h +mt19937ar.o: mt19937ar.c mt19937ar.h diff --git a/gi/pyp-topics/src/mpi-pyp-topics.cc b/gi/pyp-topics/src/mpi-pyp-topics.cc index 2ad28278..4525302e 100644 --- a/gi/pyp-topics/src/mpi-pyp-topics.cc +++ b/gi/pyp-topics/src/mpi-pyp-topics.cc @@ -4,7 +4,7 @@  #include "mpi-pyp-topics.hh"  //#include <boost/date_time/posix_time/posix_time_types.hpp> -void PYPTopics::sample_corpus(const Corpus& corpus, int samples, +void MPIPYPTopics::sample_corpus(const Corpus& corpus, int samples,                                int freq_cutoff_start, int freq_cutoff_end,                                int freq_cutoff_interval,                                int max_contexts_per_document) { @@ -23,33 +23,33 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples,    }    int local_documents = m_mpi_end - m_mpi_start; -    if (!m_backoff.get()) {      m_word_pyps.clear(); -    m_word_pyps.push_back(PYPs()); +    m_word_pyps.push_back(MPIPYPs());    }    if (m_am_root) std::cerr << "\n Training with " << m_word_pyps.size()-1 << " backoff level" -    << (m_word_pyps.size()==2 ? ":" : "s:") << std::endl; +    << (m_word_pyps.size()>1 ? ":" : "s:") << std::endl; -  for (int i=0; i<(int)m_word_pyps.size(); ++i) -  { +  for (int i=0; i<(int)m_word_pyps.size(); ++i) {      m_word_pyps.at(i).reserve(m_num_topics);      for (int j=0; j<m_num_topics; ++j) -      m_word_pyps.at(i).push_back(new PYP<int>(0.5, 1.0)); +      m_word_pyps.at(i).push_back(new MPIPYP<int>(0.5, 1.0));    }    if (m_am_root) std::cerr << std::endl; -  m_document_pyps.reserve(corpus.num_documents()); -  for (int j=0; j<corpus.num_documents(); ++j) +  m_document_pyps.reserve(local_documents); +  //m_document_pyps.reserve(corpus.num_documents()); +  //for (int j=0; j<corpus.num_documents(); ++j) +  for (int j=0; j<local_documents; ++j)      m_document_pyps.push_back(new PYP<int>(0.5, 1.0));    m_topic_p0 = 1.0/m_num_topics;    m_term_p0 = 1.0/corpus.num_types();    m_backoff_p0 = 1.0/corpus.num_documents(); -  if (m_am_root) std::cerr << " Documents: " << corpus.num_documents() << " Terms: " -    << corpus.num_types() << std::endl; +  if (m_am_root) std::cerr << " Documents: " << corpus.num_documents() << "("  +    << local_documents << ")" << " Terms: " << corpus.num_types() << std::endl;    int frequency_cutoff = freq_cutoff_start;    if (m_am_root) std::cerr << " Context frequency cutoff set to " << frequency_cutoff << std::endl; @@ -57,13 +57,16 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples,    timer.Reset();    // Initialisation pass    int document_id=0, topic_counter=0; -  for (Corpus::const_iterator corpusIt=corpus.begin(); -       corpusIt != corpus.end(); ++corpusIt, ++document_id) { -    m_corpus_topics.push_back(DocumentTopics(corpusIt->size(), 0)); +  for (int i=0; i<local_documents; ++i) { +    document_id = i+m_mpi_start; + +  //for (Corpus::const_iterator corpusIt=corpus.begin(); +  //     corpusIt != corpus.end(); ++corpusIt, ++document_id) { +    m_corpus_topics.push_back(DocumentTopics(corpus.at(document_id).size(), 0));      int term_index=0; -    for (Document::const_iterator docIt=corpusIt->begin(); -         docIt != corpusIt->end(); ++docIt, ++term_index) { +    for (Document::const_iterator docIt=corpus.at(document_id).begin(); +         docIt != corpus.at(document_id).end(); ++docIt, ++term_index) {        topic_counter++;        Term term = *docIt; @@ -80,21 +83,41 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples,          if (m_use_topic_pyp) {            F p0 = m_topic_pyp.prob(new_topic, m_topic_p0); -          int table_delta = m_document_pyps[document_id].increment(new_topic, p0); +          int table_delta = m_document_pyps.at(i).increment(new_topic, p0);            if (table_delta)              m_topic_pyp.increment(new_topic, m_topic_p0);          } -        else m_document_pyps[document_id].increment(new_topic, m_topic_p0); +        else m_document_pyps.at(i).increment(new_topic, m_topic_p0);        } -      m_corpus_topics[document_id][term_index] = new_topic; +      m_corpus_topics.at(i).at(term_index) = new_topic; +    } +  } + +  // Synchronise the topic->word counds across the processes. +  for (std::vector<MPIPYPs>::iterator levelIt=m_word_pyps.begin(); +       levelIt != m_word_pyps.end(); ++levelIt) { +    for (MPIPYPs::iterator pypIt=levelIt->begin(); +         pypIt != levelIt->end(); ++pypIt) { +      if (!m_am_root) boost::mpi::communicator().barrier(); +      std::cerr << "Before Sync Process " << m_rank << ":"; +      pypIt->debug_info(std::cerr); std::cerr << std::endl; +      if (m_am_root) boost::mpi::communicator().barrier(); + +      pypIt->synchronise(); + +      if (!m_am_root) boost::mpi::communicator().barrier(); +      std::cerr << "After Sync Process " << m_rank << ":"; +      pypIt->debug_info(std::cerr); std::cerr << std::endl; +      if (m_am_root) boost::mpi::communicator().barrier();      }    } +    if (m_am_root) std::cerr << "  Initialized in " << timer.Elapsed() << " seconds\n";    int* randomDocIndices = new int[local_documents];    for (int i = 0; i < local_documents; ++i) -	  randomDocIndices[i] = i+m_mpi_start; +	  randomDocIndices[i] = i;    // Sampling phase    for (int curr_sample=0; curr_sample < samples; ++curr_sample) { @@ -110,8 +133,8 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples,      // Randomize the corpus indexing array      int tmp;      int processed_terms=0; -    for (int i = local_documents-1; i > 0; --i) { -        //i+1 since j \in [0,i] but rnd() \in [0,1) +    for (int i = (local_documents-1); i > 0; --i) { +      //i+1 since j \in [0,i] but rnd() \in [0,1)      	int j = (int)(rnd() * (i+1));        assert(j >= 0 && j <= i);       	tmp = randomDocIndices[i]; @@ -120,15 +143,17 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples,      }      // for each document in the corpus -    int document_id; -    for (int i=0; i<local_documents; ++i) { -    	document_id = randomDocIndices[i]; +    for (int rand_doc=0; rand_doc<local_documents; ++rand_doc) { +    	int doc_index = randomDocIndices[rand_doc]; +    	int document_id = doc_index + m_mpi_start; +      const Document& doc = corpus.at(document_id);        // for each term in the document        int term_index=0; -      Document::const_iterator docEnd = corpus.at(document_id).end(); -      for (Document::const_iterator docIt=corpus.at(document_id).begin(); +      Document::const_iterator docEnd = doc.end(); +      for (Document::const_iterator docIt=doc.begin();             docIt != docEnd; ++docIt, ++term_index) { +          if (max_contexts_per_document && term_index > max_contexts_per_document)            break; @@ -140,36 +165,49 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples,          processed_terms++;          // remove the prevous topic from the PYPs -        int current_topic = m_corpus_topics[document_id][term_index]; +        int current_topic = m_corpus_topics.at(doc_index).at(term_index);          // a negative label mean that term hasn't been sampled yet          if (current_topic >= 0) {            decrement(term, current_topic); -          int table_delta = m_document_pyps[document_id].decrement(current_topic); +          int table_delta = m_document_pyps.at(doc_index).decrement(current_topic);            if (m_use_topic_pyp && table_delta < 0)              m_topic_pyp.decrement(current_topic);          }          // sample a new_topic -        int new_topic = sample(document_id, term); +        int new_topic = sample(doc_index, term);          // add the new topic to the PYPs -        m_corpus_topics[document_id][term_index] = new_topic; +        m_corpus_topics.at(doc_index).at(term_index) = new_topic;          increment(term, new_topic);          if (m_use_topic_pyp) {            F p0 = m_topic_pyp.prob(new_topic, m_topic_p0); -          int table_delta = m_document_pyps[document_id].increment(new_topic, p0); +          int table_delta = m_document_pyps.at(doc_index).increment(new_topic, p0);            if (table_delta)              m_topic_pyp.increment(new_topic, m_topic_p0);          } -        else m_document_pyps[document_id].increment(new_topic, m_topic_p0); +        else m_document_pyps.at(doc_index).increment(new_topic, m_topic_p0);        }        if (document_id && document_id % 10000 == 0) {          if (m_am_root) std::cerr << "."; std::cerr.flush();        }      }      m_world.barrier(); +    // Synchronise the topic->word counds across the processes. +    for (std::vector<MPIPYPs>::iterator levelIt=m_word_pyps.begin(); +         levelIt != m_word_pyps.end(); ++levelIt) { +      for (MPIPYPs::iterator pypIt=levelIt->begin(); +           pypIt != levelIt->end(); ++pypIt) { +        std::cerr << "Before Sync Process " << m_rank << ":"; +        pypIt->debug_info(std::cerr); std::cerr << std::endl; +        pypIt->synchronise(); +        std::cerr << "After Sync Process " << m_rank << ":"; +        pypIt->debug_info(std::cerr); std::cerr << std::endl; +      } +    } +      if (m_am_root) std::cerr << " ||| sampled " << processed_terms << " terms.";      if (curr_sample != 0 && curr_sample % 10 == 0) { @@ -179,9 +217,9 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples,        // resample the hyperparamters        F log_p=0.0; -      for (std::vector<PYPs>::iterator levelIt=m_word_pyps.begin(); +      for (std::vector<MPIPYPs>::iterator levelIt=m_word_pyps.begin();             levelIt != m_word_pyps.end(); ++levelIt) { -        for (PYPs::iterator pypIt=levelIt->begin(); +        for (MPIPYPs::iterator pypIt=levelIt->begin();               pypIt != levelIt->end(); ++pypIt) {            pypIt->resample_prior();            log_p += pypIt->log_restaurant_prob(); @@ -206,7 +244,7 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples,        int k=0;        if (m_am_root) std::cerr << "Topics distribution: ";        std::cerr.precision(2); -      for (PYPs::iterator pypIt=m_word_pyps.front().begin(); +      for (MPIPYPs::iterator pypIt=m_word_pyps.front().begin();             pypIt != m_word_pyps.front().end(); ++pypIt, ++k) {          if (m_am_root && k % 5 == 0) std::cerr << std::endl << '\t';          if (m_am_root) std::cerr << "<" << k << ":" << pypIt->num_customers() << "," @@ -220,8 +258,8 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples,  } -void PYPTopics::decrement(const Term& term, int topic, int level) { -  //std::cerr << "PYPTopics::decrement(" << term << "," << topic << "," << level << ")" << std::endl; +void MPIPYPTopics::decrement(const Term& term, int topic, int level) { +  //std::cerr << "MPIPYPTopics::decrement(" << term << "," << topic << "," << level << ")" << std::endl;    m_word_pyps.at(level).at(topic).decrement(term);    if (m_backoff.get()) {      Term backoff_term = (*m_backoff)[term]; @@ -230,8 +268,8 @@ void PYPTopics::decrement(const Term& term, int topic, int level) {    }  } -void PYPTopics::increment(const Term& term, int topic, int level) { -  //std::cerr << "PYPTopics::increment(" << term << "," << topic << "," << level << ")" << std::endl; +void MPIPYPTopics::increment(const Term& term, int topic, int level) { +  //std::cerr << "MPIPYPTopics::increment(" << term << "," << topic << "," << level << ")" << std::endl;    m_word_pyps.at(level).at(topic).increment(term, word_pyps_p0(term, topic, level));    if (m_backoff.get()) { @@ -241,7 +279,7 @@ void PYPTopics::increment(const Term& term, int topic, int level) {    }  } -int PYPTopics::sample(const DocumentId& doc, const Term& term) { +int MPIPYPTopics::sample(const DocumentId& doc, const Term& term) {    // First pass: collect probs    F sum=0.0;    std::vector<F> sums; @@ -252,7 +290,7 @@ int PYPTopics::sample(const DocumentId& doc, const Term& term) {      if (m_use_topic_pyp) topic_prob = m_topic_pyp.prob(k, m_topic_p0);      //F p_k_d = m_document_pyps[doc].prob(k, topic_prob); -    F p_k_d = m_document_pyps[doc].unnormalised_prob(k, topic_prob); +    F p_k_d = m_document_pyps.at(doc).unnormalised_prob(k, topic_prob);      sum += (p_w_k*p_k_d);      sums.push_back(sum); @@ -266,9 +304,9 @@ int PYPTopics::sample(const DocumentId& doc, const Term& term) {    assert(false);  } -PYPTopics::F PYPTopics::word_pyps_p0(const Term& term, int topic, int level) const { +MPIPYPTopics::F MPIPYPTopics::word_pyps_p0(const Term& term, int topic, int level) const {    //for (int i=0; i<level+1; ++i) std::cerr << "  "; -  //std::cerr << "PYPTopics::word_pyps_p0(" << term << "," << topic << "," << level << ")" << std::endl; +  //std::cerr << "MPIPYPTopics::word_pyps_p0(" << term << "," << topic << "," << level << ")" << std::endl;    F p0 = m_term_p0;    if (m_backoff.get()) { @@ -283,24 +321,24 @@ PYPTopics::F PYPTopics::word_pyps_p0(const Term& term, int topic, int level) con        p0 = m_term_p0;    }    //for (int i=0; i<level+1; ++i) std::cerr << "  "; -  //std::cerr << "PYPTopics::word_pyps_p0(" << term << "," << topic << "," << level << ") = " << p0 << std::endl; +  //std::cerr << "MPIPYPTopics::word_pyps_p0(" << term << "," << topic << "," << level << ") = " << p0 << std::endl;    return p0;  } -PYPTopics::F PYPTopics::prob(const Term& term, int topic, int level) const { +MPIPYPTopics::F MPIPYPTopics::prob(const Term& term, int topic, int level) const {    //for (int i=0; i<level+1; ++i) std::cerr << "  "; -  //std::cerr << "PYPTopics::prob(" << term << "," << topic << "," << level << " " << factor << ")" << std::endl; +  //std::cerr << "MPIPYPTopics::prob(" << term << "," << topic << "," << level << " " << factor << ")" << std::endl;    F p0 = word_pyps_p0(term, topic, level);    F p_w_k = m_word_pyps.at(level).at(topic).prob(term, p0);    //for (int i=0; i<level+1; ++i) std::cerr << "  "; -  //std::cerr << "PYPTopics::prob(" << term << "," << topic << "," << level << ") = " << p_w_k << std::endl; +  //std::cerr << "MPIPYPTopics::prob(" << term << "," << topic << "," << level << ") = " << p_w_k << std::endl;    return p_w_k;  } -int PYPTopics::max_topic() const { +int MPIPYPTopics::max_topic() const {    if (!m_use_topic_pyp)      return -1; @@ -317,8 +355,8 @@ int PYPTopics::max_topic() const {    return current_topic;  } -int PYPTopics::max(const DocumentId& doc) const { -  //std::cerr << "PYPTopics::max(" << doc << "," << term << ")" << std::endl; +int MPIPYPTopics::max(const DocumentId& doc) const { +  //std::cerr << "MPIPYPTopics::max(" << doc << "," << term << ")" << std::endl;    // collect probs    F current_max=0.0;    int current_topic=-1; @@ -342,8 +380,8 @@ int PYPTopics::max(const DocumentId& doc) const {    return current_topic;  } -int PYPTopics::max(const DocumentId& doc, const Term& term) const { -  //std::cerr << "PYPTopics::max(" << doc << "," << term << ")" << std::endl; +int MPIPYPTopics::max(const DocumentId& doc, const Term& term) const { +  //std::cerr << "MPIPYPTopics::max(" << doc << "," << term << ")" << std::endl;    // collect probs    F current_max=0.0;    int current_topic=-1; @@ -368,7 +406,7 @@ int PYPTopics::max(const DocumentId& doc, const Term& term) const {    return current_topic;  } -std::ostream& PYPTopics::print_document_topics(std::ostream& out) const { +std::ostream& MPIPYPTopics::print_document_topics(std::ostream& out) const {    for (CorpusTopics::const_iterator corpusIt=m_corpus_topics.begin();         corpusIt != m_corpus_topics.end(); ++corpusIt) {      int term_index=0; @@ -382,7 +420,7 @@ std::ostream& PYPTopics::print_document_topics(std::ostream& out) const {    return out;  } -std::ostream& PYPTopics::print_topic_terms(std::ostream& out) const { +std::ostream& MPIPYPTopics::print_topic_terms(std::ostream& out) const {    for (PYPs::const_iterator pypsIt=m_word_pyps.front().begin();         pypsIt != m_word_pyps.front().end(); ++pypsIt) {      int term_index=0; diff --git a/gi/pyp-topics/src/mpi-pyp-topics.hh b/gi/pyp-topics/src/mpi-pyp-topics.hh index 5da35d82..a85a776d 100644 --- a/gi/pyp-topics/src/mpi-pyp-topics.hh +++ b/gi/pyp-topics/src/mpi-pyp-topics.hh @@ -1,5 +1,5 @@ -#ifndef PYP_TOPICS_HH -#define PYP_TOPICS_HH +#ifndef MPI_PYP_TOPICS_HH +#define MPI_PYP_TOPICS_HH  #include <vector>  #include <iostream> @@ -14,14 +14,14 @@  #include "mpi-pyp.hh"  #include "corpus.hh" -class PYPTopics { +class MPIPYPTopics {  public:    typedef std::vector<int> DocumentTopics;    typedef std::vector<DocumentTopics> CorpusTopics;    typedef double F;  public: -  PYPTopics(int num_topics, bool use_topic_pyp=false, unsigned long seed = 0)  +  MPIPYPTopics(int num_topics, bool use_topic_pyp=false, unsigned long seed = 0)       : m_num_topics(num_topics), m_word_pyps(1),       m_topic_pyp(0.5,1.0), m_use_topic_pyp(use_topic_pyp),      m_seed(seed), @@ -47,12 +47,12 @@ public:      m_backoff.reset(new TermBackoff);      m_backoff->read(filename);      m_word_pyps.clear(); -    m_word_pyps.resize(m_backoff->order(), PYPs()); +    m_word_pyps.resize(m_backoff->order(), MPIPYPs());    }    void set_backoff(TermBackoffPtr backoff) {      m_backoff = backoff;      m_word_pyps.clear(); -    m_word_pyps.resize(m_backoff->order(), PYPs()); +    m_word_pyps.resize(m_backoff->order(), MPIPYPs());    }    F prob(const Term& term, int topic, int level=0) const; @@ -70,9 +70,10 @@ private:    CorpusTopics m_corpus_topics;    typedef boost::ptr_vector< PYP<int> > PYPs; +  typedef boost::ptr_vector< MPIPYP<int> > MPIPYPs;    PYPs m_document_pyps; -  std::vector<PYPs> m_word_pyps; -  PYP<int> m_topic_pyp; +  std::vector<MPIPYPs> m_word_pyps; +  MPIPYP<int> m_topic_pyp;    bool m_use_topic_pyp;    unsigned long m_seed; diff --git a/gi/pyp-topics/src/mpi-pyp.hh b/gi/pyp-topics/src/mpi-pyp.hh index 58be7c5c..65358d20 100644 --- a/gi/pyp-topics/src/mpi-pyp.hh +++ b/gi/pyp-topics/src/mpi-pyp.hh @@ -1,5 +1,5 @@ -#ifndef _pyp_hh -#define _pyp_hh +#ifndef _mpipyp_hh +#define _mpipyp_hh  #include <math.h>  #include <map> @@ -9,11 +9,15 @@  #include <boost/random/uniform_real.hpp>  #include <boost/random/variate_generator.hpp>  #include <boost/random/mersenne_twister.hpp> +#include <boost/tuple/tuple.hpp> +#include <boost/serialization/map.hpp> +#include <boost/mpi.hpp> +#include <boost/mpi/environment.hpp> +#include <boost/mpi/communicator.hpp> +#include <boost/mpi/operations.hpp> -#include "pyp.h" -#include "log_add.h" -#include "slice-sampler.h" -#include "mt19937ar.h" + +#include "pyp.hh"  //  // Pitman-Yor process with customer and table tracking @@ -28,25 +32,104 @@ public:    virtual int decrement(Dish d);    void clear(); +  void reset_deltas(); -  void reset_deltas() { m_count_delta.clear(); } +  void synchronise();  private:    typedef std::map<Dish, int> dish_delta_type; -  typedef std::map<Dish, TableCounter> table_delta_type; +  typedef std::map<Dish, typename PYP<Dish,Hash>::TableCounter> table_delta_type;    dish_delta_type m_count_delta;    table_delta_type m_table_delta;  };  template <typename Dish, typename Hash> -MPIPYP<Dish,Hash>::MPIPYP(double a, double b, Hash) -: PYP(a, b, Hash) {} +MPIPYP<Dish,Hash>::MPIPYP(double a, double b, Hash h) +: PYP<Dish,Hash>(a, b, 0, h) {}  template <typename Dish, typename Hash>  int   MPIPYP<Dish,Hash>::increment(Dish dish, double p0) { -  int delta = PYP<Dish,Hash>::increment(dish, p0); +  int delta = 0; +  int table_joined=-1; +  typename PYP<Dish,Hash>::TableCounter &tc = PYP<Dish,Hash>::_dish_tables[dish]; + +  // seated on a new or existing table? +  int c = PYP<Dish,Hash>::count(dish);  +  int t = PYP<Dish,Hash>::num_tables(dish);  +  int T = PYP<Dish,Hash>::num_tables(); +  double& a = PYP<Dish,Hash>::_a; +  double& b = PYP<Dish,Hash>::_b; +  double pshare = (c > 0) ? (c - a*t) : 0.0; +  double pnew = (b + a*T) * p0; +  assert (pshare >= 0.0); + +  if (mt_genrand_res53() < pnew / (pshare + pnew)) { +    // assign to a new table +    tc.tables += 1; +    tc.table_histogram[1] += 1; +    PYP<Dish,Hash>::_total_tables += 1; +    delta = 1; +  } +  else { +    // randomly assign to an existing table +    // remove constant denominator from inner loop +    double r = mt_genrand_res53() * (c - a*t); +    for (std::map<int,int>::iterator +         hit = tc.table_histogram.begin(); +         hit != tc.table_histogram.end(); ++hit) { +      r -= ((hit->first - a) * hit->second); +      if (r <= 0) { +        tc.table_histogram[hit->first+1] += 1; +        hit->second -= 1; +        if (hit->second == 0) +          tc.table_histogram.erase(hit); +        table_joined = hit->first+1; +        break; +      } +    } +    if (r > 0) { +      std::cerr << r << " " << c << " " << a << " " << t << std::endl; +      assert(false); +    } +    delta = 0; +  } + +  std::tr1::unordered_map<Dish,int,Hash>::operator[](dish) += 1; +  //google::sparse_hash_map<Dish,int,Hash>::operator[](dish) += 1; +  PYP<Dish,Hash>::_total_customers += 1; + +  // MPI Delta handling +  // track the customer entering +  typename dish_delta_type::iterator customer_it;  +  bool customer_insert_result;  +  boost::tie(customer_it, customer_insert_result)  +    = m_count_delta.insert(std::make_pair(dish,0));  + +  customer_it->second += 1; +  if (customer_it->second == 0) +    m_count_delta.erase(customer_it); + +  // increment the histogram bar for the table joined +  if (!delta) { +    assert (table_joined >= 0); +    std::map<int,int> &histogram = m_table_delta[dish].table_histogram; +    typename std::map<int,int>::iterator table_it; bool table_insert_result;  +    boost::tie(table_it, table_insert_result) = histogram.insert(std::make_pair(table_joined,0));  +    table_it->second += 1; +    if (table_it->second == 0) histogram.erase(table_it); + +    // decrement the histogram bar for the table left  +    boost::tie(table_it, table_insert_result) = histogram.insert(std::make_pair(table_joined-1,0));  +    table_it->second -= 1; +    if (table_it->second == 0) histogram.erase(table_it); +  } +  else { +    typename PYP<Dish,Hash>::TableCounter &delta_tc = m_table_delta[dish]; +    delta_tc.tables += 1; +    delta_tc.table_histogram[1] += 1; +  }    return delta;  } @@ -55,15 +138,177 @@ template <typename Dish, typename Hash>  int   MPIPYP<Dish,Hash>::decrement(Dish dish)  { -  int delta = PYP<Dish,Hash>::decrement(dish); +  typename std::tr1::unordered_map<Dish, int>::iterator dcit = find(dish); +  //typename google::sparse_hash_map<Dish, int>::iterator dcit = find(dish); +  if (dcit == PYP<Dish,Hash>::end()) { +    std::cerr << dish << std::endl; +    assert(false); +  }  + +  int delta = 0, table_left=-1; + +  typename std::tr1::unordered_map<Dish, typename PYP<Dish,Hash>::TableCounter>::iterator dtit  +    = PYP<Dish,Hash>::_dish_tables.find(dish); +  //typename google::sparse_hash_map<Dish, TableCounter>::iterator dtit = _dish_tables.find(dish); +  if (dtit == PYP<Dish,Hash>::_dish_tables.end()) { +    std::cerr << dish << std::endl; +    assert(false); +  }  +  typename PYP<Dish,Hash>::TableCounter &tc = dtit->second; + +  double r = mt_genrand_res53() * PYP<Dish,Hash>::count(dish); +  for (std::map<int,int>::iterator hit = tc.table_histogram.begin(); +       hit != tc.table_histogram.end(); ++hit) { +    r -= (hit->first * hit->second); +    if (r <= 0) { +      table_left = hit->first; +      if (hit->first > 1) { +        tc.table_histogram[hit->first-1] += 1; +      } +      else { +        delta = -1; +        tc.tables -= 1; +        PYP<Dish,Hash>::_total_tables -= 1; +      } + +      hit->second -= 1; +      if (hit->second == 0) tc.table_histogram.erase(hit); +      break; +    } +  } +  if (r > 0) { +    std::cerr << r << " " << PYP<Dish,Hash>::count(dish) << " " << PYP<Dish,Hash>::_a << " "  +      << PYP<Dish,Hash>::num_tables(dish) << std::endl; +    assert(false); +  } + +  // remove the customer +  dcit->second -= 1; +  PYP<Dish,Hash>::_total_customers -= 1; +  assert(dcit->second >= 0); +  if (dcit->second == 0) { +    PYP<Dish,Hash>::erase(dcit); +    PYP<Dish,Hash>::_dish_tables.erase(dtit); +  } + +  typename dish_delta_type::iterator it;  +  bool insert_result;  +  boost::tie(it, insert_result) = m_count_delta.insert(std::make_pair(dish,0));  + +  it->second -= 1; + +  if (it->second == 0) +    m_count_delta.erase(it); + +  assert (table_left >= 0); +  typename PYP<Dish,Hash>::TableCounter& delta_tc = m_table_delta[dish]; +  if (table_left > 1) +    delta_tc.table_histogram[table_left-1] += 1; +  else delta_tc.tables -= 1; + +  std::map<int,int>::iterator tit = delta_tc.table_histogram.find(table_left); +  //assert (tit != delta_tc.table_histogram.end()); +  tit->second -= 1; +  if (tit->second == 0) delta_tc.table_histogram.erase(tit); +    return delta;  }  template <typename Dish, typename Hash>  void  -MPIPYP<Dish,Hash>::clear() -{ +MPIPYP<Dish,Hash>::clear() {    PYP<Dish,Hash>::clear(); +  reset_deltas(); +} + +template <typename Dish, typename Hash> +void  +MPIPYP<Dish,Hash>::reset_deltas() {  +  m_count_delta.clear();  +  m_table_delta.clear(); +} + +template <typename Dish> +struct sum_maps { +  typedef std::map<Dish,int> map_type; +  map_type& operator() (map_type& l, map_type const & r) const { +    for (typename map_type::const_iterator it=r.begin(); it != r.end(); it++) +      l[it->first] += it->second; +    return l; +  } +}; + +// Needed Boost definitions +namespace boost {  +  namespace mpi { +    template <> +    struct is_commutative< sum_maps<int>, std::map<int,int> > : mpl::true_ {}; +  } + +  namespace serialization { +    template<class Archive> +    void serialize(Archive & ar, PYP<int>::TableCounter& t, const unsigned int version) { +      ar & t.table_histogram; +      ar & t.tables; +    } + +  } // namespace serialization +} // namespace boost + + +template <typename Dish, typename Hash> +void  +MPIPYP<Dish,Hash>::synchronise() { +  boost::mpi::communicator world;  +  int rank = world.rank(), size = world.size(); + +  // communicate the customer count deltas +  dish_delta_type global_dish_delta; // the “merged” map +  boost::mpi::all_reduce(world, m_count_delta, global_dish_delta, sum_maps<Dish>()); + +  // update this restaurant +  for (typename dish_delta_type::const_iterator it=global_dish_delta.begin();  +       it != global_dish_delta.end(); ++it) { +    std::tr1::unordered_map<Dish,int,Hash>::operator[](it->first) += (it->second - m_count_delta[it->first]); +    PYP<Dish,Hash>::_total_customers += (it->second - m_count_delta[it->first]); +    //std::cerr << "Process " << rank << " adding " <<  (it->second - m_count_delta[it->first]) << " customers." << std::endl; +  } + +  // communicate the table count deltas +//  for (int process = 0; process < size; ++process) { +//    if (rank == process) { +//      // broadcast deltas +//      std::cerr << " -- Rank " << rank << " broadcasting -- " << std::endl; +// +//      boost::mpi::broadcast(world, m_table_delta, process); +// +//      std::cerr << " -- Rank " << rank << " done broadcasting -- " << std::endl; +//    } +//    else { +//      std::cerr << " -- Rank " << rank << " receiving -- " << std::endl; +//      // receive deltas +//      table_delta_type recv_table_delta; +// +//      boost::mpi::broadcast(world, recv_table_delta, process); +// +//      std::cerr << " -- Rank " << rank << " done receiving -- " << std::endl; +// +//      for (typename table_delta_type::const_iterator dish_it=recv_table_delta.begin();  +//           dish_it != recv_table_delta.end(); ++dish_it) { +//        typename PYP<Dish,Hash>::TableCounter &tc = PYP<Dish,Hash>::_dish_tables[dish_it->first]; +// +//        for (std::map<int,int>::const_iterator it=dish_it->second.table_histogram.begin();  +//             it != dish_it->second.table_histogram.end(); ++it) { +//          tc.table_histogram[it->first] += it->second; +//        } +//        tc.tables += dish_it->second.tables; +//        PYP<Dish,Hash>::_total_tables += dish_it->second.tables; +//      } +//    } +//  } +//  std::cerr << " -- Done Reducing -- " << std::endl; + +  reset_deltas();  }  #endif diff --git a/gi/pyp-topics/src/mpi-train-contexts.cc b/gi/pyp-topics/src/mpi-train-contexts.cc index 956ce123..0651ecac 100644 --- a/gi/pyp-topics/src/mpi-train-contexts.cc +++ b/gi/pyp-topics/src/mpi-train-contexts.cc @@ -86,7 +86,7 @@ int main(int argc, char **argv)    // seed the random number generator: 0 = automatic, specify value otherwise    unsigned long seed = 0;  -  PYPTopics model(vm["topics"].as<int>(), vm.count("hierarchical-topics"), seed); +  MPIPYPTopics model(vm["topics"].as<int>(), vm.count("hierarchical-topics"), seed);    // read the data    BackoffGenerator* backoff_gen=0; diff --git a/gi/pyp-topics/src/pyp.hh b/gi/pyp-topics/src/pyp.hh index 26f6ab2e..84decb0f 100644 --- a/gi/pyp-topics/src/pyp.hh +++ b/gi/pyp-topics/src/pyp.hh @@ -1,6 +1,7 @@  #ifndef _pyp_hh  #define _pyp_hh +#include "slice-sampler.h"  #include <math.h>  #include <map>  #include <tr1/unordered_map> @@ -11,7 +12,6 @@  #include <boost/random/mersenne_twister.hpp>  #include "log_add.h" -#include "slice-sampler.h"  #include "mt19937ar.h"  // @@ -63,7 +63,7 @@ public:    double b() const { return _b; }    void set_b(double b) { _b = b; } -  void clear(); +  virtual void clear();    std::ostream& debug_info(std::ostream& os) const;    double log_restaurant_prob() const; @@ -75,13 +75,12 @@ public:    void resample_prior_a();    void resample_prior_b(); -private: +protected:    double _a, _b; // parameters of the Pitman-Yor distribution    double _a_beta_a, _a_beta_b; // parameters of Beta prior on a    double _b_gamma_s, _b_gamma_c; // parameters of Gamma prior on b -  struct TableCounter -  { +  struct TableCounter {      TableCounter() : tables(0) {};      int tables;      std::map<int, int> table_histogram; // num customers at table -> number tables | 
