diff options
Diffstat (limited to 'gi/pyp-topics')
-rw-r--r-- | gi/pyp-topics/src/Makefile.mpi | 3 | ||||
-rw-r--r-- | gi/pyp-topics/src/contexts_corpus.cc | 4 | ||||
-rw-r--r-- | gi/pyp-topics/src/contexts_corpus.hh | 2 | ||||
-rw-r--r-- | gi/pyp-topics/src/makefile.depend | 228 | ||||
-rw-r--r-- | gi/pyp-topics/src/mpi-pyp-topics.cc | 148 | ||||
-rw-r--r-- | gi/pyp-topics/src/mpi-pyp-topics.hh | 17 | ||||
-rw-r--r-- | gi/pyp-topics/src/mpi-pyp.hh | 273 | ||||
-rw-r--r-- | gi/pyp-topics/src/mpi-train-contexts.cc | 2 | ||||
-rw-r--r-- | gi/pyp-topics/src/pyp.hh | 9 |
9 files changed, 582 insertions, 104 deletions
diff --git a/gi/pyp-topics/src/Makefile.mpi b/gi/pyp-topics/src/Makefile.mpi index 8c859881..b7b8a290 100644 --- a/gi/pyp-topics/src/Makefile.mpi +++ b/gi/pyp-topics/src/Makefile.mpi @@ -16,7 +16,8 @@ mpi-pyp-contexts-train: mpi-train-contexts.o $(local_objs) .PHONY: depend echo depend: - $(CXX) -MM $(CXXFLAGS) *.cc *.c | sed 's/^\(.*\.o:\)/obj\/\1/' > makefile.depend +#$(CXX) -MM $(CXXFLAGS) *.cc *.c | sed 's/^\(.*\.o:\)/obj\/\1/' > makefile.depend + $(CXX) -MM $(CXXFLAGS) *.cc *.c > makefile.depend clean: rm -f *.o diff --git a/gi/pyp-topics/src/contexts_corpus.cc b/gi/pyp-topics/src/contexts_corpus.cc index 26d5718a..1cf69429 100644 --- a/gi/pyp-topics/src/contexts_corpus.cc +++ b/gi/pyp-topics/src/contexts_corpus.cc @@ -28,7 +28,7 @@ void read_callback(const ContextsLexer::PhraseContextsType& new_contexts, void* Document* doc(new Document()); //cout << "READ: " << new_contexts.phrase << "\t"; - for (int i=0; i < new_contexts.counts.size(); ++i) { + for (int i=0; i < (int)new_contexts.counts.size(); ++i) { int cache_word_count = corpus_ptr->m_dict.max(); //string context_str = corpus_ptr->m_dict.toString(new_contexts.contexts[i]); @@ -101,7 +101,7 @@ void filter_callback(const ContextsLexer::PhraseContextsType& new_contexts, void map<string,int>* context_counts = (static_cast<map<string,int>*>(extra)); - for (int i=0; i < new_contexts.counts.size(); ++i) { + for (int i=0; i < (int)new_contexts.counts.size(); ++i) { int context_index = new_contexts.counts.at(i).first; int count = new_contexts.counts.at(i).second; //int count = new_contexts.counts[i]; diff --git a/gi/pyp-topics/src/contexts_corpus.hh b/gi/pyp-topics/src/contexts_corpus.hh index 66b71783..4d3d5669 100644 --- a/gi/pyp-topics/src/contexts_corpus.hh +++ b/gi/pyp-topics/src/contexts_corpus.hh @@ -63,6 +63,8 @@ public: std::vector<std::string> context2string(const WordID& id) const { std::vector<std::string> res; + assert (id >= 0); + std::cerr << m_dict.Convert(id) << std::endl; m_dict.AsVector(id, &res); return res; } diff --git a/gi/pyp-topics/src/makefile.depend b/gi/pyp-topics/src/makefile.depend index 88bab79e..88bc73c1 100644 --- a/gi/pyp-topics/src/makefile.depend +++ b/gi/pyp-topics/src/makefile.depend @@ -1,4 +1,4 @@ -obj/contexts_corpus.o: contexts_corpus.cc contexts_corpus.hh \ +contexts_corpus.o: contexts_corpus.cc contexts_corpus.hh \ /Users/pblunsom/packages/include/boost/ptr_container/ptr_vector.hpp \ /Users/pblunsom/packages/include/boost/ptr_container/ptr_sequence_adapter.hpp \ /Users/pblunsom/packages/include/boost/ptr_container/detail/reversible_ptr_container.hpp \ @@ -432,7 +432,7 @@ obj/contexts_corpus.o: contexts_corpus.cc contexts_corpus.hh \ /Users/pblunsom/packages/include/boost/type_traits/add_cv.hpp \ /Users/pblunsom/packages/include/boost/type_traits/remove_volatile.hpp \ /Users/pblunsom/packages/include/boost/type_traits/function_traits.hpp -obj/contexts_lexer.o: contexts_lexer.cc contexts_lexer.h \ +contexts_lexer.o: contexts_lexer.cc contexts_lexer.h \ ../../../decoder/dict.h \ /Users/pblunsom/packages/include/boost/functional/hash.hpp \ /Users/pblunsom/packages/include/boost/functional/hash/hash.hpp \ @@ -463,7 +463,7 @@ obj/contexts_lexer.o: contexts_lexer.cc contexts_lexer.h \ /Users/pblunsom/packages/include/boost/detail/container_fwd.hpp \ ../../../decoder/wordid.h ../../../decoder/filelib.h \ ../../../decoder/gzstream.h -obj/corpus.o: corpus.cc corpus.hh \ +corpus.o: corpus.cc corpus.hh \ /Users/pblunsom/packages/include/boost/shared_ptr.hpp \ /Users/pblunsom/packages/include/boost/smart_ptr/shared_ptr.hpp \ /Users/pblunsom/packages/include/boost/config.hpp \ @@ -874,8 +874,8 @@ obj/corpus.o: corpus.cc corpus.hh \ /Users/pblunsom/packages/include/boost/detail/is_incrementable.hpp \ /Users/pblunsom/packages/include/boost/ptr_container/detail/void_ptr_iterator.hpp \ gzstream.hh -obj/gzstream.o: gzstream.cc gzstream.hh -obj/mpi-pyp-topics.o: mpi-pyp-topics.cc \ +gzstream.o: gzstream.cc gzstream.hh +mpi-pyp-topics.o: mpi-pyp-topics.cc \ /Users/pblunsom/packages/include/boost/mpi/communicator.hpp \ /Users/pblunsom/packages/include/boost/mpi/config.hpp \ /Users/pblunsom/packages/include/boost/config.hpp \ @@ -1448,8 +1448,108 @@ obj/mpi-pyp-topics.o: mpi-pyp-topics.cc \ /Users/pblunsom/packages/include/boost/random/detail/const_mod.hpp \ /Users/pblunsom/packages/include/boost/random/detail/seed.hpp \ /Users/pblunsom/packages/include/boost/mpi/environment.hpp mpi-pyp.hh \ - log_add.h slice-sampler.h mt19937ar.h corpus.hh -obj/mpi-train-contexts.o: mpi-train-contexts.cc \ + /Users/pblunsom/packages/include/boost/tuple/tuple.hpp \ + /Users/pblunsom/packages/include/boost/ref.hpp \ + /Users/pblunsom/packages/include/boost/tuple/detail/tuple_basic.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/cv_traits.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/add_volatile.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/add_cv.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/remove_volatile.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/function_traits.hpp \ + /Users/pblunsom/packages/include/boost/serialization/map.hpp \ + /Users/pblunsom/packages/include/boost/serialization/utility.hpp \ + /Users/pblunsom/packages/include/boost/serialization/collections_save_imp.hpp \ + /Users/pblunsom/packages/include/boost/serialization/collections_load_imp.hpp \ + /Users/pblunsom/packages/include/boost/serialization/detail/stack_constructor.hpp \ + /Users/pblunsom/packages/include/boost/aligned_storage.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/type_with_alignment.hpp \ + /Users/pblunsom/packages/include/boost/mpi.hpp \ + /Users/pblunsom/packages/include/boost/mpi/collectives.hpp \ + /Users/pblunsom/packages/include/boost/mpi/collectives/all_gather.hpp \ + /Users/pblunsom/packages/include/boost/serialization/vector.hpp \ + /Users/pblunsom/packages/include/boost/serialization/collection_traits.hpp \ + /Users/pblunsom/packages/include/boost/mpi/collectives/broadcast.hpp \ + /Users/pblunsom/packages/include/boost/mpi/collectives_fwd.hpp \ + /Users/pblunsom/packages/include/boost/mpi/collectives/gather.hpp \ + /Users/pblunsom/packages/include/boost/mpi/collectives/all_reduce.hpp \ + /Users/pblunsom/packages/include/boost/mpi/collectives/reduce.hpp \ + /Users/pblunsom/packages/include/boost/mpi/detail/computation_tree.hpp \ + /Users/pblunsom/packages/include/boost/mpi/operations.hpp \ + /Users/pblunsom/packages/include/boost/mpi/collectives/all_to_all.hpp \ + /Users/pblunsom/packages/include/boost/mpi/collectives/scatter.hpp \ + /Users/pblunsom/packages/include/boost/mpi/collectives/scan.hpp \ + /Users/pblunsom/packages/include/boost/mpi/graph_communicator.hpp \ + /Users/pblunsom/packages/include/boost/graph/graph_traits.hpp \ + /Users/pblunsom/packages/include/boost/pending/property.hpp \ + /Users/pblunsom/packages/include/boost/pending/detail/property.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/same_traits.hpp \ + /Users/pblunsom/packages/include/boost/graph/properties.hpp \ + /Users/pblunsom/packages/include/boost/property_map/property_map.hpp \ + /Users/pblunsom/packages/include/boost/pending/cstddef.hpp \ + /Users/pblunsom/packages/include/boost/concept_check.hpp \ + /Users/pblunsom/packages/include/boost/concept/assert.hpp \ + /Users/pblunsom/packages/include/boost/concept/detail/general.hpp \ + /Users/pblunsom/packages/include/boost/concept/detail/has_constraints.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/conversion_traits.hpp \ + /Users/pblunsom/packages/include/boost/concept/usage.hpp \ + /Users/pblunsom/packages/include/boost/concept/detail/concept_def.hpp \ + /Users/pblunsom/packages/include/boost/concept/detail/concept_undef.hpp \ + /Users/pblunsom/packages/include/boost/concept_archetype.hpp \ + /Users/pblunsom/packages/include/boost/property_map/vector_property_map.hpp \ + /Users/pblunsom/packages/include/boost/graph/property_maps/constant_property_map.hpp \ + /Users/pblunsom/packages/include/boost/graph/property_maps/null_property_map.hpp \ + /Users/pblunsom/packages/include/boost/iterator/counting_iterator.hpp \ + /Users/pblunsom/packages/include/boost/detail/numeric_traits.hpp \ + /Users/pblunsom/packages/include/boost/type_traits.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/has_nothrow_assign.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/has_trivial_assign.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/has_nothrow_constructor.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/has_trivial_constructor.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/has_nothrow_copy.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/has_nothrow_destructor.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/has_trivial_destructor.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/has_virtual_destructor.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/is_signed.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/is_unsigned.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/is_compound.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/is_floating_point.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/is_member_object_pointer.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/is_object.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/is_stateless.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/rank.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/extent.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/remove_all_extents.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/function_traits.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/aligned_storage.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/floating_point_promotion.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/integral_promotion.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/promote.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/integral_promotion.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/floating_point_promotion.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/make_unsigned.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/is_signed.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/is_unsigned.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/make_signed.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/decay.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/is_complex.hpp \ + /Users/pblunsom/packages/include/boost/detail/select_type.hpp \ + /Users/pblunsom/packages/include/boost/graph/iteration_macros.hpp \ + /Users/pblunsom/packages/include/boost/shared_array.hpp \ + /Users/pblunsom/packages/include/boost/smart_ptr/shared_array.hpp \ + /Users/pblunsom/packages/include/boost/mpi/group.hpp \ + /Users/pblunsom/packages/include/boost/mpi/intercommunicator.hpp \ + /Users/pblunsom/packages/include/boost/mpi/nonblocking.hpp \ + /Users/pblunsom/packages/include/boost/mpi/skeleton_and_content.hpp \ + /Users/pblunsom/packages/include/boost/mpi/detail/forward_skeleton_iarchive.hpp \ + /Users/pblunsom/packages/include/boost/mpi/detail/forward_skeleton_oarchive.hpp \ + /Users/pblunsom/packages/include/boost/mpi/detail/ignore_iprimitive.hpp \ + /Users/pblunsom/packages/include/boost/mpi/detail/ignore_oprimitive.hpp \ + /Users/pblunsom/packages/include/boost/mpi/detail/content_oarchive.hpp \ + /Users/pblunsom/packages/include/boost/mpi/detail/broadcast_sc.hpp \ + /Users/pblunsom/packages/include/boost/mpi/detail/communicator_sc.hpp \ + /Users/pblunsom/packages/include/boost/mpi/timer.hpp pyp.hh \ + slice-sampler.h log_add.h mt19937ar.h corpus.hh +mpi-train-contexts.o: mpi-train-contexts.cc \ /Users/pblunsom/packages/include/boost/program_options/parsers.hpp \ /Users/pblunsom/packages/include/boost/program_options/config.hpp \ /Users/pblunsom/packages/include/boost/config.hpp \ @@ -2064,8 +2164,100 @@ obj/mpi-train-contexts.o: mpi-train-contexts.cc \ /Users/pblunsom/packages/include/boost/random/linear_congruential.hpp \ /Users/pblunsom/packages/include/boost/random/detail/const_mod.hpp \ /Users/pblunsom/packages/include/boost/random/detail/seed.hpp \ - mpi-pyp.hh log_add.h slice-sampler.h mt19937ar.h corpus.hh \ - contexts_corpus.hh contexts_lexer.h ../../../decoder/dict.h \ + mpi-pyp.hh /Users/pblunsom/packages/include/boost/tuple/tuple.hpp \ + /Users/pblunsom/packages/include/boost/tuple/detail/tuple_basic.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/cv_traits.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/add_cv.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/remove_volatile.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/function_traits.hpp \ + /Users/pblunsom/packages/include/boost/serialization/map.hpp \ + /Users/pblunsom/packages/include/boost/serialization/utility.hpp \ + /Users/pblunsom/packages/include/boost/serialization/collections_save_imp.hpp \ + /Users/pblunsom/packages/include/boost/serialization/collections_load_imp.hpp \ + /Users/pblunsom/packages/include/boost/serialization/detail/stack_constructor.hpp \ + /Users/pblunsom/packages/include/boost/aligned_storage.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/type_with_alignment.hpp \ + /Users/pblunsom/packages/include/boost/mpi.hpp \ + /Users/pblunsom/packages/include/boost/mpi/collectives.hpp \ + /Users/pblunsom/packages/include/boost/mpi/collectives/all_gather.hpp \ + /Users/pblunsom/packages/include/boost/serialization/vector.hpp \ + /Users/pblunsom/packages/include/boost/serialization/collection_traits.hpp \ + /Users/pblunsom/packages/include/boost/mpi/collectives/broadcast.hpp \ + /Users/pblunsom/packages/include/boost/mpi/collectives_fwd.hpp \ + /Users/pblunsom/packages/include/boost/mpi/collectives/gather.hpp \ + /Users/pblunsom/packages/include/boost/mpi/collectives/all_reduce.hpp \ + /Users/pblunsom/packages/include/boost/mpi/collectives/reduce.hpp \ + /Users/pblunsom/packages/include/boost/mpi/detail/computation_tree.hpp \ + /Users/pblunsom/packages/include/boost/mpi/operations.hpp \ + /Users/pblunsom/packages/include/boost/mpi/collectives/all_to_all.hpp \ + /Users/pblunsom/packages/include/boost/mpi/collectives/scatter.hpp \ + /Users/pblunsom/packages/include/boost/mpi/collectives/scan.hpp \ + /Users/pblunsom/packages/include/boost/mpi/graph_communicator.hpp \ + /Users/pblunsom/packages/include/boost/graph/graph_traits.hpp \ + /Users/pblunsom/packages/include/boost/pending/property.hpp \ + /Users/pblunsom/packages/include/boost/pending/detail/property.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/same_traits.hpp \ + /Users/pblunsom/packages/include/boost/graph/properties.hpp \ + /Users/pblunsom/packages/include/boost/property_map/property_map.hpp \ + /Users/pblunsom/packages/include/boost/pending/cstddef.hpp \ + /Users/pblunsom/packages/include/boost/concept_check.hpp \ + /Users/pblunsom/packages/include/boost/concept/assert.hpp \ + /Users/pblunsom/packages/include/boost/concept/detail/general.hpp \ + /Users/pblunsom/packages/include/boost/concept/detail/has_constraints.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/conversion_traits.hpp \ + /Users/pblunsom/packages/include/boost/concept/usage.hpp \ + /Users/pblunsom/packages/include/boost/concept/detail/concept_def.hpp \ + /Users/pblunsom/packages/include/boost/concept/detail/concept_undef.hpp \ + /Users/pblunsom/packages/include/boost/concept_archetype.hpp \ + /Users/pblunsom/packages/include/boost/property_map/vector_property_map.hpp \ + /Users/pblunsom/packages/include/boost/graph/property_maps/constant_property_map.hpp \ + /Users/pblunsom/packages/include/boost/graph/property_maps/null_property_map.hpp \ + /Users/pblunsom/packages/include/boost/iterator/counting_iterator.hpp \ + /Users/pblunsom/packages/include/boost/detail/numeric_traits.hpp \ + /Users/pblunsom/packages/include/boost/type_traits.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/has_nothrow_assign.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/has_trivial_assign.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/has_nothrow_constructor.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/has_trivial_constructor.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/has_nothrow_copy.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/has_nothrow_destructor.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/has_virtual_destructor.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/is_compound.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/is_floating_point.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/is_member_object_pointer.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/is_object.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/is_stateless.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/rank.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/extent.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/remove_all_extents.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/function_traits.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/aligned_storage.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/floating_point_promotion.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/integral_promotion.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/promote.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/integral_promotion.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/floating_point_promotion.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/make_signed.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/decay.hpp \ + /Users/pblunsom/packages/include/boost/type_traits/is_complex.hpp \ + /Users/pblunsom/packages/include/boost/detail/select_type.hpp \ + /Users/pblunsom/packages/include/boost/graph/iteration_macros.hpp \ + /Users/pblunsom/packages/include/boost/shared_array.hpp \ + /Users/pblunsom/packages/include/boost/smart_ptr/shared_array.hpp \ + /Users/pblunsom/packages/include/boost/mpi/group.hpp \ + /Users/pblunsom/packages/include/boost/mpi/intercommunicator.hpp \ + /Users/pblunsom/packages/include/boost/mpi/nonblocking.hpp \ + /Users/pblunsom/packages/include/boost/mpi/skeleton_and_content.hpp \ + /Users/pblunsom/packages/include/boost/mpi/detail/forward_skeleton_iarchive.hpp \ + /Users/pblunsom/packages/include/boost/mpi/detail/forward_skeleton_oarchive.hpp \ + /Users/pblunsom/packages/include/boost/mpi/detail/ignore_iprimitive.hpp \ + /Users/pblunsom/packages/include/boost/mpi/detail/ignore_oprimitive.hpp \ + /Users/pblunsom/packages/include/boost/mpi/detail/content_oarchive.hpp \ + /Users/pblunsom/packages/include/boost/mpi/detail/broadcast_sc.hpp \ + /Users/pblunsom/packages/include/boost/mpi/detail/communicator_sc.hpp \ + /Users/pblunsom/packages/include/boost/mpi/timer.hpp pyp.hh \ + slice-sampler.h log_add.h mt19937ar.h corpus.hh contexts_corpus.hh \ + contexts_lexer.h ../../../decoder/dict.h \ /Users/pblunsom/packages/include/boost/functional/hash.hpp \ /Users/pblunsom/packages/include/boost/functional/hash/hash.hpp \ /Users/pblunsom/packages/include/boost/functional/hash/hash_fwd.hpp \ @@ -2078,7 +2270,7 @@ obj/mpi-train-contexts.o: mpi-train-contexts.cc \ /Users/pblunsom/packages/include/boost/functional/hash/extensions.hpp \ /Users/pblunsom/packages/include/boost/detail/container_fwd.hpp \ ../../../decoder/wordid.h gzstream.hh -obj/pyp-topics.o: pyp-topics.cc timing.h clock_gettime_stub.c pyp-topics.hh \ +pyp-topics.o: pyp-topics.cc timing.h clock_gettime_stub.c pyp-topics.hh \ /Users/pblunsom/packages/include/boost/ptr_container/ptr_vector.hpp \ /Users/pblunsom/packages/include/boost/ptr_container/ptr_sequence_adapter.hpp \ /Users/pblunsom/packages/include/boost/ptr_container/detail/reversible_ptr_container.hpp \ @@ -2484,7 +2676,7 @@ obj/pyp-topics.o: pyp-topics.cc timing.h clock_gettime_stub.c pyp-topics.hh \ /Users/pblunsom/packages/include/boost/random/linear_congruential.hpp \ /Users/pblunsom/packages/include/boost/random/detail/const_mod.hpp \ /Users/pblunsom/packages/include/boost/random/detail/seed.hpp pyp.hh \ - log_add.h slice-sampler.h mt19937ar.h corpus.hh \ + slice-sampler.h log_add.h mt19937ar.h corpus.hh \ /Users/pblunsom/packages/include/boost/shared_ptr.hpp \ /Users/pblunsom/packages/include/boost/smart_ptr/shared_ptr.hpp \ /Users/pblunsom/packages/include/boost/config/no_tr1/memory.hpp \ @@ -2658,7 +2850,7 @@ obj/pyp-topics.o: pyp-topics.cc timing.h clock_gettime_stub.c pyp-topics.hh \ /Users/pblunsom/packages/include/boost/smart_ptr/scoped_ptr.hpp \ /Users/pblunsom/packages/include/boost/type_traits/is_fundamental.hpp \ /Users/pblunsom/packages/include/boost/thread/condition.hpp -obj/train-contexts.o: train-contexts.cc \ +train-contexts.o: train-contexts.cc \ /Users/pblunsom/packages/include/boost/program_options/parsers.hpp \ /Users/pblunsom/packages/include/boost/program_options/config.hpp \ /Users/pblunsom/packages/include/boost/config.hpp \ @@ -3140,7 +3332,7 @@ obj/train-contexts.o: train-contexts.cc \ /Users/pblunsom/packages/include/boost/random/linear_congruential.hpp \ /Users/pblunsom/packages/include/boost/random/detail/const_mod.hpp \ /Users/pblunsom/packages/include/boost/random/detail/seed.hpp pyp.hh \ - log_add.h slice-sampler.h mt19937ar.h corpus.hh workers.hh \ + slice-sampler.h log_add.h mt19937ar.h corpus.hh workers.hh \ /Users/pblunsom/packages/include/boost/bind.hpp \ /Users/pblunsom/packages/include/boost/bind/bind.hpp \ /Users/pblunsom/packages/include/boost/is_placeholder.hpp \ @@ -3275,7 +3467,7 @@ obj/train-contexts.o: train-contexts.cc \ /Users/pblunsom/packages/include/boost/functional/hash/extensions.hpp \ /Users/pblunsom/packages/include/boost/detail/container_fwd.hpp \ ../../../decoder/wordid.h gzstream.hh -obj/train.o: train.cc \ +train.o: train.cc \ /Users/pblunsom/packages/include/boost/program_options/parsers.hpp \ /Users/pblunsom/packages/include/boost/program_options/config.hpp \ /Users/pblunsom/packages/include/boost/config.hpp \ @@ -3757,7 +3949,7 @@ obj/train.o: train.cc \ /Users/pblunsom/packages/include/boost/random/linear_congruential.hpp \ /Users/pblunsom/packages/include/boost/random/detail/const_mod.hpp \ /Users/pblunsom/packages/include/boost/random/detail/seed.hpp pyp.hh \ - log_add.h slice-sampler.h mt19937ar.h corpus.hh workers.hh \ + slice-sampler.h log_add.h mt19937ar.h corpus.hh workers.hh \ /Users/pblunsom/packages/include/boost/bind.hpp \ /Users/pblunsom/packages/include/boost/bind/bind.hpp \ /Users/pblunsom/packages/include/boost/is_placeholder.hpp \ @@ -3892,6 +4084,6 @@ obj/train.o: train.cc \ /Users/pblunsom/packages/include/boost/functional/hash/extensions.hpp \ /Users/pblunsom/packages/include/boost/detail/container_fwd.hpp \ ../../../decoder/wordid.h gzstream.hh -obj/clock_gettime_stub.o: clock_gettime_stub.c -obj/gammadist.o: gammadist.c gammadist.h mt19937ar.h -obj/mt19937ar.o: mt19937ar.c mt19937ar.h +clock_gettime_stub.o: clock_gettime_stub.c +gammadist.o: gammadist.c gammadist.h mt19937ar.h +mt19937ar.o: mt19937ar.c mt19937ar.h diff --git a/gi/pyp-topics/src/mpi-pyp-topics.cc b/gi/pyp-topics/src/mpi-pyp-topics.cc index 2ad28278..4525302e 100644 --- a/gi/pyp-topics/src/mpi-pyp-topics.cc +++ b/gi/pyp-topics/src/mpi-pyp-topics.cc @@ -4,7 +4,7 @@ #include "mpi-pyp-topics.hh" //#include <boost/date_time/posix_time/posix_time_types.hpp> -void PYPTopics::sample_corpus(const Corpus& corpus, int samples, +void MPIPYPTopics::sample_corpus(const Corpus& corpus, int samples, int freq_cutoff_start, int freq_cutoff_end, int freq_cutoff_interval, int max_contexts_per_document) { @@ -23,33 +23,33 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples, } int local_documents = m_mpi_end - m_mpi_start; - if (!m_backoff.get()) { m_word_pyps.clear(); - m_word_pyps.push_back(PYPs()); + m_word_pyps.push_back(MPIPYPs()); } if (m_am_root) std::cerr << "\n Training with " << m_word_pyps.size()-1 << " backoff level" - << (m_word_pyps.size()==2 ? ":" : "s:") << std::endl; + << (m_word_pyps.size()>1 ? ":" : "s:") << std::endl; - for (int i=0; i<(int)m_word_pyps.size(); ++i) - { + for (int i=0; i<(int)m_word_pyps.size(); ++i) { m_word_pyps.at(i).reserve(m_num_topics); for (int j=0; j<m_num_topics; ++j) - m_word_pyps.at(i).push_back(new PYP<int>(0.5, 1.0)); + m_word_pyps.at(i).push_back(new MPIPYP<int>(0.5, 1.0)); } if (m_am_root) std::cerr << std::endl; - m_document_pyps.reserve(corpus.num_documents()); - for (int j=0; j<corpus.num_documents(); ++j) + m_document_pyps.reserve(local_documents); + //m_document_pyps.reserve(corpus.num_documents()); + //for (int j=0; j<corpus.num_documents(); ++j) + for (int j=0; j<local_documents; ++j) m_document_pyps.push_back(new PYP<int>(0.5, 1.0)); m_topic_p0 = 1.0/m_num_topics; m_term_p0 = 1.0/corpus.num_types(); m_backoff_p0 = 1.0/corpus.num_documents(); - if (m_am_root) std::cerr << " Documents: " << corpus.num_documents() << " Terms: " - << corpus.num_types() << std::endl; + if (m_am_root) std::cerr << " Documents: " << corpus.num_documents() << "(" + << local_documents << ")" << " Terms: " << corpus.num_types() << std::endl; int frequency_cutoff = freq_cutoff_start; if (m_am_root) std::cerr << " Context frequency cutoff set to " << frequency_cutoff << std::endl; @@ -57,13 +57,16 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples, timer.Reset(); // Initialisation pass int document_id=0, topic_counter=0; - for (Corpus::const_iterator corpusIt=corpus.begin(); - corpusIt != corpus.end(); ++corpusIt, ++document_id) { - m_corpus_topics.push_back(DocumentTopics(corpusIt->size(), 0)); + for (int i=0; i<local_documents; ++i) { + document_id = i+m_mpi_start; + + //for (Corpus::const_iterator corpusIt=corpus.begin(); + // corpusIt != corpus.end(); ++corpusIt, ++document_id) { + m_corpus_topics.push_back(DocumentTopics(corpus.at(document_id).size(), 0)); int term_index=0; - for (Document::const_iterator docIt=corpusIt->begin(); - docIt != corpusIt->end(); ++docIt, ++term_index) { + for (Document::const_iterator docIt=corpus.at(document_id).begin(); + docIt != corpus.at(document_id).end(); ++docIt, ++term_index) { topic_counter++; Term term = *docIt; @@ -80,21 +83,41 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples, if (m_use_topic_pyp) { F p0 = m_topic_pyp.prob(new_topic, m_topic_p0); - int table_delta = m_document_pyps[document_id].increment(new_topic, p0); + int table_delta = m_document_pyps.at(i).increment(new_topic, p0); if (table_delta) m_topic_pyp.increment(new_topic, m_topic_p0); } - else m_document_pyps[document_id].increment(new_topic, m_topic_p0); + else m_document_pyps.at(i).increment(new_topic, m_topic_p0); } - m_corpus_topics[document_id][term_index] = new_topic; + m_corpus_topics.at(i).at(term_index) = new_topic; + } + } + + // Synchronise the topic->word counds across the processes. + for (std::vector<MPIPYPs>::iterator levelIt=m_word_pyps.begin(); + levelIt != m_word_pyps.end(); ++levelIt) { + for (MPIPYPs::iterator pypIt=levelIt->begin(); + pypIt != levelIt->end(); ++pypIt) { + if (!m_am_root) boost::mpi::communicator().barrier(); + std::cerr << "Before Sync Process " << m_rank << ":"; + pypIt->debug_info(std::cerr); std::cerr << std::endl; + if (m_am_root) boost::mpi::communicator().barrier(); + + pypIt->synchronise(); + + if (!m_am_root) boost::mpi::communicator().barrier(); + std::cerr << "After Sync Process " << m_rank << ":"; + pypIt->debug_info(std::cerr); std::cerr << std::endl; + if (m_am_root) boost::mpi::communicator().barrier(); } } + if (m_am_root) std::cerr << " Initialized in " << timer.Elapsed() << " seconds\n"; int* randomDocIndices = new int[local_documents]; for (int i = 0; i < local_documents; ++i) - randomDocIndices[i] = i+m_mpi_start; + randomDocIndices[i] = i; // Sampling phase for (int curr_sample=0; curr_sample < samples; ++curr_sample) { @@ -110,8 +133,8 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples, // Randomize the corpus indexing array int tmp; int processed_terms=0; - for (int i = local_documents-1; i > 0; --i) { - //i+1 since j \in [0,i] but rnd() \in [0,1) + for (int i = (local_documents-1); i > 0; --i) { + //i+1 since j \in [0,i] but rnd() \in [0,1) int j = (int)(rnd() * (i+1)); assert(j >= 0 && j <= i); tmp = randomDocIndices[i]; @@ -120,15 +143,17 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples, } // for each document in the corpus - int document_id; - for (int i=0; i<local_documents; ++i) { - document_id = randomDocIndices[i]; + for (int rand_doc=0; rand_doc<local_documents; ++rand_doc) { + int doc_index = randomDocIndices[rand_doc]; + int document_id = doc_index + m_mpi_start; + const Document& doc = corpus.at(document_id); // for each term in the document int term_index=0; - Document::const_iterator docEnd = corpus.at(document_id).end(); - for (Document::const_iterator docIt=corpus.at(document_id).begin(); + Document::const_iterator docEnd = doc.end(); + for (Document::const_iterator docIt=doc.begin(); docIt != docEnd; ++docIt, ++term_index) { + if (max_contexts_per_document && term_index > max_contexts_per_document) break; @@ -140,36 +165,49 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples, processed_terms++; // remove the prevous topic from the PYPs - int current_topic = m_corpus_topics[document_id][term_index]; + int current_topic = m_corpus_topics.at(doc_index).at(term_index); // a negative label mean that term hasn't been sampled yet if (current_topic >= 0) { decrement(term, current_topic); - int table_delta = m_document_pyps[document_id].decrement(current_topic); + int table_delta = m_document_pyps.at(doc_index).decrement(current_topic); if (m_use_topic_pyp && table_delta < 0) m_topic_pyp.decrement(current_topic); } // sample a new_topic - int new_topic = sample(document_id, term); + int new_topic = sample(doc_index, term); // add the new topic to the PYPs - m_corpus_topics[document_id][term_index] = new_topic; + m_corpus_topics.at(doc_index).at(term_index) = new_topic; increment(term, new_topic); if (m_use_topic_pyp) { F p0 = m_topic_pyp.prob(new_topic, m_topic_p0); - int table_delta = m_document_pyps[document_id].increment(new_topic, p0); + int table_delta = m_document_pyps.at(doc_index).increment(new_topic, p0); if (table_delta) m_topic_pyp.increment(new_topic, m_topic_p0); } - else m_document_pyps[document_id].increment(new_topic, m_topic_p0); + else m_document_pyps.at(doc_index).increment(new_topic, m_topic_p0); } if (document_id && document_id % 10000 == 0) { if (m_am_root) std::cerr << "."; std::cerr.flush(); } } m_world.barrier(); + // Synchronise the topic->word counds across the processes. + for (std::vector<MPIPYPs>::iterator levelIt=m_word_pyps.begin(); + levelIt != m_word_pyps.end(); ++levelIt) { + for (MPIPYPs::iterator pypIt=levelIt->begin(); + pypIt != levelIt->end(); ++pypIt) { + std::cerr << "Before Sync Process " << m_rank << ":"; + pypIt->debug_info(std::cerr); std::cerr << std::endl; + pypIt->synchronise(); + std::cerr << "After Sync Process " << m_rank << ":"; + pypIt->debug_info(std::cerr); std::cerr << std::endl; + } + } + if (m_am_root) std::cerr << " ||| sampled " << processed_terms << " terms."; if (curr_sample != 0 && curr_sample % 10 == 0) { @@ -179,9 +217,9 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples, // resample the hyperparamters F log_p=0.0; - for (std::vector<PYPs>::iterator levelIt=m_word_pyps.begin(); + for (std::vector<MPIPYPs>::iterator levelIt=m_word_pyps.begin(); levelIt != m_word_pyps.end(); ++levelIt) { - for (PYPs::iterator pypIt=levelIt->begin(); + for (MPIPYPs::iterator pypIt=levelIt->begin(); pypIt != levelIt->end(); ++pypIt) { pypIt->resample_prior(); log_p += pypIt->log_restaurant_prob(); @@ -206,7 +244,7 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples, int k=0; if (m_am_root) std::cerr << "Topics distribution: "; std::cerr.precision(2); - for (PYPs::iterator pypIt=m_word_pyps.front().begin(); + for (MPIPYPs::iterator pypIt=m_word_pyps.front().begin(); pypIt != m_word_pyps.front().end(); ++pypIt, ++k) { if (m_am_root && k % 5 == 0) std::cerr << std::endl << '\t'; if (m_am_root) std::cerr << "<" << k << ":" << pypIt->num_customers() << "," @@ -220,8 +258,8 @@ void PYPTopics::sample_corpus(const Corpus& corpus, int samples, } -void PYPTopics::decrement(const Term& term, int topic, int level) { - //std::cerr << "PYPTopics::decrement(" << term << "," << topic << "," << level << ")" << std::endl; +void MPIPYPTopics::decrement(const Term& term, int topic, int level) { + //std::cerr << "MPIPYPTopics::decrement(" << term << "," << topic << "," << level << ")" << std::endl; m_word_pyps.at(level).at(topic).decrement(term); if (m_backoff.get()) { Term backoff_term = (*m_backoff)[term]; @@ -230,8 +268,8 @@ void PYPTopics::decrement(const Term& term, int topic, int level) { } } -void PYPTopics::increment(const Term& term, int topic, int level) { - //std::cerr << "PYPTopics::increment(" << term << "," << topic << "," << level << ")" << std::endl; +void MPIPYPTopics::increment(const Term& term, int topic, int level) { + //std::cerr << "MPIPYPTopics::increment(" << term << "," << topic << "," << level << ")" << std::endl; m_word_pyps.at(level).at(topic).increment(term, word_pyps_p0(term, topic, level)); if (m_backoff.get()) { @@ -241,7 +279,7 @@ void PYPTopics::increment(const Term& term, int topic, int level) { } } -int PYPTopics::sample(const DocumentId& doc, const Term& term) { +int MPIPYPTopics::sample(const DocumentId& doc, const Term& term) { // First pass: collect probs F sum=0.0; std::vector<F> sums; @@ -252,7 +290,7 @@ int PYPTopics::sample(const DocumentId& doc, const Term& term) { if (m_use_topic_pyp) topic_prob = m_topic_pyp.prob(k, m_topic_p0); //F p_k_d = m_document_pyps[doc].prob(k, topic_prob); - F p_k_d = m_document_pyps[doc].unnormalised_prob(k, topic_prob); + F p_k_d = m_document_pyps.at(doc).unnormalised_prob(k, topic_prob); sum += (p_w_k*p_k_d); sums.push_back(sum); @@ -266,9 +304,9 @@ int PYPTopics::sample(const DocumentId& doc, const Term& term) { assert(false); } -PYPTopics::F PYPTopics::word_pyps_p0(const Term& term, int topic, int level) const { +MPIPYPTopics::F MPIPYPTopics::word_pyps_p0(const Term& term, int topic, int level) const { //for (int i=0; i<level+1; ++i) std::cerr << " "; - //std::cerr << "PYPTopics::word_pyps_p0(" << term << "," << topic << "," << level << ")" << std::endl; + //std::cerr << "MPIPYPTopics::word_pyps_p0(" << term << "," << topic << "," << level << ")" << std::endl; F p0 = m_term_p0; if (m_backoff.get()) { @@ -283,24 +321,24 @@ PYPTopics::F PYPTopics::word_pyps_p0(const Term& term, int topic, int level) con p0 = m_term_p0; } //for (int i=0; i<level+1; ++i) std::cerr << " "; - //std::cerr << "PYPTopics::word_pyps_p0(" << term << "," << topic << "," << level << ") = " << p0 << std::endl; + //std::cerr << "MPIPYPTopics::word_pyps_p0(" << term << "," << topic << "," << level << ") = " << p0 << std::endl; return p0; } -PYPTopics::F PYPTopics::prob(const Term& term, int topic, int level) const { +MPIPYPTopics::F MPIPYPTopics::prob(const Term& term, int topic, int level) const { //for (int i=0; i<level+1; ++i) std::cerr << " "; - //std::cerr << "PYPTopics::prob(" << term << "," << topic << "," << level << " " << factor << ")" << std::endl; + //std::cerr << "MPIPYPTopics::prob(" << term << "," << topic << "," << level << " " << factor << ")" << std::endl; F p0 = word_pyps_p0(term, topic, level); F p_w_k = m_word_pyps.at(level).at(topic).prob(term, p0); //for (int i=0; i<level+1; ++i) std::cerr << " "; - //std::cerr << "PYPTopics::prob(" << term << "," << topic << "," << level << ") = " << p_w_k << std::endl; + //std::cerr << "MPIPYPTopics::prob(" << term << "," << topic << "," << level << ") = " << p_w_k << std::endl; return p_w_k; } -int PYPTopics::max_topic() const { +int MPIPYPTopics::max_topic() const { if (!m_use_topic_pyp) return -1; @@ -317,8 +355,8 @@ int PYPTopics::max_topic() const { return current_topic; } -int PYPTopics::max(const DocumentId& doc) const { - //std::cerr << "PYPTopics::max(" << doc << "," << term << ")" << std::endl; +int MPIPYPTopics::max(const DocumentId& doc) const { + //std::cerr << "MPIPYPTopics::max(" << doc << "," << term << ")" << std::endl; // collect probs F current_max=0.0; int current_topic=-1; @@ -342,8 +380,8 @@ int PYPTopics::max(const DocumentId& doc) const { return current_topic; } -int PYPTopics::max(const DocumentId& doc, const Term& term) const { - //std::cerr << "PYPTopics::max(" << doc << "," << term << ")" << std::endl; +int MPIPYPTopics::max(const DocumentId& doc, const Term& term) const { + //std::cerr << "MPIPYPTopics::max(" << doc << "," << term << ")" << std::endl; // collect probs F current_max=0.0; int current_topic=-1; @@ -368,7 +406,7 @@ int PYPTopics::max(const DocumentId& doc, const Term& term) const { return current_topic; } -std::ostream& PYPTopics::print_document_topics(std::ostream& out) const { +std::ostream& MPIPYPTopics::print_document_topics(std::ostream& out) const { for (CorpusTopics::const_iterator corpusIt=m_corpus_topics.begin(); corpusIt != m_corpus_topics.end(); ++corpusIt) { int term_index=0; @@ -382,7 +420,7 @@ std::ostream& PYPTopics::print_document_topics(std::ostream& out) const { return out; } -std::ostream& PYPTopics::print_topic_terms(std::ostream& out) const { +std::ostream& MPIPYPTopics::print_topic_terms(std::ostream& out) const { for (PYPs::const_iterator pypsIt=m_word_pyps.front().begin(); pypsIt != m_word_pyps.front().end(); ++pypsIt) { int term_index=0; diff --git a/gi/pyp-topics/src/mpi-pyp-topics.hh b/gi/pyp-topics/src/mpi-pyp-topics.hh index 5da35d82..a85a776d 100644 --- a/gi/pyp-topics/src/mpi-pyp-topics.hh +++ b/gi/pyp-topics/src/mpi-pyp-topics.hh @@ -1,5 +1,5 @@ -#ifndef PYP_TOPICS_HH -#define PYP_TOPICS_HH +#ifndef MPI_PYP_TOPICS_HH +#define MPI_PYP_TOPICS_HH #include <vector> #include <iostream> @@ -14,14 +14,14 @@ #include "mpi-pyp.hh" #include "corpus.hh" -class PYPTopics { +class MPIPYPTopics { public: typedef std::vector<int> DocumentTopics; typedef std::vector<DocumentTopics> CorpusTopics; typedef double F; public: - PYPTopics(int num_topics, bool use_topic_pyp=false, unsigned long seed = 0) + MPIPYPTopics(int num_topics, bool use_topic_pyp=false, unsigned long seed = 0) : m_num_topics(num_topics), m_word_pyps(1), m_topic_pyp(0.5,1.0), m_use_topic_pyp(use_topic_pyp), m_seed(seed), @@ -47,12 +47,12 @@ public: m_backoff.reset(new TermBackoff); m_backoff->read(filename); m_word_pyps.clear(); - m_word_pyps.resize(m_backoff->order(), PYPs()); + m_word_pyps.resize(m_backoff->order(), MPIPYPs()); } void set_backoff(TermBackoffPtr backoff) { m_backoff = backoff; m_word_pyps.clear(); - m_word_pyps.resize(m_backoff->order(), PYPs()); + m_word_pyps.resize(m_backoff->order(), MPIPYPs()); } F prob(const Term& term, int topic, int level=0) const; @@ -70,9 +70,10 @@ private: CorpusTopics m_corpus_topics; typedef boost::ptr_vector< PYP<int> > PYPs; + typedef boost::ptr_vector< MPIPYP<int> > MPIPYPs; PYPs m_document_pyps; - std::vector<PYPs> m_word_pyps; - PYP<int> m_topic_pyp; + std::vector<MPIPYPs> m_word_pyps; + MPIPYP<int> m_topic_pyp; bool m_use_topic_pyp; unsigned long m_seed; diff --git a/gi/pyp-topics/src/mpi-pyp.hh b/gi/pyp-topics/src/mpi-pyp.hh index 58be7c5c..65358d20 100644 --- a/gi/pyp-topics/src/mpi-pyp.hh +++ b/gi/pyp-topics/src/mpi-pyp.hh @@ -1,5 +1,5 @@ -#ifndef _pyp_hh -#define _pyp_hh +#ifndef _mpipyp_hh +#define _mpipyp_hh #include <math.h> #include <map> @@ -9,11 +9,15 @@ #include <boost/random/uniform_real.hpp> #include <boost/random/variate_generator.hpp> #include <boost/random/mersenne_twister.hpp> +#include <boost/tuple/tuple.hpp> +#include <boost/serialization/map.hpp> +#include <boost/mpi.hpp> +#include <boost/mpi/environment.hpp> +#include <boost/mpi/communicator.hpp> +#include <boost/mpi/operations.hpp> -#include "pyp.h" -#include "log_add.h" -#include "slice-sampler.h" -#include "mt19937ar.h" + +#include "pyp.hh" // // Pitman-Yor process with customer and table tracking @@ -28,25 +32,104 @@ public: virtual int decrement(Dish d); void clear(); + void reset_deltas(); - void reset_deltas() { m_count_delta.clear(); } + void synchronise(); private: typedef std::map<Dish, int> dish_delta_type; - typedef std::map<Dish, TableCounter> table_delta_type; + typedef std::map<Dish, typename PYP<Dish,Hash>::TableCounter> table_delta_type; dish_delta_type m_count_delta; table_delta_type m_table_delta; }; template <typename Dish, typename Hash> -MPIPYP<Dish,Hash>::MPIPYP(double a, double b, Hash) -: PYP(a, b, Hash) {} +MPIPYP<Dish,Hash>::MPIPYP(double a, double b, Hash h) +: PYP<Dish,Hash>(a, b, 0, h) {} template <typename Dish, typename Hash> int MPIPYP<Dish,Hash>::increment(Dish dish, double p0) { - int delta = PYP<Dish,Hash>::increment(dish, p0); + int delta = 0; + int table_joined=-1; + typename PYP<Dish,Hash>::TableCounter &tc = PYP<Dish,Hash>::_dish_tables[dish]; + + // seated on a new or existing table? + int c = PYP<Dish,Hash>::count(dish); + int t = PYP<Dish,Hash>::num_tables(dish); + int T = PYP<Dish,Hash>::num_tables(); + double& a = PYP<Dish,Hash>::_a; + double& b = PYP<Dish,Hash>::_b; + double pshare = (c > 0) ? (c - a*t) : 0.0; + double pnew = (b + a*T) * p0; + assert (pshare >= 0.0); + + if (mt_genrand_res53() < pnew / (pshare + pnew)) { + // assign to a new table + tc.tables += 1; + tc.table_histogram[1] += 1; + PYP<Dish,Hash>::_total_tables += 1; + delta = 1; + } + else { + // randomly assign to an existing table + // remove constant denominator from inner loop + double r = mt_genrand_res53() * (c - a*t); + for (std::map<int,int>::iterator + hit = tc.table_histogram.begin(); + hit != tc.table_histogram.end(); ++hit) { + r -= ((hit->first - a) * hit->second); + if (r <= 0) { + tc.table_histogram[hit->first+1] += 1; + hit->second -= 1; + if (hit->second == 0) + tc.table_histogram.erase(hit); + table_joined = hit->first+1; + break; + } + } + if (r > 0) { + std::cerr << r << " " << c << " " << a << " " << t << std::endl; + assert(false); + } + delta = 0; + } + + std::tr1::unordered_map<Dish,int,Hash>::operator[](dish) += 1; + //google::sparse_hash_map<Dish,int,Hash>::operator[](dish) += 1; + PYP<Dish,Hash>::_total_customers += 1; + + // MPI Delta handling + // track the customer entering + typename dish_delta_type::iterator customer_it; + bool customer_insert_result; + boost::tie(customer_it, customer_insert_result) + = m_count_delta.insert(std::make_pair(dish,0)); + + customer_it->second += 1; + if (customer_it->second == 0) + m_count_delta.erase(customer_it); + + // increment the histogram bar for the table joined + if (!delta) { + assert (table_joined >= 0); + std::map<int,int> &histogram = m_table_delta[dish].table_histogram; + typename std::map<int,int>::iterator table_it; bool table_insert_result; + boost::tie(table_it, table_insert_result) = histogram.insert(std::make_pair(table_joined,0)); + table_it->second += 1; + if (table_it->second == 0) histogram.erase(table_it); + + // decrement the histogram bar for the table left + boost::tie(table_it, table_insert_result) = histogram.insert(std::make_pair(table_joined-1,0)); + table_it->second -= 1; + if (table_it->second == 0) histogram.erase(table_it); + } + else { + typename PYP<Dish,Hash>::TableCounter &delta_tc = m_table_delta[dish]; + delta_tc.tables += 1; + delta_tc.table_histogram[1] += 1; + } return delta; } @@ -55,15 +138,177 @@ template <typename Dish, typename Hash> int MPIPYP<Dish,Hash>::decrement(Dish dish) { - int delta = PYP<Dish,Hash>::decrement(dish); + typename std::tr1::unordered_map<Dish, int>::iterator dcit = find(dish); + //typename google::sparse_hash_map<Dish, int>::iterator dcit = find(dish); + if (dcit == PYP<Dish,Hash>::end()) { + std::cerr << dish << std::endl; + assert(false); + } + + int delta = 0, table_left=-1; + + typename std::tr1::unordered_map<Dish, typename PYP<Dish,Hash>::TableCounter>::iterator dtit + = PYP<Dish,Hash>::_dish_tables.find(dish); + //typename google::sparse_hash_map<Dish, TableCounter>::iterator dtit = _dish_tables.find(dish); + if (dtit == PYP<Dish,Hash>::_dish_tables.end()) { + std::cerr << dish << std::endl; + assert(false); + } + typename PYP<Dish,Hash>::TableCounter &tc = dtit->second; + + double r = mt_genrand_res53() * PYP<Dish,Hash>::count(dish); + for (std::map<int,int>::iterator hit = tc.table_histogram.begin(); + hit != tc.table_histogram.end(); ++hit) { + r -= (hit->first * hit->second); + if (r <= 0) { + table_left = hit->first; + if (hit->first > 1) { + tc.table_histogram[hit->first-1] += 1; + } + else { + delta = -1; + tc.tables -= 1; + PYP<Dish,Hash>::_total_tables -= 1; + } + + hit->second -= 1; + if (hit->second == 0) tc.table_histogram.erase(hit); + break; + } + } + if (r > 0) { + std::cerr << r << " " << PYP<Dish,Hash>::count(dish) << " " << PYP<Dish,Hash>::_a << " " + << PYP<Dish,Hash>::num_tables(dish) << std::endl; + assert(false); + } + + // remove the customer + dcit->second -= 1; + PYP<Dish,Hash>::_total_customers -= 1; + assert(dcit->second >= 0); + if (dcit->second == 0) { + PYP<Dish,Hash>::erase(dcit); + PYP<Dish,Hash>::_dish_tables.erase(dtit); + } + + typename dish_delta_type::iterator it; + bool insert_result; + boost::tie(it, insert_result) = m_count_delta.insert(std::make_pair(dish,0)); + + it->second -= 1; + + if (it->second == 0) + m_count_delta.erase(it); + + assert (table_left >= 0); + typename PYP<Dish,Hash>::TableCounter& delta_tc = m_table_delta[dish]; + if (table_left > 1) + delta_tc.table_histogram[table_left-1] += 1; + else delta_tc.tables -= 1; + + std::map<int,int>::iterator tit = delta_tc.table_histogram.find(table_left); + //assert (tit != delta_tc.table_histogram.end()); + tit->second -= 1; + if (tit->second == 0) delta_tc.table_histogram.erase(tit); + return delta; } template <typename Dish, typename Hash> void -MPIPYP<Dish,Hash>::clear() -{ +MPIPYP<Dish,Hash>::clear() { PYP<Dish,Hash>::clear(); + reset_deltas(); +} + +template <typename Dish, typename Hash> +void +MPIPYP<Dish,Hash>::reset_deltas() { + m_count_delta.clear(); + m_table_delta.clear(); +} + +template <typename Dish> +struct sum_maps { + typedef std::map<Dish,int> map_type; + map_type& operator() (map_type& l, map_type const & r) const { + for (typename map_type::const_iterator it=r.begin(); it != r.end(); it++) + l[it->first] += it->second; + return l; + } +}; + +// Needed Boost definitions +namespace boost { + namespace mpi { + template <> + struct is_commutative< sum_maps<int>, std::map<int,int> > : mpl::true_ {}; + } + + namespace serialization { + template<class Archive> + void serialize(Archive & ar, PYP<int>::TableCounter& t, const unsigned int version) { + ar & t.table_histogram; + ar & t.tables; + } + + } // namespace serialization +} // namespace boost + + +template <typename Dish, typename Hash> +void +MPIPYP<Dish,Hash>::synchronise() { + boost::mpi::communicator world; + int rank = world.rank(), size = world.size(); + + // communicate the customer count deltas + dish_delta_type global_dish_delta; // the “merged” map + boost::mpi::all_reduce(world, m_count_delta, global_dish_delta, sum_maps<Dish>()); + + // update this restaurant + for (typename dish_delta_type::const_iterator it=global_dish_delta.begin(); + it != global_dish_delta.end(); ++it) { + std::tr1::unordered_map<Dish,int,Hash>::operator[](it->first) += (it->second - m_count_delta[it->first]); + PYP<Dish,Hash>::_total_customers += (it->second - m_count_delta[it->first]); + //std::cerr << "Process " << rank << " adding " << (it->second - m_count_delta[it->first]) << " customers." << std::endl; + } + + // communicate the table count deltas +// for (int process = 0; process < size; ++process) { +// if (rank == process) { +// // broadcast deltas +// std::cerr << " -- Rank " << rank << " broadcasting -- " << std::endl; +// +// boost::mpi::broadcast(world, m_table_delta, process); +// +// std::cerr << " -- Rank " << rank << " done broadcasting -- " << std::endl; +// } +// else { +// std::cerr << " -- Rank " << rank << " receiving -- " << std::endl; +// // receive deltas +// table_delta_type recv_table_delta; +// +// boost::mpi::broadcast(world, recv_table_delta, process); +// +// std::cerr << " -- Rank " << rank << " done receiving -- " << std::endl; +// +// for (typename table_delta_type::const_iterator dish_it=recv_table_delta.begin(); +// dish_it != recv_table_delta.end(); ++dish_it) { +// typename PYP<Dish,Hash>::TableCounter &tc = PYP<Dish,Hash>::_dish_tables[dish_it->first]; +// +// for (std::map<int,int>::const_iterator it=dish_it->second.table_histogram.begin(); +// it != dish_it->second.table_histogram.end(); ++it) { +// tc.table_histogram[it->first] += it->second; +// } +// tc.tables += dish_it->second.tables; +// PYP<Dish,Hash>::_total_tables += dish_it->second.tables; +// } +// } +// } +// std::cerr << " -- Done Reducing -- " << std::endl; + + reset_deltas(); } #endif diff --git a/gi/pyp-topics/src/mpi-train-contexts.cc b/gi/pyp-topics/src/mpi-train-contexts.cc index 956ce123..0651ecac 100644 --- a/gi/pyp-topics/src/mpi-train-contexts.cc +++ b/gi/pyp-topics/src/mpi-train-contexts.cc @@ -86,7 +86,7 @@ int main(int argc, char **argv) // seed the random number generator: 0 = automatic, specify value otherwise unsigned long seed = 0; - PYPTopics model(vm["topics"].as<int>(), vm.count("hierarchical-topics"), seed); + MPIPYPTopics model(vm["topics"].as<int>(), vm.count("hierarchical-topics"), seed); // read the data BackoffGenerator* backoff_gen=0; diff --git a/gi/pyp-topics/src/pyp.hh b/gi/pyp-topics/src/pyp.hh index 26f6ab2e..84decb0f 100644 --- a/gi/pyp-topics/src/pyp.hh +++ b/gi/pyp-topics/src/pyp.hh @@ -1,6 +1,7 @@ #ifndef _pyp_hh #define _pyp_hh +#include "slice-sampler.h" #include <math.h> #include <map> #include <tr1/unordered_map> @@ -11,7 +12,6 @@ #include <boost/random/mersenne_twister.hpp> #include "log_add.h" -#include "slice-sampler.h" #include "mt19937ar.h" // @@ -63,7 +63,7 @@ public: double b() const { return _b; } void set_b(double b) { _b = b; } - void clear(); + virtual void clear(); std::ostream& debug_info(std::ostream& os) const; double log_restaurant_prob() const; @@ -75,13 +75,12 @@ public: void resample_prior_a(); void resample_prior_b(); -private: +protected: double _a, _b; // parameters of the Pitman-Yor distribution double _a_beta_a, _a_beta_b; // parameters of Beta prior on a double _b_gamma_s, _b_gamma_c; // parameters of Gamma prior on b - struct TableCounter - { + struct TableCounter { TableCounter() : tables(0) {}; int tables; std::map<int, int> table_histogram; // num customers at table -> number tables |