diff options
author | philblunsom <philblunsom@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-15 03:39:32 +0000 |
---|---|---|
committer | philblunsom <philblunsom@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-15 03:39:32 +0000 |
commit | c0eee37401d3731555346cb260330049b5dc99e7 (patch) | |
tree | a34bcd2309c23ca4d02ab3938f888a349aad12b2 /gi/pyp-topics/src/mpi-train-contexts.cc | |
parent | a5b1162c68d6ff5bc52b646efb563a0077cd1d2a (diff) |
working on the mpi version
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@257 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pyp-topics/src/mpi-train-contexts.cc')
-rw-r--r-- | gi/pyp-topics/src/mpi-train-contexts.cc | 105 |
1 files changed, 56 insertions, 49 deletions
diff --git a/gi/pyp-topics/src/mpi-train-contexts.cc b/gi/pyp-topics/src/mpi-train-contexts.cc index 6309fe93..956ce123 100644 --- a/gi/pyp-topics/src/mpi-train-contexts.cc +++ b/gi/pyp-topics/src/mpi-train-contexts.cc @@ -8,6 +8,8 @@ #include <boost/program_options/parsers.hpp> #include <boost/program_options/variables_map.hpp> #include <boost/scoped_ptr.hpp> +#include <boost/mpi/environment.hpp> +#include <boost/mpi/communicator.hpp> // Local #include "mpi-pyp-topics.hh" @@ -24,8 +26,12 @@ using namespace std; int main(int argc, char **argv) { - cout << "Pitman Yor topic models: Copyright 2010 Phil Blunsom\n"; - cout << REVISION << '\n' <<endl; + mpi::environment env(argc, argv); + mpi::communicator world; + bool am_root = (world.rank() == 0); + if (am_root) std::cout << "I am process " << world.rank() << " of " << world.size() << "." << std::endl; + if (am_root) cout << "Pitman Yor topic models: Copyright 2010 Phil Blunsom\n"; + if (am_root) cout << REVISION << '\n' <<endl; //////////////////////////////////////////////////////////////////////////////////////////// // Command line processing @@ -53,7 +59,6 @@ int main(int argc, char **argv) ("freq-cutoff-start", value<int>()->default_value(0), "initial frequency cutoff.") ("freq-cutoff-end", value<int>()->default_value(0), "final frequency cutoff.") ("freq-cutoff-interval", value<int>()->default_value(0), "number of iterations between frequency decrement.") - ("max-threads", value<int>()->default_value(1), "maximum number of simultaneous threads allowed") ("max-contexts-per-document", value<int>()->default_value(0), "Only sample the n most frequent contexts for a document.") ; @@ -81,7 +86,7 @@ int main(int argc, char **argv) // seed the random number generator: 0 = automatic, specify value otherwise unsigned long seed = 0; - PYPTopics model(vm["topics"].as<int>(), vm.count("hierarchical-topics"), seed, vm["max-threads"].as<int>()); + PYPTopics model(vm["topics"].as<int>(), vm.count("hierarchical-topics"), seed); // read the data BackoffGenerator* backoff_gen=0; @@ -112,58 +117,60 @@ int main(int argc, char **argv) vm["freq-cutoff-interval"].as<int>(), vm["max-contexts-per-document"].as<int>()); - if (vm.count("document-topics-out")) { - ogzstream documents_out(vm["document-topics-out"].as<string>().c_str()); - - int document_id=0; - map<int,int> all_terms; - for (Corpus::const_iterator corpusIt=contexts_corpus.begin(); - corpusIt != contexts_corpus.end(); ++corpusIt, ++document_id) { - vector<int> unique_terms; - for (Document::const_iterator docIt=corpusIt->begin(); - docIt != corpusIt->end(); ++docIt) { - if (unique_terms.empty() || *docIt != unique_terms.back()) - unique_terms.push_back(*docIt); - // increment this terms frequency - pair<map<int,int>::iterator,bool> insert_result = all_terms.insert(make_pair(*docIt,1)); - if (!insert_result.second) - all_terms[*docIt] = all_terms[*docIt] + 1; + if (world.rank() == 0) { + if (vm.count("document-topics-out")) { + ogzstream documents_out(vm["document-topics-out"].as<string>().c_str()); + + int document_id=0; + map<int,int> all_terms; + for (Corpus::const_iterator corpusIt=contexts_corpus.begin(); + corpusIt != contexts_corpus.end(); ++corpusIt, ++document_id) { + vector<int> unique_terms; + for (Document::const_iterator docIt=corpusIt->begin(); + docIt != corpusIt->end(); ++docIt) { + if (unique_terms.empty() || *docIt != unique_terms.back()) + unique_terms.push_back(*docIt); + // increment this terms frequency + pair<map<int,int>::iterator,bool> insert_result = all_terms.insert(make_pair(*docIt,1)); + if (!insert_result.second) + all_terms[*docIt] = all_terms[*docIt] + 1; //insert_result.first++; + } + documents_out << contexts_corpus.key(document_id) << '\t'; + documents_out << model.max(document_id) << " " << corpusIt->size() << " ||| "; + for (std::vector<int>::const_iterator termIt=unique_terms.begin(); + termIt != unique_terms.end(); ++termIt) { + if (termIt != unique_terms.begin()) + documents_out << " ||| "; + vector<std::string> strings = contexts_corpus.context2string(*termIt); + copy(strings.begin(), strings.end(),ostream_iterator<std::string>(documents_out, " ")); + documents_out << "||| C=" << model.max(document_id, *termIt); + + } + documents_out <<endl; } - documents_out << contexts_corpus.key(document_id) << '\t'; - documents_out << model.max(document_id) << " " << corpusIt->size() << " ||| "; - for (std::vector<int>::const_iterator termIt=unique_terms.begin(); - termIt != unique_terms.end(); ++termIt) { - if (termIt != unique_terms.begin()) - documents_out << " ||| "; - vector<std::string> strings = contexts_corpus.context2string(*termIt); - copy(strings.begin(), strings.end(),ostream_iterator<std::string>(documents_out, " ")); - documents_out << "||| C=" << model.max(document_id, *termIt); - + documents_out.close(); + + if (vm.count("default-topics-out")) { + ofstream default_topics(vm["default-topics-out"].as<string>().c_str()); + default_topics << model.max_topic() <<endl; + for (std::map<int,int>::const_iterator termIt=all_terms.begin(); termIt != all_terms.end(); ++termIt) { + vector<std::string> strings = contexts_corpus.context2string(termIt->first); + default_topics << model.max(-1, termIt->first) << " ||| " << termIt->second << " ||| "; + copy(strings.begin(), strings.end(),ostream_iterator<std::string>(default_topics, " ")); + default_topics <<endl; + } } - documents_out <<endl; } - documents_out.close(); - - if (vm.count("default-topics-out")) { - ofstream default_topics(vm["default-topics-out"].as<string>().c_str()); - default_topics << model.max_topic() <<endl; - for (std::map<int,int>::const_iterator termIt=all_terms.begin(); termIt != all_terms.end(); ++termIt) { - vector<std::string> strings = contexts_corpus.context2string(termIt->first); - default_topics << model.max(-1, termIt->first) << " ||| " << termIt->second << " ||| "; - copy(strings.begin(), strings.end(),ostream_iterator<std::string>(default_topics, " ")); - default_topics <<endl; - } + + if (vm.count("topic-words-out")) { + ogzstream topics_out(vm["topic-words-out"].as<string>().c_str()); + model.print_topic_terms(topics_out); + topics_out.close(); } - } - if (vm.count("topic-words-out")) { - ogzstream topics_out(vm["topic-words-out"].as<string>().c_str()); - model.print_topic_terms(topics_out); - topics_out.close(); + cout <<endl; } - cout <<endl; - return 0; } |