summaryrefslogtreecommitdiff
path: root/gi/pyp-topics/src/train.cc
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cab.ark.cs.cmu.edu>2012-10-02 00:19:43 -0400
committerChris Dyer <cdyer@cab.ark.cs.cmu.edu>2012-10-02 00:19:43 -0400
commite26434979adc33bd949566ba7bf02dff64e80a3e (patch)
treed1c72495e3af6301bd28e7e66c42de0c7a944d1f /gi/pyp-topics/src/train.cc
parent0870d4a1f5e14cc7daf553b180d599f09f6614a2 (diff)
cdec cleanup, remove bayesian stuff, parsing stuff
Diffstat (limited to 'gi/pyp-topics/src/train.cc')
-rw-r--r--gi/pyp-topics/src/train.cc135
1 files changed, 0 insertions, 135 deletions
diff --git a/gi/pyp-topics/src/train.cc b/gi/pyp-topics/src/train.cc
deleted file mode 100644
index db7ca46e..00000000
--- a/gi/pyp-topics/src/train.cc
+++ /dev/null
@@ -1,135 +0,0 @@
-// STL
-#include <iostream>
-#include <fstream>
-
-// Boost
-#include <boost/program_options/parsers.hpp>
-#include <boost/program_options/variables_map.hpp>
-#include <boost/scoped_ptr.hpp>
-
-// Local
-#include "pyp-topics.hh"
-#include "corpus.hh"
-#include "contexts_corpus.hh"
-#include "gzstream.hh"
-
-static const char *REVISION = "$Rev$";
-
-// Namespaces
-using namespace boost;
-using namespace boost::program_options;
-using namespace std;
-
-int main(int argc, char **argv)
-{
- std::cout << "Pitman Yor topic models: Copyright 2010 Phil Blunsom\n";
- std::cout << REVISION << '\n' << std::endl;
-
- ////////////////////////////////////////////////////////////////////////////////////////////
- // Command line processing
- variables_map vm;
-
- // Command line processing
- options_description cmdline_specific("Command line specific options");
- cmdline_specific.add_options()
- ("help,h", "print help message")
- ("config,c", value<string>(), "config file specifying additional command line options")
- ;
- options_description generic("Allowed options");
- generic.add_options()
- ("documents,d", value<string>(), "file containing the documents")
- ("topics,t", value<int>()->default_value(50), "number of topics")
- ("document-topics-out,o", value<string>(), "file to write the document topics to")
- ("topic-words-out,w", value<string>(), "file to write the topic word distribution to")
- ("samples,s", value<int>()->default_value(10), "number of sampling passes through the data")
- ("test-corpus", value<string>(), "file containing the test data")
- ("backoff-paths", value<string>(), "file containing the term backoff paths")
- ;
- options_description config_options, cmdline_options;
- config_options.add(generic);
- cmdline_options.add(generic).add(cmdline_specific);
-
- store(parse_command_line(argc, argv, cmdline_options), vm);
- if (vm.count("config") > 0) {
- ifstream config(vm["config"].as<string>().c_str());
- store(parse_config_file(config, cmdline_options), vm);
- }
- notify(vm);
- ////////////////////////////////////////////////////////////////////////////////////////////
-
- if (vm.count("documents") == 0) {
- cerr << "Please specify a file containing the documents." << endl;
- cout << cmdline_options << "\n";
- return 1;
- }
-
- if (vm.count("help")) {
- cout << cmdline_options << "\n";
- return 1;
- }
-
- // seed the random number generator: 0 = automatic, specify value otherwise
- unsigned long seed = 0;
- PYPTopics model(vm["topics"].as<int>(), false, seed);
-
- // read the data
- Corpus corpus;
- corpus.read(vm["documents"].as<string>());
-
- // read the backoff dictionary
- if (vm.count("backoff-paths"))
- model.set_backoff(vm["backoff-paths"].as<string>());
-
- // train the sampler
- model.sample_corpus(corpus, vm["samples"].as<int>());
-
- if (vm.count("document-topics-out")) {
- ogzstream documents_out(vm["document-topics-out"].as<string>().c_str());
- //model.print_document_topics(documents_out);
-
- int document_id=0;
- for (Corpus::const_iterator corpusIt=corpus.begin();
- corpusIt != corpus.end(); ++corpusIt, ++document_id) {
- std::vector<int> unique_terms;
- for (Document::const_iterator docIt=corpusIt->begin();
- docIt != corpusIt->end(); ++docIt) {
- if (unique_terms.empty() || *docIt != unique_terms.back())
- unique_terms.push_back(*docIt);
- }
- documents_out << unique_terms.size();
- for (std::vector<int>::const_iterator termIt=unique_terms.begin();
- termIt != unique_terms.end(); ++termIt)
- documents_out << " " << *termIt << ":" << model.max(document_id, *termIt).first;
- documents_out << std::endl;
- }
- documents_out.close();
- }
-
- if (vm.count("topic-words-out")) {
- ogzstream topics_out(vm["topic-words-out"].as<string>().c_str());
- model.print_topic_terms(topics_out);
- topics_out.close();
- }
-
- if (vm.count("test-corpus")) {
- TestCorpus test_corpus;
- test_corpus.read(vm["test-corpus"].as<string>());
- ogzstream topics_out((vm["test-corpus"].as<string>() + ".topics.gz").c_str());
-
- for (TestCorpus::const_iterator corpusIt=test_corpus.begin();
- corpusIt != test_corpus.end(); ++corpusIt) {
- int index=0;
- for (DocumentTerms::const_iterator instanceIt=corpusIt->begin();
- instanceIt != corpusIt->end(); ++instanceIt, ++index) {
- int topic = model.max(instanceIt->doc, instanceIt->term).first;
- if (index != 0) topics_out << " ";
- topics_out << topic;
- }
- topics_out << std::endl;
- }
- topics_out.close();
- }
- std::cout << std::endl;
-
- return 0;
-}