1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
|
// STL
#include <iostream>
#include <fstream>
#include <algorithm>
#include <iterator>
// Boost
#include <boost/program_options/parsers.hpp>
#include <boost/program_options/variables_map.hpp>
#include <boost/scoped_ptr.hpp>
// Local
#include "pyp-topics.hh"
#include "corpus.hh"
#include "contexts_corpus.hh"
#include "gzstream.hh"
#include "mt19937ar.h"
static const char *REVISION = "$Rev$";
// Namespaces
using namespace boost;
using namespace boost::program_options;
using namespace std;
int main(int argc, char **argv)
{
std::cout << "Pitman Yor topic models: Copyright 2010 Phil Blunsom\n";
std::cout << REVISION << '\n' << std::endl;
////////////////////////////////////////////////////////////////////////////////////////////
// Command line processing
variables_map vm;
// Command line processing
{
options_description cmdline_options("Allowed options");
cmdline_options.add_options()
("help,h", "print help message")
("data,d", value<string>(), "file containing the documents and context terms")
("topics,t", value<int>()->default_value(50), "number of topics")
("document-topics-out,o", value<string>(), "file to write the document topics to")
("topic-words-out,w", value<string>(), "file to write the topic word distribution to")
("samples,s", value<int>()->default_value(10), "number of sampling passes through the data")
("backoff-type", value<string>(), "backoff type: none|simple")
("filter-singleton-contexts", "filter singleton contexts")
;
store(parse_command_line(argc, argv, cmdline_options), vm);
notify(vm);
if (vm.count("help")) {
cout << cmdline_options << "\n";
return 1;
}
}
////////////////////////////////////////////////////////////////////////////////////////////
if (!vm.count("data")) {
cerr << "Please specify a file containing the data." << endl;
return 1;
}
// seed the random number generator
//mt_init_genrand(time(0));
PYPTopics model(vm["topics"].as<int>());
// read the data
BackoffGenerator* backoff_gen=0;
if (vm.count("backoff-type")) {
if (vm["backoff-type"].as<std::string>() == "none") {
backoff_gen = 0;
}
else if (vm["backoff-type"].as<std::string>() == "simple") {
backoff_gen = new SimpleBackoffGenerator();
}
else {
std::cerr << "Backoff type (--backoff-type) must be one of none|simple." << std::endl;
return(1);
}
}
ContextsCorpus contexts_corpus;
contexts_corpus.read_contexts(vm["data"].as<string>(), backoff_gen, vm.count("filter-singleton-contexts"));
model.set_backoff(contexts_corpus.backoff_index());
if (backoff_gen)
delete backoff_gen;
// train the sampler
model.sample(contexts_corpus, vm["samples"].as<int>());
if (vm.count("document-topics-out")) {
ogzstream documents_out(vm["document-topics-out"].as<string>().c_str());
int document_id=0;
for (Corpus::const_iterator corpusIt=contexts_corpus.begin();
corpusIt != contexts_corpus.end(); ++corpusIt, ++document_id) {
std::vector<int> unique_terms;
for (Document::const_iterator docIt=corpusIt->begin();
docIt != corpusIt->end(); ++docIt) {
if (unique_terms.empty() || *docIt != unique_terms.back())
unique_terms.push_back(*docIt);
}
documents_out << contexts_corpus.key(document_id) << '\t';
for (std::vector<int>::const_iterator termIt=unique_terms.begin();
termIt != unique_terms.end(); ++termIt) {
if (termIt != unique_terms.begin())
documents_out << " ||| ";
std::vector<std::string> strings = contexts_corpus.context2string(*termIt);
std::copy(strings.begin(), strings.end(), std::ostream_iterator<std::string>(documents_out, " "));
documents_out << "||| C=" << model.max(document_id, *termIt);
}
documents_out << std::endl;
}
documents_out.close();
}
if (vm.count("topic-words-out")) {
ogzstream topics_out(vm["topic-words-out"].as<string>().c_str());
model.print_topic_terms(topics_out);
topics_out.close();
}
std::cout << std::endl;
return 0;
}
|