diff options
Diffstat (limited to 'gi/clda/src/clda.cc')
-rw-r--r-- | gi/clda/src/clda.cc | 148 |
1 files changed, 0 insertions, 148 deletions
diff --git a/gi/clda/src/clda.cc b/gi/clda/src/clda.cc deleted file mode 100644 index f548997f..00000000 --- a/gi/clda/src/clda.cc +++ /dev/null @@ -1,148 +0,0 @@ -#include <iostream> -#include <vector> -#include <map> -#include <string> - -#include "timer.h" -#include "crp.h" -#include "ccrp.h" -#include "sampler.h" -#include "tdict.h" -const size_t MAX_DOC_LEN_CHARS = 10000000; - -using namespace std; - -void ShowTopWordsForTopic(const map<WordID, int>& counts) { - multimap<int, WordID> ms; - for (map<WordID,int>::const_iterator it = counts.begin(); it != counts.end(); ++it) - ms.insert(make_pair(it->second, it->first)); - int cc = 0; - for (multimap<int, WordID>::reverse_iterator it = ms.rbegin(); it != ms.rend(); ++it) { - cerr << it->first << ':' << TD::Convert(it->second) << " "; - ++cc; - if (cc==20) break; - } - cerr << endl; -} - -int main(int argc, char** argv) { - if (argc != 3) { - cerr << "Usage: " << argv[0] << " num-classes num-samples\n"; - return 1; - } - const int num_classes = atoi(argv[1]); - const int num_iterations = atoi(argv[2]); - const int burnin_size = num_iterations * 0.9; - if (num_classes < 2) { - cerr << "Must request more than 1 class\n"; - return 1; - } - if (num_iterations < 5) { - cerr << "Must request more than 5 iterations\n"; - return 1; - } - cerr << "CLASSES: " << num_classes << endl; - char* buf = new char[MAX_DOC_LEN_CHARS]; - vector<vector<int> > wji; // w[j][i] - observed word i of doc j - vector<vector<int> > zji; // z[j][i] - topic assignment for word i of doc j - cerr << "READING DOCUMENTS\n"; - while(cin) { - cin.getline(buf, MAX_DOC_LEN_CHARS); - if (buf[0] == 0) continue; - wji.push_back(vector<WordID>()); - TD::ConvertSentence(buf, &wji.back()); - } - cerr << "READ " << wji.size() << " DOCUMENTS\n"; - MT19937 rng; - cerr << "INITIALIZING RANDOM TOPIC ASSIGNMENTS\n"; - zji.resize(wji.size()); - double disc = 0.1; - double beta = 10.0; - double alpha = 50.0; - const double uniform_topic = 1.0 / num_classes; - const double uniform_word = 1.0 / TD::NumWords(); - vector<CCRP<int> > dr(zji.size(), CCRP<int>(1,1,1,1,disc, beta)); // dr[i] describes the probability of using a topic in document i - vector<CCRP<int> > wr(num_classes, CCRP<int>(1,1,1,1,disc, alpha)); // wr[k] describes the probability of generating a word in topic k - for (int j = 0; j < zji.size(); ++j) { - const size_t num_words = wji[j].size(); - vector<int>& zj = zji[j]; - const vector<int>& wj = wji[j]; - zj.resize(num_words); - for (int i = 0; i < num_words; ++i) { - int random_topic = rng.next() * num_classes; - if (random_topic == num_classes) { --random_topic; } - zj[i] = random_topic; - const int word = wj[i]; - dr[j].increment(random_topic, uniform_topic, &rng); - wr[random_topic].increment(word, uniform_word, &rng); - } - } - cerr << "SAMPLING\n"; - vector<map<WordID, int> > t2w(num_classes); - Timer timer; - SampleSet<double> ss; - ss.resize(num_classes); - double total_time = 0; - for (int iter = 0; iter < num_iterations; ++iter) { - cerr << '.'; - if (iter && iter % 10 == 0) { - total_time += timer.Elapsed(); - timer.Reset(); - double llh = 0; -#if 1 - for (int j = 0; j < dr.size(); ++j) - dr[j].resample_hyperparameters(&rng); - for (int j = 0; j < wr.size(); ++j) - wr[j].resample_hyperparameters(&rng); -#endif - - for (int j = 0; j < dr.size(); ++j) - llh += dr[j].log_crp_prob(); - for (int j = 0; j < wr.size(); ++j) - llh += wr[j].log_crp_prob(); - cerr << " [LLH=" << llh << " I=" << iter << "]\n"; - } - for (int j = 0; j < zji.size(); ++j) { - const size_t num_words = wji[j].size(); - vector<int>& zj = zji[j]; - const vector<int>& wj = wji[j]; - for (int i = 0; i < num_words; ++i) { - const int word = wj[i]; - const int cur_topic = zj[i]; - dr[j].decrement(cur_topic, &rng); - wr[cur_topic].decrement(word, &rng); - - for (int k = 0; k < num_classes; ++k) { - ss[k]= dr[j].prob(k, uniform_topic) * wr[k].prob(word, uniform_word); - } - const int new_topic = rng.SelectSample(ss); - dr[j].increment(new_topic, uniform_topic, &rng); - wr[new_topic].increment(word, uniform_word, &rng); - zj[i] = new_topic; - if (iter > burnin_size) { - ++t2w[cur_topic][word]; - } - } - } - } - for (int i = 0; i < num_classes; ++i) { - cerr << "---------------------------------\n"; - cerr << " final PYP(" << wr[i].discount() << "," << wr[i].concentration() << ")\n"; - ShowTopWordsForTopic(t2w[i]); - } - cerr << "-------------\n"; -#if 0 - for (int j = 0; j < zji.size(); ++j) { - const size_t num_words = wji[j].size(); - vector<int>& zj = zji[j]; - const vector<int>& wj = wji[j]; - zj.resize(num_words); - for (int i = 0; i < num_words; ++i) { - cerr << TD::Convert(wji[j][i]) << '(' << zj[i] << ") "; - } - cerr << endl; - } -#endif - return 0; -} - |