diff options
Diffstat (limited to 'gi/clda/src/clda.cc')
-rw-r--r-- | gi/clda/src/clda.cc | 24 |
1 files changed, 18 insertions, 6 deletions
diff --git a/gi/clda/src/clda.cc b/gi/clda/src/clda.cc index 482a1c4c..574fa038 100644 --- a/gi/clda/src/clda.cc +++ b/gi/clda/src/clda.cc @@ -6,9 +6,6 @@ #include "crp.h" #include "sampler.h" #include "tdict.h" -Dict TD::dict_; -std::string TD::empty = ""; -std::string TD::space = " "; const size_t MAX_DOC_LEN_CHARS = 1000000; using namespace std; @@ -57,8 +54,8 @@ int main(int argc, char** argv) { MT19937 rng; cerr << "INITIALIZING RANDOM TOPIC ASSIGNMENTS\n"; zji.resize(wji.size()); - double beta = 0.01; - double alpha = 0.001; + double beta = 0.1; + double alpha = 50.0 / num_classes; vector<CRP<int> > dr(zji.size(), CRP<int>(beta)); // dr[i] describes the probability of using a topic in document i vector<CRP<int> > wr(num_classes, CRP<int>(alpha)); // wr[k] describes the probability of generating a word in topic k int random_topic = rng.next() * num_classes; @@ -79,9 +76,11 @@ int main(int argc, char** argv) { vector<map<WordID, int> > t2w(num_classes); Timer timer; SampleSet ss; - const int num_types = TD::dict_.max(); + const int num_types = TD::NumWords(); const prob_t class_p0(1.0 / num_classes); const prob_t word_p0(1.0 / num_types); + cerr << "CLASS PRIOR PROB: " << class_p0 << endl; + cerr << " WORD PRIOR LOGPROB: " << log(word_p0) << endl; ss.resize(num_classes); double total_time = 0; for (int iter = 0; iter < num_iterations; ++iter) { @@ -131,6 +130,19 @@ int main(int argc, char** argv) { cerr << "---------------------------------\n"; ShowTopWordsForTopic(t2w[i]); } + cerr << "-------------\n"; +#if 0 + for (int j = 0; j < zji.size(); ++j) { + const size_t num_words = wji[j].size(); + vector<int>& zj = zji[j]; + const vector<int>& wj = wji[j]; + zj.resize(num_words); + for (int i = 0; i < num_words; ++i) { + cerr << TD::Convert(wji[j][i]) << '(' << zj[i] << ") "; + } + cerr << endl; + } +#endif return 0; } |