From e26434979adc33bd949566ba7bf02dff64e80a3e Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 2 Oct 2012 00:19:43 -0400 Subject: cdec cleanup, remove bayesian stuff, parsing stuff --- gi/pyp-topics/src/train-contexts.cc | 174 ------------------------------------ 1 file changed, 174 deletions(-) delete mode 100644 gi/pyp-topics/src/train-contexts.cc (limited to 'gi/pyp-topics/src/train-contexts.cc') diff --git a/gi/pyp-topics/src/train-contexts.cc b/gi/pyp-topics/src/train-contexts.cc deleted file mode 100644 index 9463f9fc..00000000 --- a/gi/pyp-topics/src/train-contexts.cc +++ /dev/null @@ -1,174 +0,0 @@ -// STL -#include -#include -#include -#include - -// Boost -#include -#include -#include - -// Local -#include "pyp-topics.hh" -#include "corpus.hh" -#include "contexts_corpus.hh" -#include "gzstream.hh" - -static const char *REVISION = "$Rev$"; - -// Namespaces -using namespace boost; -using namespace boost::program_options; -using namespace std; - -int main(int argc, char **argv) -{ - cout << "Pitman Yor topic models: Copyright 2010 Phil Blunsom\n"; - cout << REVISION << '\n' <(), "config file specifying additional command line options") - ; - options_description config_options("Allowed options"); - config_options.add_options() - ("data,d", value(), "file containing the documents and context terms") - ("topics,t", value()->default_value(50), "number of topics") - ("document-topics-out,o", value(), "file to write the document topics to") - ("default-topics-out", value(), "file to write default term topic assignments.") - ("topic-words-out,w", value(), "file to write the topic word distribution to") - ("samples,s", value()->default_value(10), "number of sampling passes through the data") - ("backoff-type", value(), "backoff type: none|simple") -// ("filter-singleton-contexts", "filter singleton contexts") - ("hierarchical-topics", "Use a backoff hierarchical PYP as the P0 for the document topics distribution.") - ("freq-cutoff-start", value()->default_value(0), "initial frequency cutoff.") - ("freq-cutoff-end", value()->default_value(0), "final frequency cutoff.") - ("freq-cutoff-interval", value()->default_value(0), "number of iterations between frequency decrement.") - ("max-threads", value()->default_value(1), "maximum number of simultaneous threads allowed") - ("max-contexts-per-document", value()->default_value(0), "Only sample the n most frequent contexts for a document.") - ("num-jobs", value()->default_value(1), "allows finer control over parallelization") - ("temp-start", value()->default_value(1.0), "starting annealing temperature.") - ("temp-end", value()->default_value(1.0), "end annealing temperature.") - ; - - cmdline_specific.add(config_options); - - store(parse_command_line(argc, argv, cmdline_specific), vm); - notify(vm); - - if (vm.count("config") > 0) { - ifstream config(vm["config"].as().c_str()); - store(parse_config_file(config, config_options), vm); - } - - if (vm.count("help")) { - cout << cmdline_specific << "\n"; - return 1; - } - } - //////////////////////////////////////////////////////////////////////////////////////////// - - if (!vm.count("data")) { - cerr << "Please specify a file containing the data." << endl; - return 1; - } - assert(vm["max-threads"].as() > 0); - assert(vm["num-jobs"].as() > -1); - // seed the random number generator: 0 = automatic, specify value otherwise - unsigned long seed = 0; - PYPTopics model(vm["topics"].as(), vm.count("hierarchical-topics"), seed, vm["max-threads"].as(), vm["num-jobs"].as()); - - // read the data - BackoffGenerator* backoff_gen=0; - if (vm.count("backoff-type")) { - if (vm["backoff-type"].as() == "none") { - backoff_gen = 0; - } - else if (vm["backoff-type"].as() == "simple") { - backoff_gen = new SimpleBackoffGenerator(); - } - else { - cerr << "Backoff type (--backoff-type) must be one of none|simple." <(), backoff_gen, /*vm.count("filter-singleton-contexts")*/ false); - model.set_backoff(contexts_corpus.backoff_index()); - - if (backoff_gen) - delete backoff_gen; - - // train the sampler - model.sample_corpus(contexts_corpus, vm["samples"].as(), - vm["freq-cutoff-start"].as(), - vm["freq-cutoff-end"].as(), - vm["freq-cutoff-interval"].as(), - vm["max-contexts-per-document"].as(), - vm["temp-start"].as(), vm["temp-end"].as()); - - if (vm.count("document-topics-out")) { - ogzstream documents_out(vm["document-topics-out"].as().c_str()); - - int document_id=0; - map all_terms; - for (Corpus::const_iterator corpusIt=contexts_corpus.begin(); - corpusIt != contexts_corpus.end(); ++corpusIt, ++document_id) { - vector unique_terms; - for (Document::const_iterator docIt=corpusIt->begin(); - docIt != corpusIt->end(); ++docIt) { - if (unique_terms.empty() || *docIt != unique_terms.back()) - unique_terms.push_back(*docIt); - // increment this terms frequency - pair::iterator,bool> insert_result = all_terms.insert(make_pair(*docIt,1)); - if (!insert_result.second) - all_terms[*docIt] = all_terms[*docIt] + 1; - //insert_result.first++; - } - documents_out << contexts_corpus.key(document_id) << '\t'; - documents_out << model.max(document_id).first << " " << corpusIt->size() << " ||| "; - for (std::vector::const_iterator termIt=unique_terms.begin(); - termIt != unique_terms.end(); ++termIt) { - if (termIt != unique_terms.begin()) - documents_out << " ||| "; - vector strings = contexts_corpus.context2string(*termIt); - copy(strings.begin(), strings.end(),ostream_iterator(documents_out, " ")); - std::pair maxinfo = model.max(document_id, *termIt); - documents_out << "||| C=" << maxinfo.first << " P=" << maxinfo.second; - - } - documents_out <().c_str()); - default_topics << model.max_topic() <::const_iterator termIt=all_terms.begin(); termIt != all_terms.end(); ++termIt) { - vector strings = contexts_corpus.context2string(termIt->first); - default_topics << model.max(-1, termIt->first).first << " ||| " << termIt->second << " ||| "; - copy(strings.begin(), strings.end(),ostream_iterator(default_topics, " ")); - default_topics <().c_str()); - model.print_topic_terms(topics_out); - topics_out.close(); - } - - cout <