diff options
Diffstat (limited to 'gi/pyp-topics')
| -rw-r--r-- | gi/pyp-topics/src/Makefile.am | 5 | ||||
| -rw-r--r-- | gi/pyp-topics/src/contexts_corpus.cc | 56 | ||||
| -rw-r--r-- | gi/pyp-topics/src/contexts_corpus.hh | 39 | ||||
| -rw-r--r-- | gi/pyp-topics/src/contexts_lexer.h | 22 | ||||
| -rw-r--r-- | gi/pyp-topics/src/contexts_lexer.l | 110 | ||||
| -rw-r--r-- | gi/pyp-topics/src/corpus.cc | 7 | ||||
| -rw-r--r-- | gi/pyp-topics/src/corpus.hh | 3 | ||||
| -rw-r--r-- | gi/pyp-topics/src/pyp-topics.hh | 7 | ||||
| -rw-r--r-- | gi/pyp-topics/src/train.cc | 43 | 
9 files changed, 272 insertions, 20 deletions
| diff --git a/gi/pyp-topics/src/Makefile.am b/gi/pyp-topics/src/Makefile.am index 51b47294..3d62a334 100644 --- a/gi/pyp-topics/src/Makefile.am +++ b/gi/pyp-topics/src/Makefile.am @@ -1,6 +1,9 @@  bin_PROGRAMS = pyp-topics-train -pyp_topics_train_SOURCES = corpus.cc gammadist.c gzstream.cc mt19937ar.c pyp-topics.cc train.cc +contexts_lexer.cc: contexts_lexer.l +	$(LEX) -s -CF -8 -o$@ $< + +pyp_topics_train_SOURCES = corpus.cc gammadist.c gzstream.cc mt19937ar.c pyp-topics.cc train.cc contexts_lexer.cc contexts_corpus.cc  pyp_topics_train_LDADD = -lz  AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops diff --git a/gi/pyp-topics/src/contexts_corpus.cc b/gi/pyp-topics/src/contexts_corpus.cc new file mode 100644 index 00000000..0b3ec644 --- /dev/null +++ b/gi/pyp-topics/src/contexts_corpus.cc @@ -0,0 +1,56 @@ +#include <sstream> +#include <iostream> +#include <set> + +#include "contexts_corpus.hh" +#include "gzstream.hh" +#include "contexts_lexer.h" + +using namespace std; + +////////////////////////////////////////////////// +// ContextsCorpus +////////////////////////////////////////////////// + +void read_callback(const ContextsLexer::PhraseContextsType& new_contexts, void* extra) { +  assert(new_contexts.contexts.size() == new_contexts.counts.size()); + +  ContextsCorpus* corpus_ptr = static_cast<ContextsCorpus*>(extra); +  Document* doc(new Document()); + +  //std::cout << "READ: " << new_contexts.phrase << "\t"; + +  for (int i=0; i < new_contexts.contexts.size(); ++i) { +    std::string context_str = ""; +    for (ContextsLexer::Context::const_iterator it=new_contexts.contexts[i].begin(); +         it != new_contexts.contexts[i].end(); ++it) { +      //std::cout << *it << " "; +      if (it != new_contexts.contexts[i].begin()) +        context_str += "__"; +      context_str += *it; +    } + +    WordID id = corpus_ptr->m_dict.Convert(context_str); +    int count = new_contexts.counts[i]; +    for (int i=0; i<count; ++i) +      doc->push_back(id); +    corpus_ptr->m_num_terms += count; + +    //std::cout << context_str << " (" << id << ") ||| C=" << count << " ||| "; +  } +  //std::cout << std::endl; + +  corpus_ptr->m_documents.push_back(doc); +} + +unsigned ContextsCorpus::read_contexts(const std::string &filename) { +  m_num_terms = 0; +  m_num_types = 0; + +  igzstream in(filename.c_str()); +  ContextsLexer::ReadContexts(&in, read_callback, this); + +  m_num_types = m_dict.max(); + +  return m_documents.size(); +} diff --git a/gi/pyp-topics/src/contexts_corpus.hh b/gi/pyp-topics/src/contexts_corpus.hh new file mode 100644 index 00000000..e680cef5 --- /dev/null +++ b/gi/pyp-topics/src/contexts_corpus.hh @@ -0,0 +1,39 @@ +#ifndef _CONTEXTS_CORPUS_HH +#define _CONTEXTS_CORPUS_HH + +#include <vector> +#include <string> +#include <map> + +#include <boost/ptr_container/ptr_vector.hpp> + +#include "corpus.hh" +#include "contexts_lexer.h" +#include "../../../decoder/dict.h" + +//////////////////////////////////////////////////////////////// +// ContextsCorpus +//////////////////////////////////////////////////////////////// + +class ContextsCorpus : public Corpus { +  friend void read_callback(const ContextsLexer::PhraseContextsType&, void*); + +public: +    typedef boost::ptr_vector<Document>::const_iterator const_iterator; + +public: +    ContextsCorpus() {} +    virtual ~ContextsCorpus() {} + +    unsigned read_contexts(const std::string &filename); + +    TermBackoffPtr backoff_index() { +      return m_backoff; +    } + +private: +    TermBackoffPtr m_backoff; +    Dict m_dict; +}; + +#endif // _CONTEXTS_CORPUS_HH diff --git a/gi/pyp-topics/src/contexts_lexer.h b/gi/pyp-topics/src/contexts_lexer.h new file mode 100644 index 00000000..f9a1b21c --- /dev/null +++ b/gi/pyp-topics/src/contexts_lexer.h @@ -0,0 +1,22 @@ +#ifndef _CONTEXTS_LEXER_H_ +#define _CONTEXTS_LEXER_H_  + +#include <iostream> +#include <vector> +#include <string> + +#include "../../../decoder/dict.h"  + +struct ContextsLexer { +  typedef std::vector<std::string> Context; +  struct PhraseContextsType { +    std::string          phrase; +    std::vector<Context> contexts; +    std::vector<int>     counts; +  }; + +  typedef void (*ContextsCallback)(const PhraseContextsType& new_contexts, void* extra); +  static void ReadContexts(std::istream* in, ContextsCallback func, void* extra); +}; + +#endif diff --git a/gi/pyp-topics/src/contexts_lexer.l b/gi/pyp-topics/src/contexts_lexer.l new file mode 100644 index 00000000..61189a73 --- /dev/null +++ b/gi/pyp-topics/src/contexts_lexer.l @@ -0,0 +1,110 @@ +%{ +#include "contexts_lexer.h" + +#include <string> +#include <iostream> +#include <sstream> +#include <cstring> +#include <cassert> + +int lex_line = 0; +std::istream* contextslex_stream = NULL; +ContextsLexer::ContextsCallback contexts_callback = NULL; +void* contexts_callback_extra = NULL; + +#undef YY_INPUT +#define YY_INPUT(buf, result, max_size) (result = contextslex_stream->read(buf, max_size).gcount()) + +#define YY_SKIP_YYWRAP 1 +int num_phrases = 0; +int yywrap() { return 1; } + +#define MAX_TOKEN_SIZE 255 +std::string contextslex_tmp_token(MAX_TOKEN_SIZE, '\0'); +ContextsLexer::PhraseContextsType current_contexts; + +#define MAX_CONTEXT_SIZE 255 +//std::string tmp_context[MAX_CONTEXT_SIZE]; +ContextsLexer::Context tmp_context; + + +void contextslex_reset() { +  current_contexts.phrase.clear(); +  current_contexts.contexts.clear(); +  current_contexts.counts.clear(); +  tmp_context.clear(); +} + +%} + +INT [\-+]?[0-9]+|inf|[\-+]inf + +%x CONTEXT COUNT COUNT_END +%% + +<INITIAL>[^\t]+	{  +    contextslex_reset(); +    current_contexts.phrase.assign(yytext, yyleng); +    BEGIN(CONTEXT); +  } +<INITIAL>\t	{  +    ;  +  } + +<INITIAL,CONTEXT,COUNT>\n	{ +    std::cerr << "ERROR: contexts_lexer.l: unexpected newline while trying to read phrase|context|count." << std::endl; +    abort(); +  } + +<CONTEXT>\|\|\|	{ +    current_contexts.contexts.push_back(tmp_context); +    tmp_context.clear(); +		BEGIN(COUNT); +	} +<CONTEXT>[^ \t]+	{  +		contextslex_tmp_token.assign(yytext, yyleng); +    tmp_context.push_back(contextslex_tmp_token); +  } +<CONTEXT>[ \t]+	{ ; } + +<COUNT>[ \t]+	{ ; } +<COUNT>C={INT} {  +		current_contexts.counts.push_back(atoi(yytext+2)); +    BEGIN(COUNT_END); +  } +<COUNT>.	{  +    std::cerr << "ERROR: contexts_lexer.l: unexpected content while reading count." << std::endl; +    abort(); +  } + +<COUNT_END>[ \t]+  { ; } +<COUNT_END>\|\|\|	{ +		BEGIN(CONTEXT); +  } +<COUNT_END>\n { +    //std::cerr << "READ:" << current_contexts.phrase << " with " << current_contexts.contexts.size()  +    //  << " contexts, and " << current_contexts.counts.size() << " counts." << std::endl; +		contexts_callback(current_contexts, contexts_callback_extra); +    current_contexts.phrase.clear(); +    current_contexts.contexts.clear(); +    current_contexts.counts.clear(); +		BEGIN(INITIAL); +  } +<COUNT_END>.  {  +		contextslex_tmp_token.assign(yytext, yyleng); +    std::cerr << "ERROR: contexts_lexer.l: unexpected content while looking for ||| closing count." << std::endl; +    abort(); +  } + +%% + +#include "../../../decoder/filelib.h"  + +void ContextsLexer::ReadContexts(std::istream* in, ContextsLexer::ContextsCallback func, void* extra) { +  lex_line = 1; +  contextslex_stream = in; +  contexts_callback_extra = extra, +  contexts_callback = func; +  yylex(); +} + diff --git a/gi/pyp-topics/src/corpus.cc b/gi/pyp-topics/src/corpus.cc index 93910ea3..24b93a03 100644 --- a/gi/pyp-topics/src/corpus.cc +++ b/gi/pyp-topics/src/corpus.cc @@ -22,7 +22,7 @@ unsigned Corpus::read(const std::string &filename) {    string buf;    int token; -  unsigned count=0; +  unsigned doc_count=0;    while (getline(in, buf)) {      Document* doc(new Document());      istringstream ss(buf); @@ -39,14 +39,15 @@ unsigned Corpus::read(const std::string &filename) {      }      m_documents.push_back(doc); -    count++; +    doc_count++;    }    m_num_types = seen_types.size(); -  return count; +  return doc_count;  } +  //////////////////////////////////////////////////  // TestCorpus  ////////////////////////////////////////////////// diff --git a/gi/pyp-topics/src/corpus.hh b/gi/pyp-topics/src/corpus.hh index c1b0ceda..243f7e2c 100644 --- a/gi/pyp-topics/src/corpus.hh +++ b/gi/pyp-topics/src/corpus.hh @@ -5,6 +5,7 @@  #include <string>  #include <map> +#include <boost/shared_ptr.hpp>  #include <boost/ptr_container/ptr_vector.hpp>  //////////////////////////////////////////////////////////////// @@ -100,4 +101,6 @@ protected:      int m_backoff_order;      std::vector<int> m_terms_at_order;  }; +typedef boost::shared_ptr<TermBackoff> TermBackoffPtr; +  #endif // _CORPUS_HH diff --git a/gi/pyp-topics/src/pyp-topics.hh b/gi/pyp-topics/src/pyp-topics.hh index 4036985a..92d6f292 100644 --- a/gi/pyp-topics/src/pyp-topics.hh +++ b/gi/pyp-topics/src/pyp-topics.hh @@ -15,7 +15,7 @@ public:    typedef long double F;  public: -  PYPTopics(int num_topics) : m_num_topics(num_topics), m_word_pyps(1), m_backoff(0) {} +  PYPTopics(int num_topics) : m_num_topics(num_topics), m_word_pyps(1) {}    void sample(const Corpus& corpus, int samples);    int sample(const DocumentId& doc, const Term& term); @@ -27,6 +27,9 @@ public:      m_word_pyps.clear();      m_word_pyps.resize(m_backoff->order(), PYPs());    } +  void set_backoff(TermBackoffPtr backoff) { +    m_backoff = backoff; +  }    F prob(const Term& term, int topic, int level=0) const;    void decrement(const Term& term, int topic, int level=0); @@ -46,7 +49,7 @@ private:    PYPs m_document_pyps;    std::vector<PYPs> m_word_pyps; -  std::auto_ptr<TermBackoff> m_backoff; +  TermBackoffPtr m_backoff;  };  #endif // PYP_TOPICS_HH diff --git a/gi/pyp-topics/src/train.cc b/gi/pyp-topics/src/train.cc index 0d107f11..01ada182 100644 --- a/gi/pyp-topics/src/train.cc +++ b/gi/pyp-topics/src/train.cc @@ -10,6 +10,7 @@  // Local  #include "pyp-topics.hh"  #include "corpus.hh" +#include "contexts_corpus.hh"  #include "gzstream.hh"  #include "mt19937ar.h" @@ -38,6 +39,7 @@ int main(int argc, char **argv)    options_description generic("Allowed options");    generic.add_options()      ("documents,d", value<string>(), "file containing the documents") +    ("contexts", value<string>(), "file containing the documents in phrase contexts format")      ("topics,t", value<int>()->default_value(50), "number of topics")      ("document-topics-out,o", value<string>(), "file to write the document topics to")      ("topic-words-out,w", value<string>(), "file to write the topic word distribution to") @@ -56,42 +58,55 @@ int main(int argc, char **argv)    }    notify(vm);    //////////////////////////////////////////////////////////////////////////////////////////// - -  if (vm.count("help")) {  -    cout << cmdline_options << "\n";  +  if (vm.count("contexts") > 0 && vm.count("documents") > 0) { +    cerr << "Only one of --documents or --contexts must be specified." << std::endl;       return 1;     } -  if (vm.count("documents") == 0) { +  if (vm.count("documents") == 0 && vm.count("contexts") == 0) {      cerr << "Please specify a file containing the documents." << endl;      cout << cmdline_options << "\n";       return 1;    } +  if (vm.count("help")) {  +    cout << cmdline_options << "\n";  +    return 1;  +  } +    // seed the random number generator    //mt_init_genrand(time(0)); +  PYPTopics model(vm["topics"].as<int>()); +    // read the data -  Corpus corpus; -  corpus.read(vm["documents"].as<string>()); +  boost::shared_ptr<Corpus> corpus; +  if (vm.count("documents") == 0 && vm.count("contexts") == 0) { +    corpus.reset(new Corpus); +    corpus->read(vm["documents"].as<string>()); -  // run the sampler -  PYPTopics model(vm["topics"].as<int>()); +    // read the backoff dictionary +    if (vm.count("backoff-paths")) +      model.set_backoff(vm["backoff-paths"].as<string>()); -  // read the backoff dictionary -  if (vm.count("backoff-paths")) -    model.set_backoff(vm["backoff-paths"].as<string>()); +  } +  else { +    boost::shared_ptr<ContextsCorpus> contexts_corpus(new ContextsCorpus); +    contexts_corpus->read_contexts(vm["contexts"].as<string>()); +    corpus = contexts_corpus; +    model.set_backoff(contexts_corpus->backoff_index()); +  }    // train the sampler -  model.sample(corpus, vm["samples"].as<int>()); +  model.sample(*corpus, vm["samples"].as<int>());    if (vm.count("document-topics-out")) {      ogzstream documents_out(vm["document-topics-out"].as<string>().c_str());      //model.print_document_topics(documents_out);      int document_id=0; -    for (Corpus::const_iterator corpusIt=corpus.begin();  -         corpusIt != corpus.end(); ++corpusIt, ++document_id) { +    for (Corpus::const_iterator corpusIt=corpus->begin();  +         corpusIt != corpus->end(); ++corpusIt, ++document_id) {        std::vector<int> unique_terms;        for (Document::const_iterator docIt=corpusIt->begin();             docIt != corpusIt->end(); ++docIt) { | 
