From 1070f63e3978b9b26df46ad80fe1f40f2ce83a23 Mon Sep 17 00:00:00 2001 From: "philblunsom@gmail.com" Date: Mon, 28 Jun 2010 15:01:17 +0000 Subject: Added contexts_corpus for reading text data files. git-svn-id: https://ws10smt.googlecode.com/svn/trunk@36 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/pyp-topics/src/Makefile.am | 5 +- gi/pyp-topics/src/contexts_corpus.cc | 56 ++++++++++++++++++ gi/pyp-topics/src/contexts_corpus.hh | 39 +++++++++++++ gi/pyp-topics/src/contexts_lexer.h | 22 +++++++ gi/pyp-topics/src/contexts_lexer.l | 110 +++++++++++++++++++++++++++++++++++ gi/pyp-topics/src/corpus.cc | 7 ++- gi/pyp-topics/src/corpus.hh | 3 + gi/pyp-topics/src/pyp-topics.hh | 7 ++- gi/pyp-topics/src/train.cc | 43 +++++++++----- 9 files changed, 272 insertions(+), 20 deletions(-) create mode 100644 gi/pyp-topics/src/contexts_corpus.cc create mode 100644 gi/pyp-topics/src/contexts_corpus.hh create mode 100644 gi/pyp-topics/src/contexts_lexer.h create mode 100644 gi/pyp-topics/src/contexts_lexer.l diff --git a/gi/pyp-topics/src/Makefile.am b/gi/pyp-topics/src/Makefile.am index 51b47294..3d62a334 100644 --- a/gi/pyp-topics/src/Makefile.am +++ b/gi/pyp-topics/src/Makefile.am @@ -1,6 +1,9 @@ bin_PROGRAMS = pyp-topics-train -pyp_topics_train_SOURCES = corpus.cc gammadist.c gzstream.cc mt19937ar.c pyp-topics.cc train.cc +contexts_lexer.cc: contexts_lexer.l + $(LEX) -s -CF -8 -o$@ $< + +pyp_topics_train_SOURCES = corpus.cc gammadist.c gzstream.cc mt19937ar.c pyp-topics.cc train.cc contexts_lexer.cc contexts_corpus.cc pyp_topics_train_LDADD = -lz AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops diff --git a/gi/pyp-topics/src/contexts_corpus.cc b/gi/pyp-topics/src/contexts_corpus.cc new file mode 100644 index 00000000..0b3ec644 --- /dev/null +++ b/gi/pyp-topics/src/contexts_corpus.cc @@ -0,0 +1,56 @@ +#include +#include +#include + +#include "contexts_corpus.hh" +#include "gzstream.hh" +#include "contexts_lexer.h" + +using namespace std; + +////////////////////////////////////////////////// +// ContextsCorpus +////////////////////////////////////////////////// + +void read_callback(const ContextsLexer::PhraseContextsType& new_contexts, void* extra) { + assert(new_contexts.contexts.size() == new_contexts.counts.size()); + + ContextsCorpus* corpus_ptr = static_cast(extra); + Document* doc(new Document()); + + //std::cout << "READ: " << new_contexts.phrase << "\t"; + + for (int i=0; i < new_contexts.contexts.size(); ++i) { + std::string context_str = ""; + for (ContextsLexer::Context::const_iterator it=new_contexts.contexts[i].begin(); + it != new_contexts.contexts[i].end(); ++it) { + //std::cout << *it << " "; + if (it != new_contexts.contexts[i].begin()) + context_str += "__"; + context_str += *it; + } + + WordID id = corpus_ptr->m_dict.Convert(context_str); + int count = new_contexts.counts[i]; + for (int i=0; ipush_back(id); + corpus_ptr->m_num_terms += count; + + //std::cout << context_str << " (" << id << ") ||| C=" << count << " ||| "; + } + //std::cout << std::endl; + + corpus_ptr->m_documents.push_back(doc); +} + +unsigned ContextsCorpus::read_contexts(const std::string &filename) { + m_num_terms = 0; + m_num_types = 0; + + igzstream in(filename.c_str()); + ContextsLexer::ReadContexts(&in, read_callback, this); + + m_num_types = m_dict.max(); + + return m_documents.size(); +} diff --git a/gi/pyp-topics/src/contexts_corpus.hh b/gi/pyp-topics/src/contexts_corpus.hh new file mode 100644 index 00000000..e680cef5 --- /dev/null +++ b/gi/pyp-topics/src/contexts_corpus.hh @@ -0,0 +1,39 @@ +#ifndef _CONTEXTS_CORPUS_HH +#define _CONTEXTS_CORPUS_HH + +#include +#include +#include + +#include + +#include "corpus.hh" +#include "contexts_lexer.h" +#include "../../../decoder/dict.h" + +//////////////////////////////////////////////////////////////// +// ContextsCorpus +//////////////////////////////////////////////////////////////// + +class ContextsCorpus : public Corpus { + friend void read_callback(const ContextsLexer::PhraseContextsType&, void*); + +public: + typedef boost::ptr_vector::const_iterator const_iterator; + +public: + ContextsCorpus() {} + virtual ~ContextsCorpus() {} + + unsigned read_contexts(const std::string &filename); + + TermBackoffPtr backoff_index() { + return m_backoff; + } + +private: + TermBackoffPtr m_backoff; + Dict m_dict; +}; + +#endif // _CONTEXTS_CORPUS_HH diff --git a/gi/pyp-topics/src/contexts_lexer.h b/gi/pyp-topics/src/contexts_lexer.h new file mode 100644 index 00000000..f9a1b21c --- /dev/null +++ b/gi/pyp-topics/src/contexts_lexer.h @@ -0,0 +1,22 @@ +#ifndef _CONTEXTS_LEXER_H_ +#define _CONTEXTS_LEXER_H_ + +#include +#include +#include + +#include "../../../decoder/dict.h" + +struct ContextsLexer { + typedef std::vector Context; + struct PhraseContextsType { + std::string phrase; + std::vector contexts; + std::vector counts; + }; + + typedef void (*ContextsCallback)(const PhraseContextsType& new_contexts, void* extra); + static void ReadContexts(std::istream* in, ContextsCallback func, void* extra); +}; + +#endif diff --git a/gi/pyp-topics/src/contexts_lexer.l b/gi/pyp-topics/src/contexts_lexer.l new file mode 100644 index 00000000..61189a73 --- /dev/null +++ b/gi/pyp-topics/src/contexts_lexer.l @@ -0,0 +1,110 @@ +%{ +#include "contexts_lexer.h" + +#include +#include +#include +#include +#include + +int lex_line = 0; +std::istream* contextslex_stream = NULL; +ContextsLexer::ContextsCallback contexts_callback = NULL; +void* contexts_callback_extra = NULL; + +#undef YY_INPUT +#define YY_INPUT(buf, result, max_size) (result = contextslex_stream->read(buf, max_size).gcount()) + +#define YY_SKIP_YYWRAP 1 +int num_phrases = 0; +int yywrap() { return 1; } + +#define MAX_TOKEN_SIZE 255 +std::string contextslex_tmp_token(MAX_TOKEN_SIZE, '\0'); +ContextsLexer::PhraseContextsType current_contexts; + +#define MAX_CONTEXT_SIZE 255 +//std::string tmp_context[MAX_CONTEXT_SIZE]; +ContextsLexer::Context tmp_context; + + +void contextslex_reset() { + current_contexts.phrase.clear(); + current_contexts.contexts.clear(); + current_contexts.counts.clear(); + tmp_context.clear(); +} + +%} + +INT [\-+]?[0-9]+|inf|[\-+]inf + +%x CONTEXT COUNT COUNT_END +%% + +[^\t]+ { + contextslex_reset(); + current_contexts.phrase.assign(yytext, yyleng); + BEGIN(CONTEXT); + } +\t { + ; + } + +\n { + std::cerr << "ERROR: contexts_lexer.l: unexpected newline while trying to read phrase|context|count." << std::endl; + abort(); + } + +\|\|\| { + current_contexts.contexts.push_back(tmp_context); + tmp_context.clear(); + BEGIN(COUNT); + } +[^ \t]+ { + contextslex_tmp_token.assign(yytext, yyleng); + tmp_context.push_back(contextslex_tmp_token); + } +[ \t]+ { ; } + +[ \t]+ { ; } +C={INT} { + current_contexts.counts.push_back(atoi(yytext+2)); + BEGIN(COUNT_END); + } +. { + std::cerr << "ERROR: contexts_lexer.l: unexpected content while reading count." << std::endl; + abort(); + } + +[ \t]+ { ; } +\|\|\| { + BEGIN(CONTEXT); + } +\n { + //std::cerr << "READ:" << current_contexts.phrase << " with " << current_contexts.contexts.size() + // << " contexts, and " << current_contexts.counts.size() << " counts." << std::endl; + contexts_callback(current_contexts, contexts_callback_extra); + current_contexts.phrase.clear(); + current_contexts.contexts.clear(); + current_contexts.counts.clear(); + BEGIN(INITIAL); + } +. { + contextslex_tmp_token.assign(yytext, yyleng); + std::cerr << "ERROR: contexts_lexer.l: unexpected content while looking for ||| closing count." << std::endl; + abort(); + } + +%% + +#include "../../../decoder/filelib.h" + +void ContextsLexer::ReadContexts(std::istream* in, ContextsLexer::ContextsCallback func, void* extra) { + lex_line = 1; + contextslex_stream = in; + contexts_callback_extra = extra, + contexts_callback = func; + yylex(); +} + diff --git a/gi/pyp-topics/src/corpus.cc b/gi/pyp-topics/src/corpus.cc index 93910ea3..24b93a03 100644 --- a/gi/pyp-topics/src/corpus.cc +++ b/gi/pyp-topics/src/corpus.cc @@ -22,7 +22,7 @@ unsigned Corpus::read(const std::string &filename) { string buf; int token; - unsigned count=0; + unsigned doc_count=0; while (getline(in, buf)) { Document* doc(new Document()); istringstream ss(buf); @@ -39,14 +39,15 @@ unsigned Corpus::read(const std::string &filename) { } m_documents.push_back(doc); - count++; + doc_count++; } m_num_types = seen_types.size(); - return count; + return doc_count; } + ////////////////////////////////////////////////// // TestCorpus ////////////////////////////////////////////////// diff --git a/gi/pyp-topics/src/corpus.hh b/gi/pyp-topics/src/corpus.hh index c1b0ceda..243f7e2c 100644 --- a/gi/pyp-topics/src/corpus.hh +++ b/gi/pyp-topics/src/corpus.hh @@ -5,6 +5,7 @@ #include #include +#include #include //////////////////////////////////////////////////////////////// @@ -100,4 +101,6 @@ protected: int m_backoff_order; std::vector m_terms_at_order; }; +typedef boost::shared_ptr TermBackoffPtr; + #endif // _CORPUS_HH diff --git a/gi/pyp-topics/src/pyp-topics.hh b/gi/pyp-topics/src/pyp-topics.hh index 4036985a..92d6f292 100644 --- a/gi/pyp-topics/src/pyp-topics.hh +++ b/gi/pyp-topics/src/pyp-topics.hh @@ -15,7 +15,7 @@ public: typedef long double F; public: - PYPTopics(int num_topics) : m_num_topics(num_topics), m_word_pyps(1), m_backoff(0) {} + PYPTopics(int num_topics) : m_num_topics(num_topics), m_word_pyps(1) {} void sample(const Corpus& corpus, int samples); int sample(const DocumentId& doc, const Term& term); @@ -27,6 +27,9 @@ public: m_word_pyps.clear(); m_word_pyps.resize(m_backoff->order(), PYPs()); } + void set_backoff(TermBackoffPtr backoff) { + m_backoff = backoff; + } F prob(const Term& term, int topic, int level=0) const; void decrement(const Term& term, int topic, int level=0); @@ -46,7 +49,7 @@ private: PYPs m_document_pyps; std::vector m_word_pyps; - std::auto_ptr m_backoff; + TermBackoffPtr m_backoff; }; #endif // PYP_TOPICS_HH diff --git a/gi/pyp-topics/src/train.cc b/gi/pyp-topics/src/train.cc index 0d107f11..01ada182 100644 --- a/gi/pyp-topics/src/train.cc +++ b/gi/pyp-topics/src/train.cc @@ -10,6 +10,7 @@ // Local #include "pyp-topics.hh" #include "corpus.hh" +#include "contexts_corpus.hh" #include "gzstream.hh" #include "mt19937ar.h" @@ -38,6 +39,7 @@ int main(int argc, char **argv) options_description generic("Allowed options"); generic.add_options() ("documents,d", value(), "file containing the documents") + ("contexts", value(), "file containing the documents in phrase contexts format") ("topics,t", value()->default_value(50), "number of topics") ("document-topics-out,o", value(), "file to write the document topics to") ("topic-words-out,w", value(), "file to write the topic word distribution to") @@ -56,42 +58,55 @@ int main(int argc, char **argv) } notify(vm); //////////////////////////////////////////////////////////////////////////////////////////// - - if (vm.count("help")) { - cout << cmdline_options << "\n"; + if (vm.count("contexts") > 0 && vm.count("documents") > 0) { + cerr << "Only one of --documents or --contexts must be specified." << std::endl; return 1; } - if (vm.count("documents") == 0) { + if (vm.count("documents") == 0 && vm.count("contexts") == 0) { cerr << "Please specify a file containing the documents." << endl; cout << cmdline_options << "\n"; return 1; } + if (vm.count("help")) { + cout << cmdline_options << "\n"; + return 1; + } + // seed the random number generator //mt_init_genrand(time(0)); + PYPTopics model(vm["topics"].as()); + // read the data - Corpus corpus; - corpus.read(vm["documents"].as()); + boost::shared_ptr corpus; + if (vm.count("documents") == 0 && vm.count("contexts") == 0) { + corpus.reset(new Corpus); + corpus->read(vm["documents"].as()); - // run the sampler - PYPTopics model(vm["topics"].as()); + // read the backoff dictionary + if (vm.count("backoff-paths")) + model.set_backoff(vm["backoff-paths"].as()); - // read the backoff dictionary - if (vm.count("backoff-paths")) - model.set_backoff(vm["backoff-paths"].as()); + } + else { + boost::shared_ptr contexts_corpus(new ContextsCorpus); + contexts_corpus->read_contexts(vm["contexts"].as()); + corpus = contexts_corpus; + model.set_backoff(contexts_corpus->backoff_index()); + } // train the sampler - model.sample(corpus, vm["samples"].as()); + model.sample(*corpus, vm["samples"].as()); if (vm.count("document-topics-out")) { ogzstream documents_out(vm["document-topics-out"].as().c_str()); //model.print_document_topics(documents_out); int document_id=0; - for (Corpus::const_iterator corpusIt=corpus.begin(); - corpusIt != corpus.end(); ++corpusIt, ++document_id) { + for (Corpus::const_iterator corpusIt=corpus->begin(); + corpusIt != corpus->end(); ++corpusIt, ++document_id) { std::vector unique_terms; for (Document::const_iterator docIt=corpusIt->begin(); docIt != corpusIt->end(); ++docIt) { -- cgit v1.2.3