summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorphilblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-28 15:01:17 +0000
committerphilblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-28 15:01:17 +0000
commit1070f63e3978b9b26df46ad80fe1f40f2ce83a23 (patch)
treef87f4ddfec6fa1a98d03988bdf6f5d4a372f2cc9
parentf7e25929adebd260f7e7d0bf12e05a37abbe1779 (diff)
Added contexts_corpus for reading text data files.
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@36 ec762483-ff6d-05da-a07a-a48fb63a330f
-rw-r--r--gi/pyp-topics/src/Makefile.am5
-rw-r--r--gi/pyp-topics/src/contexts_corpus.cc56
-rw-r--r--gi/pyp-topics/src/contexts_corpus.hh39
-rw-r--r--gi/pyp-topics/src/contexts_lexer.h22
-rw-r--r--gi/pyp-topics/src/contexts_lexer.l110
-rw-r--r--gi/pyp-topics/src/corpus.cc7
-rw-r--r--gi/pyp-topics/src/corpus.hh3
-rw-r--r--gi/pyp-topics/src/pyp-topics.hh7
-rw-r--r--gi/pyp-topics/src/train.cc43
9 files changed, 272 insertions, 20 deletions
diff --git a/gi/pyp-topics/src/Makefile.am b/gi/pyp-topics/src/Makefile.am
index 51b47294..3d62a334 100644
--- a/gi/pyp-topics/src/Makefile.am
+++ b/gi/pyp-topics/src/Makefile.am
@@ -1,6 +1,9 @@
bin_PROGRAMS = pyp-topics-train
-pyp_topics_train_SOURCES = corpus.cc gammadist.c gzstream.cc mt19937ar.c pyp-topics.cc train.cc
+contexts_lexer.cc: contexts_lexer.l
+ $(LEX) -s -CF -8 -o$@ $<
+
+pyp_topics_train_SOURCES = corpus.cc gammadist.c gzstream.cc mt19937ar.c pyp-topics.cc train.cc contexts_lexer.cc contexts_corpus.cc
pyp_topics_train_LDADD = -lz
AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops
diff --git a/gi/pyp-topics/src/contexts_corpus.cc b/gi/pyp-topics/src/contexts_corpus.cc
new file mode 100644
index 00000000..0b3ec644
--- /dev/null
+++ b/gi/pyp-topics/src/contexts_corpus.cc
@@ -0,0 +1,56 @@
+#include <sstream>
+#include <iostream>
+#include <set>
+
+#include "contexts_corpus.hh"
+#include "gzstream.hh"
+#include "contexts_lexer.h"
+
+using namespace std;
+
+//////////////////////////////////////////////////
+// ContextsCorpus
+//////////////////////////////////////////////////
+
+void read_callback(const ContextsLexer::PhraseContextsType& new_contexts, void* extra) {
+ assert(new_contexts.contexts.size() == new_contexts.counts.size());
+
+ ContextsCorpus* corpus_ptr = static_cast<ContextsCorpus*>(extra);
+ Document* doc(new Document());
+
+ //std::cout << "READ: " << new_contexts.phrase << "\t";
+
+ for (int i=0; i < new_contexts.contexts.size(); ++i) {
+ std::string context_str = "";
+ for (ContextsLexer::Context::const_iterator it=new_contexts.contexts[i].begin();
+ it != new_contexts.contexts[i].end(); ++it) {
+ //std::cout << *it << " ";
+ if (it != new_contexts.contexts[i].begin())
+ context_str += "__";
+ context_str += *it;
+ }
+
+ WordID id = corpus_ptr->m_dict.Convert(context_str);
+ int count = new_contexts.counts[i];
+ for (int i=0; i<count; ++i)
+ doc->push_back(id);
+ corpus_ptr->m_num_terms += count;
+
+ //std::cout << context_str << " (" << id << ") ||| C=" << count << " ||| ";
+ }
+ //std::cout << std::endl;
+
+ corpus_ptr->m_documents.push_back(doc);
+}
+
+unsigned ContextsCorpus::read_contexts(const std::string &filename) {
+ m_num_terms = 0;
+ m_num_types = 0;
+
+ igzstream in(filename.c_str());
+ ContextsLexer::ReadContexts(&in, read_callback, this);
+
+ m_num_types = m_dict.max();
+
+ return m_documents.size();
+}
diff --git a/gi/pyp-topics/src/contexts_corpus.hh b/gi/pyp-topics/src/contexts_corpus.hh
new file mode 100644
index 00000000..e680cef5
--- /dev/null
+++ b/gi/pyp-topics/src/contexts_corpus.hh
@@ -0,0 +1,39 @@
+#ifndef _CONTEXTS_CORPUS_HH
+#define _CONTEXTS_CORPUS_HH
+
+#include <vector>
+#include <string>
+#include <map>
+
+#include <boost/ptr_container/ptr_vector.hpp>
+
+#include "corpus.hh"
+#include "contexts_lexer.h"
+#include "../../../decoder/dict.h"
+
+////////////////////////////////////////////////////////////////
+// ContextsCorpus
+////////////////////////////////////////////////////////////////
+
+class ContextsCorpus : public Corpus {
+ friend void read_callback(const ContextsLexer::PhraseContextsType&, void*);
+
+public:
+ typedef boost::ptr_vector<Document>::const_iterator const_iterator;
+
+public:
+ ContextsCorpus() {}
+ virtual ~ContextsCorpus() {}
+
+ unsigned read_contexts(const std::string &filename);
+
+ TermBackoffPtr backoff_index() {
+ return m_backoff;
+ }
+
+private:
+ TermBackoffPtr m_backoff;
+ Dict m_dict;
+};
+
+#endif // _CONTEXTS_CORPUS_HH
diff --git a/gi/pyp-topics/src/contexts_lexer.h b/gi/pyp-topics/src/contexts_lexer.h
new file mode 100644
index 00000000..f9a1b21c
--- /dev/null
+++ b/gi/pyp-topics/src/contexts_lexer.h
@@ -0,0 +1,22 @@
+#ifndef _CONTEXTS_LEXER_H_
+#define _CONTEXTS_LEXER_H_
+
+#include <iostream>
+#include <vector>
+#include <string>
+
+#include "../../../decoder/dict.h"
+
+struct ContextsLexer {
+ typedef std::vector<std::string> Context;
+ struct PhraseContextsType {
+ std::string phrase;
+ std::vector<Context> contexts;
+ std::vector<int> counts;
+ };
+
+ typedef void (*ContextsCallback)(const PhraseContextsType& new_contexts, void* extra);
+ static void ReadContexts(std::istream* in, ContextsCallback func, void* extra);
+};
+
+#endif
diff --git a/gi/pyp-topics/src/contexts_lexer.l b/gi/pyp-topics/src/contexts_lexer.l
new file mode 100644
index 00000000..61189a73
--- /dev/null
+++ b/gi/pyp-topics/src/contexts_lexer.l
@@ -0,0 +1,110 @@
+%{
+#include "contexts_lexer.h"
+
+#include <string>
+#include <iostream>
+#include <sstream>
+#include <cstring>
+#include <cassert>
+
+int lex_line = 0;
+std::istream* contextslex_stream = NULL;
+ContextsLexer::ContextsCallback contexts_callback = NULL;
+void* contexts_callback_extra = NULL;
+
+#undef YY_INPUT
+#define YY_INPUT(buf, result, max_size) (result = contextslex_stream->read(buf, max_size).gcount())
+
+#define YY_SKIP_YYWRAP 1
+int num_phrases = 0;
+int yywrap() { return 1; }
+
+#define MAX_TOKEN_SIZE 255
+std::string contextslex_tmp_token(MAX_TOKEN_SIZE, '\0');
+ContextsLexer::PhraseContextsType current_contexts;
+
+#define MAX_CONTEXT_SIZE 255
+//std::string tmp_context[MAX_CONTEXT_SIZE];
+ContextsLexer::Context tmp_context;
+
+
+void contextslex_reset() {
+ current_contexts.phrase.clear();
+ current_contexts.contexts.clear();
+ current_contexts.counts.clear();
+ tmp_context.clear();
+}
+
+%}
+
+INT [\-+]?[0-9]+|inf|[\-+]inf
+
+%x CONTEXT COUNT COUNT_END
+%%
+
+<INITIAL>[^\t]+ {
+ contextslex_reset();
+ current_contexts.phrase.assign(yytext, yyleng);
+ BEGIN(CONTEXT);
+ }
+<INITIAL>\t {
+ ;
+ }
+
+<INITIAL,CONTEXT,COUNT>\n {
+ std::cerr << "ERROR: contexts_lexer.l: unexpected newline while trying to read phrase|context|count." << std::endl;
+ abort();
+ }
+
+<CONTEXT>\|\|\| {
+ current_contexts.contexts.push_back(tmp_context);
+ tmp_context.clear();
+ BEGIN(COUNT);
+ }
+<CONTEXT>[^ \t]+ {
+ contextslex_tmp_token.assign(yytext, yyleng);
+ tmp_context.push_back(contextslex_tmp_token);
+ }
+<CONTEXT>[ \t]+ { ; }
+
+<COUNT>[ \t]+ { ; }
+<COUNT>C={INT} {
+ current_contexts.counts.push_back(atoi(yytext+2));
+ BEGIN(COUNT_END);
+ }
+<COUNT>. {
+ std::cerr << "ERROR: contexts_lexer.l: unexpected content while reading count." << std::endl;
+ abort();
+ }
+
+<COUNT_END>[ \t]+ { ; }
+<COUNT_END>\|\|\| {
+ BEGIN(CONTEXT);
+ }
+<COUNT_END>\n {
+ //std::cerr << "READ:" << current_contexts.phrase << " with " << current_contexts.contexts.size()
+ // << " contexts, and " << current_contexts.counts.size() << " counts." << std::endl;
+ contexts_callback(current_contexts, contexts_callback_extra);
+ current_contexts.phrase.clear();
+ current_contexts.contexts.clear();
+ current_contexts.counts.clear();
+ BEGIN(INITIAL);
+ }
+<COUNT_END>. {
+ contextslex_tmp_token.assign(yytext, yyleng);
+ std::cerr << "ERROR: contexts_lexer.l: unexpected content while looking for ||| closing count." << std::endl;
+ abort();
+ }
+
+%%
+
+#include "../../../decoder/filelib.h"
+
+void ContextsLexer::ReadContexts(std::istream* in, ContextsLexer::ContextsCallback func, void* extra) {
+ lex_line = 1;
+ contextslex_stream = in;
+ contexts_callback_extra = extra,
+ contexts_callback = func;
+ yylex();
+}
+
diff --git a/gi/pyp-topics/src/corpus.cc b/gi/pyp-topics/src/corpus.cc
index 93910ea3..24b93a03 100644
--- a/gi/pyp-topics/src/corpus.cc
+++ b/gi/pyp-topics/src/corpus.cc
@@ -22,7 +22,7 @@ unsigned Corpus::read(const std::string &filename) {
string buf;
int token;
- unsigned count=0;
+ unsigned doc_count=0;
while (getline(in, buf)) {
Document* doc(new Document());
istringstream ss(buf);
@@ -39,14 +39,15 @@ unsigned Corpus::read(const std::string &filename) {
}
m_documents.push_back(doc);
- count++;
+ doc_count++;
}
m_num_types = seen_types.size();
- return count;
+ return doc_count;
}
+
//////////////////////////////////////////////////
// TestCorpus
//////////////////////////////////////////////////
diff --git a/gi/pyp-topics/src/corpus.hh b/gi/pyp-topics/src/corpus.hh
index c1b0ceda..243f7e2c 100644
--- a/gi/pyp-topics/src/corpus.hh
+++ b/gi/pyp-topics/src/corpus.hh
@@ -5,6 +5,7 @@
#include <string>
#include <map>
+#include <boost/shared_ptr.hpp>
#include <boost/ptr_container/ptr_vector.hpp>
////////////////////////////////////////////////////////////////
@@ -100,4 +101,6 @@ protected:
int m_backoff_order;
std::vector<int> m_terms_at_order;
};
+typedef boost::shared_ptr<TermBackoff> TermBackoffPtr;
+
#endif // _CORPUS_HH
diff --git a/gi/pyp-topics/src/pyp-topics.hh b/gi/pyp-topics/src/pyp-topics.hh
index 4036985a..92d6f292 100644
--- a/gi/pyp-topics/src/pyp-topics.hh
+++ b/gi/pyp-topics/src/pyp-topics.hh
@@ -15,7 +15,7 @@ public:
typedef long double F;
public:
- PYPTopics(int num_topics) : m_num_topics(num_topics), m_word_pyps(1), m_backoff(0) {}
+ PYPTopics(int num_topics) : m_num_topics(num_topics), m_word_pyps(1) {}
void sample(const Corpus& corpus, int samples);
int sample(const DocumentId& doc, const Term& term);
@@ -27,6 +27,9 @@ public:
m_word_pyps.clear();
m_word_pyps.resize(m_backoff->order(), PYPs());
}
+ void set_backoff(TermBackoffPtr backoff) {
+ m_backoff = backoff;
+ }
F prob(const Term& term, int topic, int level=0) const;
void decrement(const Term& term, int topic, int level=0);
@@ -46,7 +49,7 @@ private:
PYPs m_document_pyps;
std::vector<PYPs> m_word_pyps;
- std::auto_ptr<TermBackoff> m_backoff;
+ TermBackoffPtr m_backoff;
};
#endif // PYP_TOPICS_HH
diff --git a/gi/pyp-topics/src/train.cc b/gi/pyp-topics/src/train.cc
index 0d107f11..01ada182 100644
--- a/gi/pyp-topics/src/train.cc
+++ b/gi/pyp-topics/src/train.cc
@@ -10,6 +10,7 @@
// Local
#include "pyp-topics.hh"
#include "corpus.hh"
+#include "contexts_corpus.hh"
#include "gzstream.hh"
#include "mt19937ar.h"
@@ -38,6 +39,7 @@ int main(int argc, char **argv)
options_description generic("Allowed options");
generic.add_options()
("documents,d", value<string>(), "file containing the documents")
+ ("contexts", value<string>(), "file containing the documents in phrase contexts format")
("topics,t", value<int>()->default_value(50), "number of topics")
("document-topics-out,o", value<string>(), "file to write the document topics to")
("topic-words-out,w", value<string>(), "file to write the topic word distribution to")
@@ -56,42 +58,55 @@ int main(int argc, char **argv)
}
notify(vm);
////////////////////////////////////////////////////////////////////////////////////////////
-
- if (vm.count("help")) {
- cout << cmdline_options << "\n";
+ if (vm.count("contexts") > 0 && vm.count("documents") > 0) {
+ cerr << "Only one of --documents or --contexts must be specified." << std::endl;
return 1;
}
- if (vm.count("documents") == 0) {
+ if (vm.count("documents") == 0 && vm.count("contexts") == 0) {
cerr << "Please specify a file containing the documents." << endl;
cout << cmdline_options << "\n";
return 1;
}
+ if (vm.count("help")) {
+ cout << cmdline_options << "\n";
+ return 1;
+ }
+
// seed the random number generator
//mt_init_genrand(time(0));
+ PYPTopics model(vm["topics"].as<int>());
+
// read the data
- Corpus corpus;
- corpus.read(vm["documents"].as<string>());
+ boost::shared_ptr<Corpus> corpus;
+ if (vm.count("documents") == 0 && vm.count("contexts") == 0) {
+ corpus.reset(new Corpus);
+ corpus->read(vm["documents"].as<string>());
- // run the sampler
- PYPTopics model(vm["topics"].as<int>());
+ // read the backoff dictionary
+ if (vm.count("backoff-paths"))
+ model.set_backoff(vm["backoff-paths"].as<string>());
- // read the backoff dictionary
- if (vm.count("backoff-paths"))
- model.set_backoff(vm["backoff-paths"].as<string>());
+ }
+ else {
+ boost::shared_ptr<ContextsCorpus> contexts_corpus(new ContextsCorpus);
+ contexts_corpus->read_contexts(vm["contexts"].as<string>());
+ corpus = contexts_corpus;
+ model.set_backoff(contexts_corpus->backoff_index());
+ }
// train the sampler
- model.sample(corpus, vm["samples"].as<int>());
+ model.sample(*corpus, vm["samples"].as<int>());
if (vm.count("document-topics-out")) {
ogzstream documents_out(vm["document-topics-out"].as<string>().c_str());
//model.print_document_topics(documents_out);
int document_id=0;
- for (Corpus::const_iterator corpusIt=corpus.begin();
- corpusIt != corpus.end(); ++corpusIt, ++document_id) {
+ for (Corpus::const_iterator corpusIt=corpus->begin();
+ corpusIt != corpus->end(); ++corpusIt, ++document_id) {
std::vector<int> unique_terms;
for (Document::const_iterator docIt=corpusIt->begin();
docIt != corpusIt->end(); ++docIt) {