Added contexts_corpus for reading text data files.

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@36 ec762483-ff6d-05da-a07a-a48fb63a330f
author: philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-06-28 15:01:17 +0000
committer: philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-06-28 15:01:17 +0000
commit: 1d089b02eff4fa8837faecf99021f624d8845e5d (patch)
tree: b6e3d20094514749c37485e154117871cdc8696f
parent: 088725c4708e83343154d1bed9dee18286446eaf (diff)
9 files changed, 272 insertions, 20 deletions
diff --git a/gi/pyp-topics/src/Makefile.am b/gi/pyp-topics/src/Makefile.am
index 51b47294..3d62a334 100644
--- a/gi/pyp-topics/src/Makefile.am
+++ b/gi/pyp-topics/src/Makefile.am
@@ -1,6 +1,9 @@
 bin_PROGRAMS = pyp-topics-train
 
-pyp_topics_train_SOURCES = corpus.cc gammadist.c gzstream.cc mt19937ar.c pyp-topics.cc train.cc
+contexts_lexer.cc: contexts_lexer.l
+	$(LEX) -s -CF -8 -o$@ $<
+
+pyp_topics_train_SOURCES = corpus.cc gammadist.c gzstream.cc mt19937ar.c pyp-topics.cc train.cc contexts_lexer.cc contexts_corpus.cc
 pyp_topics_train_LDADD = -lz
 
 AM_CPPFLAGS = -W -Wall -Wno-sign-compare -funroll-loops
diff --git a/gi/pyp-topics/src/contexts_corpus.cc b/gi/pyp-topics/src/contexts_corpus.cc
new file mode 100644
index 00000000..0b3ec644
--- /dev/null
+++ b/gi/pyp-topics/src/contexts_corpus.cc
@@ -0,0 +1,56 @@
+#include <sstream>
+#include <iostream>
+#include <set>
+
+#include "contexts_corpus.hh"
+#include "gzstream.hh"
+#include "contexts_lexer.h"
+
+using namespace std;
+
+//////////////////////////////////////////////////
+// ContextsCorpus
+//////////////////////////////////////////////////
+
+void read_callback(const ContextsLexer::PhraseContextsType& new_contexts, void* extra) {
+  assert(new_contexts.contexts.size() == new_contexts.counts.size());
+
+  ContextsCorpus* corpus_ptr = static_cast<ContextsCorpus*>(extra);
+  Document* doc(new Document());
+
+  //std::cout << "READ: " << new_contexts.phrase << "\t";
+
+  for (int i=0; i < new_contexts.contexts.size(); ++i) {
+    std::string context_str = "";
+    for (ContextsLexer::Context::const_iterator it=new_contexts.contexts[i].begin();
+         it != new_contexts.contexts[i].end(); ++it) {
+      //std::cout << *it << " ";
+      if (it != new_contexts.contexts[i].begin())
+        context_str += "__";
+      context_str += *it;
+    }
+
+    WordID id = corpus_ptr->m_dict.Convert(context_str);
+    int count = new_contexts.counts[i];
+    for (int i=0; i<count; ++i)
+      doc->push_back(id);
+    corpus_ptr->m_num_terms += count;
+
+    //std::cout << context_str << " (" << id << ") ||| C=" << count << " ||| ";
+  }
+  //std::cout << std::endl;
+
+  corpus_ptr->m_documents.push_back(doc);
+}
+
+unsigned ContextsCorpus::read_contexts(const std::string &filename) {
+  m_num_terms = 0;
+  m_num_types = 0;
+
+  igzstream in(filename.c_str());
+  ContextsLexer::ReadContexts(&in, read_callback, this);
+
+  m_num_types = m_dict.max();
+
+  return m_documents.size();
+}
diff --git a/gi/pyp-topics/src/contexts_corpus.hh b/gi/pyp-topics/src/contexts_corpus.hh
new file mode 100644
index 00000000..e680cef5
--- /dev/null
+++ b/gi/pyp-topics/src/contexts_corpus.hh
@@ -0,0 +1,39 @@
+#ifndef _CONTEXTS_CORPUS_HH
+#define _CONTEXTS_CORPUS_HH
+
+#include <vector>
+#include <string>
+#include <map>
+
+#include <boost/ptr_container/ptr_vector.hpp>
+
+#include "corpus.hh"
+#include "contexts_lexer.h"
+#include "../../../decoder/dict.h"
+
+////////////////////////////////////////////////////////////////
+// ContextsCorpus
+////////////////////////////////////////////////////////////////
+
+class ContextsCorpus : public Corpus {
+  friend void read_callback(const ContextsLexer::PhraseContextsType&, void*);
+
+public:
+    typedef boost::ptr_vector<Document>::const_iterator const_iterator;
+
+public:
+    ContextsCorpus() {}
+    virtual ~ContextsCorpus() {}
+
+    unsigned read_contexts(const std::string &filename);
+
+    TermBackoffPtr backoff_index() {
+      return m_backoff;
+    }
+
+private:
+    TermBackoffPtr m_backoff;
+    Dict m_dict;
+};
+
+#endif // _CONTEXTS_CORPUS_HH
diff --git a/gi/pyp-topics/src/contexts_lexer.h b/gi/pyp-topics/src/contexts_lexer.h
new file mode 100644
index 00000000..f9a1b21c
--- /dev/null
+++ b/gi/pyp-topics/src/contexts_lexer.h
@@ -0,0 +1,22 @@
+#ifndef _CONTEXTS_LEXER_H_
+#define _CONTEXTS_LEXER_H_ 
+
+#include <iostream>
+#include <vector>
+#include <string>
+
+#include "../../../decoder/dict.h" 
+
+struct ContextsLexer {
+  typedef std::vector<std::string> Context;
+  struct PhraseContextsType {
+    std::string          phrase;
+    std::vector<Context> contexts;
+    std::vector<int>     counts;
+  };
+
+  typedef void (*ContextsCallback)(const PhraseContextsType& new_contexts, void* extra);
+  static void ReadContexts(std::istream* in, ContextsCallback func, void* extra);
+};
+
+#endif
diff --git a/gi/pyp-topics/src/contexts_lexer.l b/gi/pyp-topics/src/contexts_lexer.l
new file mode 100644
index 00000000..61189a73
--- /dev/null
+++ b/gi/pyp-topics/src/contexts_lexer.l
@@ -0,0 +1,110 @@
+%{
+#include "contexts_lexer.h"
+
+#include <string>
+#include <iostream>
+#include <sstream>
+#include <cstring>
+#include <cassert>
+
+int lex_line = 0;
+std::istream* contextslex_stream = NULL;
+ContextsLexer::ContextsCallback contexts_callback = NULL;
+void* contexts_callback_extra = NULL;
+
+#undef YY_INPUT
+#define YY_INPUT(buf, result, max_size) (result = contextslex_stream->read(buf, max_size).gcount())
+
+#define YY_SKIP_YYWRAP 1
+int num_phrases = 0;
+int yywrap() { return 1; }
+
+#define MAX_TOKEN_SIZE 255
+std::string contextslex_tmp_token(MAX_TOKEN_SIZE, '\0');
+ContextsLexer::PhraseContextsType current_contexts;
+
+#define MAX_CONTEXT_SIZE 255
+//std::string tmp_context[MAX_CONTEXT_SIZE];
+ContextsLexer::Context tmp_context;
+
+
+void contextslex_reset() {
+  current_contexts.phrase.clear();
+  current_contexts.contexts.clear();
+  current_contexts.counts.clear();
+  tmp_context.clear();
+}
+
+%}
+
+INT [\-+]?[0-9]+|inf|[\-+]inf
+
+%x CONTEXT COUNT COUNT_END
+%%
+
+<INITIAL>[^\t]+	{ 
+    contextslex_reset();
+    current_contexts.phrase.assign(yytext, yyleng);
+    BEGIN(CONTEXT);
+  }
+<INITIAL>\t	{ 
+    ; 
+  }
+
+<INITIAL,CONTEXT,COUNT>\n	{
+    std::cerr << "ERROR: contexts_lexer.l: unexpected newline while trying to read phrase|context|count." << std::endl;
+    abort();
+  }
+
+<CONTEXT>\|\|\|	{
+    current_contexts.contexts.push_back(tmp_context);
+    tmp_context.clear();
+		BEGIN(COUNT);
+	}
+<CONTEXT>[^ \t]+	{ 
+		contextslex_tmp_token.assign(yytext, yyleng);
+    tmp_context.push_back(contextslex_tmp_token);
+  }
+<CONTEXT>[ \t]+	{ ; }
+
+<COUNT>[ \t]+	{ ; }
+<COUNT>C={INT} { 
+		current_contexts.counts.push_back(atoi(yytext+2));
+    BEGIN(COUNT_END);
+  }
+<COUNT>.	{ 
+    std::cerr << "ERROR: contexts_lexer.l: unexpected content while reading count." << std::endl;
+    abort();
+  }
+
+<COUNT_END>[ \t]+  { ; }
+<COUNT_END>\|\|\|	{
+		BEGIN(CONTEXT);
+  }
+<COUNT_END>\n {
+    //std::cerr << "READ:" << current_contexts.phrase << " with " << current_contexts.contexts.size() 
+    //  << " contexts, and " << current_contexts.counts.size() << " counts." << std::endl;
+		contexts_callback(current_contexts, contexts_callback_extra);
+    current_contexts.phrase.clear();
+    current_contexts.contexts.clear();
+    current_contexts.counts.clear();
+		BEGIN(INITIAL);
+  }
+<COUNT_END>.  { 
+		contextslex_tmp_token.assign(yytext, yyleng);
+    std::cerr << "ERROR: contexts_lexer.l: unexpected content while looking for ||| closing count." << std::endl;
+    abort();
+  }
+
+%%
+
+#include "../../../decoder/filelib.h" 
+
+void ContextsLexer::ReadContexts(std::istream* in, ContextsLexer::ContextsCallback func, void* extra) {
+  lex_line = 1;
+  contextslex_stream = in;
+  contexts_callback_extra = extra,
+  contexts_callback = func;
+  yylex();
+}
+
diff --git a/gi/pyp-topics/src/corpus.cc b/gi/pyp-topics/src/corpus.cc
index 93910ea3..24b93a03 100644
--- a/gi/pyp-topics/src/corpus.cc
+++ b/gi/pyp-topics/src/corpus.cc
@@ -22,7 +22,7 @@ unsigned Corpus::read(const std::string &filename) {
 
   string buf;
   int token;
-  unsigned count=0;
+  unsigned doc_count=0;
   while (getline(in, buf)) {
     Document* doc(new Document());
     istringstream ss(buf);
@@ -39,14 +39,15 @@ unsigned Corpus::read(const std::string &filename) {
     }
 
     m_documents.push_back(doc);
-    count++;
+    doc_count++;
   }
 
   m_num_types = seen_types.size();
 
-  return count;
+  return doc_count;
 }
 
+
 //////////////////////////////////////////////////
 // TestCorpus
 //////////////////////////////////////////////////
diff --git a/gi/pyp-topics/src/corpus.hh b/gi/pyp-topics/src/corpus.hh
index c1b0ceda..243f7e2c 100644
--- a/gi/pyp-topics/src/corpus.hh
+++ b/gi/pyp-topics/src/corpus.hh
@@ -5,6 +5,7 @@
 #include <string>
 #include <map>
 
+#include <boost/shared_ptr.hpp>
 #include <boost/ptr_container/ptr_vector.hpp>
 
 ////////////////////////////////////////////////////////////////
@@ -100,4 +101,6 @@ protected:
     int m_backoff_order;
     std::vector<int> m_terms_at_order;
 };
+typedef boost::shared_ptr<TermBackoff> TermBackoffPtr;
+
 #endif // _CORPUS_HH
diff --git a/gi/pyp-topics/src/pyp-topics.hh b/gi/pyp-topics/src/pyp-topics.hh
index 4036985a..92d6f292 100644
--- a/gi/pyp-topics/src/pyp-topics.hh
+++ b/gi/pyp-topics/src/pyp-topics.hh
@@ -15,7 +15,7 @@ public:
   typedef long double F;
 
 public:
-  PYPTopics(int num_topics) : m_num_topics(num_topics), m_word_pyps(1), m_backoff(0) {}
+  PYPTopics(int num_topics) : m_num_topics(num_topics), m_word_pyps(1) {}
 
   void sample(const Corpus& corpus, int samples);
   int sample(const DocumentId& doc, const Term& term);
@@ -27,6 +27,9 @@ public:
     m_word_pyps.clear();
     m_word_pyps.resize(m_backoff->order(), PYPs());
   }
+  void set_backoff(TermBackoffPtr backoff) {
+    m_backoff = backoff;
+  }
 
   F prob(const Term& term, int topic, int level=0) const;
   void decrement(const Term& term, int topic, int level=0);
@@ -46,7 +49,7 @@ private:
   PYPs m_document_pyps;
   std::vector<PYPs> m_word_pyps;
 
-  std::auto_ptr<TermBackoff> m_backoff;
+  TermBackoffPtr m_backoff;
 };
 
 #endif // PYP_TOPICS_HH
diff --git a/gi/pyp-topics/src/train.cc b/gi/pyp-topics/src/train.cc
index 0d107f11..01ada182 100644
--- a/gi/pyp-topics/src/train.cc
+++ b/gi/pyp-topics/src/train.cc
@@ -10,6 +10,7 @@
 // Local
 #include "pyp-topics.hh"
 #include "corpus.hh"
+#include "contexts_corpus.hh"
 #include "gzstream.hh"
 #include "mt19937ar.h"
 
@@ -38,6 +39,7 @@ int main(int argc, char **argv)
   options_description generic("Allowed options");
   generic.add_options()
     ("documents,d", value<string>(), "file containing the documents")
+    ("contexts", value<string>(), "file containing the documents in phrase contexts format")
     ("topics,t", value<int>()->default_value(50), "number of topics")
     ("document-topics-out,o", value<string>(), "file to write the document topics to")
     ("topic-words-out,w", value<string>(), "file to write the topic word distribution to")
@@ -56,42 +58,55 @@ int main(int argc, char **argv)
   }
   notify(vm);
   ////////////////////////////////////////////////////////////////////////////////////////////
-
-  if (vm.count("help")) { 
-    cout << cmdline_options << "\n"; 
+  if (vm.count("contexts") > 0 && vm.count("documents") > 0) {
+    cerr << "Only one of --documents or --contexts must be specified." << std::endl; 
     return 1; 
   }
 
-  if (vm.count("documents") == 0) {
+  if (vm.count("documents") == 0 && vm.count("contexts") == 0) {
     cerr << "Please specify a file containing the documents." << endl;
     cout << cmdline_options << "\n"; 
     return 1;
   }
 
+  if (vm.count("help")) { 
+    cout << cmdline_options << "\n"; 
+    return 1; 
+  }
+
   // seed the random number generator
   //mt_init_genrand(time(0));
 
+  PYPTopics model(vm["topics"].as<int>());
+
   // read the data
-  Corpus corpus;
-  corpus.read(vm["documents"].as<string>());
+  boost::shared_ptr<Corpus> corpus;
+  if (vm.count("documents") == 0 && vm.count("contexts") == 0) {
+    corpus.reset(new Corpus);
+    corpus->read(vm["documents"].as<string>());
 
-  // run the sampler
-  PYPTopics model(vm["topics"].as<int>());
+    // read the backoff dictionary
+    if (vm.count("backoff-paths"))
+      model.set_backoff(vm["backoff-paths"].as<string>());
 
-  // read the backoff dictionary
-  if (vm.count("backoff-paths"))
-    model.set_backoff(vm["backoff-paths"].as<string>());
+  }
+  else {
+    boost::shared_ptr<ContextsCorpus> contexts_corpus(new ContextsCorpus);
+    contexts_corpus->read_contexts(vm["contexts"].as<string>());
+    corpus = contexts_corpus;
+    model.set_backoff(contexts_corpus->backoff_index());
+  }
 
   // train the sampler
-  model.sample(corpus, vm["samples"].as<int>());
+  model.sample(*corpus, vm["samples"].as<int>());
 
   if (vm.count("document-topics-out")) {
     ogzstream documents_out(vm["document-topics-out"].as<string>().c_str());
     //model.print_document_topics(documents_out);
 
     int document_id=0;
-    for (Corpus::const_iterator corpusIt=corpus.begin(); 
-         corpusIt != corpus.end(); ++corpusIt, ++document_id) {
+    for (Corpus::const_iterator corpusIt=corpus->begin(); 
+         corpusIt != corpus->end(); ++corpusIt, ++document_id) {
       std::vector<int> unique_terms;
       for (Document::const_iterator docIt=corpusIt->begin();
            docIt != corpusIt->end(); ++docIt) {
author	philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-06-28 15:01:17 +0000
committer	philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-06-28 15:01:17 +0000
commit	1d089b02eff4fa8837faecf99021f624d8845e5d (patch)
tree	b6e3d20094514749c37485e154117871cdc8696f
parent	088725c4708e83343154d1bed9dee18286446eaf (diff)