#include <sstream>
#include <iostream>
#include <set>

#include "corpus.hh"
#include "gzstream.hh"

using namespace std;

//////////////////////////////////////////////////
// Corpus
//////////////////////////////////////////////////

Corpus::Corpus() : m_num_terms(0), m_num_types(0) {}

unsigned Corpus::read(const std::string &filename) {
  m_num_terms = 0;
  m_num_types = 0;
  std::set<int> seen_types;

  igzstream in(filename.c_str());

  string buf;
  int token;
  unsigned doc_count=0;
  while (getline(in, buf)) {
    Document* doc(new Document());
    istringstream ss(buf);

    ss >> token; // the number of unique terms

    char delimeter;
    int count;
    while(ss >> token >> delimeter >> count) {
      for (int i=0; i<count; ++i)
        doc->push_back(token);
      m_num_terms += count;
      seen_types.insert(token);
    }

    m_documents.push_back(doc);
    doc_count++;
  }

  m_num_types = seen_types.size();

  return doc_count;
}


//////////////////////////////////////////////////
// TestCorpus
//////////////////////////////////////////////////

TestCorpus::TestCorpus() {}

void TestCorpus::read(const std::string &filename) {
  igzstream in(filename.c_str());

  string buf;
  Term term;
  DocumentId doc;
  char delimeter;
  while (getline(in, buf)) {
    DocumentTerms* line(new DocumentTerms());
    istringstream ss(buf);

    while(ss >> doc >> delimeter >> term)
      line->push_back(DocumentTerm(doc, term));

    m_lines.push_back(line);
  }
}

//////////////////////////////////////////////////
// TermBackoff
//////////////////////////////////////////////////

void TermBackoff::read(const std::string &filename) {
  igzstream in(filename.c_str());

  string buf;
  int num_terms;
  getline(in, buf);
  istringstream ss(buf); 
  ss >> num_terms >> m_backoff_order;

  m_dict.resize(num_terms, -1);
  for (int i=0; i<m_backoff_order; ++i) {
    int count; ss >> count;
    m_terms_at_order.push_back(count);
  }

  Term term, backoff;
  while (getline(in, buf)) {
    istringstream ss(buf);
    ss >> term >> backoff;

    assert(term < num_terms);
    assert(term >= 0);

    m_dict[term] = backoff;
  }
}