From 4ab84a0be28fdb6c0c421fe5ba5e09cfa298f2d1 Mon Sep 17 00:00:00 2001 From: Paul Baltescu Date: Mon, 28 Jan 2013 11:56:31 +0000 Subject: Initial working commit. --- extractor/data_array.h | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) create mode 100644 extractor/data_array.h (limited to 'extractor/data_array.h') diff --git a/extractor/data_array.h b/extractor/data_array.h new file mode 100644 index 00000000..6d3e99d5 --- /dev/null +++ b/extractor/data_array.h @@ -0,0 +1,71 @@ +#ifndef _DATA_ARRAY_H_ +#define _DATA_ARRAY_H_ + +#include +#include +#include + +#include + +namespace fs = boost::filesystem; +using namespace std; +using namespace tr1; + +enum Side { + SOURCE, + TARGET +}; + +class DataArray { + public: + static int END_OF_FILE; + static int END_OF_LINE; + static string END_OF_FILE_STR; + static string END_OF_LINE_STR; + + DataArray(); + + DataArray(const string& filename); + + DataArray(const string& filename, const Side& side); + + virtual ~DataArray(); + + virtual const vector& GetData() const; + + virtual int AtIndex(int index) const; + + virtual int GetSize() const; + + virtual int GetVocabularySize() const; + + virtual bool HasWord(const string& word) const; + + virtual int GetWordId(const string& word) const; + + string GetWord(int word_id) const; + + int GetNumSentences() const; + + int GetSentenceStart(int position) const; + + virtual int GetSentenceId(int position) const; + + void WriteBinary(const fs::path& filepath) const; + + void WriteBinary(FILE* file) const; + + private: + void InitializeDataArray(); + void CreateDataArray(const vector& lines); + + unordered_map word2id; + vector id2word; + vector data; + // TODO(pauldb): We only need sentence_id for the source language. Maybe we + // can save some memory here. + vector sentence_id; + vector sentence_start; +}; + +#endif -- cgit v1.2.3