diff options
Diffstat (limited to 'extractor/data_array.h')
-rw-r--r-- | extractor/data_array.h | 71 |
1 files changed, 71 insertions, 0 deletions
diff --git a/extractor/data_array.h b/extractor/data_array.h new file mode 100644 index 00000000..6d3e99d5 --- /dev/null +++ b/extractor/data_array.h @@ -0,0 +1,71 @@ +#ifndef _DATA_ARRAY_H_ +#define _DATA_ARRAY_H_ + +#include <string> +#include <tr1/unordered_map> +#include <vector> + +#include <boost/filesystem.hpp> + +namespace fs = boost::filesystem; +using namespace std; +using namespace tr1; + +enum Side { + SOURCE, + TARGET +}; + +class DataArray { + public: + static int END_OF_FILE; + static int END_OF_LINE; + static string END_OF_FILE_STR; + static string END_OF_LINE_STR; + + DataArray(); + + DataArray(const string& filename); + + DataArray(const string& filename, const Side& side); + + virtual ~DataArray(); + + virtual const vector<int>& GetData() const; + + virtual int AtIndex(int index) const; + + virtual int GetSize() const; + + virtual int GetVocabularySize() const; + + virtual bool HasWord(const string& word) const; + + virtual int GetWordId(const string& word) const; + + string GetWord(int word_id) const; + + int GetNumSentences() const; + + int GetSentenceStart(int position) const; + + virtual int GetSentenceId(int position) const; + + void WriteBinary(const fs::path& filepath) const; + + void WriteBinary(FILE* file) const; + + private: + void InitializeDataArray(); + void CreateDataArray(const vector<string>& lines); + + unordered_map<string, int> word2id; + vector<string> id2word; + vector<int> data; + // TODO(pauldb): We only need sentence_id for the source language. Maybe we + // can save some memory here. + vector<int> sentence_id; + vector<int> sentence_start; +}; + +#endif |