diff options
Diffstat (limited to 'extractor/data_array.h')
| -rw-r--r-- | extractor/data_array.h | 71 | 
1 files changed, 71 insertions, 0 deletions
| diff --git a/extractor/data_array.h b/extractor/data_array.h new file mode 100644 index 00000000..6d3e99d5 --- /dev/null +++ b/extractor/data_array.h @@ -0,0 +1,71 @@ +#ifndef _DATA_ARRAY_H_ +#define _DATA_ARRAY_H_ + +#include <string> +#include <tr1/unordered_map> +#include <vector> + +#include <boost/filesystem.hpp> + +namespace fs = boost::filesystem; +using namespace std; +using namespace tr1; + +enum Side { +  SOURCE, +  TARGET +}; + +class DataArray { + public: +  static int END_OF_FILE; +  static int END_OF_LINE; +  static string END_OF_FILE_STR; +  static string END_OF_LINE_STR; + +  DataArray(); + +  DataArray(const string& filename); + +  DataArray(const string& filename, const Side& side); + +  virtual ~DataArray(); + +  virtual const vector<int>& GetData() const; + +  virtual int AtIndex(int index) const; + +  virtual int GetSize() const; + +  virtual int GetVocabularySize() const; + +  virtual bool HasWord(const string& word) const; + +  virtual int GetWordId(const string& word) const; + +  string GetWord(int word_id) const; + +  int GetNumSentences() const; + +  int GetSentenceStart(int position) const; + +  virtual int GetSentenceId(int position) const; + +  void WriteBinary(const fs::path& filepath) const; + +  void WriteBinary(FILE* file) const; + + private: +  void InitializeDataArray(); +  void CreateDataArray(const vector<string>& lines); + +  unordered_map<string, int> word2id; +  vector<string> id2word; +  vector<int> data; +  // TODO(pauldb): We only need sentence_id for the source language. Maybe we +  // can save some memory here. +  vector<int> sentence_id; +  vector<int> sentence_start; +}; + +#endif | 
