#ifndef _DATA_ARRAY_H_ #define _DATA_ARRAY_H_ #include #include #include #include namespace fs = boost::filesystem; using namespace std; namespace extractor { enum Side { SOURCE, TARGET }; // Note: This class has features for both the source and target data arrays. // Maybe we can save some memory by having more specific implementations (e.g. // sentence_id is only needed for the source data array). class DataArray { public: static int NULL_WORD; static int END_OF_LINE; static string NULL_WORD_STR; static string END_OF_LINE_STR; DataArray(const string& filename); DataArray(const string& filename, const Side& side); virtual ~DataArray(); virtual const vector& GetData() const; virtual int AtIndex(int index) const; virtual string GetWordAtIndex(int index) const; virtual int GetSize() const; virtual int GetVocabularySize() const; virtual bool HasWord(const string& word) const; virtual int GetWordId(const string& word) const; virtual string GetWord(int word_id) const; virtual int GetNumSentences() const; virtual int GetSentenceStart(int position) const; virtual int GetSentenceLength(int sentence_id) const; virtual int GetSentenceId(int position) const; void WriteBinary(const fs::path& filepath) const; void WriteBinary(FILE* file) const; protected: DataArray(); private: void InitializeDataArray(); void CreateDataArray(const vector& lines); unordered_map word2id; vector id2word; vector data; vector sentence_id; vector sentence_start; }; } // namespace extractor #endif