diff options
author | Paul Baltescu <pauldb89@gmail.com> | 2013-01-28 11:56:31 +0000 |
---|---|---|
committer | Paul Baltescu <pauldb89@gmail.com> | 2013-01-28 11:56:31 +0000 |
commit | 5530575ae0ad939e17f08d6bd49978acea388ab7 (patch) | |
tree | 4620a276c1c827d824e285148f4f4a5bf781ebfe /extractor/data_array.h | |
parent | ce6937f136a38af93d9a5cd9628acc712da95543 (diff) |
Initial working commit.
Diffstat (limited to 'extractor/data_array.h')
-rw-r--r-- | extractor/data_array.h | 71 |
1 files changed, 71 insertions, 0 deletions
diff --git a/extractor/data_array.h b/extractor/data_array.h new file mode 100644 index 00000000..6d3e99d5 --- /dev/null +++ b/extractor/data_array.h @@ -0,0 +1,71 @@ +#ifndef _DATA_ARRAY_H_ +#define _DATA_ARRAY_H_ + +#include <string> +#include <tr1/unordered_map> +#include <vector> + +#include <boost/filesystem.hpp> + +namespace fs = boost::filesystem; +using namespace std; +using namespace tr1; + +enum Side { + SOURCE, + TARGET +}; + +class DataArray { + public: + static int END_OF_FILE; + static int END_OF_LINE; + static string END_OF_FILE_STR; + static string END_OF_LINE_STR; + + DataArray(); + + DataArray(const string& filename); + + DataArray(const string& filename, const Side& side); + + virtual ~DataArray(); + + virtual const vector<int>& GetData() const; + + virtual int AtIndex(int index) const; + + virtual int GetSize() const; + + virtual int GetVocabularySize() const; + + virtual bool HasWord(const string& word) const; + + virtual int GetWordId(const string& word) const; + + string GetWord(int word_id) const; + + int GetNumSentences() const; + + int GetSentenceStart(int position) const; + + virtual int GetSentenceId(int position) const; + + void WriteBinary(const fs::path& filepath) const; + + void WriteBinary(FILE* file) const; + + private: + void InitializeDataArray(); + void CreateDataArray(const vector<string>& lines); + + unordered_map<string, int> word2id; + vector<string> id2word; + vector<int> data; + // TODO(pauldb): We only need sentence_id for the source language. Maybe we + // can save some memory here. + vector<int> sentence_id; + vector<int> sentence_start; +}; + +#endif |