diff options
Diffstat (limited to 'extractor/data_array.h')
-rw-r--r-- | extractor/data_array.h | 36 |
1 files changed, 33 insertions, 3 deletions
diff --git a/extractor/data_array.h b/extractor/data_array.h index a26bbecf..978a6931 100644 --- a/extractor/data_array.h +++ b/extractor/data_array.h @@ -17,9 +17,19 @@ enum Side { TARGET }; -// Note: This class has features for both the source and target data arrays. -// Maybe we can save some memory by having more specific implementations (e.g. -// sentence_id is only needed for the source data array). +/** + * Data structure storing information about a single side of a parallel corpus. + * + * Each word is mapped to a unique integer (word_id). The data structure holds + * the corpus in the numberized format, together with the hash table mapping + * words to word_ids. It also holds additional information such as the starting + * index for each sentence and, for each token, the index of the sentence it + * belongs to. + * + * Note: This class has features for both the source and target data arrays. + * Maybe we can save some memory by having more specific implementations (not + * likely to save a lot of memory tough). + */ class DataArray { public: static int NULL_WORD; @@ -27,45 +37,65 @@ class DataArray { static string NULL_WORD_STR; static string END_OF_LINE_STR; + // Reads data array from text file. DataArray(const string& filename); + // Reads data array from bitext file where the sentences are separated by |||. DataArray(const string& filename, const Side& side); virtual ~DataArray(); + // Returns a vector containing the word ids. virtual const vector<int>& GetData() const; + // Returns the word id at the specified position. virtual int AtIndex(int index) const; + // Returns the original word at the specified position. virtual string GetWordAtIndex(int index) const; + // Returns the size of the data array. virtual int GetSize() const; + // Returns the number of distinct words in the data array. virtual int GetVocabularySize() const; + // Returns whether a word has ever been observed in the data array. virtual bool HasWord(const string& word) const; + // Returns the word id for a given word or -1 if it the word has never been + // observed. virtual int GetWordId(const string& word) const; + // Returns the word corresponding to a particular word id. virtual string GetWord(int word_id) const; + // Returns the number of sentences in the data. virtual int GetNumSentences() const; + // Returns the index where the sentence containing the given position starts. virtual int GetSentenceStart(int position) const; + // Returns the length of the sentence. virtual int GetSentenceLength(int sentence_id) const; + // Returns the number of the sentence containing the given position. virtual int GetSentenceId(int position) const; + // Writes data array to file in binary format. void WriteBinary(const fs::path& filepath) const; + // Writes data array to file in binary format. void WriteBinary(FILE* file) const; protected: DataArray(); private: + // Sets up specific constants. void InitializeDataArray(); + + // Constructs the data array. void CreateDataArray(const vector<string>& lines); unordered_map<string, int> word2id; |