diff options
author | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2013-04-23 19:35:18 -0400 |
---|---|---|
committer | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2013-04-23 19:35:18 -0400 |
commit | 6d347f1ce078dede3da0e1498f75e357351c6543 (patch) | |
tree | 8e872b8747c530e741e55e25e9917c1bd8b32c5b /extractor/data_array.h | |
parent | d11b76def6899790161c47a73018146311356d8b (diff) | |
parent | 5e9605b65202f4e5fc59843b197d88c4774f0ac8 (diff) |
merge paul's extractor code
Diffstat (limited to 'extractor/data_array.h')
-rw-r--r-- | extractor/data_array.h | 110 |
1 files changed, 110 insertions, 0 deletions
diff --git a/extractor/data_array.h b/extractor/data_array.h new file mode 100644 index 00000000..978a6931 --- /dev/null +++ b/extractor/data_array.h @@ -0,0 +1,110 @@ +#ifndef _DATA_ARRAY_H_ +#define _DATA_ARRAY_H_ + +#include <string> +#include <unordered_map> +#include <vector> + +#include <boost/filesystem.hpp> + +namespace fs = boost::filesystem; +using namespace std; + +namespace extractor { + +enum Side { + SOURCE, + TARGET +}; + +/** + * Data structure storing information about a single side of a parallel corpus. + * + * Each word is mapped to a unique integer (word_id). The data structure holds + * the corpus in the numberized format, together with the hash table mapping + * words to word_ids. It also holds additional information such as the starting + * index for each sentence and, for each token, the index of the sentence it + * belongs to. + * + * Note: This class has features for both the source and target data arrays. + * Maybe we can save some memory by having more specific implementations (not + * likely to save a lot of memory tough). + */ +class DataArray { + public: + static int NULL_WORD; + static int END_OF_LINE; + static string NULL_WORD_STR; + static string END_OF_LINE_STR; + + // Reads data array from text file. + DataArray(const string& filename); + + // Reads data array from bitext file where the sentences are separated by |||. + DataArray(const string& filename, const Side& side); + + virtual ~DataArray(); + + // Returns a vector containing the word ids. + virtual const vector<int>& GetData() const; + + // Returns the word id at the specified position. + virtual int AtIndex(int index) const; + + // Returns the original word at the specified position. + virtual string GetWordAtIndex(int index) const; + + // Returns the size of the data array. + virtual int GetSize() const; + + // Returns the number of distinct words in the data array. + virtual int GetVocabularySize() const; + + // Returns whether a word has ever been observed in the data array. + virtual bool HasWord(const string& word) const; + + // Returns the word id for a given word or -1 if it the word has never been + // observed. + virtual int GetWordId(const string& word) const; + + // Returns the word corresponding to a particular word id. + virtual string GetWord(int word_id) const; + + // Returns the number of sentences in the data. + virtual int GetNumSentences() const; + + // Returns the index where the sentence containing the given position starts. + virtual int GetSentenceStart(int position) const; + + // Returns the length of the sentence. + virtual int GetSentenceLength(int sentence_id) const; + + // Returns the number of the sentence containing the given position. + virtual int GetSentenceId(int position) const; + + // Writes data array to file in binary format. + void WriteBinary(const fs::path& filepath) const; + + // Writes data array to file in binary format. + void WriteBinary(FILE* file) const; + + protected: + DataArray(); + + private: + // Sets up specific constants. + void InitializeDataArray(); + + // Constructs the data array. + void CreateDataArray(const vector<string>& lines); + + unordered_map<string, int> word2id; + vector<string> id2word; + vector<int> data; + vector<int> sentence_id; + vector<int> sentence_start; +}; + +} // namespace extractor + +#endif |