Added comments. Hooray!

author: Paul Baltescu <pauldb89@gmail.com> 2013-03-10 01:01:01 +0000
committer: Paul Baltescu <pauldb89@gmail.com> 2013-03-10 01:01:01 +0000
commit: 6d43674e6b224281e43ccefc87224a7ba2fbb99a (patch)
tree: 36e0128b60755e47d217825fca13ccfb3f701158 /extractor/data_array.h
parent: 1b9ca189fd0549bd6d969edf618f92ea59184b12 (diff)
1 files changed, 33 insertions, 3 deletions
diff --git a/extractor/data_array.h b/extractor/data_array.h
index a26bbecf..978a6931 100644
--- a/extractor/data_array.h
+++ b/extractor/data_array.h
@@ -17,9 +17,19 @@ enum Side {
   TARGET
 };
 
-// Note: This class has features for both the source and target data arrays.
-// Maybe we can save some memory by having more specific implementations (e.g.
-// sentence_id is only needed for the source data array).
+/**
+ * Data structure storing information about a single side of a parallel corpus.
+ *
+ * Each word is mapped to a unique integer (word_id). The data structure holds
+ * the corpus in the numberized format, together with the hash table mapping
+ * words to word_ids. It also holds additional information such as the starting
+ * index for each sentence and, for each token, the index of the sentence it
+ * belongs to.
+ *
+ * Note: This class has features for both the source and target data arrays.
+ * Maybe we can save some memory by having more specific implementations (not
+ * likely to save a lot of memory tough).
+ */
 class DataArray {
  public:
   static int NULL_WORD;
@@ -27,45 +37,65 @@ class DataArray {
   static string NULL_WORD_STR;
   static string END_OF_LINE_STR;
 
+  // Reads data array from text file.
   DataArray(const string& filename);
 
+  // Reads data array from bitext file where the sentences are separated by |||.
   DataArray(const string& filename, const Side& side);
 
   virtual ~DataArray();
 
+  // Returns a vector containing the word ids.
   virtual const vector<int>& GetData() const;
 
+  // Returns the word id at the specified position.
   virtual int AtIndex(int index) const;
 
+  // Returns the original word at the specified position.
   virtual string GetWordAtIndex(int index) const;
 
+  // Returns the size of the data array.
   virtual int GetSize() const;
 
+  // Returns the number of distinct words in the data array.
   virtual int GetVocabularySize() const;
 
+  // Returns whether a word has ever been observed in the data array.
   virtual bool HasWord(const string& word) const;
 
+  // Returns the word id for a given word or -1 if it the word has never been
+  // observed.
   virtual int GetWordId(const string& word) const;
 
+  // Returns the word corresponding to a particular word id.
   virtual string GetWord(int word_id) const;
 
+  // Returns the number of sentences in the data.
   virtual int GetNumSentences() const;
 
+  // Returns the index where the sentence containing the given position starts.
   virtual int GetSentenceStart(int position) const;
 
+  // Returns the length of the sentence.
   virtual int GetSentenceLength(int sentence_id) const;
 
+  // Returns the number of the sentence containing the given position.
   virtual int GetSentenceId(int position) const;
 
+  // Writes data array to file in binary format.
   void WriteBinary(const fs::path& filepath) const;
 
+  // Writes data array to file in binary format.
   void WriteBinary(FILE* file) const;
 
  protected:
   DataArray();
 
  private:
+  // Sets up specific constants.
   void InitializeDataArray();
+
+  // Constructs the data array.
   void CreateDataArray(const vector<string>& lines);
 
   unordered_map<string, int> word2id;
author	Paul Baltescu <pauldb89@gmail.com>	2013-03-10 01:01:01 +0000
committer	Paul Baltescu <pauldb89@gmail.com>	2013-03-10 01:01:01 +0000
commit	6d43674e6b224281e43ccefc87224a7ba2fbb99a (patch)
tree	36e0128b60755e47d217825fca13ccfb3f701158 /extractor/data_array.h
parent	1b9ca189fd0549bd6d969edf618f92ea59184b12 (diff)