diff options
Diffstat (limited to 'extractor')
-rw-r--r-- | extractor/data_array.h | 6 | ||||
-rw-r--r-- | extractor/data_array_test.cc | 30 |
2 files changed, 31 insertions, 5 deletions
diff --git a/extractor/data_array.h b/extractor/data_array.h index 7c120b3c..96950789 100644 --- a/extractor/data_array.h +++ b/extractor/data_array.h @@ -15,6 +15,9 @@ enum Side { TARGET }; +// TODO: This class has features for both the source and target data arrays. +// Maybe we can save some memory by having more specific implementations (e.g. +// sentence_id is only needed for the source data array). class DataArray { public: static int NULL_WORD; @@ -48,7 +51,6 @@ class DataArray { virtual int GetSentenceStart(int position) const; - //TODO(pauldb): Add unit tests. virtual int GetSentenceLength(int sentence_id) const; virtual int GetSentenceId(int position) const; @@ -67,8 +69,6 @@ class DataArray { unordered_map<string, int> word2id; vector<string> id2word; vector<int> data; - // TODO(pauldb): We only need sentence_id for the source language. Maybe we - // can save some memory here. vector<int> sentence_id; vector<int> sentence_start; }; diff --git a/extractor/data_array_test.cc b/extractor/data_array_test.cc index 772ba10e..ba5ce09e 100644 --- a/extractor/data_array_test.cc +++ b/extractor/data_array_test.cc @@ -26,18 +26,28 @@ class DataArrayTest : public Test { }; TEST_F(DataArrayTest, TestGetData) { - vector<int> expected_source_data{2, 3, 4, 5, 1, 2, 6, 7, 8, 5, 1}; + vector<int> expected_source_data = {2, 3, 4, 5, 1, 2, 6, 7, 8, 5, 1}; + vector<string> expected_source_words = { + "ana", "are", "mere", ".", "__END_OF_LINE__", + "ana", "bea", "mult", "lapte", ".", "__END_OF_LINE__" + }; EXPECT_EQ(expected_source_data, source_data->GetData()); EXPECT_EQ(expected_source_data.size(), source_data->GetSize()); for (size_t i = 0; i < expected_source_data.size(); ++i) { EXPECT_EQ(expected_source_data[i], source_data->AtIndex(i)); + EXPECT_EQ(expected_source_words[i], source_data->GetWordAtIndex(i)); } - vector<int> expected_target_data{2, 3, 4, 5, 1, 2, 6, 7, 8, 9, 10, 5, 1}; + vector<int> expected_target_data = {2, 3, 4, 5, 1, 2, 6, 7, 8, 9, 10, 5, 1}; + vector<string> expected_target_words = { + "anna", "has", "apples", ".", "__END_OF_LINE__", + "anna", "drinks", "a", "lot", "of", "milk", ".", "__END_OF_LINE__" + }; EXPECT_EQ(expected_target_data, target_data->GetData()); EXPECT_EQ(expected_target_data.size(), target_data->GetSize()); for (size_t i = 0; i < expected_target_data.size(); ++i) { EXPECT_EQ(expected_target_data[i], target_data->AtIndex(i)); + EXPECT_EQ(expected_target_words[i], target_data->GetWordAtIndex(i)); } } @@ -61,10 +71,26 @@ TEST_F(DataArrayTest, TestSentenceData) { EXPECT_EQ(5, source_data->GetSentenceStart(1)); EXPECT_EQ(11, source_data->GetSentenceStart(2)); + EXPECT_EQ(4, source_data->GetSentenceLength(0)); + EXPECT_EQ(5, source_data->GetSentenceLength(1)); + + vector<int> expected_source_ids = {0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1}; + for (size_t i = 0; i < expected_source_ids.size(); ++i) { + EXPECT_EQ(expected_source_ids[i], source_data->GetSentenceId(i)); + } + EXPECT_EQ(2, target_data->GetNumSentences()); EXPECT_EQ(0, target_data->GetSentenceStart(0)); EXPECT_EQ(5, target_data->GetSentenceStart(1)); EXPECT_EQ(13, target_data->GetSentenceStart(2)); + + EXPECT_EQ(4, target_data->GetSentenceLength(0)); + EXPECT_EQ(7, target_data->GetSentenceLength(1)); + + vector<int> expected_target_ids = {0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1}; + for (size_t i = 0; i < expected_target_ids.size(); ++i) { + EXPECT_EQ(expected_target_ids[i], target_data->GetSentenceId(i)); + } } } // namespace |