summaryrefslogtreecommitdiff
path: root/extractor/translation_table.cc
diff options
context:
space:
mode:
authorPaul Baltescu <pauldb89@gmail.com>2013-02-14 23:17:15 +0000
committerPaul Baltescu <pauldb89@gmail.com>2013-02-14 23:17:15 +0000
commit63b30ed9c8510da8c8e2f6a456576424fddacc0e (patch)
tree1b5278fb5a4480b7f7a965bb6de8f6f9e9c4d333 /extractor/translation_table.cc
parent0a53f7eca74c165b5ce1c238f1999ddf1febea55 (diff)
Working version of the grammar extractor.
Diffstat (limited to 'extractor/translation_table.cc')
-rw-r--r--extractor/translation_table.cc47
1 files changed, 35 insertions, 12 deletions
diff --git a/extractor/translation_table.cc b/extractor/translation_table.cc
index 10f1b9ed..a48c0657 100644
--- a/extractor/translation_table.cc
+++ b/extractor/translation_table.cc
@@ -9,7 +9,6 @@
#include "data_array.h"
using namespace std;
-using namespace tr1;
TranslationTable::TranslationTable(shared_ptr<DataArray> source_data_array,
shared_ptr<DataArray> target_data_array,
@@ -20,14 +19,15 @@ TranslationTable::TranslationTable(shared_ptr<DataArray> source_data_array,
unordered_map<int, int> source_links_count;
unordered_map<int, int> target_links_count;
- unordered_map<pair<int, int>, int, PairHash > links_count;
+ unordered_map<pair<int, int>, int, PairHash> links_count;
for (size_t i = 0; i < source_data_array->GetNumSentences(); ++i) {
- const vector<pair<int, int> >& links = alignment->GetLinks(i);
+ vector<pair<int, int> > links = alignment->GetLinks(i);
int source_start = source_data_array->GetSentenceStart(i);
- int next_source_start = source_data_array->GetSentenceStart(i + 1);
int target_start = target_data_array->GetSentenceStart(i);
- int next_target_start = target_data_array->GetSentenceStart(i + 1);
+ // Ignore END_OF_LINE markers.
+ int next_source_start = source_data_array->GetSentenceStart(i + 1) - 1;
+ int next_target_start = target_data_array->GetSentenceStart(i + 1) - 1;
vector<int> source_sentence(source_data.begin() + source_start,
source_data.begin() + next_source_start);
vector<int> target_sentence(target_data.begin() + target_start,
@@ -38,15 +38,23 @@ TranslationTable::TranslationTable(shared_ptr<DataArray> source_data_array,
for (pair<int, int> link: links) {
source_linked_words[link.first] = 1;
target_linked_words[link.second] = 1;
- int source_word = source_sentence[link.first];
- int target_word = target_sentence[link.second];
+ IncreaseLinksCount(source_links_count, target_links_count, links_count,
+ source_sentence[link.first], target_sentence[link.second]);
+ }
- ++source_links_count[source_word];
- ++target_links_count[target_word];
- ++links_count[make_pair(source_word, target_word)];
+ for (size_t i = 0; i < source_sentence.size(); ++i) {
+ if (!source_linked_words[i]) {
+ IncreaseLinksCount(source_links_count, target_links_count, links_count,
+ source_sentence[i], DataArray::NULL_WORD);
+ }
}
- // TODO(pauldb): Something seems wrong here. No NULL word?
+ for (size_t i = 0; i < target_sentence.size(); ++i) {
+ if (!target_linked_words[i]) {
+ IncreaseLinksCount(source_links_count, target_links_count, links_count,
+ DataArray::NULL_WORD, target_sentence[i]);
+ }
+ }
}
for (pair<pair<int, int>, int> link_count: links_count) {
@@ -58,6 +66,21 @@ TranslationTable::TranslationTable(shared_ptr<DataArray> source_data_array,
}
}
+TranslationTable::TranslationTable() {}
+
+TranslationTable::~TranslationTable() {}
+
+void TranslationTable::IncreaseLinksCount(
+ unordered_map<int, int>& source_links_count,
+ unordered_map<int, int>& target_links_count,
+ unordered_map<pair<int, int>, int, PairHash>& links_count,
+ int source_word_id,
+ int target_word_id) const {
+ ++source_links_count[source_word_id];
+ ++target_links_count[target_word_id];
+ ++links_count[make_pair(source_word_id, target_word_id)];
+}
+
double TranslationTable::GetTargetGivenSourceScore(
const string& source_word, const string& target_word) {
if (!source_data_array->HasWord(source_word) ||
@@ -73,7 +96,7 @@ double TranslationTable::GetTargetGivenSourceScore(
double TranslationTable::GetSourceGivenTargetScore(
const string& source_word, const string& target_word) {
if (!source_data_array->HasWord(source_word) ||
- !target_data_array->HasWord(target_word) == 0) {
+ !target_data_array->HasWord(target_word)) {
return -1;
}