#include "translation_table.h" #include #include #include #include "alignment.h" #include "data_array.h" using namespace std; namespace extractor { TranslationTable::TranslationTable(shared_ptr source_data_array, shared_ptr target_data_array, shared_ptr alignment) : source_data_array(source_data_array), target_data_array(target_data_array) { const vector& source_data = source_data_array->GetData(); const vector& target_data = target_data_array->GetData(); unordered_map source_links_count; unordered_map target_links_count; unordered_map, int, PairHash> links_count; for (size_t i = 0; i < source_data_array->GetNumSentences(); ++i) { vector > links = alignment->GetLinks(i); int source_start = source_data_array->GetSentenceStart(i); int target_start = target_data_array->GetSentenceStart(i); // Ignore END_OF_LINE markers. int next_source_start = source_data_array->GetSentenceStart(i + 1) - 1; int next_target_start = target_data_array->GetSentenceStart(i + 1) - 1; vector source_sentence(source_data.begin() + source_start, source_data.begin() + next_source_start); vector target_sentence(target_data.begin() + target_start, target_data.begin() + next_target_start); vector source_linked_words(source_sentence.size()); vector target_linked_words(target_sentence.size()); for (pair link: links) { source_linked_words[link.first] = 1; target_linked_words[link.second] = 1; IncreaseLinksCount(source_links_count, target_links_count, links_count, source_sentence[link.first], target_sentence[link.second]); } for (size_t i = 0; i < source_sentence.size(); ++i) { if (!source_linked_words[i]) { IncreaseLinksCount(source_links_count, target_links_count, links_count, source_sentence[i], DataArray::NULL_WORD); } } for (size_t i = 0; i < target_sentence.size(); ++i) { if (!target_linked_words[i]) { IncreaseLinksCount(source_links_count, target_links_count, links_count, DataArray::NULL_WORD, target_sentence[i]); } } } for (pair, int> link_count: links_count) { int source_word = link_count.first.first; int target_word = link_count.first.second; double score1 = 1.0 * link_count.second / source_links_count[source_word]; double score2 = 1.0 * link_count.second / target_links_count[target_word]; translation_probabilities[link_count.first] = make_pair(score1, score2); } } TranslationTable::TranslationTable() {} TranslationTable::~TranslationTable() {} void TranslationTable::IncreaseLinksCount( unordered_map& source_links_count, unordered_map& target_links_count, unordered_map, int, PairHash>& links_count, int source_word_id, int target_word_id) const { ++source_links_count[source_word_id]; ++target_links_count[target_word_id]; ++links_count[make_pair(source_word_id, target_word_id)]; } double TranslationTable::GetTargetGivenSourceScore( const string& source_word, const string& target_word) { if (!source_data_array->HasWord(source_word) || !target_data_array->HasWord(target_word)) { return -1; } int source_id = source_data_array->GetWordId(source_word); int target_id = target_data_array->GetWordId(target_word); return translation_probabilities[make_pair(source_id, target_id)].first; } double TranslationTable::GetSourceGivenTargetScore( const string& source_word, const string& target_word) { if (!source_data_array->HasWord(source_word) || !target_data_array->HasWord(target_word)) { return -1; } int source_id = source_data_array->GetWordId(source_word); int target_id = target_data_array->GetWordId(target_word); return translation_probabilities[make_pair(source_id, target_id)].second; } void TranslationTable::WriteBinary(const fs::path& filepath) const { FILE* file = fopen(filepath.string().c_str(), "w"); int size = translation_probabilities.size(); fwrite(&size, sizeof(int), 1, file); for (auto entry: translation_probabilities) { fwrite(&entry.first, sizeof(entry.first), 1, file); fwrite(&entry.second, sizeof(entry.second), 1, file); } } } // namespace extractor