From 63b30ed9c8510da8c8e2f6a456576424fddacc0e Mon Sep 17 00:00:00 2001 From: Paul Baltescu Date: Thu, 14 Feb 2013 23:17:15 +0000 Subject: Working version of the grammar extractor. --- extractor/target_phrase_extractor.cc | 144 +++++++++++++++++++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 extractor/target_phrase_extractor.cc (limited to 'extractor/target_phrase_extractor.cc') diff --git a/extractor/target_phrase_extractor.cc b/extractor/target_phrase_extractor.cc new file mode 100644 index 00000000..ac583953 --- /dev/null +++ b/extractor/target_phrase_extractor.cc @@ -0,0 +1,144 @@ +#include "target_phrase_extractor.h" + +#include + +#include "alignment.h" +#include "data_array.h" +#include "phrase.h" +#include "phrase_builder.h" +#include "rule_extractor_helper.h" +#include "vocabulary.h" + +using namespace std; + +TargetPhraseExtractor::TargetPhraseExtractor( + shared_ptr target_data_array, + shared_ptr alignment, + shared_ptr phrase_builder, + shared_ptr helper, + shared_ptr vocabulary, + int max_rule_span, + bool require_tight_phrases) : + target_data_array(target_data_array), + alignment(alignment), + phrase_builder(phrase_builder), + helper(helper), + vocabulary(vocabulary), + max_rule_span(max_rule_span), + require_tight_phrases(require_tight_phrases) {} + +TargetPhraseExtractor::TargetPhraseExtractor() {} + +TargetPhraseExtractor::~TargetPhraseExtractor() {} + +vector > TargetPhraseExtractor::ExtractPhrases( + const vector >& target_gaps, const vector& target_low, + int target_phrase_low, int target_phrase_high, + const unordered_map& source_indexes, int sentence_id) const { + int target_sent_len = target_data_array->GetSentenceLength(sentence_id); + + vector target_gap_order = helper->GetGapOrder(target_gaps); + + int target_x_low = target_phrase_low, target_x_high = target_phrase_high; + if (!require_tight_phrases) { + while (target_x_low > 0 && + target_phrase_high - target_x_low < max_rule_span && + target_low[target_x_low - 1] == -1) { + --target_x_low; + } + while (target_x_high < target_sent_len && + target_x_high - target_phrase_low < max_rule_span && + target_low[target_x_high] == -1) { + ++target_x_high; + } + } + + vector > gaps(target_gaps.size()); + for (size_t i = 0; i < gaps.size(); ++i) { + gaps[i] = target_gaps[target_gap_order[i]]; + if (!require_tight_phrases) { + while (gaps[i].first > target_x_low && + target_low[gaps[i].first - 1] == -1) { + --gaps[i].first; + } + while (gaps[i].second < target_x_high && + target_low[gaps[i].second] == -1) { + ++gaps[i].second; + } + } + } + + vector > ranges(2 * gaps.size() + 2); + ranges.front() = make_pair(target_x_low, target_phrase_low); + ranges.back() = make_pair(target_phrase_high, target_x_high); + for (size_t i = 0; i < gaps.size(); ++i) { + int j = target_gap_order[i]; + ranges[i * 2 + 1] = make_pair(gaps[i].first, target_gaps[j].first); + ranges[i * 2 + 2] = make_pair(target_gaps[j].second, gaps[i].second); + } + + vector > target_phrases; + vector subpatterns(ranges.size()); + GeneratePhrases(target_phrases, ranges, 0, subpatterns, target_gap_order, + target_phrase_low, target_phrase_high, source_indexes, + sentence_id); + return target_phrases; +} + +void TargetPhraseExtractor::GeneratePhrases( + vector >& target_phrases, + const vector >& ranges, int index, vector& subpatterns, + const vector& target_gap_order, int target_phrase_low, + int target_phrase_high, const unordered_map& source_indexes, + int sentence_id) const { + if (index >= ranges.size()) { + if (subpatterns.back() - subpatterns.front() > max_rule_span) { + return; + } + + vector symbols; + unordered_map target_indexes; + + int target_sent_start = target_data_array->GetSentenceStart(sentence_id); + for (size_t i = 0; i * 2 < subpatterns.size(); ++i) { + for (size_t j = subpatterns[i * 2]; j < subpatterns[i * 2 + 1]; ++j) { + target_indexes[j] = symbols.size(); + string target_word = target_data_array->GetWordAtIndex( + target_sent_start + j); + symbols.push_back(vocabulary->GetTerminalIndex(target_word)); + } + if (i < target_gap_order.size()) { + symbols.push_back(vocabulary->GetNonterminalIndex( + target_gap_order[i] + 1)); + } + } + + vector > links = alignment->GetLinks(sentence_id); + vector > alignment; + for (pair link: links) { + if (target_indexes.count(link.second)) { + alignment.push_back(make_pair(source_indexes.find(link.first)->second, + target_indexes[link.second])); + } + } + + Phrase target_phrase = phrase_builder->Build(symbols); + target_phrases.push_back(make_pair(target_phrase, alignment)); + return; + } + + subpatterns[index] = ranges[index].first; + if (index > 0) { + subpatterns[index] = max(subpatterns[index], subpatterns[index - 1]); + } + while (subpatterns[index] <= ranges[index].second) { + subpatterns[index + 1] = max(subpatterns[index], ranges[index + 1].first); + while (subpatterns[index + 1] <= ranges[index + 1].second) { + GeneratePhrases(target_phrases, ranges, index + 2, subpatterns, + target_gap_order, target_phrase_low, target_phrase_high, + source_indexes, sentence_id); + ++subpatterns[index + 1]; + } + ++subpatterns[index]; + } +} -- cgit v1.2.3