diff options
author | Paul Baltescu <pauldb89@gmail.com> | 2013-02-14 23:17:15 +0000 |
---|---|---|
committer | Paul Baltescu <pauldb89@gmail.com> | 2013-02-14 23:17:15 +0000 |
commit | 9a026ba2db8fa7723374109e6a4a8dcaff8733cd (patch) | |
tree | 34a60703a53ada76e7213da5940e86d6f476f1e4 /extractor/rule_extractor.cc | |
parent | 252fb164c208ec8f3005f8a652eb3b48c0644e3d (diff) |
Working version of the grammar extractor.
Diffstat (limited to 'extractor/rule_extractor.cc')
-rw-r--r-- | extractor/rule_extractor.cc | 618 |
1 files changed, 127 insertions, 491 deletions
diff --git a/extractor/rule_extractor.cc b/extractor/rule_extractor.cc index 9460020f..92343241 100644 --- a/extractor/rule_extractor.cc +++ b/extractor/rule_extractor.cc @@ -1,7 +1,6 @@ #include "rule_extractor.h" #include <map> -#include <tr1/unordered_set> #include "alignment.h" #include "data_array.h" @@ -9,11 +8,11 @@ #include "phrase_builder.h" #include "phrase_location.h" #include "rule.h" +#include "rule_extractor_helper.h" #include "scorer.h" -#include "vocabulary.h" +#include "target_phrase_extractor.h" using namespace std; -using namespace tr1; RuleExtractor::RuleExtractor( shared_ptr<DataArray> source_data_array, @@ -29,20 +28,50 @@ RuleExtractor::RuleExtractor( bool require_aligned_terminal, bool require_aligned_chunks, bool require_tight_phrases) : - source_data_array(source_data_array), target_data_array(target_data_array), - alignment(alignment), + source_data_array(source_data_array), phrase_builder(phrase_builder), scorer(scorer), - vocabulary(vocabulary), max_rule_span(max_rule_span), min_gap_size(min_gap_size), max_nonterminals(max_nonterminals), max_rule_symbols(max_rule_symbols), - require_aligned_terminal(require_aligned_terminal), - require_aligned_chunks(require_aligned_chunks), + require_tight_phrases(require_tight_phrases) { + helper = make_shared<RuleExtractorHelper>( + source_data_array, target_data_array, alignment, max_rule_span, + max_rule_symbols, require_aligned_terminal, require_aligned_chunks, + require_tight_phrases); + target_phrase_extractor = make_shared<TargetPhraseExtractor>( + target_data_array, alignment, phrase_builder, helper, vocabulary, + max_rule_span, require_tight_phrases); +} + +RuleExtractor::RuleExtractor( + shared_ptr<DataArray> source_data_array, + shared_ptr<PhraseBuilder> phrase_builder, + shared_ptr<Scorer> scorer, + shared_ptr<TargetPhraseExtractor> target_phrase_extractor, + shared_ptr<RuleExtractorHelper> helper, + int max_rule_span, + int min_gap_size, + int max_nonterminals, + int max_rule_symbols, + bool require_tight_phrases) : + source_data_array(source_data_array), + phrase_builder(phrase_builder), + scorer(scorer), + target_phrase_extractor(target_phrase_extractor), + helper(helper), + max_rule_span(max_rule_span), + min_gap_size(min_gap_size), + max_nonterminals(max_nonterminals), + max_rule_symbols(max_rule_symbols), require_tight_phrases(require_tight_phrases) {} +RuleExtractor::RuleExtractor() {} + +RuleExtractor::~RuleExtractor() {} + vector<Rule> RuleExtractor::ExtractRules(const Phrase& phrase, const PhraseLocation& location) const { int num_subpatterns = location.num_subpatterns; @@ -60,6 +89,7 @@ vector<Rule> RuleExtractor::ExtractRules(const Phrase& phrase, } } + int num_samples = matchings.size() / num_subpatterns; vector<Rule> rules; for (auto source_phrase_entry: alignments_counter) { Phrase source_phrase = source_phrase_entry.first; @@ -77,7 +107,7 @@ vector<Rule> RuleExtractor::ExtractRules(const Phrase& phrase, } FeatureContext context(source_phrase, target_phrase, - source_phrase_counter[source_phrase], num_locations); + source_phrase_counter[source_phrase], num_locations, num_samples); vector<double> scores = scorer->Score(context); rules.push_back(Rule(source_phrase, target_phrase, scores, most_frequent_alignment)); @@ -93,7 +123,8 @@ vector<Extract> RuleExtractor::ExtractAlignments( int source_sent_start = source_data_array->GetSentenceStart(sentence_id); vector<int> source_low, source_high, target_low, target_high; - GetLinksSpans(source_low, source_high, target_low, target_high, sentence_id); + helper->GetLinksSpans(source_low, source_high, target_low, target_high, + sentence_id); int num_subpatterns = matching.size(); vector<int> chunklen(num_subpatterns); @@ -101,39 +132,44 @@ vector<Extract> RuleExtractor::ExtractAlignments( chunklen[i] = phrase.GetChunkLen(i); } - if (!CheckAlignedTerminals(matching, chunklen, source_low) || - !CheckTightPhrases(matching, chunklen, source_low)) { + if (!helper->CheckAlignedTerminals(matching, chunklen, source_low) || + !helper->CheckTightPhrases(matching, chunklen, source_low)) { return extracts; } int source_back_low = -1, source_back_high = -1; int source_phrase_low = matching[0] - source_sent_start; - int source_phrase_high = matching.back() + chunklen.back() - source_sent_start; + int source_phrase_high = matching.back() + chunklen.back() - + source_sent_start; int target_phrase_low = -1, target_phrase_high = -1; - if (!FindFixPoint(source_phrase_low, source_phrase_high, source_low, - source_high, target_phrase_low, target_phrase_high, - target_low, target_high, source_back_low, source_back_high, - sentence_id, min_gap_size, 0, - max_nonterminals - matching.size() + 1, 1, 1, false)) { + if (!helper->FindFixPoint(source_phrase_low, source_phrase_high, source_low, + source_high, target_phrase_low, target_phrase_high, + target_low, target_high, source_back_low, + source_back_high, sentence_id, min_gap_size, 0, + max_nonterminals - matching.size() + 1, true, true, + false)) { return extracts; } bool met_constraints = true; int num_symbols = phrase.GetNumSymbols(); vector<pair<int, int> > source_gaps, target_gaps; - if (!CheckGaps(source_gaps, target_gaps, matching, chunklen, source_low, - source_high, target_low, target_high, source_phrase_low, - source_phrase_high, source_back_low, source_back_high, - num_symbols, met_constraints)) { + if (!helper->GetGaps(source_gaps, target_gaps, matching, chunklen, source_low, + source_high, target_low, target_high, source_phrase_low, + source_phrase_high, source_back_low, source_back_high, + num_symbols, met_constraints)) { return extracts; } - bool start_x = source_back_low != source_phrase_low; - bool end_x = source_back_high != source_phrase_high; - Phrase source_phrase = phrase_builder->Extend(phrase, start_x, end_x); + bool starts_with_x = source_back_low != source_phrase_low; + bool ends_with_x = source_back_high != source_phrase_high; + Phrase source_phrase = phrase_builder->Extend( + phrase, starts_with_x, ends_with_x); + unordered_map<int, int> source_indexes = helper->GetSourceIndexes( + matching, chunklen, starts_with_x); if (met_constraints) { - AddExtracts(extracts, source_phrase, target_gaps, target_low, - target_phrase_low, target_phrase_high, sentence_id); + AddExtracts(extracts, source_phrase, source_indexes, target_gaps, + target_low, target_phrase_low, target_phrase_high, sentence_id); } if (source_gaps.size() >= max_nonterminals || @@ -145,317 +181,24 @@ vector<Extract> RuleExtractor::ExtractAlignments( for (int i = 0; i < 2; ++i) { for (int j = 1 - i; j < 2; ++j) { - AddNonterminalExtremities(extracts, source_phrase, source_phrase_low, - source_phrase_high, source_back_low, source_back_high, source_low, - source_high, target_low, target_high, target_gaps, sentence_id, i, j); + AddNonterminalExtremities(extracts, matching, chunklen, source_phrase, + source_back_low, source_back_high, source_low, source_high, + target_low, target_high, target_gaps, sentence_id, starts_with_x, + ends_with_x, i, j); } } return extracts; } -void RuleExtractor::GetLinksSpans( - vector<int>& source_low, vector<int>& source_high, - vector<int>& target_low, vector<int>& target_high, int sentence_id) const { - // Ignore end of line markers. - int source_sent_len = source_data_array->GetSentenceStart(sentence_id + 1) - - source_data_array->GetSentenceStart(sentence_id) - 1; - int target_sent_len = target_data_array->GetSentenceStart(sentence_id + 1) - - target_data_array->GetSentenceStart(sentence_id) - 1; - source_low = vector<int>(source_sent_len, -1); - source_high = vector<int>(source_sent_len, -1); - - // TODO(pauldb): Adam Lopez claims this part is really inefficient. See if we - // can speed it up. - target_low = vector<int>(target_sent_len, -1); - target_high = vector<int>(target_sent_len, -1); - const vector<pair<int, int> >& links = alignment->GetLinks(sentence_id); - for (auto link: links) { - if (source_low[link.first] == -1 || source_low[link.first] > link.second) { - source_low[link.first] = link.second; - } - source_high[link.first] = max(source_high[link.first], link.second + 1); - - if (target_low[link.second] == -1 || target_low[link.second] > link.first) { - target_low[link.second] = link.first; - } - target_high[link.second] = max(target_high[link.second], link.first + 1); - } -} - -bool RuleExtractor::CheckAlignedTerminals(const vector<int>& matching, - const vector<int>& chunklen, - const vector<int>& source_low) const { - if (!require_aligned_terminal) { - return true; - } - - int sentence_id = source_data_array->GetSentenceId(matching[0]); - int source_sent_start = source_data_array->GetSentenceStart(sentence_id); - - int num_aligned_chunks = 0; - for (size_t i = 0; i < chunklen.size(); ++i) { - for (size_t j = 0; j < chunklen[i]; ++j) { - int sent_index = matching[i] - source_sent_start + j; - if (source_low[sent_index] != -1) { - ++num_aligned_chunks; - break; - } - } - } - - if (num_aligned_chunks == 0) { - return false; - } - - return !require_aligned_chunks || num_aligned_chunks == chunklen.size(); -} - -bool RuleExtractor::CheckTightPhrases(const vector<int>& matching, - const vector<int>& chunklen, - const vector<int>& source_low) const { - if (!require_tight_phrases) { - return true; - } - - int sentence_id = source_data_array->GetSentenceId(matching[0]); - int source_sent_start = source_data_array->GetSentenceStart(sentence_id); - for (size_t i = 0; i + 1 < chunklen.size(); ++i) { - int gap_start = matching[i] + chunklen[i] - source_sent_start; - int gap_end = matching[i + 1] - 1 - source_sent_start; - if (source_low[gap_start] == -1 || source_low[gap_end] == -1) { - return false; - } - } - - return true; -} - -bool RuleExtractor::FindFixPoint( - int source_phrase_low, int source_phrase_high, - const vector<int>& source_low, const vector<int>& source_high, - int& target_phrase_low, int& target_phrase_high, - const vector<int>& target_low, const vector<int>& target_high, - int& source_back_low, int& source_back_high, int sentence_id, - int min_source_gap_size, int min_target_gap_size, - int max_new_x, int max_low_x, int max_high_x, - bool allow_arbitrary_expansion) const { - int source_sent_len = source_data_array->GetSentenceStart(sentence_id + 1) - - source_data_array->GetSentenceStart(sentence_id) - 1; - int target_sent_len = target_data_array->GetSentenceStart(sentence_id + 1) - - target_data_array->GetSentenceStart(sentence_id) - 1; - - int prev_target_low = target_phrase_low; - int prev_target_high = target_phrase_high; - FindProjection(source_phrase_low, source_phrase_high, source_low, - source_high, target_phrase_low, target_phrase_high); - - if (target_phrase_low == -1) { - // TODO(pauldb): Low priority corner case inherited from Adam's code: - // If w is unaligned, but we don't require aligned terminals, returning an - // error here prevents the extraction of the allowed rule - // X -> X_1 w X_2 / X_1 X_2 - return false; - } - - if (prev_target_low != -1 && target_phrase_low != prev_target_low) { - if (prev_target_low - target_phrase_low < min_target_gap_size) { - target_phrase_low = prev_target_low - min_target_gap_size; - if (target_phrase_low < 0) { - return false; - } - } - } - - if (prev_target_high != -1 && target_phrase_high != prev_target_high) { - if (target_phrase_high - prev_target_high < min_target_gap_size) { - target_phrase_high = prev_target_high + min_target_gap_size; - if (target_phrase_high > target_sent_len) { - return false; - } - } - } - - if (target_phrase_high - target_phrase_low > max_rule_span) { - return false; - } - - source_back_low = source_back_high = -1; - FindProjection(target_phrase_low, target_phrase_high, target_low, target_high, - source_back_low, source_back_high); - int new_x = 0, new_low_x = 0, new_high_x = 0; - - while (true) { - source_back_low = min(source_back_low, source_phrase_low); - source_back_high = max(source_back_high, source_phrase_high); - - if (source_back_low == source_phrase_low && - source_back_high == source_phrase_high) { - return true; - } - - if (new_low_x >= max_low_x && source_back_low < source_phrase_low) { - // Extension on the left side not allowed. - return false; - } - if (new_high_x >= max_high_x && source_back_high > source_phrase_high) { - // Extension on the right side not allowed. - return false; - } - - // Extend left side. - if (source_back_low < source_phrase_low) { - if (new_x >= max_new_x) { - return false; - } - ++new_x; ++new_low_x; - if (source_phrase_low - source_back_low < min_source_gap_size) { - source_back_low = source_phrase_low - min_source_gap_size; - if (source_back_low < 0) { - return false; - } - } - } - - // Extend right side. - if (source_back_high > source_phrase_high) { - if (new_x >= max_new_x) { - return false; - } - ++new_x; ++new_high_x; - if (source_back_high - source_phrase_high < min_source_gap_size) { - source_back_high = source_phrase_high + min_source_gap_size; - if (source_back_high > source_sent_len) { - return false; - } - } - } - - if (source_back_high - source_back_low > max_rule_span) { - // Rule span too wide. - return false; - } - - prev_target_low = target_phrase_low; - prev_target_high = target_phrase_high; - FindProjection(source_back_low, source_phrase_low, source_low, source_high, - target_phrase_low, target_phrase_high); - FindProjection(source_phrase_high, source_back_high, source_low, - source_high, target_phrase_low, target_phrase_high); - if (prev_target_low == target_phrase_low && - prev_target_high == target_phrase_high) { - return true; - } - - if (!allow_arbitrary_expansion) { - // Arbitrary expansion not allowed. - return false; - } - if (target_phrase_high - target_phrase_low > max_rule_span) { - // Target side too wide. - return false; - } - - source_phrase_low = source_back_low; - source_phrase_high = source_back_high; - FindProjection(target_phrase_low, prev_target_low, target_low, target_high, - source_back_low, source_back_high); - FindProjection(prev_target_high, target_phrase_high, target_low, - target_high, source_back_low, source_back_high); - } - - return false; -} - -void RuleExtractor::FindProjection( - int source_phrase_low, int source_phrase_high, - const vector<int>& source_low, const vector<int>& source_high, - int& target_phrase_low, int& target_phrase_high) const { - for (size_t i = source_phrase_low; i < source_phrase_high; ++i) { - if (source_low[i] != -1) { - if (target_phrase_low == -1 || source_low[i] < target_phrase_low) { - target_phrase_low = source_low[i]; - } - target_phrase_high = max(target_phrase_high, source_high[i]); - } - } -} - -bool RuleExtractor::CheckGaps( - vector<pair<int, int> >& source_gaps, vector<pair<int, int> >& target_gaps, - const vector<int>& matching, const vector<int>& chunklen, - const vector<int>& source_low, const vector<int>& source_high, - const vector<int>& target_low, const vector<int>& target_high, - int source_phrase_low, int source_phrase_high, int source_back_low, - int source_back_high, int& num_symbols, bool& met_constraints) const { - int sentence_id = source_data_array->GetSentenceId(matching[0]); - int source_sent_start = source_data_array->GetSentenceStart(sentence_id); - - if (source_back_low < source_phrase_low) { - source_gaps.push_back(make_pair(source_back_low, source_phrase_low)); - if (num_symbols >= max_rule_symbols) { - // Source side contains too many symbols. - return false; - } - ++num_symbols; - if (require_tight_phrases && (source_low[source_back_low] == -1 || - source_low[source_phrase_low - 1] == -1)) { - // Inside edges of preceding gap are not tight. - return false; - } - } else if (require_tight_phrases && source_low[source_phrase_low] == -1) { - // This is not a hard error. We can't extract this phrase, but we might - // still be able to extract a superphrase. - met_constraints = false; - } - - for (size_t i = 0; i + 1 < chunklen.size(); ++i) { - int gap_start = matching[i] + chunklen[i] - source_sent_start; - int gap_end = matching[i + 1] - source_sent_start; - source_gaps.push_back(make_pair(gap_start, gap_end)); - } - - if (source_phrase_high < source_back_high) { - source_gaps.push_back(make_pair(source_phrase_high, source_back_high)); - if (num_symbols >= max_rule_symbols) { - // Source side contains too many symbols. - return false; - } - ++num_symbols; - if (require_tight_phrases && (source_low[source_phrase_high] == -1 || - source_low[source_back_high - 1] == -1)) { - // Inside edges of following gap are not tight. - return false; - } - } else if (require_tight_phrases && - source_low[source_phrase_high - 1] == -1) { - // This is not a hard error. We can't extract this phrase, but we might - // still be able to extract a superphrase. - met_constraints = false; - } - - target_gaps.resize(source_gaps.size(), make_pair(-1, -1)); - for (size_t i = 0; i < source_gaps.size(); ++i) { - if (!FindFixPoint(source_gaps[i].first, source_gaps[i].second, source_low, - source_high, target_gaps[i].first, target_gaps[i].second, - target_low, target_high, source_gaps[i].first, - source_gaps[i].second, sentence_id, 0, 0, 0, 0, 0, - false)) { - // Gap fails integrity check. - return false; - } - } - - return true; -} - void RuleExtractor::AddExtracts( vector<Extract>& extracts, const Phrase& source_phrase, + const unordered_map<int, int>& source_indexes, const vector<pair<int, int> >& target_gaps, const vector<int>& target_low, int target_phrase_low, int target_phrase_high, int sentence_id) const { - vector<pair<Phrase, PhraseAlignment> > target_phrases = ExtractTargetPhrases( + auto target_phrases = target_phrase_extractor->ExtractPhrases( target_gaps, target_low, target_phrase_low, target_phrase_high, - sentence_id); + source_indexes, sentence_id); if (target_phrases.size() > 0) { double pairs_count = 1.0 / target_phrases.size(); @@ -466,147 +209,29 @@ void RuleExtractor::AddExtracts( } } -vector<pair<Phrase, PhraseAlignment> > RuleExtractor::ExtractTargetPhrases( - const vector<pair<int, int> >& target_gaps, const vector<int>& target_low, - int target_phrase_low, int target_phrase_high, int sentence_id) const { - int target_sent_len = target_data_array->GetSentenceStart(sentence_id + 1) - - target_data_array->GetSentenceStart(sentence_id) - 1; - - vector<int> target_gap_order(target_gaps.size()); - for (size_t i = 0; i < target_gap_order.size(); ++i) { - for (size_t j = 0; j < i; ++j) { - if (target_gaps[target_gap_order[j]] < target_gaps[i]) { - ++target_gap_order[i]; - } else { - ++target_gap_order[j]; - } - } - } - - int target_x_low = target_phrase_low, target_x_high = target_phrase_high; - if (!require_tight_phrases) { - while (target_x_low > 0 && - target_phrase_high - target_x_low < max_rule_span && - target_low[target_x_low - 1] == -1) { - --target_x_low; - } - while (target_x_high + 1 < target_sent_len && - target_x_high - target_phrase_low < max_rule_span && - target_low[target_x_high + 1] == -1) { - ++target_x_high; - } - } - - vector<pair<int, int> > gaps(target_gaps.size()); - for (size_t i = 0; i < gaps.size(); ++i) { - gaps[i] = target_gaps[target_gap_order[i]]; - if (!require_tight_phrases) { - while (gaps[i].first > target_x_low && - target_low[gaps[i].first] == -1) { - --gaps[i].first; - } - while (gaps[i].second < target_x_high && - target_low[gaps[i].second] == -1) { - ++gaps[i].second; - } - } - } - - vector<pair<int, int> > ranges(2 * gaps.size() + 2); - ranges.front() = make_pair(target_x_low, target_phrase_low); - ranges.back() = make_pair(target_phrase_high, target_x_high); - for (size_t i = 0; i < gaps.size(); ++i) { - ranges[i * 2 + 1] = make_pair(gaps[i].first, target_gaps[i].first); - ranges[i * 2 + 2] = make_pair(target_gaps[i].second, gaps[i].second); - } - - vector<pair<Phrase, PhraseAlignment> > target_phrases; - vector<int> subpatterns(ranges.size()); - GeneratePhrases(target_phrases, ranges, 0, subpatterns, target_gap_order, - target_phrase_low, target_phrase_high, sentence_id); - return target_phrases; -} - -void RuleExtractor::GeneratePhrases( - vector<pair<Phrase, PhraseAlignment> >& target_phrases, - const vector<pair<int, int> >& ranges, int index, vector<int>& subpatterns, - const vector<int>& target_gap_order, int target_phrase_low, - int target_phrase_high, int sentence_id) const { - if (index >= ranges.size()) { - if (subpatterns.back() - subpatterns.front() > max_rule_span) { +void RuleExtractor::AddNonterminalExtremities( + vector<Extract>& extracts, const vector<int>& matching, + const vector<int>& chunklen, const Phrase& source_phrase, + int source_back_low, int source_back_high, const vector<int>& source_low, + const vector<int>& source_high, const vector<int>& target_low, + const vector<int>& target_high, vector<pair<int, int> > target_gaps, + int sentence_id, int starts_with_x, int ends_with_x, int extend_left, + int extend_right) const { + int source_x_low = source_back_low, source_x_high = source_back_high; + + if (require_tight_phrases) { + if (source_low[source_back_low - extend_left] == -1 || + source_low[source_back_high + extend_right - 1] == -1) { return; } - - vector<int> symbols; - unordered_set<int> target_indexes; - int offset = 1; - if (subpatterns.front() != target_phrase_low) { - offset = 2; - symbols.push_back(vocabulary->GetNonterminalIndex(1)); - } - - int target_sent_start = target_data_array->GetSentenceStart(sentence_id); - for (size_t i = 0; i * 2 < subpatterns.size(); ++i) { - for (size_t j = subpatterns[i * 2]; j < subpatterns[i * 2 + 1]; ++j) { - symbols.push_back(target_data_array->AtIndex(target_sent_start + j)); - target_indexes.insert(j); - } - if (i < target_gap_order.size()) { - symbols.push_back(vocabulary->GetNonterminalIndex( - target_gap_order[i] + offset)); - } - } - - if (subpatterns.back() != target_phrase_high) { - symbols.push_back(target_gap_order.size() + offset); - } - - const vector<pair<int, int> >& links = alignment->GetLinks(sentence_id); - vector<pair<int, int> > alignment; - for (pair<int, int> link: links) { - if (target_indexes.count(link.second)) { - alignment.push_back(link); - } - } - - target_phrases.push_back(make_pair(phrase_builder->Build(symbols), - alignment)); - return; - } - - subpatterns[index] = ranges[index].first; - if (index > 0) { - subpatterns[index] = max(subpatterns[index], subpatterns[index - 1]); } - while (subpatterns[index] <= ranges[index].second) { - subpatterns[index + 1] = max(subpatterns[index], ranges[index + 1].first); - while (subpatterns[index + 1] <= ranges[index + 1].second) { - GeneratePhrases(target_phrases, ranges, index + 2, subpatterns, - target_gap_order, target_phrase_low, target_phrase_high, - sentence_id); - ++subpatterns[index + 1]; - } - ++subpatterns[index]; - } -} -void RuleExtractor::AddNonterminalExtremities( - vector<Extract>& extracts, const Phrase& source_phrase, - int source_phrase_low, int source_phrase_high, int source_back_low, - int source_back_high, const vector<int>& source_low, - const vector<int>& source_high, const vector<int>& target_low, - const vector<int>& target_high, const vector<pair<int, int> >& target_gaps, - int sentence_id, int extend_left, int extend_right) const { - int source_x_low = source_phrase_low, source_x_high = source_phrase_high; if (extend_left) { - if (source_back_low != source_phrase_low || - source_phrase_low < min_gap_size || - (require_tight_phrases && (source_low[source_phrase_low - 1] == -1 || - source_low[source_back_high - 1] == -1))) { + if (starts_with_x || source_back_low < min_gap_size) { return; } - source_x_low = source_phrase_low - min_gap_size; + source_x_low = source_back_low - min_gap_size; if (require_tight_phrases) { while (source_x_low >= 0 && source_low[source_x_low] == -1) { --source_x_low; @@ -618,15 +243,11 @@ void RuleExtractor::AddNonterminalExtremities( } if (extend_right) { - int source_sent_len = source_data_array->GetSentenceStart(sentence_id + 1) - - source_data_array->GetSentenceStart(sentence_id) - 1; - if (source_back_high != source_phrase_high || - source_phrase_high + min_gap_size > source_sent_len || - (require_tight_phrases && (source_low[source_phrase_low] == -1 || - source_low[source_phrase_high] == -1))) { + int source_sent_len = source_data_array->GetSentenceLength(sentence_id); + if (ends_with_x || source_back_high + min_gap_size > source_sent_len) { return; } - source_x_high = source_phrase_high + min_gap_size; + source_x_high = source_back_high + min_gap_size; if (require_tight_phrases) { while (source_x_high <= source_sent_len && source_low[source_x_high - 1] == -1) { @@ -639,41 +260,56 @@ void RuleExtractor::AddNonterminalExtremities( } } + int new_nonterminals = extend_left + extend_right; if (source_x_high - source_x_low > max_rule_span || - target_gaps.size() + extend_left + extend_right > max_nonterminals) { + target_gaps.size() + new_nonterminals > max_nonterminals || + source_phrase.GetNumSymbols() + new_nonterminals > max_rule_symbols) { return; } int target_x_low = -1, target_x_high = -1; - if (!FindFixPoint(source_x_low, source_x_high, source_low, source_high, - target_x_low, target_x_high, target_low, target_high, - source_x_low, source_x_high, sentence_id, 1, 1, - extend_left + extend_right, extend_left, extend_right, - true)) { + if (!helper->FindFixPoint(source_x_low, source_x_high, source_low, + source_high, target_x_low, target_x_high, + target_low, target_high, source_x_low, + source_x_high, sentence_id, 1, 1, + new_nonterminals, extend_left, extend_right, + true)) { return; } - int source_gap_low = -1, source_gap_high = -1, target_gap_low = -1, - target_gap_high = -1; - if (extend_left && - ((require_tight_phrases && source_low[source_x_low] == -1) || - !FindFixPoint(source_x_low, source_phrase_low, source_low, source_high, - target_gap_low, target_gap_high, target_low, target_high, - source_gap_low, source_gap_high, sentence_id, - 0, 0, 0, 0, 0, false))) { - return; + if (extend_left) { + int source_gap_low = -1, source_gap_high = -1; + int target_gap_low = -1, target_gap_high = -1; + if ((require_tight_phrases && source_low[source_x_low] == -1) || + !helper->FindFixPoint(source_x_low, source_back_low, source_low, + source_high, target_gap_low, target_gap_high, + target_low, target_high, source_gap_low, + source_gap_high, sentence_id, 0, 0, 0, false, + false, false)) { + return; + } + target_gaps.insert(target_gaps.begin(), + make_pair(target_gap_low, target_gap_high)); } - if (extend_right && - ((require_tight_phrases && source_low[source_x_high - 1] == -1) || - !FindFixPoint(source_phrase_high, source_x_high, source_low, source_high, - target_gap_low, target_gap_high, target_low, target_high, - source_gap_low, source_gap_high, sentence_id, - 0, 0, 0, 0, 0, false))) { - return; + + if (extend_right) { + int target_gap_low = -1, target_gap_high = -1; + int source_gap_low = -1, source_gap_high = -1; + if ((require_tight_phrases && source_low[source_x_high - 1] == -1) || + !helper->FindFixPoint(source_back_high, source_x_high, source_low, + source_high, target_gap_low, target_gap_high, + target_low, target_high, source_gap_low, + source_gap_high, sentence_id, 0, 0, 0, false, + false, false)) { + return; + } + target_gaps.push_back(make_pair(target_gap_low, target_gap_high)); } Phrase new_source_phrase = phrase_builder->Extend(source_phrase, extend_left, extend_right); - AddExtracts(extracts, new_source_phrase, target_gaps, target_low, - target_x_low, target_x_high, sentence_id); + unordered_map<int, int> source_indexes = helper->GetSourceIndexes( + matching, chunklen, extend_left || starts_with_x); + AddExtracts(extracts, new_source_phrase, source_indexes, target_gaps, + target_low, target_x_low, target_x_high, sentence_id); } |