#ifndef _RULE_EXTRACTOR_H_ #define _RULE_EXTRACTOR_H_ #include #include #include "phrase.h" using namespace std; class Alignment; class DataArray; class PhraseBuilder; class PhraseLocation; class Rule; class Scorer; class Vocabulary; typedef vector > PhraseAlignment; struct Extract { Extract(const Phrase& source_phrase, const Phrase& target_phrase, double pairs_count, const PhraseAlignment& alignment) : source_phrase(source_phrase), target_phrase(target_phrase), pairs_count(pairs_count), alignment(alignment) {} Phrase source_phrase; Phrase target_phrase; double pairs_count; PhraseAlignment alignment; }; class RuleExtractor { public: RuleExtractor(shared_ptr source_data_array, shared_ptr target_data_array, shared_ptr alingment, shared_ptr phrase_builder, shared_ptr scorer, shared_ptr vocabulary, int min_gap_size, int max_rule_span, int max_nonterminals, int max_rule_symbols, bool require_aligned_terminal, bool require_aligned_chunks, bool require_tight_phrases); vector ExtractRules(const Phrase& phrase, const PhraseLocation& location) const; private: vector ExtractAlignments(const Phrase& phrase, const vector& matching) const; void GetLinksSpans(vector& source_low, vector& source_high, vector& target_low, vector& target_high, int sentence_id) const; bool CheckAlignedTerminals(const vector& matching, const vector& chunklen, const vector& source_low) const; bool CheckTightPhrases(const vector& matching, const vector& chunklen, const vector& source_low) const; bool FindFixPoint( int source_phrase_start, int source_phrase_end, const vector& source_low, const vector& source_high, int& target_phrase_start, int& target_phrase_end, const vector& target_low, const vector& target_high, int& source_back_low, int& source_back_high, int sentence_id, int min_source_gap_size, int min_target_gap_size, int max_new_x, int max_low_x, int max_high_x, bool allow_arbitrary_expansion) const; void FindProjection( int source_phrase_start, int source_phrase_end, const vector& source_low, const vector& source_high, int& target_phrase_low, int& target_phrase_end) const; bool CheckGaps( vector >& source_gaps, vector >& target_gaps, const vector& matching, const vector& chunklen, const vector& source_low, const vector& source_high, const vector& target_low, const vector& target_high, int source_phrase_low, int source_phrase_high, int source_back_low, int source_back_high, int& num_symbols, bool& met_constraints) const; void AddExtracts( vector& extracts, const Phrase& source_phrase, const vector >& target_gaps, const vector& target_low, int target_phrase_low, int target_phrase_high, int sentence_id) const; vector > ExtractTargetPhrases( const vector >& target_gaps, const vector& target_low, int target_phrase_low, int target_phrase_high, int sentence_id) const; void GeneratePhrases( vector >& target_phrases, const vector >& ranges, int index, vector& subpatterns, const vector& target_gap_order, int target_phrase_low, int target_phrase_high, int sentence_id) const; void AddNonterminalExtremities( vector& extracts, const Phrase& source_phrase, int source_phrase_low, int source_phrase_high, int source_back_low, int source_back_high, const vector& source_low, const vector& source_high, const vector& target_low, const vector& target_high, const vector >& target_gaps, int sentence_id, int extend_left, int extend_right) const; shared_ptr source_data_array; shared_ptr target_data_array; shared_ptr alignment; shared_ptr phrase_builder; shared_ptr scorer; shared_ptr vocabulary; int max_rule_span; int min_gap_size; int max_nonterminals; int max_rule_symbols; bool require_aligned_terminal; bool require_aligned_chunks; bool require_tight_phrases; }; #endif