diff options
Diffstat (limited to 'extractor/rule_extractor.h')
-rw-r--r-- | extractor/rule_extractor.h | 16 |
1 files changed, 16 insertions, 0 deletions
diff --git a/extractor/rule_extractor.h b/extractor/rule_extractor.h index 8b6daeea..bfec0225 100644 --- a/extractor/rule_extractor.h +++ b/extractor/rule_extractor.h @@ -22,6 +22,10 @@ class RuleExtractorHelper; class Scorer; class TargetPhraseExtractor; +/** + * Structure containing data about the occurrences of a source-target phrase pair + * in the parallel corpus. + */ struct Extract { Extract(const Phrase& source_phrase, const Phrase& target_phrase, double pairs_count, const PhraseAlignment& alignment) : @@ -34,6 +38,9 @@ struct Extract { PhraseAlignment alignment; }; +/** + * Component for extracting SCFG rules. + */ class RuleExtractor { public: RuleExtractor(shared_ptr<DataArray> source_data_array, @@ -64,6 +71,8 @@ class RuleExtractor { virtual ~RuleExtractor(); + // Extracts SCFG rules given a source phrase and a set of its occurrences + // in the source data. virtual vector<Rule> ExtractRules(const Phrase& phrase, const PhraseLocation& location) const; @@ -71,15 +80,22 @@ class RuleExtractor { RuleExtractor(); private: + // Finds all target phrases that can be aligned with the source phrase for a + // particular occurrence in the data. vector<Extract> ExtractAlignments(const Phrase& phrase, const vector<int>& matching) const; + // Extracts all target phrases for a given occurrence of the source phrase in + // the data. Constructs a vector of Extracts using these target phrases. void AddExtracts( vector<Extract>& extracts, const Phrase& source_phrase, const unordered_map<int, int>& source_indexes, const vector<pair<int, int> >& target_gaps, const vector<int>& target_low, int target_phrase_low, int target_phrase_high, int sentence_id) const; + // Adds a leading and/or trailing nonterminal to the source phrase and + // extracts target phrases that can be aligned with the extended source + // phrase. void AddNonterminalExtremities( vector<Extract>& extracts, const vector<int>& matching, const vector<int>& chunklen, const Phrase& source_phrase, |