diff options
Diffstat (limited to 'extractor/rule_factory.h')
-rw-r--r-- | extractor/rule_factory.h | 20 |
1 files changed, 20 insertions, 0 deletions
diff --git a/extractor/rule_factory.h b/extractor/rule_factory.h index d8dc2ccc..52e8712a 100644 --- a/extractor/rule_factory.h +++ b/extractor/rule_factory.h @@ -25,6 +25,17 @@ class State; class SuffixArray; class Vocabulary; +/** + * Component containing most of the logic for extracting SCFG rules for a given + * sentence. + * + * Given a sentence (as a vector of word ids), this class constructs all the + * possible source phrases starting from this sentence. For each source phrase, + * it finds all its occurrences in the source data and samples some of these + * occurrences to extract aligned source-target phrase pairs. A trie cache is + * used to avoid unnecessary computations if a source phrase can be constructed + * more than once (e.g. some words occur more than once in the sentence). + */ class HieroCachingRuleFactory { public: HieroCachingRuleFactory( @@ -58,21 +69,30 @@ class HieroCachingRuleFactory { virtual ~HieroCachingRuleFactory(); + // Constructs SCFG rules for a given sentence. + // (See class description for more details.) virtual Grammar GetGrammar(const vector<int>& word_ids); protected: HieroCachingRuleFactory(); private: + // Checks if the phrase (if previously encountered) or its prefix have any + // occurrences in the source data. bool CannotHaveMatchings(shared_ptr<TrieNode> node, int word_id); + // Checks if the phrase has previously been analyzed. bool RequiresLookup(shared_ptr<TrieNode> node, int word_id); + // Creates a new state in the trie that corresponds to adding a trailing + // nonterminal to the current phrase. void AddTrailingNonterminal(vector<int> symbols, const Phrase& prefix, const shared_ptr<TrieNode>& prefix_node, bool starts_with_x); + // Extends the current state by possibly adding a nonterminal followed by a + // terminal. vector<State> ExtendState(const vector<int>& word_ids, const State& state, vector<int> symbols, |