summaryrefslogtreecommitdiff
path: root/extractor/rule_factory.h
diff options
context:
space:
mode:
Diffstat (limited to 'extractor/rule_factory.h')
-rw-r--r--extractor/rule_factory.h20
1 files changed, 20 insertions, 0 deletions
diff --git a/extractor/rule_factory.h b/extractor/rule_factory.h
index d8dc2ccc..52e8712a 100644
--- a/extractor/rule_factory.h
+++ b/extractor/rule_factory.h
@@ -25,6 +25,17 @@ class State;
class SuffixArray;
class Vocabulary;
+/**
+ * Component containing most of the logic for extracting SCFG rules for a given
+ * sentence.
+ *
+ * Given a sentence (as a vector of word ids), this class constructs all the
+ * possible source phrases starting from this sentence. For each source phrase,
+ * it finds all its occurrences in the source data and samples some of these
+ * occurrences to extract aligned source-target phrase pairs. A trie cache is
+ * used to avoid unnecessary computations if a source phrase can be constructed
+ * more than once (e.g. some words occur more than once in the sentence).
+ */
class HieroCachingRuleFactory {
public:
HieroCachingRuleFactory(
@@ -58,21 +69,30 @@ class HieroCachingRuleFactory {
virtual ~HieroCachingRuleFactory();
+ // Constructs SCFG rules for a given sentence.
+ // (See class description for more details.)
virtual Grammar GetGrammar(const vector<int>& word_ids);
protected:
HieroCachingRuleFactory();
private:
+ // Checks if the phrase (if previously encountered) or its prefix have any
+ // occurrences in the source data.
bool CannotHaveMatchings(shared_ptr<TrieNode> node, int word_id);
+ // Checks if the phrase has previously been analyzed.
bool RequiresLookup(shared_ptr<TrieNode> node, int word_id);
+ // Creates a new state in the trie that corresponds to adding a trailing
+ // nonterminal to the current phrase.
void AddTrailingNonterminal(vector<int> symbols,
const Phrase& prefix,
const shared_ptr<TrieNode>& prefix_node,
bool starts_with_x);
+ // Extends the current state by possibly adding a nonterminal followed by a
+ // terminal.
vector<State> ExtendState(const vector<int>& word_ids,
const State& state,
vector<int> symbols,