summaryrefslogtreecommitdiff
path: root/extractor/rule_factory.cc
diff options
context:
space:
mode:
Diffstat (limited to 'extractor/rule_factory.cc')
-rw-r--r--extractor/rule_factory.cc13
1 files changed, 13 insertions, 0 deletions
diff --git a/extractor/rule_factory.cc b/extractor/rule_factory.cc
index fbc62e50..8c30fb9e 100644
--- a/extractor/rule_factory.cc
+++ b/extractor/rule_factory.cc
@@ -152,12 +152,18 @@ Grammar HieroCachingRuleFactory::GetGrammar(const vector<int>& word_ids) {
} else {
PhraseLocation phrase_location;
if (next_phrase.Arity() > 0) {
+ // For phrases containing a nonterminal, we use either the occurrences
+ // of the prefix or the suffix to determine the occurrences of the
+ // phrase.
Clock::time_point intersect_start = Clock::now();
phrase_location = fast_intersector->Intersect(
node->matchings, next_suffix_link->matchings, next_phrase);
Clock::time_point intersect_stop = Clock::now();
total_intersect_time += GetDuration(intersect_start, intersect_stop);
} else {
+ // For phrases not containing any nonterminals, we simply query the
+ // suffix array using the suffix array range of the prefix as a
+ // starting point.
Clock::time_point lookup_start = Clock::now();
phrase_location = matchings_finder->Find(
node->matchings,
@@ -170,9 +176,12 @@ Grammar HieroCachingRuleFactory::GetGrammar(const vector<int>& word_ids) {
if (phrase_location.IsEmpty()) {
continue;
}
+
+ // Create new trie node to store data about the current phrase.
next_node = make_shared<TrieNode>(
next_suffix_link, next_phrase, phrase_location);
}
+ // Add the new trie node to the trie cache.
node->AddChild(word_id, next_node);
// Automatically adds a trailing non terminal if allowed. Simply copy the
@@ -182,6 +191,7 @@ Grammar HieroCachingRuleFactory::GetGrammar(const vector<int>& word_ids) {
Clock::time_point extract_start = Clock::now();
if (!state.starts_with_x) {
+ // Extract rules for the sampled set of occurrences.
PhraseLocation sample = sampler->Sample(next_node->matchings);
vector<Rule> new_rules =
rule_extractor->ExtractRules(next_phrase, sample);
@@ -193,6 +203,7 @@ Grammar HieroCachingRuleFactory::GetGrammar(const vector<int>& word_ids) {
next_node = node->GetChild(word_id);
}
+ // Create more states (phrases) to be analyzed.
vector<State> new_states = ExtendState(word_ids, state, phrase, next_phrase,
next_node);
for (State new_state: new_states) {
@@ -262,6 +273,7 @@ vector<State> HieroCachingRuleFactory::ExtendState(
return new_states;
}
+ // New state for adding the next symbol.
new_states.push_back(State(state.start, state.end + 1, symbols,
state.subpatterns_start, node, state.starts_with_x));
@@ -272,6 +284,7 @@ vector<State> HieroCachingRuleFactory::ExtendState(
return new_states;
}
+ // New states for adding a nonterminal followed by a new symbol.
int var_id = vocabulary->GetNonterminalIndex(phrase.Arity() + 1);
symbols.push_back(var_id);
vector<int> subpatterns_start = state.subpatterns_start;