diff options
author | Paul Baltescu <pauldb89@gmail.com> | 2013-03-10 01:01:01 +0000 |
---|---|---|
committer | Paul Baltescu <pauldb89@gmail.com> | 2013-03-10 01:01:01 +0000 |
commit | e6181c89ab8f29d8bd0fc6a3a8a359cb50c2304c (patch) | |
tree | c05eaae595c711605e6ccb596b8b634756a95c5b /extractor/fast_intersector.h | |
parent | 65a67c6921ee6da0477531224effe38559739455 (diff) |
Added comments. Hooray!
Diffstat (limited to 'extractor/fast_intersector.h')
-rw-r--r-- | extractor/fast_intersector.h | 27 |
1 files changed, 27 insertions, 0 deletions
diff --git a/extractor/fast_intersector.h b/extractor/fast_intersector.h index 32c88a30..f950a2a9 100644 --- a/extractor/fast_intersector.h +++ b/extractor/fast_intersector.h @@ -20,6 +20,18 @@ class Precomputation; class SuffixArray; class Vocabulary; +/** + * Component for searching the training data for occurrences of source phrases + * containing nonterminals + * + * Given a source phrase containing a nonterminal, we first query the + * precomputed index containing frequent collocations. If the phrase is not + * frequent enough, we extend the matchings of either its prefix or its suffix, + * depending on which operation seems to require less computations. + * + * Note: This method for intersecting phrase locations is faster than both + * mergers (linear or Baeza Yates) described in Adam Lopez' dissertation. + */ class FastIntersector { public: FastIntersector(shared_ptr<SuffixArray> suffix_array, @@ -30,6 +42,8 @@ class FastIntersector { virtual ~FastIntersector(); + // Finds the locations of a phrase given the locations of its prefix and + // suffix. virtual PhraseLocation Intersect(PhraseLocation& prefix_location, PhraseLocation& suffix_location, const Phrase& phrase); @@ -38,23 +52,36 @@ class FastIntersector { FastIntersector(); private: + // Uses the vocabulary to convert the phrase from the numberized format + // specified by the source data array to the numberized format given by the + // vocabulary. vector<int> ConvertPhrase(const vector<int>& old_phrase); + // Estimates the number of computations needed if the prefix/suffix is + // extended. If the last/first symbol is separated from the rest of the phrase + // by a nonterminal, then for each occurrence of the prefix/suffix we need to + // check max_rule_span positions. Otherwise, we only need to check a single + // position for each occurrence. int EstimateNumOperations(const PhraseLocation& phrase_location, bool has_margin_x) const; + // Uses the occurrences of the prefix to find the occurrences of the phrase. PhraseLocation ExtendPrefixPhraseLocation(PhraseLocation& prefix_location, const Phrase& phrase, bool prefix_ends_with_x, int next_symbol) const; + // Uses the occurrences of the suffix to find the occurrences of the phrase. PhraseLocation ExtendSuffixPhraseLocation(PhraseLocation& suffix_location, const Phrase& phrase, bool suffix_starts_with_x, int prev_symbol) const; + // Extends the prefix/suffix location to a list of subpatterns positions if it + // represents a suffix array range. void ExtendPhraseLocation(PhraseLocation& location) const; + // Returns the range in which the search should be performed. pair<int, int> GetSearchRange(bool has_marginal_x) const; shared_ptr<SuffixArray> suffix_array; |