Added comments. Hooray!

author: Paul Baltescu <pauldb89@gmail.com> 2013-03-10 01:01:01 +0000
committer: Paul Baltescu <pauldb89@gmail.com> 2013-03-10 01:01:01 +0000
commit: e6181c89ab8f29d8bd0fc6a3a8a359cb50c2304c (patch)
tree: c05eaae595c711605e6ccb596b8b634756a95c5b /extractor/fast_intersector.h
parent: 65a67c6921ee6da0477531224effe38559739455 (diff)
1 files changed, 27 insertions, 0 deletions
diff --git a/extractor/fast_intersector.h b/extractor/fast_intersector.h
index 32c88a30..f950a2a9 100644
--- a/extractor/fast_intersector.h
+++ b/extractor/fast_intersector.h
@@ -20,6 +20,18 @@ class Precomputation;
 class SuffixArray;
 class Vocabulary;
 
+/**
+ * Component for searching the training data for occurrences of source phrases
+ * containing nonterminals
+ *
+ * Given a source phrase containing a nonterminal, we first query the
+ * precomputed index containing frequent collocations. If the phrase is not
+ * frequent enough, we extend the matchings of either its prefix or its suffix,
+ * depending on which operation seems to require less computations.
+ *
+ * Note: This method for intersecting phrase locations is faster than both
+ * mergers (linear or Baeza Yates) described in Adam Lopez' dissertation.
+ */
 class FastIntersector {
  public:
   FastIntersector(shared_ptr<SuffixArray> suffix_array,
@@ -30,6 +42,8 @@ class FastIntersector {
 
   virtual ~FastIntersector();
 
+  // Finds the locations of a phrase given the locations of its prefix and
+  // suffix.
   virtual PhraseLocation Intersect(PhraseLocation& prefix_location,
                                    PhraseLocation& suffix_location,
                                    const Phrase& phrase);
@@ -38,23 +52,36 @@ class FastIntersector {
   FastIntersector();
 
  private:
+  // Uses the vocabulary to convert the phrase from the numberized format
+  // specified by the source data array to the numberized format given by the
+  // vocabulary.
   vector<int> ConvertPhrase(const vector<int>& old_phrase);
 
+  // Estimates the number of computations needed if the prefix/suffix is
+  // extended. If the last/first symbol is separated from the rest of the phrase
+  // by a nonterminal, then for each occurrence of the prefix/suffix we need to
+  // check max_rule_span positions. Otherwise, we only need to check a single
+  // position for each occurrence.
   int EstimateNumOperations(const PhraseLocation& phrase_location,
                             bool has_margin_x) const;
 
+  // Uses the occurrences of the prefix to find the occurrences of the phrase.
   PhraseLocation ExtendPrefixPhraseLocation(PhraseLocation& prefix_location,
                                             const Phrase& phrase,
                                             bool prefix_ends_with_x,
                                             int next_symbol) const;
 
+  // Uses the occurrences of the suffix to find the occurrences of the phrase.
   PhraseLocation ExtendSuffixPhraseLocation(PhraseLocation& suffix_location,
                                             const Phrase& phrase,
                                             bool suffix_starts_with_x,
                                             int prev_symbol) const;
 
+  // Extends the prefix/suffix location to a list of subpatterns positions if it
+  // represents a suffix array range.
   void ExtendPhraseLocation(PhraseLocation& location) const;
 
+  // Returns the range in which the search should be performed.
   pair<int, int> GetSearchRange(bool has_marginal_x) const;
 
   shared_ptr<SuffixArray> suffix_array;
author	Paul Baltescu <pauldb89@gmail.com>	2013-03-10 01:01:01 +0000
committer	Paul Baltescu <pauldb89@gmail.com>	2013-03-10 01:01:01 +0000
commit	e6181c89ab8f29d8bd0fc6a3a8a359cb50c2304c (patch)
tree	c05eaae595c711605e6ccb596b8b634756a95c5b /extractor/fast_intersector.h
parent	65a67c6921ee6da0477531224effe38559739455 (diff)