merge paul's extractor code

author: Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> 2013-04-23 19:35:18 -0400
committer: Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> 2013-04-23 19:35:18 -0400
commit: 6d347f1ce078dede3da0e1498f75e357351c6543 (patch)
tree: 8e872b8747c530e741e55e25e9917c1bd8b32c5b /extractor/fast_intersector.h
parent: d11b76def6899790161c47a73018146311356d8b (diff)
parent: 5e9605b65202f4e5fc59843b197d88c4774f0ac8 (diff)
1 files changed, 96 insertions, 0 deletions
diff --git a/extractor/fast_intersector.h b/extractor/fast_intersector.h
new file mode 100644
index 00000000..f950a2a9
--- /dev/null
+++ b/extractor/fast_intersector.h
@@ -0,0 +1,96 @@
+#ifndef _FAST_INTERSECTOR_H_
+#define _FAST_INTERSECTOR_H_
+
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include <boost/functional/hash.hpp>
+
+using namespace std;
+
+namespace extractor {
+
+typedef boost::hash<vector<int> > VectorHash;
+typedef unordered_map<vector<int>, vector<int>, VectorHash> Index;
+
+class Phrase;
+class PhraseLocation;
+class Precomputation;
+class SuffixArray;
+class Vocabulary;
+
+/**
+ * Component for searching the training data for occurrences of source phrases
+ * containing nonterminals
+ *
+ * Given a source phrase containing a nonterminal, we first query the
+ * precomputed index containing frequent collocations. If the phrase is not
+ * frequent enough, we extend the matchings of either its prefix or its suffix,
+ * depending on which operation seems to require less computations.
+ *
+ * Note: This method for intersecting phrase locations is faster than both
+ * mergers (linear or Baeza Yates) described in Adam Lopez' dissertation.
+ */
+class FastIntersector {
+ public:
+  FastIntersector(shared_ptr<SuffixArray> suffix_array,
+                  shared_ptr<Precomputation> precomputation,
+                  shared_ptr<Vocabulary> vocabulary,
+                  int max_rule_span,
+                  int min_gap_size);
+
+  virtual ~FastIntersector();
+
+  // Finds the locations of a phrase given the locations of its prefix and
+  // suffix.
+  virtual PhraseLocation Intersect(PhraseLocation& prefix_location,
+                                   PhraseLocation& suffix_location,
+                                   const Phrase& phrase);
+
+ protected:
+  FastIntersector();
+
+ private:
+  // Uses the vocabulary to convert the phrase from the numberized format
+  // specified by the source data array to the numberized format given by the
+  // vocabulary.
+  vector<int> ConvertPhrase(const vector<int>& old_phrase);
+
+  // Estimates the number of computations needed if the prefix/suffix is
+  // extended. If the last/first symbol is separated from the rest of the phrase
+  // by a nonterminal, then for each occurrence of the prefix/suffix we need to
+  // check max_rule_span positions. Otherwise, we only need to check a single
+  // position for each occurrence.
+  int EstimateNumOperations(const PhraseLocation& phrase_location,
+                            bool has_margin_x) const;
+
+  // Uses the occurrences of the prefix to find the occurrences of the phrase.
+  PhraseLocation ExtendPrefixPhraseLocation(PhraseLocation& prefix_location,
+                                            const Phrase& phrase,
+                                            bool prefix_ends_with_x,
+                                            int next_symbol) const;
+
+  // Uses the occurrences of the suffix to find the occurrences of the phrase.
+  PhraseLocation ExtendSuffixPhraseLocation(PhraseLocation& suffix_location,
+                                            const Phrase& phrase,
+                                            bool suffix_starts_with_x,
+                                            int prev_symbol) const;
+
+  // Extends the prefix/suffix location to a list of subpatterns positions if it
+  // represents a suffix array range.
+  void ExtendPhraseLocation(PhraseLocation& location) const;
+
+  // Returns the range in which the search should be performed.
+  pair<int, int> GetSearchRange(bool has_marginal_x) const;
+
+  shared_ptr<SuffixArray> suffix_array;
+  shared_ptr<Vocabulary> vocabulary;
+  int max_rule_span;
+  int min_gap_size;
+  Index collocations;
+};
+
+} // namespace extractor
+
+#endif
author	Chris Dyer <cdyer@allegro.clab.cs.cmu.edu>	2013-04-23 19:35:18 -0400
committer	Chris Dyer <cdyer@allegro.clab.cs.cmu.edu>	2013-04-23 19:35:18 -0400
commit	6d347f1ce078dede3da0e1498f75e357351c6543 (patch)
tree	8e872b8747c530e741e55e25e9917c1bd8b32c5b /extractor/fast_intersector.h
parent	d11b76def6899790161c47a73018146311356d8b (diff)
parent	5e9605b65202f4e5fc59843b197d88c4774f0ac8 (diff)