summaryrefslogtreecommitdiff
path: root/extractor/grammar_extractor.h
diff options
context:
space:
mode:
authorChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2013-04-23 19:35:18 -0400
committerChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2013-04-23 19:35:18 -0400
commit6d347f1ce078dede3da0e1498f75e357351c6543 (patch)
tree8e872b8747c530e741e55e25e9917c1bd8b32c5b /extractor/grammar_extractor.h
parentd11b76def6899790161c47a73018146311356d8b (diff)
parent5e9605b65202f4e5fc59843b197d88c4774f0ac8 (diff)
merge paul's extractor code
Diffstat (limited to 'extractor/grammar_extractor.h')
-rw-r--r--extractor/grammar_extractor.h62
1 files changed, 62 insertions, 0 deletions
diff --git a/extractor/grammar_extractor.h b/extractor/grammar_extractor.h
new file mode 100644
index 00000000..b36ceeb9
--- /dev/null
+++ b/extractor/grammar_extractor.h
@@ -0,0 +1,62 @@
+#ifndef _GRAMMAR_EXTRACTOR_H_
+#define _GRAMMAR_EXTRACTOR_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+using namespace std;
+
+namespace extractor {
+
+class Alignment;
+class DataArray;
+class Grammar;
+class HieroCachingRuleFactory;
+class Precomputation;
+class Rule;
+class Scorer;
+class SuffixArray;
+class Vocabulary;
+
+/**
+ * Class wrapping all the logic for extracting the synchronous context free
+ * grammars.
+ */
+class GrammarExtractor {
+ public:
+ GrammarExtractor(
+ shared_ptr<SuffixArray> source_suffix_array,
+ shared_ptr<DataArray> target_data_array,
+ shared_ptr<Alignment> alignment,
+ shared_ptr<Precomputation> precomputation,
+ shared_ptr<Scorer> scorer,
+ int min_gap_size,
+ int max_rule_span,
+ int max_nonterminals,
+ int max_rule_symbols,
+ int max_samples,
+ bool require_tight_phrases);
+
+ // For testing only.
+ GrammarExtractor(shared_ptr<Vocabulary> vocabulary,
+ shared_ptr<HieroCachingRuleFactory> rule_factory);
+
+ // Converts the sentence to a vector of word ids and uses the RuleFactory to
+ // extract the SCFG rules which may be used to decode the sentence.
+ Grammar GetGrammar(const string& sentence);
+
+ private:
+ // Splits the sentence in a vector of words.
+ vector<string> TokenizeSentence(const string& sentence);
+
+ // Maps the words to word ids.
+ vector<int> AnnotateWords(const vector<string>& words);
+
+ shared_ptr<Vocabulary> vocabulary;
+ shared_ptr<HieroCachingRuleFactory> rule_factory;
+};
+
+} // namespace extractor
+
+#endif