Merge remote-tracking branch 'upstream/master'

author: Patrick Simianer <p@simianer.de> 2013-05-02 09:09:59 +0200
committer: Patrick Simianer <p@simianer.de> 2013-05-02 09:09:59 +0200
commit: 9e50f0237413180fba11b500c9dce5c600e3c157 (patch)
tree: 556fc31d231353c853a864afffddd43dc525549a /extractor/grammar_extractor.h
parent: d18024a41cbc1b54db88d499571349a6234b6db8 (diff)
parent: 14ed53426726202813a8e82d706b44266f015fe1 (diff)
1 files changed, 62 insertions, 0 deletions
diff --git a/extractor/grammar_extractor.h b/extractor/grammar_extractor.h
new file mode 100644
index 00000000..b36ceeb9
--- /dev/null
+++ b/extractor/grammar_extractor.h
@@ -0,0 +1,62 @@
+#ifndef _GRAMMAR_EXTRACTOR_H_
+#define _GRAMMAR_EXTRACTOR_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+using namespace std;
+
+namespace extractor {
+
+class Alignment;
+class DataArray;
+class Grammar;
+class HieroCachingRuleFactory;
+class Precomputation;
+class Rule;
+class Scorer;
+class SuffixArray;
+class Vocabulary;
+
+/**
+ * Class wrapping all the logic for extracting the synchronous context free
+ * grammars.
+ */
+class GrammarExtractor {
+ public:
+  GrammarExtractor(
+      shared_ptr<SuffixArray> source_suffix_array,
+      shared_ptr<DataArray> target_data_array,
+      shared_ptr<Alignment> alignment,
+      shared_ptr<Precomputation> precomputation,
+      shared_ptr<Scorer> scorer,
+      int min_gap_size,
+      int max_rule_span,
+      int max_nonterminals,
+      int max_rule_symbols,
+      int max_samples,
+      bool require_tight_phrases);
+
+  // For testing only.
+  GrammarExtractor(shared_ptr<Vocabulary> vocabulary,
+                   shared_ptr<HieroCachingRuleFactory> rule_factory);
+
+  // Converts the sentence to a vector of word ids and uses the RuleFactory to
+  // extract the SCFG rules which may be used to decode the sentence.
+  Grammar GetGrammar(const string& sentence);
+
+ private:
+  // Splits the sentence in a vector of words.
+  vector<string> TokenizeSentence(const string& sentence);
+
+  // Maps the words to word ids.
+  vector<int> AnnotateWords(const vector<string>& words);
+
+  shared_ptr<Vocabulary> vocabulary;
+  shared_ptr<HieroCachingRuleFactory> rule_factory;
+};
+
+} // namespace extractor
+
+#endif
author	Patrick Simianer <p@simianer.de>	2013-05-02 09:09:59 +0200
committer	Patrick Simianer <p@simianer.de>	2013-05-02 09:09:59 +0200
commit	9e50f0237413180fba11b500c9dce5c600e3c157 (patch)
tree	556fc31d231353c853a864afffddd43dc525549a /extractor/grammar_extractor.h
parent	d18024a41cbc1b54db88d499571349a6234b6db8 (diff)
parent	14ed53426726202813a8e82d706b44266f015fe1 (diff)