summaryrefslogtreecommitdiff
path: root/extractor/grammar_extractor.cc
diff options
context:
space:
mode:
authorChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2013-04-23 19:35:18 -0400
committerChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2013-04-23 19:35:18 -0400
commit6d347f1ce078dede3da0e1498f75e357351c6543 (patch)
tree8e872b8747c530e741e55e25e9917c1bd8b32c5b /extractor/grammar_extractor.cc
parentd11b76def6899790161c47a73018146311356d8b (diff)
parent5e9605b65202f4e5fc59843b197d88c4774f0ac8 (diff)
merge paul's extractor code
Diffstat (limited to 'extractor/grammar_extractor.cc')
-rw-r--r--extractor/grammar_extractor.cc62
1 files changed, 62 insertions, 0 deletions
diff --git a/extractor/grammar_extractor.cc b/extractor/grammar_extractor.cc
new file mode 100644
index 00000000..8050ce7b
--- /dev/null
+++ b/extractor/grammar_extractor.cc
@@ -0,0 +1,62 @@
+#include "grammar_extractor.h"
+
+#include <iterator>
+#include <sstream>
+#include <vector>
+
+#include "grammar.h"
+#include "rule.h"
+#include "rule_factory.h"
+#include "vocabulary.h"
+
+using namespace std;
+
+namespace extractor {
+
+GrammarExtractor::GrammarExtractor(
+ shared_ptr<SuffixArray> source_suffix_array,
+ shared_ptr<DataArray> target_data_array,
+ shared_ptr<Alignment> alignment, shared_ptr<Precomputation> precomputation,
+ shared_ptr<Scorer> scorer, int min_gap_size, int max_rule_span,
+ int max_nonterminals, int max_rule_symbols, int max_samples,
+ bool require_tight_phrases) :
+ vocabulary(make_shared<Vocabulary>()),
+ rule_factory(make_shared<HieroCachingRuleFactory>(
+ source_suffix_array, target_data_array, alignment, vocabulary,
+ precomputation, scorer, min_gap_size, max_rule_span, max_nonterminals,
+ max_rule_symbols, max_samples, require_tight_phrases)) {}
+
+GrammarExtractor::GrammarExtractor(
+ shared_ptr<Vocabulary> vocabulary,
+ shared_ptr<HieroCachingRuleFactory> rule_factory) :
+ vocabulary(vocabulary),
+ rule_factory(rule_factory) {}
+
+Grammar GrammarExtractor::GetGrammar(const string& sentence) {
+ vector<string> words = TokenizeSentence(sentence);
+ vector<int> word_ids = AnnotateWords(words);
+ return rule_factory->GetGrammar(word_ids);
+}
+
+vector<string> GrammarExtractor::TokenizeSentence(const string& sentence) {
+ vector<string> result;
+ result.push_back("<s>");
+
+ istringstream buffer(sentence);
+ copy(istream_iterator<string>(buffer),
+ istream_iterator<string>(),
+ back_inserter(result));
+
+ result.push_back("</s>");
+ return result;
+}
+
+vector<int> GrammarExtractor::AnnotateWords(const vector<string>& words) {
+ vector<int> result;
+ for (string word: words) {
+ result.push_back(vocabulary->GetTerminalIndex(word));
+ }
+ return result;
+}
+
+} // namespace extractor