summaryrefslogtreecommitdiff
path: root/extractor/grammar_extractor.cc
diff options
context:
space:
mode:
Diffstat (limited to 'extractor/grammar_extractor.cc')
-rw-r--r--extractor/grammar_extractor.cc45
1 files changed, 26 insertions, 19 deletions
diff --git a/extractor/grammar_extractor.cc b/extractor/grammar_extractor.cc
index 15268165..2f008026 100644
--- a/extractor/grammar_extractor.cc
+++ b/extractor/grammar_extractor.cc
@@ -10,19 +10,6 @@
using namespace std;
-vector<string> Tokenize(const string& sentence) {
- vector<string> result;
- result.push_back("<s>");
-
- istringstream buffer(sentence);
- copy(istream_iterator<string>(buffer),
- istream_iterator<string>(),
- back_inserter(result));
-
- result.push_back("</s>");
- return result;
-}
-
GrammarExtractor::GrammarExtractor(
shared_ptr<SuffixArray> source_suffix_array,
shared_ptr<DataArray> target_data_array,
@@ -31,15 +18,35 @@ GrammarExtractor::GrammarExtractor(
int max_nonterminals, int max_rule_symbols, int max_samples,
bool use_baeza_yates, bool require_tight_phrases) :
vocabulary(make_shared<Vocabulary>()),
- rule_factory(source_suffix_array, target_data_array, alignment,
- vocabulary, precomputation, scorer, min_gap_size, max_rule_span,
- max_nonterminals, max_rule_symbols, max_samples, use_baeza_yates,
- require_tight_phrases) {}
+ rule_factory(make_shared<HieroCachingRuleFactory>(
+ source_suffix_array, target_data_array, alignment, vocabulary,
+ precomputation, scorer, min_gap_size, max_rule_span, max_nonterminals,
+ max_rule_symbols, max_samples, use_baeza_yates,
+ require_tight_phrases)) {}
+
+GrammarExtractor::GrammarExtractor(
+ shared_ptr<Vocabulary> vocabulary,
+ shared_ptr<HieroCachingRuleFactory> rule_factory) :
+ vocabulary(vocabulary),
+ rule_factory(rule_factory) {}
Grammar GrammarExtractor::GetGrammar(const string& sentence) {
- vector<string> words = Tokenize(sentence);
+ vector<string> words = TokenizeSentence(sentence);
vector<int> word_ids = AnnotateWords(words);
- return rule_factory.GetGrammar(word_ids);
+ return rule_factory->GetGrammar(word_ids);
+}
+
+vector<string> GrammarExtractor::TokenizeSentence(const string& sentence) {
+ vector<string> result;
+ result.push_back("<s>");
+
+ istringstream buffer(sentence);
+ copy(istream_iterator<string>(buffer),
+ istream_iterator<string>(),
+ back_inserter(result));
+
+ result.push_back("</s>");
+ return result;
}
vector<int> GrammarExtractor::AnnotateWords(const vector<string>& words) {