diff options
Diffstat (limited to 'extractor/grammar_extractor.cc')
-rw-r--r-- | extractor/grammar_extractor.cc | 45 |
1 files changed, 26 insertions, 19 deletions
diff --git a/extractor/grammar_extractor.cc b/extractor/grammar_extractor.cc index 15268165..2f008026 100644 --- a/extractor/grammar_extractor.cc +++ b/extractor/grammar_extractor.cc @@ -10,19 +10,6 @@ using namespace std; -vector<string> Tokenize(const string& sentence) { - vector<string> result; - result.push_back("<s>"); - - istringstream buffer(sentence); - copy(istream_iterator<string>(buffer), - istream_iterator<string>(), - back_inserter(result)); - - result.push_back("</s>"); - return result; -} - GrammarExtractor::GrammarExtractor( shared_ptr<SuffixArray> source_suffix_array, shared_ptr<DataArray> target_data_array, @@ -31,15 +18,35 @@ GrammarExtractor::GrammarExtractor( int max_nonterminals, int max_rule_symbols, int max_samples, bool use_baeza_yates, bool require_tight_phrases) : vocabulary(make_shared<Vocabulary>()), - rule_factory(source_suffix_array, target_data_array, alignment, - vocabulary, precomputation, scorer, min_gap_size, max_rule_span, - max_nonterminals, max_rule_symbols, max_samples, use_baeza_yates, - require_tight_phrases) {} + rule_factory(make_shared<HieroCachingRuleFactory>( + source_suffix_array, target_data_array, alignment, vocabulary, + precomputation, scorer, min_gap_size, max_rule_span, max_nonterminals, + max_rule_symbols, max_samples, use_baeza_yates, + require_tight_phrases)) {} + +GrammarExtractor::GrammarExtractor( + shared_ptr<Vocabulary> vocabulary, + shared_ptr<HieroCachingRuleFactory> rule_factory) : + vocabulary(vocabulary), + rule_factory(rule_factory) {} Grammar GrammarExtractor::GetGrammar(const string& sentence) { - vector<string> words = Tokenize(sentence); + vector<string> words = TokenizeSentence(sentence); vector<int> word_ids = AnnotateWords(words); - return rule_factory.GetGrammar(word_ids); + return rule_factory->GetGrammar(word_ids); +} + +vector<string> GrammarExtractor::TokenizeSentence(const string& sentence) { + vector<string> result; + result.push_back("<s>"); + + istringstream buffer(sentence); + copy(istream_iterator<string>(buffer), + istream_iterator<string>(), + back_inserter(result)); + + result.push_back("</s>"); + return result; } vector<int> GrammarExtractor::AnnotateWords(const vector<string>& words) { |