diff options
author | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2013-04-23 19:35:18 -0400 |
---|---|---|
committer | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2013-04-23 19:35:18 -0400 |
commit | 6d347f1ce078dede3da0e1498f75e357351c6543 (patch) | |
tree | 8e872b8747c530e741e55e25e9917c1bd8b32c5b /extractor/grammar_extractor.cc | |
parent | d11b76def6899790161c47a73018146311356d8b (diff) | |
parent | 5e9605b65202f4e5fc59843b197d88c4774f0ac8 (diff) |
merge paul's extractor code
Diffstat (limited to 'extractor/grammar_extractor.cc')
-rw-r--r-- | extractor/grammar_extractor.cc | 62 |
1 files changed, 62 insertions, 0 deletions
diff --git a/extractor/grammar_extractor.cc b/extractor/grammar_extractor.cc new file mode 100644 index 00000000..8050ce7b --- /dev/null +++ b/extractor/grammar_extractor.cc @@ -0,0 +1,62 @@ +#include "grammar_extractor.h" + +#include <iterator> +#include <sstream> +#include <vector> + +#include "grammar.h" +#include "rule.h" +#include "rule_factory.h" +#include "vocabulary.h" + +using namespace std; + +namespace extractor { + +GrammarExtractor::GrammarExtractor( + shared_ptr<SuffixArray> source_suffix_array, + shared_ptr<DataArray> target_data_array, + shared_ptr<Alignment> alignment, shared_ptr<Precomputation> precomputation, + shared_ptr<Scorer> scorer, int min_gap_size, int max_rule_span, + int max_nonterminals, int max_rule_symbols, int max_samples, + bool require_tight_phrases) : + vocabulary(make_shared<Vocabulary>()), + rule_factory(make_shared<HieroCachingRuleFactory>( + source_suffix_array, target_data_array, alignment, vocabulary, + precomputation, scorer, min_gap_size, max_rule_span, max_nonterminals, + max_rule_symbols, max_samples, require_tight_phrases)) {} + +GrammarExtractor::GrammarExtractor( + shared_ptr<Vocabulary> vocabulary, + shared_ptr<HieroCachingRuleFactory> rule_factory) : + vocabulary(vocabulary), + rule_factory(rule_factory) {} + +Grammar GrammarExtractor::GetGrammar(const string& sentence) { + vector<string> words = TokenizeSentence(sentence); + vector<int> word_ids = AnnotateWords(words); + return rule_factory->GetGrammar(word_ids); +} + +vector<string> GrammarExtractor::TokenizeSentence(const string& sentence) { + vector<string> result; + result.push_back("<s>"); + + istringstream buffer(sentence); + copy(istream_iterator<string>(buffer), + istream_iterator<string>(), + back_inserter(result)); + + result.push_back("</s>"); + return result; +} + +vector<int> GrammarExtractor::AnnotateWords(const vector<string>& words) { + vector<int> result; + for (string word: words) { + result.push_back(vocabulary->GetTerminalIndex(word)); + } + return result; +} + +} // namespace extractor |