diff options
Diffstat (limited to 'extractor/grammar_extractor.cc')
-rw-r--r-- | extractor/grammar_extractor.cc | 45 |
1 files changed, 45 insertions, 0 deletions
diff --git a/extractor/grammar_extractor.cc b/extractor/grammar_extractor.cc new file mode 100644 index 00000000..3014c2e9 --- /dev/null +++ b/extractor/grammar_extractor.cc @@ -0,0 +1,45 @@ +#include "grammar_extractor.h" + +#include <iterator> +#include <sstream> +#include <vector> + +using namespace std; + +vector<string> Tokenize(const string& sentence) { + vector<string> result; + result.push_back("<s>"); + + istringstream buffer(sentence); + copy(istream_iterator<string>(buffer), + istream_iterator<string>(), + back_inserter(result)); + + result.push_back("</s>"); + return result; +} + +GrammarExtractor::GrammarExtractor( + shared_ptr<SuffixArray> source_suffix_array, + shared_ptr<DataArray> target_data_array, + const Alignment& alignment, const Precomputation& precomputation, + int min_gap_size, int max_rule_span, int max_nonterminals, + int max_rule_symbols, bool use_baeza_yates) : + vocabulary(make_shared<Vocabulary>()), + rule_factory(source_suffix_array, target_data_array, alignment, + vocabulary, precomputation, min_gap_size, max_rule_span, + max_nonterminals, max_rule_symbols, use_baeza_yates) {} + +void GrammarExtractor::GetGrammar(const string& sentence) { + vector<string> words = Tokenize(sentence); + vector<int> word_ids = AnnotateWords(words); + rule_factory.GetGrammar(word_ids); +} + +vector<int> GrammarExtractor::AnnotateWords(const vector<string>& words) { + vector<int> result; + for (string word: words) { + result.push_back(vocabulary->GetTerminalIndex(word)); + } + return result; +} |