From 5530575ae0ad939e17f08d6bd49978acea388ab7 Mon Sep 17 00:00:00 2001 From: Paul Baltescu Date: Mon, 28 Jan 2013 11:56:31 +0000 Subject: Initial working commit. --- extractor/grammar_extractor.cc | 45 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) create mode 100644 extractor/grammar_extractor.cc (limited to 'extractor/grammar_extractor.cc') diff --git a/extractor/grammar_extractor.cc b/extractor/grammar_extractor.cc new file mode 100644 index 00000000..3014c2e9 --- /dev/null +++ b/extractor/grammar_extractor.cc @@ -0,0 +1,45 @@ +#include "grammar_extractor.h" + +#include +#include +#include + +using namespace std; + +vector Tokenize(const string& sentence) { + vector result; + result.push_back(""); + + istringstream buffer(sentence); + copy(istream_iterator(buffer), + istream_iterator(), + back_inserter(result)); + + result.push_back(""); + return result; +} + +GrammarExtractor::GrammarExtractor( + shared_ptr source_suffix_array, + shared_ptr target_data_array, + const Alignment& alignment, const Precomputation& precomputation, + int min_gap_size, int max_rule_span, int max_nonterminals, + int max_rule_symbols, bool use_baeza_yates) : + vocabulary(make_shared()), + rule_factory(source_suffix_array, target_data_array, alignment, + vocabulary, precomputation, min_gap_size, max_rule_span, + max_nonterminals, max_rule_symbols, use_baeza_yates) {} + +void GrammarExtractor::GetGrammar(const string& sentence) { + vector words = Tokenize(sentence); + vector word_ids = AnnotateWords(words); + rule_factory.GetGrammar(word_ids); +} + +vector GrammarExtractor::AnnotateWords(const vector& words) { + vector result; + for (string word: words) { + result.push_back(vocabulary->GetTerminalIndex(word)); + } + return result; +} -- cgit v1.2.3