diff options
author | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2013-04-23 19:35:18 -0400 |
---|---|---|
committer | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2013-04-23 19:35:18 -0400 |
commit | 6d347f1ce078dede3da0e1498f75e357351c6543 (patch) | |
tree | 8e872b8747c530e741e55e25e9917c1bd8b32c5b /extractor/grammar_extractor.h | |
parent | d11b76def6899790161c47a73018146311356d8b (diff) | |
parent | 5e9605b65202f4e5fc59843b197d88c4774f0ac8 (diff) |
merge paul's extractor code
Diffstat (limited to 'extractor/grammar_extractor.h')
-rw-r--r-- | extractor/grammar_extractor.h | 62 |
1 files changed, 62 insertions, 0 deletions
diff --git a/extractor/grammar_extractor.h b/extractor/grammar_extractor.h new file mode 100644 index 00000000..b36ceeb9 --- /dev/null +++ b/extractor/grammar_extractor.h @@ -0,0 +1,62 @@ +#ifndef _GRAMMAR_EXTRACTOR_H_ +#define _GRAMMAR_EXTRACTOR_H_ + +#include <memory> +#include <string> +#include <vector> + +using namespace std; + +namespace extractor { + +class Alignment; +class DataArray; +class Grammar; +class HieroCachingRuleFactory; +class Precomputation; +class Rule; +class Scorer; +class SuffixArray; +class Vocabulary; + +/** + * Class wrapping all the logic for extracting the synchronous context free + * grammars. + */ +class GrammarExtractor { + public: + GrammarExtractor( + shared_ptr<SuffixArray> source_suffix_array, + shared_ptr<DataArray> target_data_array, + shared_ptr<Alignment> alignment, + shared_ptr<Precomputation> precomputation, + shared_ptr<Scorer> scorer, + int min_gap_size, + int max_rule_span, + int max_nonterminals, + int max_rule_symbols, + int max_samples, + bool require_tight_phrases); + + // For testing only. + GrammarExtractor(shared_ptr<Vocabulary> vocabulary, + shared_ptr<HieroCachingRuleFactory> rule_factory); + + // Converts the sentence to a vector of word ids and uses the RuleFactory to + // extract the SCFG rules which may be used to decode the sentence. + Grammar GetGrammar(const string& sentence); + + private: + // Splits the sentence in a vector of words. + vector<string> TokenizeSentence(const string& sentence); + + // Maps the words to word ids. + vector<int> AnnotateWords(const vector<string>& words); + + shared_ptr<Vocabulary> vocabulary; + shared_ptr<HieroCachingRuleFactory> rule_factory; +}; + +} // namespace extractor + +#endif |