diff options
author | Patrick Simianer <simianer@cl.uni-heidelberg.de> | 2012-11-05 15:29:46 +0100 |
---|---|---|
committer | Patrick Simianer <simianer@cl.uni-heidelberg.de> | 2012-11-05 15:29:46 +0100 |
commit | 1db70a45d59946560fbd5db6487b55a8674ef973 (patch) | |
tree | 172585dafe4d1462f22d8200e733d52dddb55b1e /extools/extract.h | |
parent | 4dd5216d3afa9ab72b150e250a3c30a5f223ce53 (diff) | |
parent | 6bbf03ac46bd57400aa9e65a321a304a234af935 (diff) |
merge upstream/master
Diffstat (limited to 'extools/extract.h')
-rw-r--r-- | extools/extract.h | 94 |
1 files changed, 0 insertions, 94 deletions
diff --git a/extools/extract.h b/extools/extract.h deleted file mode 100644 index e9ea5e65..00000000 --- a/extools/extract.h +++ /dev/null @@ -1,94 +0,0 @@ -#ifndef _EXTRACT_H_ -#define _EXTRACT_H_ - -#include <iostream> -#include <utility> -#include <vector> -#include <boost/tuple/tuple.hpp> -#include "array2d.h" -#include "wordid.h" -#include "sparse_vector.h" - -struct AnnotatedParallelSentence; - -// usually represents a consistent phrase, which may -// be annotated with a type (cat) -// inside the rule extractor, this class is also used to represent a word -// in a partial rule. -struct ParallelSpan { - // i1 = i of f side - // i2 = j of f side - // j1 = i of e side - // j2 = j of e side - short i1,i2,j1,j2; - // cat is set by AnnotatePhrasesWithCategoryTypes, otherwise it's 0 - WordID cat; // category type of span (also overloaded by RuleItem class - // to be a word ID) - ParallelSpan() : i1(-1), i2(-1), j1(-1), j2(-1), cat() {} - // used by Rule class to represent a terminal symbol: - explicit ParallelSpan(WordID w) : i1(-1), i2(-1), j1(-1), j2(-1), cat(w) {} - ParallelSpan(int pi1, int pi2, int pj1, int pj2) : i1(pi1), i2(pi2), j1(pj1), j2(pj2), cat() {} - ParallelSpan(int pi1, int pi2, int pj1, int pj2, WordID c) : i1(pi1), i2(pi2), j1(pj1), j2(pj2), cat(c) {} - - // ParallelSpan is used in the Rule class where it is - // overloaded to also represent terminal symbols - inline bool IsVariable() const { return i1 != -1; } -}; - -// rule extraction logic lives here. this has no data, it's just got -// static member functions. -struct Extract { - // RuleObserver's CountRule is called for each rule extracted - // implement CountRuleImpl to do things like count the rules, - // write them to a file, etc. - struct RuleObserver { - RuleObserver() : count() {} - virtual void CountRule(WordID lhs, - const std::vector<WordID>& rhs_f, - const std::vector<WordID>& rhs_e, - const std::vector<std::pair<short, short> >& fe_terminal_alignments) { - ++count; - CountRuleImpl(lhs, rhs_f, rhs_e, fe_terminal_alignments); - } - virtual ~RuleObserver(); - - protected: - virtual void CountRuleImpl(WordID lhs, - const std::vector<WordID>& rhs_f, - const std::vector<WordID>& rhs_e, - const std::vector<std::pair<short, short> >& fe_terminal_alignments) = 0; - private: - int count; - }; - - // given a set of "tight" phrases and the aligned sentence they were - // extracted from, "loosen" them - static void LoosenPhraseBounds(const AnnotatedParallelSentence& sentence, - const int max_base_phrase_size, - std::vector<ParallelSpan>* phrases); - - // extract all consistent phrase pairs, up to size max_base_phrase_size - // (on the source side). these phrases will be "tight". - static void ExtractBasePhrases(const int max_base_phrase_size, - const AnnotatedParallelSentence& sentence, - std::vector<ParallelSpan>* phrases); - - // this uses the TARGET span (i,j) to annotate phrases, will copy - // phrases if there is more than one annotation. - static void AnnotatePhrasesWithCategoryTypes(const WordID default_cat, - const std::map< boost::tuple<short,short,short,short>, std::vector<WordID> > &types, - std::vector<ParallelSpan>* phrases); - - // use the Chiang (2007) extraction logic to extract consistent subphrases - // observer->CountRule is called once for each rule extracted - static void ExtractConsistentRules(const AnnotatedParallelSentence& sentence, - const std::vector<ParallelSpan>& phrases, - const int max_vars, - const int max_syms, - const bool permit_adjacent_nonterminals, - const bool require_aligned_terminal, - RuleObserver* observer, - std::vector<WordID>* all_cats); -}; - -#endif |