diff options
-rw-r--r-- | decoder/Makefile.am | 5 | ||||
-rw-r--r-- | decoder/cdec_ff.cc | 25 | ||||
-rw-r--r-- | decoder/ff_parse_match.cc | 4 | ||||
-rw-r--r-- | decoder/ff_soft_syntax.cc | 34 | ||||
-rw-r--r-- | decoder/ff_soft_syntax.h | 16 | ||||
-rw-r--r-- | decoder/ff_soft_syntax_mindist.cc (renamed from decoder/ff_soft_syntax2.cc) | 43 | ||||
-rw-r--r-- | decoder/ff_soft_syntax_mindist.h (renamed from decoder/ff_soft_syntax2.h) | 16 | ||||
-rw-r--r-- | decoder/ff_source_syntax.cc | 37 | ||||
-rw-r--r-- | decoder/ff_source_syntax.h | 10 | ||||
-rw-r--r-- | decoder/ff_source_syntax2.cc | 25 | ||||
-rw-r--r-- | decoder/ff_source_syntax2.h | 5 | ||||
-rw-r--r-- | decoder/ff_source_syntax2_p.cc | 166 | ||||
-rw-r--r-- | decoder/ff_source_syntax2_p.h | 25 | ||||
-rw-r--r-- | decoder/ff_source_syntax_p.cc | 245 | ||||
-rw-r--r-- | decoder/ff_source_syntax_p.h | 42 |
15 files changed, 110 insertions, 588 deletions
diff --git a/decoder/Makefile.am b/decoder/Makefile.am index 914faaea..e7ebe840 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -62,7 +62,6 @@ libcdec_a_SOURCES = \ ff_ruleshape.h \ ff_sample_fsa.h \ ff_source_path.h \ - ff_source_syntax.h \ ff_spans.h \ ff_tagger.h \ ff_wordalign.h \ @@ -145,11 +144,9 @@ libcdec_a_SOURCES = \ ff_source_path.cc \ ff_parse_match.cc \ ff_soft_syntax.cc \ - ff_soft_syntax2.cc \ + ff_soft_syntax_mindist.cc \ ff_source_syntax.cc \ - ff_source_syntax_p.cc \ ff_source_syntax2.cc \ - ff_source_syntax2_p.cc \ ff_bleu.cc \ ff_factory.cc \ incremental.cc \ diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc index e7b31f50..a36a0f5f 100644 --- a/decoder/cdec_ff.cc +++ b/decoder/cdec_ff.cc @@ -15,17 +15,11 @@ #include "ff_ruleshape.h" #include "ff_bleu.h" #include "ff_soft_syntax.h" -#include "ff_soft_syntax2.h" +#include "ff_soft_syntax_mindist.h" #include "ff_source_path.h" - - #include "ff_parse_match.h" #include "ff_source_syntax.h" -#include "ff_source_syntax_p.h" #include "ff_source_syntax2.h" -#include "ff_source_syntax2_p.h" - - #include "ff_register.h" #include "ff_charset.h" #include "ff_wordset.h" @@ -58,23 +52,12 @@ void register_feature_functions() { ff_registry.Register("NgramFeatures", new FFFactory<NgramDetector>()); ff_registry.Register("RuleContextFeatures", new FFFactory<RuleContextFeatures>()); ff_registry.Register("RuleIdentityFeatures", new FFFactory<RuleIdentityFeatures>()); - - ff_registry.Register("ParseMatchFeatures", new FFFactory<ParseMatchFeatures>); - - ff_registry.Register("SoftSyntacticFeatures", new FFFactory<SoftSyntacticFeatures>); - ff_registry.Register("SoftSyntacticFeatures2", new FFFactory<SoftSyntacticFeatures2>); - + ff_registry.Register("SoftSyntaxFeatures", new FFFactory<SoftSyntaxFeatures>); + ff_registry.Register("SoftSyntaxFeaturesMindist", new FFFactory<SoftSyntaxFeaturesMindist>); ff_registry.Register("SourceSyntaxFeatures", new FFFactory<SourceSyntaxFeatures>); - ff_registry.Register("SourceSyntaxFeatures2", new FFFactory<SourceSyntaxFeatures2>); - ff_registry.Register("SourceSpanSizeFeatures", new FFFactory<SourceSpanSizeFeatures>); - - //ff_registry.Register("PSourceSyntaxFeatures", new FFFactory<PSourceSyntaxFeatures>); - //ff_registry.Register("PSourceSpanSizeFeatures", new FFFactory<PSourceSpanSizeFeatures>); - //ff_registry.Register("PSourceSyntaxFeatures2", new FFFactory<PSourceSyntaxFeatures2>); - - + ff_registry.Register("SourceSyntaxFeatures2", new FFFactory<SourceSyntaxFeatures2>); ff_registry.Register("CMR2008ReorderingFeatures", new FFFactory<CMR2008ReorderingFeatures>()); ff_registry.Register("RuleSourceBigramFeatures", new FFFactory<RuleSourceBigramFeatures>()); ff_registry.Register("RuleTargetBigramFeatures", new FFFactory<RuleTargetBigramFeatures>()); diff --git a/decoder/ff_parse_match.cc b/decoder/ff_parse_match.cc index 94634b27..7c79302b 100644 --- a/decoder/ff_parse_match.cc +++ b/decoder/ff_parse_match.cc @@ -13,10 +13,6 @@ using namespace std; // implements the parse match features as described in Vilar et al. (2008) // source trees must be represented in Penn Treebank format, e.g. // (S (NP John) (VP (V left))) -// -// Annotate source sentences with <seg id="..." grammar="..." src_tree="(S ...)">...</seg>" -// Note: You need to escape quite a lot of stuff in all your models! -// struct ParseMatchFeaturesImpl { ParseMatchFeaturesImpl(const string& param) { diff --git a/decoder/ff_soft_syntax.cc b/decoder/ff_soft_syntax.cc index d84f2e6d..a3d26135 100644 --- a/decoder/ff_soft_syntax.cc +++ b/decoder/ff_soft_syntax.cc @@ -13,16 +13,15 @@ using namespace std; -// Implements the soft syntactic features described in +// Implements the soft syntactic features described in // Marton and Resnik (2008): "Soft Syntacitc Constraints for Hierarchical Phrase-Based Translation". // Source trees must be represented in Penn Treebank format, // e.g. (S (NP John) (VP (V left))). -struct SoftSyntacticFeaturesImpl { - SoftSyntacticFeaturesImpl(const string& param) { +struct SoftSyntaxFeaturesImpl { + SoftSyntaxFeaturesImpl(const string& param) { vector<string> labels = SplitOnWhitespace(param); - //for (unsigned int i = 0; i < labels.size(); i++) - //cerr << "Labels: " << labels.at(i) << endl; + //for (unsigned int i = 0; i < labels.size(); i++) { cerr << "Labels: " << labels.at(i) << endl; } for (unsigned int i = 0; i < labels.size(); i++) { string label = labels.at(i); pair<string, string> feat_label; @@ -34,10 +33,8 @@ struct SoftSyntacticFeaturesImpl { void InitializeGrids(const string& tree, unsigned src_len) { assert(tree.size() > 0); - //fids_cat.clear(); fids_ef.clear(); src_tree.clear(); - //fids_cat.resize(src_len, src_len + 1); fids_ef.resize(src_len, src_len + 1); src_tree.resize(src_len, src_len + 1, TD::Convert("XX")); ParseTreeString(tree, src_len); @@ -99,7 +96,7 @@ struct SoftSyntacticFeaturesImpl { const WordID lhs = src_tree(i,j); string lhs_str = TD::Convert(lhs); //cerr << "LHS: " << lhs_str << " from " << i << " to " << j << endl; - //cerr << "RULE :"<< rule << endl; + //cerr << "RULE :"<< rule << endl; int& fid_ef = fids_ef(i,j)[&rule]; for (unsigned int i = 0; i < feat_labels.size(); i++) { ostringstream os; @@ -126,7 +123,7 @@ struct SoftSyntacticFeaturesImpl { fid_ef = FD::Convert(os.str()); if (lhs_str.compare(label) == 0) { if (fid_ef > 0) { - //cerr << "Feature: " << os.str() << endl; + //cerr << "Feature: " << os.str() << endl; feats->set_value(fid_ef, 1.0); } } @@ -147,8 +144,8 @@ struct SoftSyntacticFeaturesImpl { } } break; - case '-': - //cerr << "-" << endl; + case '-': + //cerr << "-" << endl; if (lhs_str.compare(label) != 0) { os << "SYN:" << label << "_cross"; fid_ef = FD::Convert(os.str()); @@ -167,22 +164,22 @@ struct SoftSyntacticFeaturesImpl { return lhs; } - Array2D<WordID> src_tree; // src_tree(i,j) NT = type - mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized + Array2D<WordID> src_tree; // src_tree(i,j) NT = type + mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized vector<pair<string, string> > feat_labels; }; -SoftSyntacticFeatures::SoftSyntacticFeatures(const string& param) : +SoftSyntaxFeatures::SoftSyntaxFeatures(const string& param) : FeatureFunction(sizeof(WordID)) { - impl = new SoftSyntacticFeaturesImpl(param); + impl = new SoftSyntaxFeaturesImpl(param); } -SoftSyntacticFeatures::~SoftSyntacticFeatures() { +SoftSyntaxFeatures::~SoftSyntaxFeatures() { delete impl; impl = NULL; } -void SoftSyntacticFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, +void SoftSyntaxFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, const Hypergraph::Edge& edge, const vector<const void*>& ant_contexts, SparseVector<double>* features, @@ -196,6 +193,7 @@ void SoftSyntacticFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, impl->FireFeatures(*edge.rule_, edge.i_, edge.j_, ants, features); } -void SoftSyntacticFeatures::PrepareForInput(const SentenceMetadata& smeta) { +void SoftSyntaxFeatures::PrepareForInput(const SentenceMetadata& smeta) { impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); } + diff --git a/decoder/ff_soft_syntax.h b/decoder/ff_soft_syntax.h index 79352f49..e71825d5 100644 --- a/decoder/ff_soft_syntax.h +++ b/decoder/ff_soft_syntax.h @@ -1,15 +1,15 @@ -#ifndef _FF_SOFTSYNTAX_H_ -#define _FF_SOFTSYNTAX_H_ +#ifndef _FF_SOFT_SYNTAX_H_ +#define _FF_SOFT_SYNTAX_H_ #include "ff.h" #include "hg.h" -struct SoftSyntacticFeaturesImpl; +struct SoftSyntaxFeaturesImpl; -class SoftSyntacticFeatures : public FeatureFunction { +class SoftSyntaxFeatures : public FeatureFunction { public: - SoftSyntacticFeatures(const std::string& param); - ~SoftSyntacticFeatures(); + SoftSyntaxFeatures(const std::string& param); + ~SoftSyntaxFeatures(); protected: virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, const Hypergraph::Edge& edge, @@ -19,9 +19,9 @@ class SoftSyntacticFeatures : public FeatureFunction { void* context) const; virtual void PrepareForInput(const SentenceMetadata& smeta); private: - SoftSyntacticFeaturesImpl* impl; + SoftSyntaxFeaturesImpl* impl; }; - #endif + diff --git a/decoder/ff_soft_syntax2.cc b/decoder/ff_soft_syntax_mindist.cc index 121bc39b..3f531986 100644 --- a/decoder/ff_soft_syntax2.cc +++ b/decoder/ff_soft_syntax_mindist.cc @@ -1,4 +1,4 @@ -#include "ff_soft_syntax2.h" +#include "ff_soft_syntax_mindist.h" #include <cstdio> #include <sstream> @@ -13,16 +13,18 @@ using namespace std; -// Implements the soft syntactic features described in +// Implements the soft syntactic features described in // Marton and Resnik (2008): "Soft Syntacitc Constraints for Hierarchical Phrase-Based Translation". // Source trees must be represented in Penn Treebank format, // e.g. (S (NP John) (VP (V left))). +// +// This variant accepts fuzzy matches, choosing the constituent with +// minimum distance. -struct SoftSyntacticFeatures2Impl { - SoftSyntacticFeatures2Impl(const string& param) { +struct SoftSyntaxFeaturesMindistImpl { + SoftSyntaxFeaturesMindistImpl(const string& param) { vector<string> labels = SplitOnWhitespace(param); - //for (unsigned int i = 0; i < labels.size(); i++) - //cerr << "Labels: " << labels.at(i) << endl; + //for (unsigned int i = 0; i < labels.size(); i++) { cerr << "Labels: " << labels.at(i) << endl; } for (unsigned int i = 0; i < labels.size(); i++) { string label = labels.at(i); pair<string, string> feat_label; @@ -30,14 +32,12 @@ struct SoftSyntacticFeatures2Impl { feat_label.second = label.at(label.size() - 1); feat_labels.push_back(feat_label); } - } + } void InitializeGrids(const string& tree, unsigned src_len) { assert(tree.size() > 0); - //fids_cat.clear(); fids_ef.clear(); src_tree.clear(); - //fids_cat.resize(src_len, src_len + 1); fids_ef.resize(src_len, src_len + 1); src_tree.resize(src_len, src_len + 1, TD::Convert("XX")); ParseTreeString(tree, src_len); @@ -99,14 +99,14 @@ struct SoftSyntacticFeatures2Impl { const WordID lhs = src_tree(i,j); string lhs_str = TD::Convert(lhs); //cerr << "LHS: " << lhs_str << " from " << i << " to " << j << endl; - //cerr << "RULE :"<< rule << endl; + //cerr << "RULE :"<< rule << endl; int& fid_ef = fids_ef(i,j)[&rule]; string lhs_to_str = TD::Convert(lhs); int min_dist; string min_dist_label; if (lhs_to_str.compare("XX") != 0) { min_dist = 0; - min_dist_label = lhs_to_str; + min_dist_label = lhs_to_str; } else { int ok = 0; @@ -128,7 +128,7 @@ struct SoftSyntacticFeatures2Impl { min_dist_label = (TD::Convert(src_tree(l_rem, r_rem))); break; } - } + } } if (ok) break; } @@ -180,8 +180,8 @@ struct SoftSyntacticFeatures2Impl { } } break; - case '-': - //cerr << "-" << endl; + case '-': + //cerr << "-" << endl; if (min_dist_label.compare(label) != 0) { os << "SYN:" << label << "_cross"; fid_ef = FD::Convert(os.str()); @@ -200,22 +200,22 @@ struct SoftSyntacticFeatures2Impl { return lhs; } - Array2D<WordID> src_tree; // src_tree(i,j) NT = type - mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized + Array2D<WordID> src_tree; // src_tree(i,j) NT = type + mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized vector<pair<string, string> > feat_labels; }; -SoftSyntacticFeatures2::SoftSyntacticFeatures2(const string& param) : +SoftSyntaxFeaturesMindist::SoftSyntaxFeaturesMindist(const string& param) : FeatureFunction(sizeof(WordID)) { - impl = new SoftSyntacticFeatures2Impl(param); + impl = new SoftSyntaxFeaturesMindistImpl(param); } -SoftSyntacticFeatures2::~SoftSyntacticFeatures2() { +SoftSyntaxFeaturesMindist::~SoftSyntaxFeaturesMindist() { delete impl; impl = NULL; } -void SoftSyntacticFeatures2::TraversalFeaturesImpl(const SentenceMetadata& smeta, +void SoftSyntaxFeaturesMindist::TraversalFeaturesImpl(const SentenceMetadata& smeta, const Hypergraph::Edge& edge, const vector<const void*>& ant_contexts, SparseVector<double>* features, @@ -229,6 +229,7 @@ void SoftSyntacticFeatures2::TraversalFeaturesImpl(const SentenceMetadata& smeta impl->FireFeatures(*edge.rule_, edge.i_, edge.j_, ants, features); } -void SoftSyntacticFeatures2::PrepareForInput(const SentenceMetadata& smeta) { +void SoftSyntaxFeaturesMindist::PrepareForInput(const SentenceMetadata& smeta) { impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); } + diff --git a/decoder/ff_soft_syntax2.h b/decoder/ff_soft_syntax_mindist.h index 4de91d86..bf938b38 100644 --- a/decoder/ff_soft_syntax2.h +++ b/decoder/ff_soft_syntax_mindist.h @@ -1,15 +1,15 @@ -#ifndef _FF_SOFTSYNTAX2_H_ -#define _FF_SOFTSYNTAX2_H_ +#ifndef _FF_SOFT_SYNTAX_MINDIST_H_ +#define _FF_SOFT_SYNTAX_MINDIST_H_ #include "ff.h" #include "hg.h" -struct SoftSyntacticFeatures2Impl; +struct SoftSyntaxFeaturesMindistImpl; -class SoftSyntacticFeatures2 : public FeatureFunction { +class SoftSyntaxFeaturesMindist : public FeatureFunction { public: - SoftSyntacticFeatures2(const std::string& param); - ~SoftSyntacticFeatures2(); + SoftSyntaxFeaturesMindist(const std::string& param); + ~SoftSyntaxFeaturesMindist(); protected: virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, const Hypergraph::Edge& edge, @@ -19,9 +19,9 @@ class SoftSyntacticFeatures2 : public FeatureFunction { void* context) const; virtual void PrepareForInput(const SentenceMetadata& smeta); private: - SoftSyntacticFeatures2Impl* impl; + SoftSyntaxFeaturesMindistImpl* impl; }; - #endif + diff --git a/decoder/ff_source_syntax.cc b/decoder/ff_source_syntax.cc index a1997695..34e7ab69 100644 --- a/decoder/ff_source_syntax.cc +++ b/decoder/ff_source_syntax.cc @@ -2,8 +2,8 @@ #include <sstream> #include <stack> +#include <tr1/unordered_set> -#include "hg.h" #include "sentence_metadata.h" #include "array2d.h" #include "filelib.h" @@ -24,6 +24,17 @@ inline int SpanSizeTransform(unsigned span_size) { struct SourceSyntaxFeaturesImpl { SourceSyntaxFeaturesImpl() {} + SourceSyntaxFeaturesImpl(const string& param) { + if (!(param.compare("") == 0)) { + string triggered_features_fn = param; + ReadFile triggered_features(triggered_features_fn); + string in; + while(getline(*triggered_features, in)) { + feature_filter.insert(FD::Convert(in)); + } + } + } + void InitializeGrids(const string& tree, unsigned src_len) { assert(tree.size() > 0); //fids_cat.clear(); @@ -118,21 +129,28 @@ struct SourceSyntaxFeaturesImpl { } fid_ef = FD::Convert(os.str()); } - //if (fid_cat > 0) - // feats->set_value(fid_cat, 1.0); - if (fid_ef > 0) - feats->set_value(fid_ef, 1.0); + if (fid_ef > 0) { + if (feature_filter.size()>0) { + if (feature_filter.find(fid_ef) != feature_filter.end()) { + feats->set_value(fid_ef, 1.0); + } + } else { + feats->set_value(fid_ef, 1.0); + } + } + cerr << FD::Convert(fid_ef) << endl; return lhs; } - Array2D<WordID> src_tree; // src_tree(i,j) NT = type - // mutable Array2D<int> fids_cat; // this tends to overfit baddly - mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized + Array2D<WordID> src_tree; // src_tree(i,j) NT = type + // mutable Array2D<int> fids_cat; // this tends to overfit baddly + mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized + tr1::unordered_set<int> feature_filter; }; SourceSyntaxFeatures::SourceSyntaxFeatures(const string& param) : FeatureFunction(sizeof(WordID)) { - impl = new SourceSyntaxFeaturesImpl; + impl = new SourceSyntaxFeaturesImpl(param); } SourceSyntaxFeatures::~SourceSyntaxFeatures() { @@ -230,4 +248,3 @@ void SourceSpanSizeFeatures::PrepareForInput(const SentenceMetadata& smeta) { impl->InitializeGrids(smeta.GetSourceLength()); } - diff --git a/decoder/ff_source_syntax.h b/decoder/ff_source_syntax.h index a8c7150a..bdd638c1 100644 --- a/decoder/ff_source_syntax.h +++ b/decoder/ff_source_syntax.h @@ -1,7 +1,8 @@ -#ifndef _FF_SOURCE_TOOLS_H_ -#define _FF_SOURCE_TOOLS_H_ +#ifndef _FF_SOURCE_SYNTAX_H_ +#define _FF_SOURCE_SYNTAX_H_ #include "ff.h" +#include "hg.h" struct SourceSyntaxFeaturesImpl; @@ -11,7 +12,7 @@ class SourceSyntaxFeatures : public FeatureFunction { ~SourceSyntaxFeatures(); protected: virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const HG::Edge& edge, + const Hypergraph::Edge& edge, const std::vector<const void*>& ant_contexts, SparseVector<double>* features, SparseVector<double>* estimated_features, @@ -28,7 +29,7 @@ class SourceSpanSizeFeatures : public FeatureFunction { ~SourceSpanSizeFeatures(); protected: virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const HG::Edge& edge, + const Hypergraph::Edge& edge, const std::vector<const void*>& ant_contexts, SparseVector<double>* features, SparseVector<double>* estimated_features, @@ -39,3 +40,4 @@ class SourceSpanSizeFeatures : public FeatureFunction { }; #endif + diff --git a/decoder/ff_source_syntax2.cc b/decoder/ff_source_syntax2.cc index 08ece917..63736342 100644 --- a/decoder/ff_source_syntax2.cc +++ b/decoder/ff_source_syntax2.cc @@ -17,7 +17,7 @@ using namespace std; struct SourceSyntaxFeatures2Impl { SourceSyntaxFeatures2Impl(const string& param) { - if (!(param.compare("") == 0)) { + if (param.compare("") != 0) { string triggered_features_fn = param; ReadFile triggered_features(triggered_features_fn); string in; @@ -29,10 +29,8 @@ struct SourceSyntaxFeatures2Impl { void InitializeGrids(const string& tree, unsigned src_len) { assert(tree.size() > 0); - //fids_cat.clear(); fids_ef.clear(); src_tree.clear(); - //fids_cat.resize(src_len, src_len + 1); fids_ef.resize(src_len, src_len + 1); src_tree.resize(src_len, src_len + 1, TD::Convert("XX")); ParseTreeString(tree, src_len); @@ -40,7 +38,7 @@ struct SourceSyntaxFeatures2Impl { void ParseTreeString(const string& tree, unsigned src_len) { //cerr << "TREE: " << tree << endl; - stack<pair<int, WordID> > stk; // first = i, second = category + stack<pair<int, WordID> > stk; // first = i, second = category pair<int, WordID> cur_cat; cur_cat.first = -1; unsigned i = 0; unsigned p = 0; @@ -100,7 +98,7 @@ struct SourceSyntaxFeatures2Impl { if (k > 0 && fj <= 0) os << '_'; if (fj <= 0) { os << '[' << TD::Convert(ants[ntc++]) << ']'; - } /*else { + }/*else { os << TD::Convert(fj); }*/ } @@ -116,16 +114,22 @@ struct SourceSyntaxFeatures2Impl { fid_ef = FD::Convert(os.str()); //cerr << "FEATURE: " << os.str() << endl; //cerr << "FID_EF: " << fid_ef << endl; - if (feature_filter.find(fid_ef) != feature_filter.end()) { - cerr << "SYN-Feature was trigger more than once on training set." << endl; + if (feature_filter.size() > 0) { + if (feature_filter.find(fid_ef) != feature_filter.end()) { + //cerr << "SYN-Feature was trigger more than once on training set." << endl; + feats->set_value(fid_ef, 1.0); + } + //else cerr << "SYN-Feature was triggered less than once on training set." << endli; + } + else { feats->set_value(fid_ef, 1.0); } - else cerr << "SYN-Feature was triggered less than once on training set." << endl; + cerr << FD::Convert(fid_ef) << endl; return lhs; } - Array2D<WordID> src_tree; // src_tree(i,j) NT = type - mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized + Array2D<WordID> src_tree; // src_tree(i,j) NT = type + mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized tr1::unordered_set<int> feature_filter; }; @@ -157,3 +161,4 @@ void SourceSyntaxFeatures2::TraversalFeaturesImpl(const SentenceMetadata& smeta, void SourceSyntaxFeatures2::PrepareForInput(const SentenceMetadata& smeta) { impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); } + diff --git a/decoder/ff_source_syntax2.h b/decoder/ff_source_syntax2.h index b6b7dc3d..f606c2bf 100644 --- a/decoder/ff_source_syntax2.h +++ b/decoder/ff_source_syntax2.h @@ -1,5 +1,5 @@ -#ifndef _FF_SOURCE_TOOLS2_H_ -#define _FF_SOURCE_TOOLS2_H_ +#ifndef _FF_SOURCE_SYNTAX2_H_ +#define _FF_SOURCE_SYNTAX2_H_ #include "ff.h" #include "hg.h" @@ -23,3 +23,4 @@ class SourceSyntaxFeatures2 : public FeatureFunction { }; #endif + diff --git a/decoder/ff_source_syntax2_p.cc b/decoder/ff_source_syntax2_p.cc deleted file mode 100644 index dfa791ea..00000000 --- a/decoder/ff_source_syntax2_p.cc +++ /dev/null @@ -1,166 +0,0 @@ -#include "ff_source_syntax2_p.h" - -#include <sstream> -#include <stack> -#include <string> -#include <tr1/unordered_set> - -#include "sentence_metadata.h" -#include "array2d.h" -#include "filelib.h" - -using namespace std; - -// implements the source side syntax features described in Blunsom et al. (EMNLP 2008) -// source trees must be represented in Penn Treebank format, e.g. -// (S (NP John) (VP (V left))) - -struct PSourceSyntaxFeatures2Impl { - PSourceSyntaxFeatures2Impl(const string& param) { - if (param.compare("") != 0) { - string triggered_features_fn = param; - ReadFile triggered_features(triggered_features_fn); - string in; - while(getline(*triggered_features, in)) { - feature_filter.insert(FD::Convert(in)); - } - } - /*cerr << "find(\"One\") == " << boolalpha << (table.find("One") != table.end()) << endl; - cerr << "find(\"Three\") == " << boolalpha << (table.find("Three") != table.end()) << endl;*/ - } - - void InitializeGrids(const string& tree, unsigned src_len) { - assert(tree.size() > 0); - //fids_cat.clear(); - fids_ef.clear(); - src_tree.clear(); - //fids_cat.resize(src_len, src_len + 1); - fids_ef.resize(src_len, src_len + 1); - src_tree.resize(src_len, src_len + 1, TD::Convert("XX")); - ParseTreeString(tree, src_len); - } - - void ParseTreeString(const string& tree, unsigned src_len) { - //cerr << "TREE: " << tree << endl; - stack<pair<int, WordID> > stk; // first = i, second = category - pair<int, WordID> cur_cat; cur_cat.first = -1; - unsigned i = 0; - unsigned p = 0; - while(p < tree.size()) { - const char cur = tree[p]; - if (cur == '(') { - stk.push(cur_cat); - ++p; - unsigned k = p + 1; - while (k < tree.size() && tree[k] != ' ') { ++k; } - cur_cat.first = i; - cur_cat.second = TD::Convert(tree.substr(p, k - p)); - // cerr << "NT: '" << tree.substr(p, k-p) << "' (i=" << i << ")\n"; - p = k + 1; - } else if (cur == ')') { - unsigned k = p; - while (k < tree.size() && tree[k] == ')') { ++k; } - const unsigned num_closes = k - p; - for (unsigned ci = 0; ci < num_closes; ++ci) { - src_tree(cur_cat.first, i) = cur_cat.second; - cur_cat = stk.top(); - stk.pop(); - } - p = k; - while (p < tree.size() && (tree[p] == ' ' || tree[p] == '\t')) { ++p; } - } else if (cur == ' ' || cur == '\t') { - cerr << "Unexpected whitespace in: " << tree << endl; - abort(); - } else { // terminal symbol - unsigned k = p + 1; - do { - while (k < tree.size() && tree[k] != ')' && tree[k] != ' ') { ++k; } - // cerr << "TERM: '" << tree.substr(p, k-p) << "' (i=" << i << ")\n"; - ++i; - assert(i <= src_len); - while (k < tree.size() && tree[k] == ' ') { ++k; } - p = k; - } while (p < tree.size() && tree[p] != ')'); - } - //cerr << "i=" << i << " src_len=" << src_len << endl; - } - //cerr << "i=" << i << " src_len=" << src_len << endl; - assert(i == src_len); // make sure tree specified in src_tree is - // the same length as the source sentence - } - - WordID FireFeatures(const TRule& rule, const int i, const int j, const WordID* ants, SparseVector<double>* feats) { - //cerr << "fire features: " << rule.AsString() << " for " << i << "," << j << endl; - const WordID lhs = src_tree(i,j); - int& fid_ef = fids_ef(i,j)[&rule]; - ostringstream os; - os << "SYN:" << TD::Convert(lhs); - os << ':'; - unsigned ntc = 0; - for (unsigned k = 0; k < rule.f_.size(); ++k) { - int fj = rule.f_[k]; - if (k > 0 && fj <= 0) os << '_'; - if (fj <= 0) { - os << '[' << TD::Convert(ants[ntc++]) << ']'; - } /*else { - os << TD::Convert(fj); - }*/ - } - os << ':'; - for (unsigned k = 0; k < rule.e_.size(); ++k) { - const int ei = rule.e_[k]; - if (k > 0) os << '_'; - if (ei <= 0) - os << '[' << (1-ei) << ']'; - else - os << TD::Convert(ei); - } - fid_ef = FD::Convert(os.str()); - //cerr << "FEATURE: " << os.str() << endl; - //cerr << "FID_EF: " << fid_ef << endl; - if (feature_filter.size() > 0) { - if (feature_filter.find(fid_ef) != feature_filter.end()) { - //cerr << "SYN-Feature was trigger more than once on training set." << endl; - feats->set_value(fid_ef, 1.0); - } - //else cerr << "SYN-Feature was triggered less than once on training set." << endli; - } - else { - feats->set_value(fid_ef, 1.0); - } - return lhs; - } - - Array2D<WordID> src_tree; // src_tree(i,j) NT = type - mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized - tr1::unordered_set<int> feature_filter; - -}; - -PSourceSyntaxFeatures2::PSourceSyntaxFeatures2(const string& param) : - FeatureFunction(sizeof(WordID)) { - impl = new PSourceSyntaxFeatures2Impl(param); -} - -PSourceSyntaxFeatures2::~PSourceSyntaxFeatures2() { - delete impl; - impl = NULL; -} - -void PSourceSyntaxFeatures2::TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const vector<const void*>& ant_contexts, - SparseVector<double>* features, - SparseVector<double>* estimated_features, - void* context) const { - WordID ants[8]; - for (unsigned i = 0; i < ant_contexts.size(); ++i) - ants[i] = *static_cast<const WordID*>(ant_contexts[i]); - - *static_cast<WordID*>(context) = - impl->FireFeatures(*edge.rule_, edge.i_, edge.j_, ants, features); -} - -void PSourceSyntaxFeatures2::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); -} diff --git a/decoder/ff_source_syntax2_p.h b/decoder/ff_source_syntax2_p.h deleted file mode 100644 index d56ecab0..00000000 --- a/decoder/ff_source_syntax2_p.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef _FF_SOURCE_TOOLS2_H_ -#define _FF_SOURCE_TOOLS2_H_ - -#include "ff.h" -#include "hg.h" - -struct PSourceSyntaxFeatures2Impl; - -class PSourceSyntaxFeatures2 : public FeatureFunction { - public: - PSourceSyntaxFeatures2(const std::string& param); - ~PSourceSyntaxFeatures2(); - protected: - virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector<const void*>& ant_contexts, - SparseVector<double>* features, - SparseVector<double>* estimated_features, - void* context) const; - virtual void PrepareForInput(const SentenceMetadata& smeta); - private: - PSourceSyntaxFeatures2Impl* impl; -}; - -#endif diff --git a/decoder/ff_source_syntax_p.cc b/decoder/ff_source_syntax_p.cc deleted file mode 100644 index cd081544..00000000 --- a/decoder/ff_source_syntax_p.cc +++ /dev/null @@ -1,245 +0,0 @@ -#include "ff_source_syntax_p.h" - -#include <sstream> -#include <stack> -#include <tr1/unordered_set> - -#include "sentence_metadata.h" -#include "array2d.h" -#include "filelib.h" - -using namespace std; - -// implements the source side syntax features described in Blunsom et al. (EMNLP 2008) -// source trees must be represented in Penn Treebank format, e.g. -// (S (NP John) (VP (V left))) - -// log transform to make long spans cluster together -// but preserve differences -inline int SpanSizeTransform(unsigned span_size) { - if (!span_size) return 0; - return static_cast<int>(log(span_size+1) / log(1.39)) - 1; -} - -struct PSourceSyntaxFeaturesImpl { - PSourceSyntaxFeaturesImpl() {} - - PSourceSyntaxFeaturesImpl(const string& param) { - if (!(param.compare("") == 0)) { - string triggered_features_fn = param; - ReadFile triggered_features(triggered_features_fn); - string in; - while(getline(*triggered_features, in)) { - feature_filter.insert(FD::Convert(in)); - } - } - } - - void InitializeGrids(const string& tree, unsigned src_len) { - assert(tree.size() > 0); - //fids_cat.clear(); - fids_ef.clear(); - src_tree.clear(); - //fids_cat.resize(src_len, src_len + 1); - fids_ef.resize(src_len, src_len + 1); - src_tree.resize(src_len, src_len + 1, TD::Convert("XX")); - ParseTreeString(tree, src_len); - } - - void ParseTreeString(const string& tree, unsigned src_len) { - stack<pair<int, WordID> > stk; // first = i, second = category - pair<int, WordID> cur_cat; cur_cat.first = -1; - unsigned i = 0; - unsigned p = 0; - while(p < tree.size()) { - const char cur = tree[p]; - if (cur == '(') { - stk.push(cur_cat); - ++p; - unsigned k = p + 1; - while (k < tree.size() && tree[k] != ' ') { ++k; } - cur_cat.first = i; - cur_cat.second = TD::Convert(tree.substr(p, k - p)); - // cerr << "NT: '" << tree.substr(p, k-p) << "' (i=" << i << ")\n"; - p = k + 1; - } else if (cur == ')') { - unsigned k = p; - while (k < tree.size() && tree[k] == ')') { ++k; } - const unsigned num_closes = k - p; - for (unsigned ci = 0; ci < num_closes; ++ci) { - // cur_cat.second spans from cur_cat.first to i - // cerr << TD::Convert(cur_cat.second) << " from " << cur_cat.first << " to " << i << endl; - // NOTE: unary rule chains end up being labeled with the top-most category - src_tree(cur_cat.first, i) = cur_cat.second; - cur_cat = stk.top(); - stk.pop(); - } - p = k; - while (p < tree.size() && (tree[p] == ' ' || tree[p] == '\t')) { ++p; } - } else if (cur == ' ' || cur == '\t') { - cerr << "Unexpected whitespace in: " << tree << endl; - abort(); - } else { // terminal symbol - unsigned k = p + 1; - do { - while (k < tree.size() && tree[k] != ')' && tree[k] != ' ') { ++k; } - // cerr << "TERM: '" << tree.substr(p, k-p) << "' (i=" << i << ")\n"; - ++i; - assert(i <= src_len); - while (k < tree.size() && tree[k] == ' ') { ++k; } - p = k; - } while (p < tree.size() && tree[p] != ')'); - } - } - // cerr << "i=" << i << " src_len=" << src_len << endl; - assert(i == src_len); // make sure tree specified in src_tree is - // the same length as the source sentence - } - - WordID FireFeatures(const TRule& rule, const int i, const int j, const WordID* ants, SparseVector<double>* feats) { - //cerr << "fire features: " << rule.AsString() << " for " << i << "," << j << endl; - const WordID lhs = src_tree(i,j); - //int& fid_cat = fids_cat(i,j); - int& fid_ef = fids_ef(i,j)[&rule]; - if (fid_ef <= 0) { - ostringstream os; - //ostringstream os2; - os << "SYN:" << TD::Convert(lhs); - //os2 << "SYN:" << TD::Convert(lhs) << '_' << SpanSizeTransform(j - i); - //fid_cat = FD::Convert(os2.str()); - os << ':'; - unsigned ntc = 0; - for (unsigned k = 0; k < rule.f_.size(); ++k) { - if (k > 0) os << '_'; - int fj = rule.f_[k]; - if (fj <= 0) { - os << '[' << TD::Convert(ants[ntc++]) << ']'; - } else { - os << TD::Convert(fj); - } - } - os << ':'; - for (unsigned k = 0; k < rule.e_.size(); ++k) { - const int ei = rule.e_[k]; - if (k > 0) os << '_'; - if (ei <= 0) - os << '[' << (1-ei) << ']'; - else - os << TD::Convert(ei); - } - fid_ef = FD::Convert(os.str()); - } - //if (fid_cat > 0) - // feats->set_value(fid_cat, 1.0); - if (fid_ef > 0 && (feature_filter.find(fid_ef) != feature_filter.end())) - feats->set_value(fid_ef, 1.0); - return lhs; - } - - Array2D<WordID> src_tree; // src_tree(i,j) NT = type - // mutable Array2D<int> fids_cat; // this tends to overfit baddly - mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized - tr1::unordered_set<int> feature_filter; -}; - -PSourceSyntaxFeatures::PSourceSyntaxFeatures(const string& param) : - FeatureFunction(sizeof(WordID)) { - impl = new PSourceSyntaxFeaturesImpl(param); -} - -PSourceSyntaxFeatures::~PSourceSyntaxFeatures() { - delete impl; - impl = NULL; -} - -void PSourceSyntaxFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const vector<const void*>& ant_contexts, - SparseVector<double>* features, - SparseVector<double>* estimated_features, - void* context) const { - WordID ants[8]; - for (unsigned i = 0; i < ant_contexts.size(); ++i) - ants[i] = *static_cast<const WordID*>(ant_contexts[i]); - - *static_cast<WordID*>(context) = - impl->FireFeatures(*edge.rule_, edge.i_, edge.j_, ants, features); -} - -void PSourceSyntaxFeatures::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); -} - -struct PSourceSpanSizeFeaturesImpl { - PSourceSpanSizeFeaturesImpl() {} - - void InitializeGrids(unsigned src_len) { - fids.clear(); - fids.resize(src_len, src_len + 1); - } - - int FireFeatures(const TRule& rule, const int i, const int j, const WordID* ants, SparseVector<double>* feats) { - if (rule.Arity() > 0) { - int& fid = fids(i,j)[&rule]; - if (fid <= 0) { - ostringstream os; - os << "SSS:"; - unsigned ntc = 0; - for (unsigned k = 0; k < rule.f_.size(); ++k) { - if (k > 0) os << '_'; - int fj = rule.f_[k]; - if (fj <= 0) { - os << '[' << TD::Convert(-fj) << ants[ntc++] << ']'; - } else { - os << TD::Convert(fj); - } - } - os << ':'; - for (unsigned k = 0; k < rule.e_.size(); ++k) { - const int ei = rule.e_[k]; - if (k > 0) os << '_'; - if (ei <= 0) - os << '[' << (1-ei) << ']'; - else - os << TD::Convert(ei); - } - fid = FD::Convert(os.str()); - } - if (fid > 0) - feats->set_value(fid, 1.0); - } - return SpanSizeTransform(j - i); - } - - mutable Array2D<map<const TRule*, int> > fids; -}; - -PSourceSpanSizeFeatures::PSourceSpanSizeFeatures(const string& param) : - FeatureFunction(sizeof(char)) { - impl = new PSourceSpanSizeFeaturesImpl; -} - -PSourceSpanSizeFeatures::~PSourceSpanSizeFeatures() { - delete impl; - impl = NULL; -} - -void PSourceSpanSizeFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const vector<const void*>& ant_contexts, - SparseVector<double>* features, - SparseVector<double>* estimated_features, - void* context) const { - int ants[8]; - for (unsigned i = 0; i < ant_contexts.size(); ++i) - ants[i] = *static_cast<const char*>(ant_contexts[i]); - - *static_cast<char*>(context) = - impl->FireFeatures(*edge.rule_, edge.i_, edge.j_, ants, features); -} - -void PSourceSpanSizeFeatures::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSourceLength()); -} - - diff --git a/decoder/ff_source_syntax_p.h b/decoder/ff_source_syntax_p.h deleted file mode 100644 index 2dd9094a..00000000 --- a/decoder/ff_source_syntax_p.h +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef _FF_SOURCE_TOOLS_H_ -#define _FF_SOURCE_TOOLS_H_ - -#include "ff.h" -#include "hg.h" - -struct PSourceSyntaxFeaturesImpl; - -class PSourceSyntaxFeatures : public FeatureFunction { - public: - PSourceSyntaxFeatures(const std::string& param); - ~PSourceSyntaxFeatures(); - protected: - virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector<const void*>& ant_contexts, - SparseVector<double>* features, - SparseVector<double>* estimated_features, - void* context) const; - virtual void PrepareForInput(const SentenceMetadata& smeta); - private: - PSourceSyntaxFeaturesImpl* impl; -}; - -struct PSourceSpanSizeFeaturesImpl; -class PSourceSpanSizeFeatures : public FeatureFunction { - public: - PSourceSpanSizeFeatures(const std::string& param); - ~PSourceSpanSizeFeatures(); - protected: - virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector<const void*>& ant_contexts, - SparseVector<double>* features, - SparseVector<double>* estimated_features, - void* context) const; - virtual void PrepareForInput(const SentenceMetadata& smeta); - private: - PSourceSpanSizeFeaturesImpl* impl; -}; - -#endif |