From a9171fa0aa0ad6d7611fe079ecee464bc5f78231 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Sun, 3 Nov 2013 21:56:06 +0100 Subject: cleaned up parsematch features --- decoder/ff_parse_match.cc | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'decoder/ff_parse_match.cc') diff --git a/decoder/ff_parse_match.cc b/decoder/ff_parse_match.cc index ed556b91..94634b27 100644 --- a/decoder/ff_parse_match.cc +++ b/decoder/ff_parse_match.cc @@ -13,6 +13,10 @@ using namespace std; // implements the parse match features as described in Vilar et al. (2008) // source trees must be represented in Penn Treebank format, e.g. // (S (NP John) (VP (V left))) +// +// Annotate source sentences with ..." +// Note: You need to escape quite a lot of stuff in all your models! +// struct ParseMatchFeaturesImpl { ParseMatchFeaturesImpl(const string& param) { @@ -42,10 +46,8 @@ struct ParseMatchFeaturesImpl { void InitializeGrids(const string& tree, unsigned src_len) { assert(tree.size() > 0); - //fids_cat.clear(); fids_ef.clear(); src_tree.clear(); - //fids_cat.resize(src_len, src_len + 1); fids_ef.resize(src_len, src_len + 1); src_tree.resize(src_len, src_len + 1, TD::Convert("XX")); ParseTreeString(tree, src_len); @@ -112,7 +114,7 @@ struct ParseMatchFeaturesImpl { int fid_ef = FD::Convert("PM"); int min_dist; // minimal distance to next syntactic constituent of this rule's LHS int summed_min_dists; // minimal distances of LHS and NTs summed up - if (TD::Convert(lhs).compare("XX") != 0) + if (TD::Convert(lhs).compare("XX") != 0) min_dist= 0; // compute the distance to the next syntactical constituent else { @@ -131,7 +133,7 @@ struct ParseMatchFeaturesImpl { ok = 1; break; } - // check if removing k words from the rule span will + // check if removing k words from the rule span will // lead to a syntactical constituent else { //cerr << "Hilfe...!" << endl; @@ -144,7 +146,7 @@ struct ParseMatchFeaturesImpl { ok = 1; break; } - } + } } if (ok) break; } @@ -183,9 +185,9 @@ struct ParseMatchFeaturesImpl { return min_dist; } - Array2D src_tree; // src_tree(i,j) NT = type + Array2D src_tree; // src_tree(i,j) NT = type unsigned int src_sent_len; - mutable Array2D > fids_ef; // fires for fully lexicalized + mutable Array2D > fids_ef; // fires for fully lexicalized int scoring_method; }; @@ -216,3 +218,4 @@ void ParseMatchFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, void ParseMatchFeatures::PrepareForInput(const SentenceMetadata& smeta) { impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); } + -- cgit v1.2.3 From 5cba65baf55b821cbc22b0ee0e3ae8dc9946ca0f Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Tue, 5 Nov 2013 16:29:03 +0100 Subject: cleaning up syntax features --- decoder/Makefile.am | 5 +- decoder/cdec_ff.cc | 25 +--- decoder/ff_parse_match.cc | 4 - decoder/ff_soft_syntax.cc | 34 +++--- decoder/ff_soft_syntax.h | 16 +-- decoder/ff_soft_syntax2.cc | 234 ------------------------------------ decoder/ff_soft_syntax2.h | 27 ----- decoder/ff_soft_syntax_mindist.cc | 235 ++++++++++++++++++++++++++++++++++++ decoder/ff_soft_syntax_mindist.h | 27 +++++ decoder/ff_source_syntax.cc | 37 ++++-- decoder/ff_source_syntax.h | 10 +- decoder/ff_source_syntax2.cc | 25 ++-- decoder/ff_source_syntax2.h | 5 +- decoder/ff_source_syntax2_p.cc | 166 -------------------------- decoder/ff_source_syntax2_p.h | 25 ---- decoder/ff_source_syntax_p.cc | 245 -------------------------------------- decoder/ff_source_syntax_p.h | 42 ------- 17 files changed, 342 insertions(+), 820 deletions(-) delete mode 100644 decoder/ff_soft_syntax2.cc delete mode 100644 decoder/ff_soft_syntax2.h create mode 100644 decoder/ff_soft_syntax_mindist.cc create mode 100644 decoder/ff_soft_syntax_mindist.h delete mode 100644 decoder/ff_source_syntax2_p.cc delete mode 100644 decoder/ff_source_syntax2_p.h delete mode 100644 decoder/ff_source_syntax_p.cc delete mode 100644 decoder/ff_source_syntax_p.h (limited to 'decoder/ff_parse_match.cc') diff --git a/decoder/Makefile.am b/decoder/Makefile.am index 914faaea..e7ebe840 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -62,7 +62,6 @@ libcdec_a_SOURCES = \ ff_ruleshape.h \ ff_sample_fsa.h \ ff_source_path.h \ - ff_source_syntax.h \ ff_spans.h \ ff_tagger.h \ ff_wordalign.h \ @@ -145,11 +144,9 @@ libcdec_a_SOURCES = \ ff_source_path.cc \ ff_parse_match.cc \ ff_soft_syntax.cc \ - ff_soft_syntax2.cc \ + ff_soft_syntax_mindist.cc \ ff_source_syntax.cc \ - ff_source_syntax_p.cc \ ff_source_syntax2.cc \ - ff_source_syntax2_p.cc \ ff_bleu.cc \ ff_factory.cc \ incremental.cc \ diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc index e7b31f50..a36a0f5f 100644 --- a/decoder/cdec_ff.cc +++ b/decoder/cdec_ff.cc @@ -15,17 +15,11 @@ #include "ff_ruleshape.h" #include "ff_bleu.h" #include "ff_soft_syntax.h" -#include "ff_soft_syntax2.h" +#include "ff_soft_syntax_mindist.h" #include "ff_source_path.h" - - #include "ff_parse_match.h" #include "ff_source_syntax.h" -#include "ff_source_syntax_p.h" #include "ff_source_syntax2.h" -#include "ff_source_syntax2_p.h" - - #include "ff_register.h" #include "ff_charset.h" #include "ff_wordset.h" @@ -58,23 +52,12 @@ void register_feature_functions() { ff_registry.Register("NgramFeatures", new FFFactory()); ff_registry.Register("RuleContextFeatures", new FFFactory()); ff_registry.Register("RuleIdentityFeatures", new FFFactory()); - - ff_registry.Register("ParseMatchFeatures", new FFFactory); - - ff_registry.Register("SoftSyntacticFeatures", new FFFactory); - ff_registry.Register("SoftSyntacticFeatures2", new FFFactory); - + ff_registry.Register("SoftSyntaxFeatures", new FFFactory); + ff_registry.Register("SoftSyntaxFeaturesMindist", new FFFactory); ff_registry.Register("SourceSyntaxFeatures", new FFFactory); - ff_registry.Register("SourceSyntaxFeatures2", new FFFactory); - ff_registry.Register("SourceSpanSizeFeatures", new FFFactory); - - //ff_registry.Register("PSourceSyntaxFeatures", new FFFactory); - //ff_registry.Register("PSourceSpanSizeFeatures", new FFFactory); - //ff_registry.Register("PSourceSyntaxFeatures2", new FFFactory); - - + ff_registry.Register("SourceSyntaxFeatures2", new FFFactory); ff_registry.Register("CMR2008ReorderingFeatures", new FFFactory()); ff_registry.Register("RuleSourceBigramFeatures", new FFFactory()); ff_registry.Register("RuleTargetBigramFeatures", new FFFactory()); diff --git a/decoder/ff_parse_match.cc b/decoder/ff_parse_match.cc index 94634b27..7c79302b 100644 --- a/decoder/ff_parse_match.cc +++ b/decoder/ff_parse_match.cc @@ -13,10 +13,6 @@ using namespace std; // implements the parse match features as described in Vilar et al. (2008) // source trees must be represented in Penn Treebank format, e.g. // (S (NP John) (VP (V left))) -// -// Annotate source sentences with ..." -// Note: You need to escape quite a lot of stuff in all your models! -// struct ParseMatchFeaturesImpl { ParseMatchFeaturesImpl(const string& param) { diff --git a/decoder/ff_soft_syntax.cc b/decoder/ff_soft_syntax.cc index d84f2e6d..a3d26135 100644 --- a/decoder/ff_soft_syntax.cc +++ b/decoder/ff_soft_syntax.cc @@ -13,16 +13,15 @@ using namespace std; -// Implements the soft syntactic features described in +// Implements the soft syntactic features described in // Marton and Resnik (2008): "Soft Syntacitc Constraints for Hierarchical Phrase-Based Translation". // Source trees must be represented in Penn Treebank format, // e.g. (S (NP John) (VP (V left))). -struct SoftSyntacticFeaturesImpl { - SoftSyntacticFeaturesImpl(const string& param) { +struct SoftSyntaxFeaturesImpl { + SoftSyntaxFeaturesImpl(const string& param) { vector labels = SplitOnWhitespace(param); - //for (unsigned int i = 0; i < labels.size(); i++) - //cerr << "Labels: " << labels.at(i) << endl; + //for (unsigned int i = 0; i < labels.size(); i++) { cerr << "Labels: " << labels.at(i) << endl; } for (unsigned int i = 0; i < labels.size(); i++) { string label = labels.at(i); pair feat_label; @@ -34,10 +33,8 @@ struct SoftSyntacticFeaturesImpl { void InitializeGrids(const string& tree, unsigned src_len) { assert(tree.size() > 0); - //fids_cat.clear(); fids_ef.clear(); src_tree.clear(); - //fids_cat.resize(src_len, src_len + 1); fids_ef.resize(src_len, src_len + 1); src_tree.resize(src_len, src_len + 1, TD::Convert("XX")); ParseTreeString(tree, src_len); @@ -99,7 +96,7 @@ struct SoftSyntacticFeaturesImpl { const WordID lhs = src_tree(i,j); string lhs_str = TD::Convert(lhs); //cerr << "LHS: " << lhs_str << " from " << i << " to " << j << endl; - //cerr << "RULE :"<< rule << endl; + //cerr << "RULE :"<< rule << endl; int& fid_ef = fids_ef(i,j)[&rule]; for (unsigned int i = 0; i < feat_labels.size(); i++) { ostringstream os; @@ -126,7 +123,7 @@ struct SoftSyntacticFeaturesImpl { fid_ef = FD::Convert(os.str()); if (lhs_str.compare(label) == 0) { if (fid_ef > 0) { - //cerr << "Feature: " << os.str() << endl; + //cerr << "Feature: " << os.str() << endl; feats->set_value(fid_ef, 1.0); } } @@ -147,8 +144,8 @@ struct SoftSyntacticFeaturesImpl { } } break; - case '-': - //cerr << "-" << endl; + case '-': + //cerr << "-" << endl; if (lhs_str.compare(label) != 0) { os << "SYN:" << label << "_cross"; fid_ef = FD::Convert(os.str()); @@ -167,22 +164,22 @@ struct SoftSyntacticFeaturesImpl { return lhs; } - Array2D src_tree; // src_tree(i,j) NT = type - mutable Array2D > fids_ef; // fires for fully lexicalized + Array2D src_tree; // src_tree(i,j) NT = type + mutable Array2D > fids_ef; // fires for fully lexicalized vector > feat_labels; }; -SoftSyntacticFeatures::SoftSyntacticFeatures(const string& param) : +SoftSyntaxFeatures::SoftSyntaxFeatures(const string& param) : FeatureFunction(sizeof(WordID)) { - impl = new SoftSyntacticFeaturesImpl(param); + impl = new SoftSyntaxFeaturesImpl(param); } -SoftSyntacticFeatures::~SoftSyntacticFeatures() { +SoftSyntaxFeatures::~SoftSyntaxFeatures() { delete impl; impl = NULL; } -void SoftSyntacticFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, +void SoftSyntaxFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, const Hypergraph::Edge& edge, const vector& ant_contexts, SparseVector* features, @@ -196,6 +193,7 @@ void SoftSyntacticFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, impl->FireFeatures(*edge.rule_, edge.i_, edge.j_, ants, features); } -void SoftSyntacticFeatures::PrepareForInput(const SentenceMetadata& smeta) { +void SoftSyntaxFeatures::PrepareForInput(const SentenceMetadata& smeta) { impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); } + diff --git a/decoder/ff_soft_syntax.h b/decoder/ff_soft_syntax.h index 79352f49..e71825d5 100644 --- a/decoder/ff_soft_syntax.h +++ b/decoder/ff_soft_syntax.h @@ -1,15 +1,15 @@ -#ifndef _FF_SOFTSYNTAX_H_ -#define _FF_SOFTSYNTAX_H_ +#ifndef _FF_SOFT_SYNTAX_H_ +#define _FF_SOFT_SYNTAX_H_ #include "ff.h" #include "hg.h" -struct SoftSyntacticFeaturesImpl; +struct SoftSyntaxFeaturesImpl; -class SoftSyntacticFeatures : public FeatureFunction { +class SoftSyntaxFeatures : public FeatureFunction { public: - SoftSyntacticFeatures(const std::string& param); - ~SoftSyntacticFeatures(); + SoftSyntaxFeatures(const std::string& param); + ~SoftSyntaxFeatures(); protected: virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, const Hypergraph::Edge& edge, @@ -19,9 +19,9 @@ class SoftSyntacticFeatures : public FeatureFunction { void* context) const; virtual void PrepareForInput(const SentenceMetadata& smeta); private: - SoftSyntacticFeaturesImpl* impl; + SoftSyntaxFeaturesImpl* impl; }; - #endif + diff --git a/decoder/ff_soft_syntax2.cc b/decoder/ff_soft_syntax2.cc deleted file mode 100644 index 121bc39b..00000000 --- a/decoder/ff_soft_syntax2.cc +++ /dev/null @@ -1,234 +0,0 @@ -#include "ff_soft_syntax2.h" - -#include -#include -#include -#include -#include - -#include "sentence_metadata.h" -#include "stringlib.h" -#include "array2d.h" -#include "filelib.h" - -using namespace std; - -// Implements the soft syntactic features described in -// Marton and Resnik (2008): "Soft Syntacitc Constraints for Hierarchical Phrase-Based Translation". -// Source trees must be represented in Penn Treebank format, -// e.g. (S (NP John) (VP (V left))). - -struct SoftSyntacticFeatures2Impl { - SoftSyntacticFeatures2Impl(const string& param) { - vector labels = SplitOnWhitespace(param); - //for (unsigned int i = 0; i < labels.size(); i++) - //cerr << "Labels: " << labels.at(i) << endl; - for (unsigned int i = 0; i < labels.size(); i++) { - string label = labels.at(i); - pair feat_label; - feat_label.first = label.substr(0, label.size() - 1); - feat_label.second = label.at(label.size() - 1); - feat_labels.push_back(feat_label); - } - } - - void InitializeGrids(const string& tree, unsigned src_len) { - assert(tree.size() > 0); - //fids_cat.clear(); - fids_ef.clear(); - src_tree.clear(); - //fids_cat.resize(src_len, src_len + 1); - fids_ef.resize(src_len, src_len + 1); - src_tree.resize(src_len, src_len + 1, TD::Convert("XX")); - ParseTreeString(tree, src_len); - } - - void ParseTreeString(const string& tree, unsigned src_len) { - stack > stk; // first = i, second = category - pair cur_cat; cur_cat.first = -1; - unsigned i = 0; - unsigned p = 0; - //cerr << "String " << tree << endl; - while(p < tree.size()) { - const char cur = tree[p]; - if (cur == '(') { - stk.push(cur_cat); - ++p; - unsigned k = p + 1; - while (k < tree.size() && tree[k] != ' ') { ++k; } - cur_cat.first = i; - cur_cat.second = TD::Convert(tree.substr(p, k - p)); - //cerr << "NT: '" << tree.substr(p, k-p) << "' (i=" << i << ")\n"; - p = k + 1; - } else if (cur == ')') { - unsigned k = p; - while (k < tree.size() && tree[k] == ')') { ++k; } - const unsigned num_closes = k - p; - for (unsigned ci = 0; ci < num_closes; ++ci) { - // cur_cat.second spans from cur_cat.first to i - //cerr << TD::Convert(cur_cat.second) << " from " << cur_cat.first << " to " << i << endl; - // NOTE: unary rule chains end up being labeled with the top-most category - src_tree(cur_cat.first, i) = cur_cat.second; - cur_cat = stk.top(); - stk.pop(); - } - p = k; - while (p < tree.size() && (tree[p] == ' ' || tree[p] == '\t')) { ++p; } - } else if (cur == ' ' || cur == '\t') { - cerr << "Unexpected whitespace in: " << tree << endl; - abort(); - } else { // terminal symbol - unsigned k = p + 1; - do { - while (k < tree.size() && tree[k] != ')' && tree[k] != ' ') { ++k; } - // cerr << "TERM: '" << tree.substr(p, k-p) << "' (i=" << i << ")\n"; - ++i; - assert(i <= src_len); - while (k < tree.size() && tree[k] == ' ') { ++k; } - p = k; - } while (p < tree.size() && tree[p] != ')'); - } - } - //cerr << "i=" << i << " src_len=" << src_len << endl; - assert(i == src_len); // make sure tree specified in src_tree is - // the same length as the source sentence - } - - WordID FireFeatures(const TRule& rule, const int i, const int j, const WordID* ants, SparseVector* feats) { - //cerr << "fire features: " << rule.AsString() << " for " << i << "," << j << endl; - const WordID lhs = src_tree(i,j); - string lhs_str = TD::Convert(lhs); - //cerr << "LHS: " << lhs_str << " from " << i << " to " << j << endl; - //cerr << "RULE :"<< rule << endl; - int& fid_ef = fids_ef(i,j)[&rule]; - string lhs_to_str = TD::Convert(lhs); - int min_dist; - string min_dist_label; - if (lhs_to_str.compare("XX") != 0) { - min_dist = 0; - min_dist_label = lhs_to_str; - } - else { - int ok = 0; - for (unsigned int k = 1; k < (j - i); k++) { - min_dist = k; - for (unsigned int l = 0; l <= k; l++) { - int l_add = i-l; - int r_add = j+(k-l); - if ((l_add < src_tree.width() && r_add < src_tree.height()) && (TD::Convert(src_tree(l_add, r_add)).compare("XX") != 0)) { - ok = 1; - min_dist_label = (TD::Convert(src_tree(l_add, r_add))); - break; - } - else { - int l_rem= i+l; - int r_rem = j-(k-l); - if ((l_rem < src_tree.width() && r_rem < src_tree.height()) && TD::Convert(src_tree(l_rem, r_rem)).compare("XX") != 0) { - ok = 1; - min_dist_label = (TD::Convert(src_tree(l_rem, r_rem))); - break; - } - } - } - if (ok) break; - } - } - //cerr << "SPAN: " << i << " " << j << endl; - //cerr << "MINDIST: " << min_dist << endl; - //cerr << "MINDISTLABEL: " << min_dist_label << endl; - for (unsigned int i = 0; i < feat_labels.size(); i++) { - ostringstream os; - string label = feat_labels.at(i).first; - //cerr << "This Label: " << label << endl; - char feat_type = (char) feat_labels.at(i).second.c_str()[0]; - //cerr << "feat_type: " << feat_type << endl; - switch(feat_type) { - case '2': - if (min_dist_label.compare(label) == 0) { - if (min_dist == 0) { - os << "SYN:" << label << "_conform"; - } - else { - os << "SYN:" << label << "_cross"; - } - fid_ef = FD::Convert(os.str()); - //cerr << "Feature :" << os.str() << endl; - feats->set_value(fid_ef, 1.0); - } - break; - case '_': - os << "SYN:" << label; - fid_ef = FD::Convert(os.str()); - if (min_dist_label.compare(label) == 0) { - //cerr << "Feature: " << os.str() << endl; - if (min_dist == 0) { - feats->set_value(fid_ef, 1.0); - } - else { - //cerr << "Feature: " << os.str() << endl; - feats->set_value(fid_ef, -1.0); - } - } - break; - case '+': - if (min_dist_label.compare(label) == 0) { - os << "SYN:" << label << "_conform"; - fid_ef = FD::Convert(os.str()); - if (min_dist == 0) { - //cerr << "Feature: " << os.str() << endl; - feats->set_value(fid_ef, 1.0); - } - } - break; - case '-': - //cerr << "-" << endl; - if (min_dist_label.compare(label) != 0) { - os << "SYN:" << label << "_cross"; - fid_ef = FD::Convert(os.str()); - if (min_dist > 0) { - //cerr << "Feature :" << os.str() << endl; - feats->set_value(fid_ef, 1.0); - } - } - break; - os.clear(); - os.str(""); - } - //cerr << "FEATURE: " << os.str() << endl; - //cerr << endl; - } - return lhs; - } - - Array2D src_tree; // src_tree(i,j) NT = type - mutable Array2D > fids_ef; // fires for fully lexicalized - vector > feat_labels; -}; - -SoftSyntacticFeatures2::SoftSyntacticFeatures2(const string& param) : - FeatureFunction(sizeof(WordID)) { - impl = new SoftSyntacticFeatures2Impl(param); -} - -SoftSyntacticFeatures2::~SoftSyntacticFeatures2() { - delete impl; - impl = NULL; -} - -void SoftSyntacticFeatures2::TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const vector& ant_contexts, - SparseVector* features, - SparseVector* estimated_features, - void* context) const { - WordID ants[8]; - for (unsigned i = 0; i < ant_contexts.size(); ++i) - ants[i] = *static_cast(ant_contexts[i]); - - *static_cast(context) = - impl->FireFeatures(*edge.rule_, edge.i_, edge.j_, ants, features); -} - -void SoftSyntacticFeatures2::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); -} diff --git a/decoder/ff_soft_syntax2.h b/decoder/ff_soft_syntax2.h deleted file mode 100644 index 4de91d86..00000000 --- a/decoder/ff_soft_syntax2.h +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef _FF_SOFTSYNTAX2_H_ -#define _FF_SOFTSYNTAX2_H_ - -#include "ff.h" -#include "hg.h" - -struct SoftSyntacticFeatures2Impl; - -class SoftSyntacticFeatures2 : public FeatureFunction { - public: - SoftSyntacticFeatures2(const std::string& param); - ~SoftSyntacticFeatures2(); - protected: - virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector& ant_contexts, - SparseVector* features, - SparseVector* estimated_features, - void* context) const; - virtual void PrepareForInput(const SentenceMetadata& smeta); - private: - SoftSyntacticFeatures2Impl* impl; -}; - - - -#endif diff --git a/decoder/ff_soft_syntax_mindist.cc b/decoder/ff_soft_syntax_mindist.cc new file mode 100644 index 00000000..3f531986 --- /dev/null +++ b/decoder/ff_soft_syntax_mindist.cc @@ -0,0 +1,235 @@ +#include "ff_soft_syntax_mindist.h" + +#include +#include +#include +#include +#include + +#include "sentence_metadata.h" +#include "stringlib.h" +#include "array2d.h" +#include "filelib.h" + +using namespace std; + +// Implements the soft syntactic features described in +// Marton and Resnik (2008): "Soft Syntacitc Constraints for Hierarchical Phrase-Based Translation". +// Source trees must be represented in Penn Treebank format, +// e.g. (S (NP John) (VP (V left))). +// +// This variant accepts fuzzy matches, choosing the constituent with +// minimum distance. + +struct SoftSyntaxFeaturesMindistImpl { + SoftSyntaxFeaturesMindistImpl(const string& param) { + vector labels = SplitOnWhitespace(param); + //for (unsigned int i = 0; i < labels.size(); i++) { cerr << "Labels: " << labels.at(i) << endl; } + for (unsigned int i = 0; i < labels.size(); i++) { + string label = labels.at(i); + pair feat_label; + feat_label.first = label.substr(0, label.size() - 1); + feat_label.second = label.at(label.size() - 1); + feat_labels.push_back(feat_label); + } + } + + void InitializeGrids(const string& tree, unsigned src_len) { + assert(tree.size() > 0); + fids_ef.clear(); + src_tree.clear(); + fids_ef.resize(src_len, src_len + 1); + src_tree.resize(src_len, src_len + 1, TD::Convert("XX")); + ParseTreeString(tree, src_len); + } + + void ParseTreeString(const string& tree, unsigned src_len) { + stack > stk; // first = i, second = category + pair cur_cat; cur_cat.first = -1; + unsigned i = 0; + unsigned p = 0; + //cerr << "String " << tree << endl; + while(p < tree.size()) { + const char cur = tree[p]; + if (cur == '(') { + stk.push(cur_cat); + ++p; + unsigned k = p + 1; + while (k < tree.size() && tree[k] != ' ') { ++k; } + cur_cat.first = i; + cur_cat.second = TD::Convert(tree.substr(p, k - p)); + //cerr << "NT: '" << tree.substr(p, k-p) << "' (i=" << i << ")\n"; + p = k + 1; + } else if (cur == ')') { + unsigned k = p; + while (k < tree.size() && tree[k] == ')') { ++k; } + const unsigned num_closes = k - p; + for (unsigned ci = 0; ci < num_closes; ++ci) { + // cur_cat.second spans from cur_cat.first to i + //cerr << TD::Convert(cur_cat.second) << " from " << cur_cat.first << " to " << i << endl; + // NOTE: unary rule chains end up being labeled with the top-most category + src_tree(cur_cat.first, i) = cur_cat.second; + cur_cat = stk.top(); + stk.pop(); + } + p = k; + while (p < tree.size() && (tree[p] == ' ' || tree[p] == '\t')) { ++p; } + } else if (cur == ' ' || cur == '\t') { + cerr << "Unexpected whitespace in: " << tree << endl; + abort(); + } else { // terminal symbol + unsigned k = p + 1; + do { + while (k < tree.size() && tree[k] != ')' && tree[k] != ' ') { ++k; } + // cerr << "TERM: '" << tree.substr(p, k-p) << "' (i=" << i << ")\n"; + ++i; + assert(i <= src_len); + while (k < tree.size() && tree[k] == ' ') { ++k; } + p = k; + } while (p < tree.size() && tree[p] != ')'); + } + } + //cerr << "i=" << i << " src_len=" << src_len << endl; + assert(i == src_len); // make sure tree specified in src_tree is + // the same length as the source sentence + } + + WordID FireFeatures(const TRule& rule, const int i, const int j, const WordID* ants, SparseVector* feats) { + //cerr << "fire features: " << rule.AsString() << " for " << i << "," << j << endl; + const WordID lhs = src_tree(i,j); + string lhs_str = TD::Convert(lhs); + //cerr << "LHS: " << lhs_str << " from " << i << " to " << j << endl; + //cerr << "RULE :"<< rule << endl; + int& fid_ef = fids_ef(i,j)[&rule]; + string lhs_to_str = TD::Convert(lhs); + int min_dist; + string min_dist_label; + if (lhs_to_str.compare("XX") != 0) { + min_dist = 0; + min_dist_label = lhs_to_str; + } + else { + int ok = 0; + for (unsigned int k = 1; k < (j - i); k++) { + min_dist = k; + for (unsigned int l = 0; l <= k; l++) { + int l_add = i-l; + int r_add = j+(k-l); + if ((l_add < src_tree.width() && r_add < src_tree.height()) && (TD::Convert(src_tree(l_add, r_add)).compare("XX") != 0)) { + ok = 1; + min_dist_label = (TD::Convert(src_tree(l_add, r_add))); + break; + } + else { + int l_rem= i+l; + int r_rem = j-(k-l); + if ((l_rem < src_tree.width() && r_rem < src_tree.height()) && TD::Convert(src_tree(l_rem, r_rem)).compare("XX") != 0) { + ok = 1; + min_dist_label = (TD::Convert(src_tree(l_rem, r_rem))); + break; + } + } + } + if (ok) break; + } + } + //cerr << "SPAN: " << i << " " << j << endl; + //cerr << "MINDIST: " << min_dist << endl; + //cerr << "MINDISTLABEL: " << min_dist_label << endl; + for (unsigned int i = 0; i < feat_labels.size(); i++) { + ostringstream os; + string label = feat_labels.at(i).first; + //cerr << "This Label: " << label << endl; + char feat_type = (char) feat_labels.at(i).second.c_str()[0]; + //cerr << "feat_type: " << feat_type << endl; + switch(feat_type) { + case '2': + if (min_dist_label.compare(label) == 0) { + if (min_dist == 0) { + os << "SYN:" << label << "_conform"; + } + else { + os << "SYN:" << label << "_cross"; + } + fid_ef = FD::Convert(os.str()); + //cerr << "Feature :" << os.str() << endl; + feats->set_value(fid_ef, 1.0); + } + break; + case '_': + os << "SYN:" << label; + fid_ef = FD::Convert(os.str()); + if (min_dist_label.compare(label) == 0) { + //cerr << "Feature: " << os.str() << endl; + if (min_dist == 0) { + feats->set_value(fid_ef, 1.0); + } + else { + //cerr << "Feature: " << os.str() << endl; + feats->set_value(fid_ef, -1.0); + } + } + break; + case '+': + if (min_dist_label.compare(label) == 0) { + os << "SYN:" << label << "_conform"; + fid_ef = FD::Convert(os.str()); + if (min_dist == 0) { + //cerr << "Feature: " << os.str() << endl; + feats->set_value(fid_ef, 1.0); + } + } + break; + case '-': + //cerr << "-" << endl; + if (min_dist_label.compare(label) != 0) { + os << "SYN:" << label << "_cross"; + fid_ef = FD::Convert(os.str()); + if (min_dist > 0) { + //cerr << "Feature :" << os.str() << endl; + feats->set_value(fid_ef, 1.0); + } + } + break; + os.clear(); + os.str(""); + } + //cerr << "FEATURE: " << os.str() << endl; + //cerr << endl; + } + return lhs; + } + + Array2D src_tree; // src_tree(i,j) NT = type + mutable Array2D > fids_ef; // fires for fully lexicalized + vector > feat_labels; +}; + +SoftSyntaxFeaturesMindist::SoftSyntaxFeaturesMindist(const string& param) : + FeatureFunction(sizeof(WordID)) { + impl = new SoftSyntaxFeaturesMindistImpl(param); +} + +SoftSyntaxFeaturesMindist::~SoftSyntaxFeaturesMindist() { + delete impl; + impl = NULL; +} + +void SoftSyntaxFeaturesMindist::TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* context) const { + WordID ants[8]; + for (unsigned i = 0; i < ant_contexts.size(); ++i) + ants[i] = *static_cast(ant_contexts[i]); + + *static_cast(context) = + impl->FireFeatures(*edge.rule_, edge.i_, edge.j_, ants, features); +} + +void SoftSyntaxFeaturesMindist::PrepareForInput(const SentenceMetadata& smeta) { + impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); +} + diff --git a/decoder/ff_soft_syntax_mindist.h b/decoder/ff_soft_syntax_mindist.h new file mode 100644 index 00000000..bf938b38 --- /dev/null +++ b/decoder/ff_soft_syntax_mindist.h @@ -0,0 +1,27 @@ +#ifndef _FF_SOFT_SYNTAX_MINDIST_H_ +#define _FF_SOFT_SYNTAX_MINDIST_H_ + +#include "ff.h" +#include "hg.h" + +struct SoftSyntaxFeaturesMindistImpl; + +class SoftSyntaxFeaturesMindist : public FeatureFunction { + public: + SoftSyntaxFeaturesMindist(const std::string& param); + ~SoftSyntaxFeaturesMindist(); + protected: + virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* context) const; + virtual void PrepareForInput(const SentenceMetadata& smeta); + private: + SoftSyntaxFeaturesMindistImpl* impl; +}; + + +#endif + diff --git a/decoder/ff_source_syntax.cc b/decoder/ff_source_syntax.cc index a1997695..34e7ab69 100644 --- a/decoder/ff_source_syntax.cc +++ b/decoder/ff_source_syntax.cc @@ -2,8 +2,8 @@ #include #include +#include -#include "hg.h" #include "sentence_metadata.h" #include "array2d.h" #include "filelib.h" @@ -24,6 +24,17 @@ inline int SpanSizeTransform(unsigned span_size) { struct SourceSyntaxFeaturesImpl { SourceSyntaxFeaturesImpl() {} + SourceSyntaxFeaturesImpl(const string& param) { + if (!(param.compare("") == 0)) { + string triggered_features_fn = param; + ReadFile triggered_features(triggered_features_fn); + string in; + while(getline(*triggered_features, in)) { + feature_filter.insert(FD::Convert(in)); + } + } + } + void InitializeGrids(const string& tree, unsigned src_len) { assert(tree.size() > 0); //fids_cat.clear(); @@ -118,21 +129,28 @@ struct SourceSyntaxFeaturesImpl { } fid_ef = FD::Convert(os.str()); } - //if (fid_cat > 0) - // feats->set_value(fid_cat, 1.0); - if (fid_ef > 0) - feats->set_value(fid_ef, 1.0); + if (fid_ef > 0) { + if (feature_filter.size()>0) { + if (feature_filter.find(fid_ef) != feature_filter.end()) { + feats->set_value(fid_ef, 1.0); + } + } else { + feats->set_value(fid_ef, 1.0); + } + } + cerr << FD::Convert(fid_ef) << endl; return lhs; } - Array2D src_tree; // src_tree(i,j) NT = type - // mutable Array2D fids_cat; // this tends to overfit baddly - mutable Array2D > fids_ef; // fires for fully lexicalized + Array2D src_tree; // src_tree(i,j) NT = type + // mutable Array2D fids_cat; // this tends to overfit baddly + mutable Array2D > fids_ef; // fires for fully lexicalized + tr1::unordered_set feature_filter; }; SourceSyntaxFeatures::SourceSyntaxFeatures(const string& param) : FeatureFunction(sizeof(WordID)) { - impl = new SourceSyntaxFeaturesImpl; + impl = new SourceSyntaxFeaturesImpl(param); } SourceSyntaxFeatures::~SourceSyntaxFeatures() { @@ -230,4 +248,3 @@ void SourceSpanSizeFeatures::PrepareForInput(const SentenceMetadata& smeta) { impl->InitializeGrids(smeta.GetSourceLength()); } - diff --git a/decoder/ff_source_syntax.h b/decoder/ff_source_syntax.h index a8c7150a..bdd638c1 100644 --- a/decoder/ff_source_syntax.h +++ b/decoder/ff_source_syntax.h @@ -1,7 +1,8 @@ -#ifndef _FF_SOURCE_TOOLS_H_ -#define _FF_SOURCE_TOOLS_H_ +#ifndef _FF_SOURCE_SYNTAX_H_ +#define _FF_SOURCE_SYNTAX_H_ #include "ff.h" +#include "hg.h" struct SourceSyntaxFeaturesImpl; @@ -11,7 +12,7 @@ class SourceSyntaxFeatures : public FeatureFunction { ~SourceSyntaxFeatures(); protected: virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const HG::Edge& edge, + const Hypergraph::Edge& edge, const std::vector& ant_contexts, SparseVector* features, SparseVector* estimated_features, @@ -28,7 +29,7 @@ class SourceSpanSizeFeatures : public FeatureFunction { ~SourceSpanSizeFeatures(); protected: virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const HG::Edge& edge, + const Hypergraph::Edge& edge, const std::vector& ant_contexts, SparseVector* features, SparseVector* estimated_features, @@ -39,3 +40,4 @@ class SourceSpanSizeFeatures : public FeatureFunction { }; #endif + diff --git a/decoder/ff_source_syntax2.cc b/decoder/ff_source_syntax2.cc index 08ece917..63736342 100644 --- a/decoder/ff_source_syntax2.cc +++ b/decoder/ff_source_syntax2.cc @@ -17,7 +17,7 @@ using namespace std; struct SourceSyntaxFeatures2Impl { SourceSyntaxFeatures2Impl(const string& param) { - if (!(param.compare("") == 0)) { + if (param.compare("") != 0) { string triggered_features_fn = param; ReadFile triggered_features(triggered_features_fn); string in; @@ -29,10 +29,8 @@ struct SourceSyntaxFeatures2Impl { void InitializeGrids(const string& tree, unsigned src_len) { assert(tree.size() > 0); - //fids_cat.clear(); fids_ef.clear(); src_tree.clear(); - //fids_cat.resize(src_len, src_len + 1); fids_ef.resize(src_len, src_len + 1); src_tree.resize(src_len, src_len + 1, TD::Convert("XX")); ParseTreeString(tree, src_len); @@ -40,7 +38,7 @@ struct SourceSyntaxFeatures2Impl { void ParseTreeString(const string& tree, unsigned src_len) { //cerr << "TREE: " << tree << endl; - stack > stk; // first = i, second = category + stack > stk; // first = i, second = category pair cur_cat; cur_cat.first = -1; unsigned i = 0; unsigned p = 0; @@ -100,7 +98,7 @@ struct SourceSyntaxFeatures2Impl { if (k > 0 && fj <= 0) os << '_'; if (fj <= 0) { os << '[' << TD::Convert(ants[ntc++]) << ']'; - } /*else { + }/*else { os << TD::Convert(fj); }*/ } @@ -116,16 +114,22 @@ struct SourceSyntaxFeatures2Impl { fid_ef = FD::Convert(os.str()); //cerr << "FEATURE: " << os.str() << endl; //cerr << "FID_EF: " << fid_ef << endl; - if (feature_filter.find(fid_ef) != feature_filter.end()) { - cerr << "SYN-Feature was trigger more than once on training set." << endl; + if (feature_filter.size() > 0) { + if (feature_filter.find(fid_ef) != feature_filter.end()) { + //cerr << "SYN-Feature was trigger more than once on training set." << endl; + feats->set_value(fid_ef, 1.0); + } + //else cerr << "SYN-Feature was triggered less than once on training set." << endli; + } + else { feats->set_value(fid_ef, 1.0); } - else cerr << "SYN-Feature was triggered less than once on training set." << endl; + cerr << FD::Convert(fid_ef) << endl; return lhs; } - Array2D src_tree; // src_tree(i,j) NT = type - mutable Array2D > fids_ef; // fires for fully lexicalized + Array2D src_tree; // src_tree(i,j) NT = type + mutable Array2D > fids_ef; // fires for fully lexicalized tr1::unordered_set feature_filter; }; @@ -157,3 +161,4 @@ void SourceSyntaxFeatures2::TraversalFeaturesImpl(const SentenceMetadata& smeta, void SourceSyntaxFeatures2::PrepareForInput(const SentenceMetadata& smeta) { impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); } + diff --git a/decoder/ff_source_syntax2.h b/decoder/ff_source_syntax2.h index b6b7dc3d..f606c2bf 100644 --- a/decoder/ff_source_syntax2.h +++ b/decoder/ff_source_syntax2.h @@ -1,5 +1,5 @@ -#ifndef _FF_SOURCE_TOOLS2_H_ -#define _FF_SOURCE_TOOLS2_H_ +#ifndef _FF_SOURCE_SYNTAX2_H_ +#define _FF_SOURCE_SYNTAX2_H_ #include "ff.h" #include "hg.h" @@ -23,3 +23,4 @@ class SourceSyntaxFeatures2 : public FeatureFunction { }; #endif + diff --git a/decoder/ff_source_syntax2_p.cc b/decoder/ff_source_syntax2_p.cc deleted file mode 100644 index dfa791ea..00000000 --- a/decoder/ff_source_syntax2_p.cc +++ /dev/null @@ -1,166 +0,0 @@ -#include "ff_source_syntax2_p.h" - -#include -#include -#include -#include - -#include "sentence_metadata.h" -#include "array2d.h" -#include "filelib.h" - -using namespace std; - -// implements the source side syntax features described in Blunsom et al. (EMNLP 2008) -// source trees must be represented in Penn Treebank format, e.g. -// (S (NP John) (VP (V left))) - -struct PSourceSyntaxFeatures2Impl { - PSourceSyntaxFeatures2Impl(const string& param) { - if (param.compare("") != 0) { - string triggered_features_fn = param; - ReadFile triggered_features(triggered_features_fn); - string in; - while(getline(*triggered_features, in)) { - feature_filter.insert(FD::Convert(in)); - } - } - /*cerr << "find(\"One\") == " << boolalpha << (table.find("One") != table.end()) << endl; - cerr << "find(\"Three\") == " << boolalpha << (table.find("Three") != table.end()) << endl;*/ - } - - void InitializeGrids(const string& tree, unsigned src_len) { - assert(tree.size() > 0); - //fids_cat.clear(); - fids_ef.clear(); - src_tree.clear(); - //fids_cat.resize(src_len, src_len + 1); - fids_ef.resize(src_len, src_len + 1); - src_tree.resize(src_len, src_len + 1, TD::Convert("XX")); - ParseTreeString(tree, src_len); - } - - void ParseTreeString(const string& tree, unsigned src_len) { - //cerr << "TREE: " << tree << endl; - stack > stk; // first = i, second = category - pair cur_cat; cur_cat.first = -1; - unsigned i = 0; - unsigned p = 0; - while(p < tree.size()) { - const char cur = tree[p]; - if (cur == '(') { - stk.push(cur_cat); - ++p; - unsigned k = p + 1; - while (k < tree.size() && tree[k] != ' ') { ++k; } - cur_cat.first = i; - cur_cat.second = TD::Convert(tree.substr(p, k - p)); - // cerr << "NT: '" << tree.substr(p, k-p) << "' (i=" << i << ")\n"; - p = k + 1; - } else if (cur == ')') { - unsigned k = p; - while (k < tree.size() && tree[k] == ')') { ++k; } - const unsigned num_closes = k - p; - for (unsigned ci = 0; ci < num_closes; ++ci) { - src_tree(cur_cat.first, i) = cur_cat.second; - cur_cat = stk.top(); - stk.pop(); - } - p = k; - while (p < tree.size() && (tree[p] == ' ' || tree[p] == '\t')) { ++p; } - } else if (cur == ' ' || cur == '\t') { - cerr << "Unexpected whitespace in: " << tree << endl; - abort(); - } else { // terminal symbol - unsigned k = p + 1; - do { - while (k < tree.size() && tree[k] != ')' && tree[k] != ' ') { ++k; } - // cerr << "TERM: '" << tree.substr(p, k-p) << "' (i=" << i << ")\n"; - ++i; - assert(i <= src_len); - while (k < tree.size() && tree[k] == ' ') { ++k; } - p = k; - } while (p < tree.size() && tree[p] != ')'); - } - //cerr << "i=" << i << " src_len=" << src_len << endl; - } - //cerr << "i=" << i << " src_len=" << src_len << endl; - assert(i == src_len); // make sure tree specified in src_tree is - // the same length as the source sentence - } - - WordID FireFeatures(const TRule& rule, const int i, const int j, const WordID* ants, SparseVector* feats) { - //cerr << "fire features: " << rule.AsString() << " for " << i << "," << j << endl; - const WordID lhs = src_tree(i,j); - int& fid_ef = fids_ef(i,j)[&rule]; - ostringstream os; - os << "SYN:" << TD::Convert(lhs); - os << ':'; - unsigned ntc = 0; - for (unsigned k = 0; k < rule.f_.size(); ++k) { - int fj = rule.f_[k]; - if (k > 0 && fj <= 0) os << '_'; - if (fj <= 0) { - os << '[' << TD::Convert(ants[ntc++]) << ']'; - } /*else { - os << TD::Convert(fj); - }*/ - } - os << ':'; - for (unsigned k = 0; k < rule.e_.size(); ++k) { - const int ei = rule.e_[k]; - if (k > 0) os << '_'; - if (ei <= 0) - os << '[' << (1-ei) << ']'; - else - os << TD::Convert(ei); - } - fid_ef = FD::Convert(os.str()); - //cerr << "FEATURE: " << os.str() << endl; - //cerr << "FID_EF: " << fid_ef << endl; - if (feature_filter.size() > 0) { - if (feature_filter.find(fid_ef) != feature_filter.end()) { - //cerr << "SYN-Feature was trigger more than once on training set." << endl; - feats->set_value(fid_ef, 1.0); - } - //else cerr << "SYN-Feature was triggered less than once on training set." << endli; - } - else { - feats->set_value(fid_ef, 1.0); - } - return lhs; - } - - Array2D src_tree; // src_tree(i,j) NT = type - mutable Array2D > fids_ef; // fires for fully lexicalized - tr1::unordered_set feature_filter; - -}; - -PSourceSyntaxFeatures2::PSourceSyntaxFeatures2(const string& param) : - FeatureFunction(sizeof(WordID)) { - impl = new PSourceSyntaxFeatures2Impl(param); -} - -PSourceSyntaxFeatures2::~PSourceSyntaxFeatures2() { - delete impl; - impl = NULL; -} - -void PSourceSyntaxFeatures2::TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const vector& ant_contexts, - SparseVector* features, - SparseVector* estimated_features, - void* context) const { - WordID ants[8]; - for (unsigned i = 0; i < ant_contexts.size(); ++i) - ants[i] = *static_cast(ant_contexts[i]); - - *static_cast(context) = - impl->FireFeatures(*edge.rule_, edge.i_, edge.j_, ants, features); -} - -void PSourceSyntaxFeatures2::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); -} diff --git a/decoder/ff_source_syntax2_p.h b/decoder/ff_source_syntax2_p.h deleted file mode 100644 index d56ecab0..00000000 --- a/decoder/ff_source_syntax2_p.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef _FF_SOURCE_TOOLS2_H_ -#define _FF_SOURCE_TOOLS2_H_ - -#include "ff.h" -#include "hg.h" - -struct PSourceSyntaxFeatures2Impl; - -class PSourceSyntaxFeatures2 : public FeatureFunction { - public: - PSourceSyntaxFeatures2(const std::string& param); - ~PSourceSyntaxFeatures2(); - protected: - virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector& ant_contexts, - SparseVector* features, - SparseVector* estimated_features, - void* context) const; - virtual void PrepareForInput(const SentenceMetadata& smeta); - private: - PSourceSyntaxFeatures2Impl* impl; -}; - -#endif diff --git a/decoder/ff_source_syntax_p.cc b/decoder/ff_source_syntax_p.cc deleted file mode 100644 index cd081544..00000000 --- a/decoder/ff_source_syntax_p.cc +++ /dev/null @@ -1,245 +0,0 @@ -#include "ff_source_syntax_p.h" - -#include -#include -#include - -#include "sentence_metadata.h" -#include "array2d.h" -#include "filelib.h" - -using namespace std; - -// implements the source side syntax features described in Blunsom et al. (EMNLP 2008) -// source trees must be represented in Penn Treebank format, e.g. -// (S (NP John) (VP (V left))) - -// log transform to make long spans cluster together -// but preserve differences -inline int SpanSizeTransform(unsigned span_size) { - if (!span_size) return 0; - return static_cast(log(span_size+1) / log(1.39)) - 1; -} - -struct PSourceSyntaxFeaturesImpl { - PSourceSyntaxFeaturesImpl() {} - - PSourceSyntaxFeaturesImpl(const string& param) { - if (!(param.compare("") == 0)) { - string triggered_features_fn = param; - ReadFile triggered_features(triggered_features_fn); - string in; - while(getline(*triggered_features, in)) { - feature_filter.insert(FD::Convert(in)); - } - } - } - - void InitializeGrids(const string& tree, unsigned src_len) { - assert(tree.size() > 0); - //fids_cat.clear(); - fids_ef.clear(); - src_tree.clear(); - //fids_cat.resize(src_len, src_len + 1); - fids_ef.resize(src_len, src_len + 1); - src_tree.resize(src_len, src_len + 1, TD::Convert("XX")); - ParseTreeString(tree, src_len); - } - - void ParseTreeString(const string& tree, unsigned src_len) { - stack > stk; // first = i, second = category - pair cur_cat; cur_cat.first = -1; - unsigned i = 0; - unsigned p = 0; - while(p < tree.size()) { - const char cur = tree[p]; - if (cur == '(') { - stk.push(cur_cat); - ++p; - unsigned k = p + 1; - while (k < tree.size() && tree[k] != ' ') { ++k; } - cur_cat.first = i; - cur_cat.second = TD::Convert(tree.substr(p, k - p)); - // cerr << "NT: '" << tree.substr(p, k-p) << "' (i=" << i << ")\n"; - p = k + 1; - } else if (cur == ')') { - unsigned k = p; - while (k < tree.size() && tree[k] == ')') { ++k; } - const unsigned num_closes = k - p; - for (unsigned ci = 0; ci < num_closes; ++ci) { - // cur_cat.second spans from cur_cat.first to i - // cerr << TD::Convert(cur_cat.second) << " from " << cur_cat.first << " to " << i << endl; - // NOTE: unary rule chains end up being labeled with the top-most category - src_tree(cur_cat.first, i) = cur_cat.second; - cur_cat = stk.top(); - stk.pop(); - } - p = k; - while (p < tree.size() && (tree[p] == ' ' || tree[p] == '\t')) { ++p; } - } else if (cur == ' ' || cur == '\t') { - cerr << "Unexpected whitespace in: " << tree << endl; - abort(); - } else { // terminal symbol - unsigned k = p + 1; - do { - while (k < tree.size() && tree[k] != ')' && tree[k] != ' ') { ++k; } - // cerr << "TERM: '" << tree.substr(p, k-p) << "' (i=" << i << ")\n"; - ++i; - assert(i <= src_len); - while (k < tree.size() && tree[k] == ' ') { ++k; } - p = k; - } while (p < tree.size() && tree[p] != ')'); - } - } - // cerr << "i=" << i << " src_len=" << src_len << endl; - assert(i == src_len); // make sure tree specified in src_tree is - // the same length as the source sentence - } - - WordID FireFeatures(const TRule& rule, const int i, const int j, const WordID* ants, SparseVector* feats) { - //cerr << "fire features: " << rule.AsString() << " for " << i << "," << j << endl; - const WordID lhs = src_tree(i,j); - //int& fid_cat = fids_cat(i,j); - int& fid_ef = fids_ef(i,j)[&rule]; - if (fid_ef <= 0) { - ostringstream os; - //ostringstream os2; - os << "SYN:" << TD::Convert(lhs); - //os2 << "SYN:" << TD::Convert(lhs) << '_' << SpanSizeTransform(j - i); - //fid_cat = FD::Convert(os2.str()); - os << ':'; - unsigned ntc = 0; - for (unsigned k = 0; k < rule.f_.size(); ++k) { - if (k > 0) os << '_'; - int fj = rule.f_[k]; - if (fj <= 0) { - os << '[' << TD::Convert(ants[ntc++]) << ']'; - } else { - os << TD::Convert(fj); - } - } - os << ':'; - for (unsigned k = 0; k < rule.e_.size(); ++k) { - const int ei = rule.e_[k]; - if (k > 0) os << '_'; - if (ei <= 0) - os << '[' << (1-ei) << ']'; - else - os << TD::Convert(ei); - } - fid_ef = FD::Convert(os.str()); - } - //if (fid_cat > 0) - // feats->set_value(fid_cat, 1.0); - if (fid_ef > 0 && (feature_filter.find(fid_ef) != feature_filter.end())) - feats->set_value(fid_ef, 1.0); - return lhs; - } - - Array2D src_tree; // src_tree(i,j) NT = type - // mutable Array2D fids_cat; // this tends to overfit baddly - mutable Array2D > fids_ef; // fires for fully lexicalized - tr1::unordered_set feature_filter; -}; - -PSourceSyntaxFeatures::PSourceSyntaxFeatures(const string& param) : - FeatureFunction(sizeof(WordID)) { - impl = new PSourceSyntaxFeaturesImpl(param); -} - -PSourceSyntaxFeatures::~PSourceSyntaxFeatures() { - delete impl; - impl = NULL; -} - -void PSourceSyntaxFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const vector& ant_contexts, - SparseVector* features, - SparseVector* estimated_features, - void* context) const { - WordID ants[8]; - for (unsigned i = 0; i < ant_contexts.size(); ++i) - ants[i] = *static_cast(ant_contexts[i]); - - *static_cast(context) = - impl->FireFeatures(*edge.rule_, edge.i_, edge.j_, ants, features); -} - -void PSourceSyntaxFeatures::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); -} - -struct PSourceSpanSizeFeaturesImpl { - PSourceSpanSizeFeaturesImpl() {} - - void InitializeGrids(unsigned src_len) { - fids.clear(); - fids.resize(src_len, src_len + 1); - } - - int FireFeatures(const TRule& rule, const int i, const int j, const WordID* ants, SparseVector* feats) { - if (rule.Arity() > 0) { - int& fid = fids(i,j)[&rule]; - if (fid <= 0) { - ostringstream os; - os << "SSS:"; - unsigned ntc = 0; - for (unsigned k = 0; k < rule.f_.size(); ++k) { - if (k > 0) os << '_'; - int fj = rule.f_[k]; - if (fj <= 0) { - os << '[' << TD::Convert(-fj) << ants[ntc++] << ']'; - } else { - os << TD::Convert(fj); - } - } - os << ':'; - for (unsigned k = 0; k < rule.e_.size(); ++k) { - const int ei = rule.e_[k]; - if (k > 0) os << '_'; - if (ei <= 0) - os << '[' << (1-ei) << ']'; - else - os << TD::Convert(ei); - } - fid = FD::Convert(os.str()); - } - if (fid > 0) - feats->set_value(fid, 1.0); - } - return SpanSizeTransform(j - i); - } - - mutable Array2D > fids; -}; - -PSourceSpanSizeFeatures::PSourceSpanSizeFeatures(const string& param) : - FeatureFunction(sizeof(char)) { - impl = new PSourceSpanSizeFeaturesImpl; -} - -PSourceSpanSizeFeatures::~PSourceSpanSizeFeatures() { - delete impl; - impl = NULL; -} - -void PSourceSpanSizeFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const vector& ant_contexts, - SparseVector* features, - SparseVector* estimated_features, - void* context) const { - int ants[8]; - for (unsigned i = 0; i < ant_contexts.size(); ++i) - ants[i] = *static_cast(ant_contexts[i]); - - *static_cast(context) = - impl->FireFeatures(*edge.rule_, edge.i_, edge.j_, ants, features); -} - -void PSourceSpanSizeFeatures::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSourceLength()); -} - - diff --git a/decoder/ff_source_syntax_p.h b/decoder/ff_source_syntax_p.h deleted file mode 100644 index 2dd9094a..00000000 --- a/decoder/ff_source_syntax_p.h +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef _FF_SOURCE_TOOLS_H_ -#define _FF_SOURCE_TOOLS_H_ - -#include "ff.h" -#include "hg.h" - -struct PSourceSyntaxFeaturesImpl; - -class PSourceSyntaxFeatures : public FeatureFunction { - public: - PSourceSyntaxFeatures(const std::string& param); - ~PSourceSyntaxFeatures(); - protected: - virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector& ant_contexts, - SparseVector* features, - SparseVector* estimated_features, - void* context) const; - virtual void PrepareForInput(const SentenceMetadata& smeta); - private: - PSourceSyntaxFeaturesImpl* impl; -}; - -struct PSourceSpanSizeFeaturesImpl; -class PSourceSpanSizeFeatures : public FeatureFunction { - public: - PSourceSpanSizeFeatures(const std::string& param); - ~PSourceSpanSizeFeatures(); - protected: - virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector& ant_contexts, - SparseVector* features, - SparseVector* estimated_features, - void* context) const; - virtual void PrepareForInput(const SentenceMetadata& smeta); - private: - PSourceSpanSizeFeaturesImpl* impl; -}; - -#endif -- cgit v1.2.3 From decd2c4b1d4fb42a73a3217f347ea8f317e50869 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Tue, 5 Nov 2013 18:15:18 +0100 Subject: syntax features now read trees from files -- no more escaping! --- decoder/ff_parse_match.cc | 5 ++++- decoder/ff_soft_syntax.cc | 15 +++++++++------ decoder/ff_soft_syntax_mindist.cc | 15 +++++++++------ decoder/ff_source_syntax.cc | 7 +++++-- decoder/ff_source_syntax2.cc | 7 +++++-- utils/filelib.h | 5 ++++- 6 files changed, 36 insertions(+), 18 deletions(-) (limited to 'decoder/ff_parse_match.cc') diff --git a/decoder/ff_parse_match.cc b/decoder/ff_parse_match.cc index 7c79302b..58026975 100644 --- a/decoder/ff_parse_match.cc +++ b/decoder/ff_parse_match.cc @@ -212,6 +212,9 @@ void ParseMatchFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, } void ParseMatchFeatures::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); + ReadFile f = ReadFile(smeta.GetSGMLValue("src_tree")); + string tree; + f.ReadAll(tree); + impl->InitializeGrids(tree, smeta.GetSourceLength()); } diff --git a/decoder/ff_soft_syntax.cc b/decoder/ff_soft_syntax.cc index a3d26135..23fe87bd 100644 --- a/decoder/ff_soft_syntax.cc +++ b/decoder/ff_soft_syntax.cc @@ -107,10 +107,10 @@ struct SoftSyntaxFeaturesImpl { switch(feat_type) { case '2': if (lhs_str.compare(label) == 0) { - os << "SYN:" << label << "_conform"; + os << "SOFT:" << label << "_conform"; } else { - os << "SYN:" << label << "_cross"; + os << "SOFT:" << label << "_cross"; } fid_ef = FD::Convert(os.str()); if (fid_ef > 0) { @@ -119,7 +119,7 @@ struct SoftSyntaxFeaturesImpl { } break; case '_': - os << "SYN:" << label; + os << "SOFT:" << label; fid_ef = FD::Convert(os.str()); if (lhs_str.compare(label) == 0) { if (fid_ef > 0) { @@ -136,7 +136,7 @@ struct SoftSyntaxFeaturesImpl { break; case '+': if (lhs_str.compare(label) == 0) { - os << "SYN:" << label << "_conform"; + os << "SOFT:" << label << "_conform"; fid_ef = FD::Convert(os.str()); if (fid_ef > 0) { //cerr << "Feature: " << os.str() << endl; @@ -147,7 +147,7 @@ struct SoftSyntaxFeaturesImpl { case '-': //cerr << "-" << endl; if (lhs_str.compare(label) != 0) { - os << "SYN:" << label << "_cross"; + os << "SOFT:" << label << "_cross"; fid_ef = FD::Convert(os.str()); if (fid_ef > 0) { //cerr << "Feature :" << os.str() << endl; @@ -194,6 +194,9 @@ void SoftSyntaxFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, } void SoftSyntaxFeatures::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); + ReadFile f = ReadFile(smeta.GetSGMLValue("src_tree")); + string tree; + f.ReadAll(tree); + impl->InitializeGrids(tree, smeta.GetSourceLength()); } diff --git a/decoder/ff_soft_syntax_mindist.cc b/decoder/ff_soft_syntax_mindist.cc index 3f531986..a23f70f8 100644 --- a/decoder/ff_soft_syntax_mindist.cc +++ b/decoder/ff_soft_syntax_mindist.cc @@ -146,10 +146,10 @@ struct SoftSyntaxFeaturesMindistImpl { case '2': if (min_dist_label.compare(label) == 0) { if (min_dist == 0) { - os << "SYN:" << label << "_conform"; + os << "SOFTM:" << label << "_conform"; } else { - os << "SYN:" << label << "_cross"; + os << "SOFTM:" << label << "_cross"; } fid_ef = FD::Convert(os.str()); //cerr << "Feature :" << os.str() << endl; @@ -157,7 +157,7 @@ struct SoftSyntaxFeaturesMindistImpl { } break; case '_': - os << "SYN:" << label; + os << "SOFTM:" << label; fid_ef = FD::Convert(os.str()); if (min_dist_label.compare(label) == 0) { //cerr << "Feature: " << os.str() << endl; @@ -172,7 +172,7 @@ struct SoftSyntaxFeaturesMindistImpl { break; case '+': if (min_dist_label.compare(label) == 0) { - os << "SYN:" << label << "_conform"; + os << "SOFTM:" << label << "_conform"; fid_ef = FD::Convert(os.str()); if (min_dist == 0) { //cerr << "Feature: " << os.str() << endl; @@ -183,7 +183,7 @@ struct SoftSyntaxFeaturesMindistImpl { case '-': //cerr << "-" << endl; if (min_dist_label.compare(label) != 0) { - os << "SYN:" << label << "_cross"; + os << "SOFTM:" << label << "_cross"; fid_ef = FD::Convert(os.str()); if (min_dist > 0) { //cerr << "Feature :" << os.str() << endl; @@ -230,6 +230,9 @@ void SoftSyntaxFeaturesMindist::TraversalFeaturesImpl(const SentenceMetadata& sm } void SoftSyntaxFeaturesMindist::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); + ReadFile f = ReadFile(smeta.GetSGMLValue("src_tree")); + string tree; + f.ReadAll(tree); + impl->InitializeGrids(tree, smeta.GetSourceLength()); } diff --git a/decoder/ff_source_syntax.cc b/decoder/ff_source_syntax.cc index 34e7ab69..4879ca1d 100644 --- a/decoder/ff_source_syntax.cc +++ b/decoder/ff_source_syntax.cc @@ -104,7 +104,7 @@ struct SourceSyntaxFeaturesImpl { if (fid_ef <= 0) { ostringstream os; //ostringstream os2; - os << "SYN:" << TD::Convert(lhs); + os << "SSYN:" << TD::Convert(lhs); //os2 << "SYN:" << TD::Convert(lhs) << '_' << SpanSizeTransform(j - i); //fid_cat = FD::Convert(os2.str()); os << ':'; @@ -173,7 +173,10 @@ void SourceSyntaxFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, } void SourceSyntaxFeatures::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); + ReadFile f = ReadFile(smeta.GetSGMLValue("src_tree")); + string tree; + f.ReadAll(tree); + impl->InitializeGrids(tree, smeta.GetSourceLength()); } struct SourceSpanSizeFeaturesImpl { diff --git a/decoder/ff_source_syntax2.cc b/decoder/ff_source_syntax2.cc index 63736342..9d0bc33f 100644 --- a/decoder/ff_source_syntax2.cc +++ b/decoder/ff_source_syntax2.cc @@ -90,7 +90,7 @@ struct SourceSyntaxFeatures2Impl { const WordID lhs = src_tree(i,j); int& fid_ef = fids_ef(i,j)[&rule]; ostringstream os; - os << "SYN:" << TD::Convert(lhs); + os << "SSYN2:" << TD::Convert(lhs); os << ':'; unsigned ntc = 0; for (unsigned k = 0; k < rule.f_.size(); ++k) { @@ -159,6 +159,9 @@ void SourceSyntaxFeatures2::TraversalFeaturesImpl(const SentenceMetadata& smeta, } void SourceSyntaxFeatures2::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); + ReadFile f = ReadFile(smeta.GetSGMLValue("src_tree")); + string tree; + f.ReadAll(tree); + impl->InitializeGrids(tree, smeta.GetSourceLength()); } diff --git a/utils/filelib.h b/utils/filelib.h index b9ea3940..4fa69760 100644 --- a/utils/filelib.h +++ b/utils/filelib.h @@ -75,7 +75,10 @@ class ReadFile : public BaseFile { } } } - + void ReadAll(std::string& s) { + getline(*stream(), s, (char) EOF); + if (s.size() > 0) s.resize(s.size()-1); + } }; class WriteFile : public BaseFile { -- cgit v1.2.3