From 5a23ee2ae792b629e0f52b9c7fdf293de60a0ca1 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Tue, 5 Nov 2013 16:29:03 +0100 Subject: cleaning up syntax features --- decoder/ff_source_syntax.cc | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) (limited to 'decoder/ff_source_syntax.cc') diff --git a/decoder/ff_source_syntax.cc b/decoder/ff_source_syntax.cc index a1997695..34e7ab69 100644 --- a/decoder/ff_source_syntax.cc +++ b/decoder/ff_source_syntax.cc @@ -2,8 +2,8 @@ #include #include +#include -#include "hg.h" #include "sentence_metadata.h" #include "array2d.h" #include "filelib.h" @@ -24,6 +24,17 @@ inline int SpanSizeTransform(unsigned span_size) { struct SourceSyntaxFeaturesImpl { SourceSyntaxFeaturesImpl() {} + SourceSyntaxFeaturesImpl(const string& param) { + if (!(param.compare("") == 0)) { + string triggered_features_fn = param; + ReadFile triggered_features(triggered_features_fn); + string in; + while(getline(*triggered_features, in)) { + feature_filter.insert(FD::Convert(in)); + } + } + } + void InitializeGrids(const string& tree, unsigned src_len) { assert(tree.size() > 0); //fids_cat.clear(); @@ -118,21 +129,28 @@ struct SourceSyntaxFeaturesImpl { } fid_ef = FD::Convert(os.str()); } - //if (fid_cat > 0) - // feats->set_value(fid_cat, 1.0); - if (fid_ef > 0) - feats->set_value(fid_ef, 1.0); + if (fid_ef > 0) { + if (feature_filter.size()>0) { + if (feature_filter.find(fid_ef) != feature_filter.end()) { + feats->set_value(fid_ef, 1.0); + } + } else { + feats->set_value(fid_ef, 1.0); + } + } + cerr << FD::Convert(fid_ef) << endl; return lhs; } - Array2D src_tree; // src_tree(i,j) NT = type - // mutable Array2D fids_cat; // this tends to overfit baddly - mutable Array2D > fids_ef; // fires for fully lexicalized + Array2D src_tree; // src_tree(i,j) NT = type + // mutable Array2D fids_cat; // this tends to overfit baddly + mutable Array2D > fids_ef; // fires for fully lexicalized + tr1::unordered_set feature_filter; }; SourceSyntaxFeatures::SourceSyntaxFeatures(const string& param) : FeatureFunction(sizeof(WordID)) { - impl = new SourceSyntaxFeaturesImpl; + impl = new SourceSyntaxFeaturesImpl(param); } SourceSyntaxFeatures::~SourceSyntaxFeatures() { @@ -230,4 +248,3 @@ void SourceSpanSizeFeatures::PrepareForInput(const SentenceMetadata& smeta) { impl->InitializeGrids(smeta.GetSourceLength()); } - -- cgit v1.2.3 From a7a0773b0e299f71409cf4a13d18ea7db5ab3fc1 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Tue, 5 Nov 2013 18:15:18 +0100 Subject: syntax features now read trees from files -- no more escaping! --- decoder/ff_parse_match.cc | 5 ++++- decoder/ff_soft_syntax.cc | 15 +++++++++------ decoder/ff_soft_syntax_mindist.cc | 15 +++++++++------ decoder/ff_source_syntax.cc | 7 +++++-- decoder/ff_source_syntax2.cc | 7 +++++-- utils/filelib.h | 5 ++++- 6 files changed, 36 insertions(+), 18 deletions(-) (limited to 'decoder/ff_source_syntax.cc') diff --git a/decoder/ff_parse_match.cc b/decoder/ff_parse_match.cc index 7c79302b..58026975 100644 --- a/decoder/ff_parse_match.cc +++ b/decoder/ff_parse_match.cc @@ -212,6 +212,9 @@ void ParseMatchFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, } void ParseMatchFeatures::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); + ReadFile f = ReadFile(smeta.GetSGMLValue("src_tree")); + string tree; + f.ReadAll(tree); + impl->InitializeGrids(tree, smeta.GetSourceLength()); } diff --git a/decoder/ff_soft_syntax.cc b/decoder/ff_soft_syntax.cc index a3d26135..23fe87bd 100644 --- a/decoder/ff_soft_syntax.cc +++ b/decoder/ff_soft_syntax.cc @@ -107,10 +107,10 @@ struct SoftSyntaxFeaturesImpl { switch(feat_type) { case '2': if (lhs_str.compare(label) == 0) { - os << "SYN:" << label << "_conform"; + os << "SOFT:" << label << "_conform"; } else { - os << "SYN:" << label << "_cross"; + os << "SOFT:" << label << "_cross"; } fid_ef = FD::Convert(os.str()); if (fid_ef > 0) { @@ -119,7 +119,7 @@ struct SoftSyntaxFeaturesImpl { } break; case '_': - os << "SYN:" << label; + os << "SOFT:" << label; fid_ef = FD::Convert(os.str()); if (lhs_str.compare(label) == 0) { if (fid_ef > 0) { @@ -136,7 +136,7 @@ struct SoftSyntaxFeaturesImpl { break; case '+': if (lhs_str.compare(label) == 0) { - os << "SYN:" << label << "_conform"; + os << "SOFT:" << label << "_conform"; fid_ef = FD::Convert(os.str()); if (fid_ef > 0) { //cerr << "Feature: " << os.str() << endl; @@ -147,7 +147,7 @@ struct SoftSyntaxFeaturesImpl { case '-': //cerr << "-" << endl; if (lhs_str.compare(label) != 0) { - os << "SYN:" << label << "_cross"; + os << "SOFT:" << label << "_cross"; fid_ef = FD::Convert(os.str()); if (fid_ef > 0) { //cerr << "Feature :" << os.str() << endl; @@ -194,6 +194,9 @@ void SoftSyntaxFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, } void SoftSyntaxFeatures::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); + ReadFile f = ReadFile(smeta.GetSGMLValue("src_tree")); + string tree; + f.ReadAll(tree); + impl->InitializeGrids(tree, smeta.GetSourceLength()); } diff --git a/decoder/ff_soft_syntax_mindist.cc b/decoder/ff_soft_syntax_mindist.cc index 3f531986..a23f70f8 100644 --- a/decoder/ff_soft_syntax_mindist.cc +++ b/decoder/ff_soft_syntax_mindist.cc @@ -146,10 +146,10 @@ struct SoftSyntaxFeaturesMindistImpl { case '2': if (min_dist_label.compare(label) == 0) { if (min_dist == 0) { - os << "SYN:" << label << "_conform"; + os << "SOFTM:" << label << "_conform"; } else { - os << "SYN:" << label << "_cross"; + os << "SOFTM:" << label << "_cross"; } fid_ef = FD::Convert(os.str()); //cerr << "Feature :" << os.str() << endl; @@ -157,7 +157,7 @@ struct SoftSyntaxFeaturesMindistImpl { } break; case '_': - os << "SYN:" << label; + os << "SOFTM:" << label; fid_ef = FD::Convert(os.str()); if (min_dist_label.compare(label) == 0) { //cerr << "Feature: " << os.str() << endl; @@ -172,7 +172,7 @@ struct SoftSyntaxFeaturesMindistImpl { break; case '+': if (min_dist_label.compare(label) == 0) { - os << "SYN:" << label << "_conform"; + os << "SOFTM:" << label << "_conform"; fid_ef = FD::Convert(os.str()); if (min_dist == 0) { //cerr << "Feature: " << os.str() << endl; @@ -183,7 +183,7 @@ struct SoftSyntaxFeaturesMindistImpl { case '-': //cerr << "-" << endl; if (min_dist_label.compare(label) != 0) { - os << "SYN:" << label << "_cross"; + os << "SOFTM:" << label << "_cross"; fid_ef = FD::Convert(os.str()); if (min_dist > 0) { //cerr << "Feature :" << os.str() << endl; @@ -230,6 +230,9 @@ void SoftSyntaxFeaturesMindist::TraversalFeaturesImpl(const SentenceMetadata& sm } void SoftSyntaxFeaturesMindist::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); + ReadFile f = ReadFile(smeta.GetSGMLValue("src_tree")); + string tree; + f.ReadAll(tree); + impl->InitializeGrids(tree, smeta.GetSourceLength()); } diff --git a/decoder/ff_source_syntax.cc b/decoder/ff_source_syntax.cc index 34e7ab69..4879ca1d 100644 --- a/decoder/ff_source_syntax.cc +++ b/decoder/ff_source_syntax.cc @@ -104,7 +104,7 @@ struct SourceSyntaxFeaturesImpl { if (fid_ef <= 0) { ostringstream os; //ostringstream os2; - os << "SYN:" << TD::Convert(lhs); + os << "SSYN:" << TD::Convert(lhs); //os2 << "SYN:" << TD::Convert(lhs) << '_' << SpanSizeTransform(j - i); //fid_cat = FD::Convert(os2.str()); os << ':'; @@ -173,7 +173,10 @@ void SourceSyntaxFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, } void SourceSyntaxFeatures::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); + ReadFile f = ReadFile(smeta.GetSGMLValue("src_tree")); + string tree; + f.ReadAll(tree); + impl->InitializeGrids(tree, smeta.GetSourceLength()); } struct SourceSpanSizeFeaturesImpl { diff --git a/decoder/ff_source_syntax2.cc b/decoder/ff_source_syntax2.cc index 63736342..9d0bc33f 100644 --- a/decoder/ff_source_syntax2.cc +++ b/decoder/ff_source_syntax2.cc @@ -90,7 +90,7 @@ struct SourceSyntaxFeatures2Impl { const WordID lhs = src_tree(i,j); int& fid_ef = fids_ef(i,j)[&rule]; ostringstream os; - os << "SYN:" << TD::Convert(lhs); + os << "SSYN2:" << TD::Convert(lhs); os << ':'; unsigned ntc = 0; for (unsigned k = 0; k < rule.f_.size(); ++k) { @@ -159,6 +159,9 @@ void SourceSyntaxFeatures2::TraversalFeaturesImpl(const SentenceMetadata& smeta, } void SourceSyntaxFeatures2::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); + ReadFile f = ReadFile(smeta.GetSGMLValue("src_tree")); + string tree; + f.ReadAll(tree); + impl->InitializeGrids(tree, smeta.GetSourceLength()); } diff --git a/utils/filelib.h b/utils/filelib.h index b9ea3940..4fa69760 100644 --- a/utils/filelib.h +++ b/utils/filelib.h @@ -75,7 +75,10 @@ class ReadFile : public BaseFile { } } } - + void ReadAll(std::string& s) { + getline(*stream(), s, (char) EOF); + if (s.size() > 0) s.resize(s.size()-1); + } }; class WriteFile : public BaseFile { -- cgit v1.2.3