From 34b7c1e7c3aa5f9ee780be65effc40726d849303 Mon Sep 17 00:00:00 2001 From: "Wu, Ke" Date: Sat, 1 Nov 2014 14:48:17 -0400 Subject: Junhui's implementation of soft syntax constraint features --- decoder/ff_soft_syn.cc | 687 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 687 insertions(+) create mode 100644 decoder/ff_soft_syn.cc (limited to 'decoder/ff_soft_syn.cc') diff --git a/decoder/ff_soft_syn.cc b/decoder/ff_soft_syn.cc new file mode 100644 index 00000000..da840008 --- /dev/null +++ b/decoder/ff_soft_syn.cc @@ -0,0 +1,687 @@ +/* + * ff_soft_syn.cc + * + */ +#include "ff_soft_syn.h" +#include "stringlib.h" +#include "hg.h" +#include "sentence_metadata.h" +#include "tree.h" +#include "synutils.h" + +#include +#include +#include + +using namespace std; + +typedef HASH_MAP > MapFeatures; + +/* + * Note: + * In BOLT experiments, we need to merged some sequence words into one term + *(like from "1999 nian 1 yue 10 ri" to "1999_nian_1_yue_10_ri") due to some + *reasons; + * but in the parse file, we still use the parse tree before merging any + *words; + * therefore, the words in source sentence and parse tree diverse and we + *need to map a word in merged sentence into its original index; + * a word in source sentence maps 1 or more words in parse tree + * the index map info is stored at variable index_map_; + * if the index_map_ is NULL, indicating the word index in source sentence + *and parse tree is always same. + * + */ + +struct SoftSynFeatureImpl { + SoftSynFeatureImpl(const string& params) { + parsed_tree_ = NULL; + index_map_ = NULL; + + map_features_ = NULL; + } + + ~SoftSynFeatureImpl() { FreeSentenceVariables(); } + + static int ReserveStateSize() { return 2 * sizeof(uint16_t); } + + void InitializeInputSentence(const std::string& parse_file, + const std::string& index_map_file) { + FreeSentenceVariables(); + parsed_tree_ = ReadParseTree(parse_file); + + if (index_map_file != "") ReadIndexMap(index_map_file); + + // we can do the features "off-line" + map_features_ = new MapFeatures(); + InitializeFeatures(map_features_); + } + + void ReadIndexMap(const std::string& index_map_file) { + STxtFileReader* reader = new STxtFileReader(index_map_file.c_str()); + char szLine[10001]; + szLine[0] = '\0'; + reader->fnReadNextLine(szLine, NULL); + vector terms; + SplitOnWhitespace(string(szLine), &terms); + + index_map_ = new short int[terms.size() + 1]; + int ix = 0; + size_t i; + for (i = 0; i < terms.size(); i++) { + index_map_[i] = ix; + ix += atoi(terms[i].c_str()); + } + index_map_[i] = ix; + assert(parsed_tree_ == NULL || ix == parsed_tree_->m_vecTerminals.size()); + delete reader; + } + + void MapIndex(short int begin, short int end, short int& mapped_begin, + short int& mapped_end) { + if (index_map_ == NULL) { + mapped_begin = begin; + mapped_end = end; + return; + } + + mapped_begin = index_map_[begin]; + mapped_end = index_map_[end + 1] - 1; + } + + /* + * ff_const_reorder.cc::ConstReorderFeatureImpl also defines this function + */ + void FindConsts(const SParsedTree* tree, int begin, int end, + vector& consts) { + STreeItem* item; + item = tree->m_vecTerminals[begin]->m_ptParent; + while (true) { + while (item->m_ptParent != NULL && + item->m_ptParent->m_iBegin == item->m_iBegin && + item->m_ptParent->m_iEnd <= end) + item = item->m_ptParent; + + if (item->m_ptParent == NULL && item->m_vecChildren.size() == 1 && + strcmp(item->m_pszTerm, "ROOT") == 0) + item = item->m_vecChildren[0]; // we automatically add a "ROOT" node at + // the top, skip it if necessary. + + consts.push_back(item); + if (item->m_iEnd < end) + item = tree->m_vecTerminals[item->m_iEnd + 1]->m_ptParent; + else + break; + } + } + + /* + * according to Marton & Resnik (2008) + * a span cann't have both X+ style and X= style features + * a constituent XP is crossed only if the span not only covers parts of XP's + *content, but also covers one or more words outside XP + * a span may have X+, Y+ + * + * (note, we refer X* features to X= features in Marton & Resnik (2008)) + */ + void GenerateSoftFeature(int begin, int end, const SParsedTree* tree, + vector& vecFeature) { + vector vecNode; + FindConsts(tree, begin, end, vecNode); + + if (vecNode.size() == 1) { + // match to one constituent + string feature_name = string(vecNode[0]->m_pszTerm) + string("*"); + vecFeature.push_back(feature_name); + } else { + // match to multiple constituents, find the lowest common parent (lcp) + STreeItem* lcp = vecNode[0]; + while (lcp->m_iEnd < end) lcp = lcp->m_ptParent; + + for (size_t i = 0; i < vecNode.size(); i++) { + STreeItem* item = vecNode[i]; + + while (item != lcp) { + if (item->m_iBegin < begin || item->m_iEnd > end) { + // item is crossed + string feature_name = string(item->m_pszTerm) + string("+"); + vecFeature.push_back(feature_name); + } + if (item->m_iBrotherIndex > 0 && + item->m_ptParent->m_vecChildren[item->m_iBrotherIndex - 1] + ->m_iBegin >= begin && + item->m_ptParent->m_vecChildren[item->m_iBrotherIndex - 1] + ->m_iEnd <= end) + break; // we don't want to collect crossed constituents twice + item = item->m_ptParent; + } + } + } + } + + void GenerateSoftFeatureFromFlattenedTree(int begin, int end, + const SParsedTree* tree, + vector& vecFeature) { + vector vecNode; + FindConsts(tree, begin, end, vecNode); + + if (vecNode.size() == 1) { + // match to one constituent + string feature_name = string(vecNode[0]->m_pszTerm) + string("*"); + vecFeature.push_back(feature_name); + } else { + // match to multiple constituents, see if they have a common parent + size_t i = 0; + for (i = 1; i < vecNode.size(); i++) { + if (vecNode[i]->m_ptParent != vecNode[0]->m_ptParent) break; + } + if (i == vecNode.size()) { + // they share a common parent + string feature_name = + string(vecNode[0]->m_ptParent->m_pszTerm) + string("&"); + vecFeature.push_back(feature_name); + } else { + // they don't share a common parent, find the lowest common parent (lcp) + STreeItem* lcp = vecNode[0]; + while (lcp->m_iEnd < end) lcp = lcp->m_ptParent; + + for (size_t i = 0; i < vecNode.size(); i++) { + STreeItem* item = vecNode[i]; + + while (item != lcp) { + if (item->m_iBegin < begin || item->m_iEnd > end) { + // item is crossed + string feature_name = string(item->m_pszTerm) + string("+"); + vecFeature.push_back(feature_name); + } + if (item->m_iBrotherIndex > 0 && + item->m_ptParent->m_vecChildren[item->m_iBrotherIndex - 1] + ->m_iBegin >= begin && + item->m_ptParent->m_vecChildren[item->m_iBrotherIndex - 1] + ->m_iEnd <= end) + break; // we don't want to collect crossed constituents twice + item = item->m_ptParent; + } + } + } + } + } + + void SetSoftSynFeature(const Hypergraph::Edge& edge, + SparseVector* features, + const vector& ant_states, void* state) { + vector vec_pos; + if (parsed_tree_ == NULL) return; + + uint16_t* remnant = reinterpret_cast(state); + + short int mapped_begin, mapped_end; + MapIndex(edge.i_, edge.j_ - 1, mapped_begin, mapped_end); + + remnant[0] = mapped_begin; + remnant[1] = mapped_end; + + for (size_t i = 0; i < edge.tail_nodes_.size(); i++) { + const uint16_t* astate = reinterpret_cast(ant_states[i]); + vec_pos.push_back(astate[0]); + vec_pos.push_back(astate[1]); + } + + // soft feature for the whole span + const vector vecFeature = + GenerateSoftFeature(mapped_begin, mapped_end, map_features_); + for (size_t i = 0; i < vecFeature.size(); i++) { + int f_id = FD::Convert(vecFeature[i]); + if (f_id) features->set_value(f_id, 1); + } + } + + private: + const vector& GenerateSoftFeature(int begin, int end, + MapFeatures* map_features) { + string key; + GenerateKey(begin, end, key); + MapFeatures::const_iterator iter = (*map_features).find(key); + assert(iter != map_features->end()); + return iter->second; + } + + void Byte_to_Char(unsigned char* str, int n) { + str[0] = (n & 255); + str[1] = n / 256; + } + + void GenerateKey(int begin, int end, string& key) { + unsigned char szTerm[1001]; + Byte_to_Char(szTerm, begin); + Byte_to_Char(szTerm + 2, end); + szTerm[4] = '\0'; + key = string(szTerm, szTerm + 4); + } + + void InitializeFeatures(MapFeatures* map_features) { + if (parsed_tree_ == NULL) return; + + for (size_t i = 0; i < parsed_tree_->m_vecTerminals.size(); i++) + for (size_t j = i; j < parsed_tree_->m_vecTerminals.size(); j++) { + vector vecFeature; + GenerateSoftFeature(i, j, parsed_tree_, vecFeature); + string key; + GenerateKey(i, j, key); + (*map_features)[key] = vecFeature; + } + } + + void FreeSentenceVariables() { + if (parsed_tree_ != NULL) delete parsed_tree_; + if (index_map_ != NULL) delete[] index_map_; + index_map_ = NULL; + + if (map_features_ != NULL) delete map_features_; + } + + SParsedTree* ReadParseTree(const std::string& parse_file) { + SParseReader* reader = new SParseReader(parse_file.c_str(), false); + SParsedTree* tree = reader->fnReadNextParseTree(); + // assert(tree != NULL); + delete reader; + return tree; + } + + private: + SParsedTree* parsed_tree_; + + short int* index_map_; + + MapFeatures* map_features_; +}; + +SoftSynFeature::SoftSynFeature(std::string param) { + pimpl_ = new SoftSynFeatureImpl(param); + SetStateSize(SoftSynFeatureImpl::ReserveStateSize()); + name_ = "SoftSynFeature"; +} + +SoftSynFeature::~SoftSynFeature() { delete pimpl_; } + +void SoftSynFeature::PrepareForInput(const SentenceMetadata& smeta) { + string parse_file = smeta.GetSGMLValue("parse"); + assert(parse_file != ""); + + string indexmap_file = smeta.GetSGMLValue("index-map"); + + pimpl_->InitializeInputSentence(parse_file, indexmap_file); +} + +void SoftSynFeature::TraversalFeaturesImpl( + const SentenceMetadata& /* smeta */, const Hypergraph::Edge& edge, + const vector& ant_states, SparseVector* features, + SparseVector* estimated_features, void* state) const { + pimpl_->SetSoftSynFeature(edge, features, ant_states, state); +} + +string SoftSynFeature::usage(bool /*param*/, bool /*verbose*/) { + return "SoftSynFeature"; +} + +boost::shared_ptr CreateSoftSynFeatureModel( + std::string param) { + SoftSynFeature* ret = new SoftSynFeature(param); + return boost::shared_ptr(ret); +} + +boost::shared_ptr SoftSynFeatureFactory::Create( + std::string param) const { + return CreateSoftSynFeatureModel(param); +} + +std::string SoftSynFeatureFactory::usage(bool params, bool verbose) const { + return SoftSynFeature::usage(params, verbose); +} + +typedef HASH_MAP MapDouble; +typedef HASH_MAP MapDoubleFeatures; + +/* + * Note: + * In BOLT experiments, we need to merged some sequence words into one term + *(like from "1999 nian 1 yue 10 ri" to "1999_nian_1_yue_10_ri") due to some + *reasons; + * but in the parse file, we still use the parse tree before merging any + *words; + * therefore, the words in source sentence and parse tree diverse and we + *need to map a word in merged sentence into its original index; + * a word in source sentence maps 1 or more words in parse tree + * the index map info is stored at variable index_map_; + * if the index_map_ is NULL, indicating the word index in source sentence + *and parse tree is always same. + * + */ + +struct SoftKBestSynFeatureImpl { + SoftKBestSynFeatureImpl(const string& params) { + index_map_ = NULL; + + map_features_ = NULL; + } + + ~SoftKBestSynFeatureImpl() { FreeSentenceVariables(); } + + static int ReserveStateSize() { return 2 * sizeof(uint16_t); } + + void InitializeInputSentence(const std::string& parse_file, + const std::string& index_map_file) { + FreeSentenceVariables(); + ReadParseTree(parse_file, vec_parsed_tree_, vec_tree_prob_); + + if (index_map_file != "") ReadIndexMap(index_map_file); + + // we can do the features "off-line" + map_features_ = new MapDoubleFeatures(); + InitializeFeatures(map_features_); + } + + void SetSoftKBestSynFeature(const Hypergraph::Edge& edge, + SparseVector* features, + const vector& ant_states, + void* state) { + vector vec_pos; + if (vec_parsed_tree_.size() == 0) return; + + uint16_t* remnant = reinterpret_cast(state); + + short int mapped_begin, mapped_end; + MapIndex(edge.i_, edge.j_ - 1, mapped_begin, mapped_end); + + remnant[0] = mapped_begin; + remnant[1] = mapped_end; + + for (size_t i = 0; i < edge.tail_nodes_.size(); i++) { + const uint16_t* astate = reinterpret_cast(ant_states[i]); + vec_pos.push_back(astate[0]); + vec_pos.push_back(astate[1]); + } + + // soft feature for the whole span + const MapDouble* pMapFeature = + GenerateSoftFeature(mapped_begin, mapped_end, map_features_); + for (MapDouble::const_iterator iter = pMapFeature->begin(); + iter != pMapFeature->end(); iter++) { + int f_id = FD::Convert(iter->first); + if (f_id) features->set_value(f_id, iter->second); + } + } + + private: + void ReadIndexMap(const std::string& index_map_file) { + STxtFileReader* reader = new STxtFileReader(index_map_file.c_str()); + char szLine[10001]; + szLine[0] = '\0'; + reader->fnReadNextLine(szLine, NULL); + vector terms; + SplitOnWhitespace(string(szLine), &terms); + + index_map_ = new short int[terms.size() + 1]; + int ix = 0; + size_t i; + for (i = 0; i < terms.size(); i++) { + index_map_[i] = ix; + ix += atoi(terms[i].c_str()); + } + index_map_[i] = ix; + assert(vec_parsed_tree_.size() == 0 || + ix == vec_parsed_tree_[0]->m_vecTerminals.size()); + delete reader; + } + + void MapIndex(short int begin, short int end, short int& mapped_begin, + short int& mapped_end) { + if (index_map_ == NULL) { + mapped_begin = begin; + mapped_end = end; + return; + } + + mapped_begin = index_map_[begin]; + mapped_end = index_map_[end + 1] - 1; + } + + /* + * ff_const_reorder.cc::ConstReorderFeatureImpl also defines this function + */ + void FindConsts(const SParsedTree* tree, int begin, int end, + vector& consts) { + STreeItem* item; + item = tree->m_vecTerminals[begin]->m_ptParent; + while (true) { + while (item->m_ptParent != NULL && + item->m_ptParent->m_iBegin == item->m_iBegin && + item->m_ptParent->m_iEnd <= end) + item = item->m_ptParent; + + if (item->m_ptParent == NULL && item->m_vecChildren.size() == 1 && + strcmp(item->m_pszTerm, "ROOT") == 0) + item = item->m_vecChildren[0]; // we automatically add a "ROOT" node at + // the top, skip it if necessary. + + consts.push_back(item); + if (item->m_iEnd < end) + item = tree->m_vecTerminals[item->m_iEnd + 1]->m_ptParent; + else + break; + } + } + + /* + * according to Marton & Resnik (2008) + * a span cann't have both X+ style and X= style features + * a constituent XP is crossed only if the span not only covers parts of XP's + *content, but also covers one or more words outside XP + * a span may have X+, Y+ + * + * (note, we refer X* features to X= features in Marton & Resnik (2008)) + */ + void GenerateSoftFeature(int begin, int end, + const vector& vec_tree, + const vector& vec_prob, + MapDouble* pMapFeature) { + + for (size_t i = 0; i < vec_tree.size(); i++) { + const SParsedTree* tree = vec_tree[i]; + vector vecNode; + FindConsts(tree, begin, end, vecNode); + + if (vecNode.size() == 1) { + // match to one constituent + string feature_name = string(vecNode[0]->m_pszTerm) + string("*"); + MapDouble::iterator iter = pMapFeature->find(feature_name); + if (iter != pMapFeature->end()) { + iter->second += vec_prob[i]; + } else + (*pMapFeature)[feature_name] = vec_prob[i]; + } else { + // match to multiple constituents, find the lowest common parent (lcp) + STreeItem* lcp = vecNode[0]; + while (lcp->m_iEnd < end) lcp = lcp->m_ptParent; + + for (size_t j = 0; j < vecNode.size(); j++) { + STreeItem* item = vecNode[j]; + + while (item != lcp) { + if (item->m_iBegin < begin || item->m_iEnd > end) { + // item is crossed + string feature_name = string(item->m_pszTerm) + string("+"); + MapDouble::iterator iter = pMapFeature->find(feature_name); + if (iter != pMapFeature->end()) { + iter->second += vec_prob[i]; + } else + (*pMapFeature)[feature_name] = vec_prob[i]; + } + if (item->m_iBrotherIndex > 0 && + item->m_ptParent->m_vecChildren[item->m_iBrotherIndex - 1] + ->m_iBegin >= begin && + item->m_ptParent->m_vecChildren[item->m_iBrotherIndex - 1] + ->m_iEnd <= end) + break; // we don't want to collect crossed constituents twice + item = item->m_ptParent; + } + } + } + } + } + + const MapDouble* GenerateSoftFeature(int begin, int end, + MapDoubleFeatures* map_features) { + string key; + GenerateKey(begin, end, key); + MapDoubleFeatures::const_iterator iter = (*map_features).find(key); + assert(iter != map_features->end()); + return iter->second; + } + + void Byte_to_Char(unsigned char* str, int n) { + str[0] = (n & 255); + str[1] = n / 256; + } + + void GenerateKey(int begin, int end, string& key) { + unsigned char szTerm[1001]; + Byte_to_Char(szTerm, begin); + Byte_to_Char(szTerm + 2, end); + szTerm[4] = '\0'; + key = string(szTerm, szTerm + 4); + } + + void InitializeFeatures(MapDoubleFeatures* map_features) { + if (vec_parsed_tree_.size() == 0) return; + + const SParsedTree* pTree = vec_parsed_tree_[0]; + + vector vec_prob; + vec_prob.reserve(vec_tree_prob_.size()); + double tmp = 0.0; + for (size_t i = 0; i < vec_tree_prob_.size(); i++) { + vec_prob.push_back(pow(10, vec_tree_prob_[i] - vec_tree_prob_[0])); + tmp += vec_prob[i]; + } + for (size_t i = 0; i < vec_prob.size(); i++) vec_prob[i] /= tmp; + + for (size_t i = 0; i < pTree->m_vecTerminals.size(); i++) + for (size_t j = i; j < pTree->m_vecTerminals.size(); j++) { + MapDouble* pMap = new MapDouble(); + GenerateSoftFeature(i, j, vec_parsed_tree_, vec_prob, pMap); + string key; + GenerateKey(i, j, key); + (*map_features)[key] = pMap; + } + } + + void FreeSentenceVariables() { + for (size_t i = 0; i < vec_parsed_tree_.size(); i++) { + if (vec_parsed_tree_[i] != NULL) delete vec_parsed_tree_[i]; + } + vec_parsed_tree_.clear(); + vec_tree_prob_.clear(); + if (index_map_ != NULL) delete[] index_map_; + index_map_ = NULL; + + if (map_features_ != NULL) { + for (MapDoubleFeatures::iterator iter = map_features_->begin(); + iter != map_features_->end(); iter++) + delete iter->second; + delete map_features_; + } + } + + void ReadParseTree(const std::string& parse_file, + vector& vec_tree, vector& vec_prob) { + STxtFileReader* reader = new STxtFileReader(parse_file.c_str()); + SParsedTree* tree; + string line; + while (reader->fnReadNextLine(line)) { + const char* p = strchr(line.c_str(), ' '); + assert(p != NULL); + string strProb = line.substr(0, line.find(' ')); + tree = SParsedTree::fnConvertFromString(p + 1); + tree->fnSetSpanInfo(); + tree->fnSetHeadWord(); + vec_tree.push_back(tree); + if (strProb == string("-Infinity")) { + vec_prob.push_back(-99.0); + break; + } else { + vec_prob.push_back(atof(strProb.c_str())); + } + } + delete reader; + } + + void ReadParseTree2(const std::string& parse_file, + vector& vec_tree, + vector& vec_prob) { + SParseReader* reader = new SParseReader(parse_file.c_str(), false); + double prob; + SParsedTree* tree; + while ((tree = reader->fnReadNextParseTreeWithProb(&prob)) != NULL) { + vec_tree.push_back(tree); + if (std::isinf(prob)) { + vec_prob.push_back(-99); + break; + } else + vec_prob.push_back(prob); + } + // assert(tree != NULL); + delete reader; + } + + private: + vector vec_parsed_tree_; + vector vec_tree_prob_; + + short int* index_map_; + + MapDoubleFeatures* map_features_; +}; + +SoftKBestSynFeature::SoftKBestSynFeature(std::string param) { + pimpl_ = new SoftKBestSynFeatureImpl(param); + SetStateSize(SoftKBestSynFeatureImpl::ReserveStateSize()); + name_ = "SoftKBestSynFeature"; +} + +SoftKBestSynFeature::~SoftKBestSynFeature() { delete pimpl_; } + +void SoftKBestSynFeature::PrepareForInput(const SentenceMetadata& smeta) { + string parse_file = smeta.GetSGMLValue("kbestparse"); + assert(parse_file != ""); + + string indexmap_file = smeta.GetSGMLValue("index-map"); + + pimpl_->InitializeInputSentence(parse_file, indexmap_file); +} + +void SoftKBestSynFeature::TraversalFeaturesImpl( + const SentenceMetadata& /* smeta */, const Hypergraph::Edge& edge, + const vector& ant_states, SparseVector* features, + SparseVector* estimated_features, void* state) const { + pimpl_->SetSoftKBestSynFeature(edge, features, ant_states, state); +} + +string SoftKBestSynFeature::usage(bool /*param*/, bool /*verbose*/) { + return "SoftKBestSynFeature"; +} + +boost::shared_ptr CreateSoftKBestSynFeatureModel( + std::string param) { + SoftKBestSynFeature* ret = new SoftKBestSynFeature(param); + return boost::shared_ptr(ret); +} + +boost::shared_ptr SoftKBestSynFeatureFactory::Create( + std::string param) const { + return CreateSoftKBestSynFeatureModel(param); +} + +std::string SoftKBestSynFeatureFactory::usage(bool params, bool verbose) const { + return SoftKBestSynFeature::usage(params, verbose); +} -- cgit v1.2.3 From 68fb48bc94d9eccc6b43960a26ea72aed69bf789 Mon Sep 17 00:00:00 2001 From: "Wu, Ke" Date: Sat, 6 Dec 2014 10:44:08 -0500 Subject: Remove synutils.h in ff_soft_syn.cc --- decoder/ff_soft_syn.cc | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) (limited to 'decoder/ff_soft_syn.cc') diff --git a/decoder/ff_soft_syn.cc b/decoder/ff_soft_syn.cc index da840008..9d5ed08e 100644 --- a/decoder/ff_soft_syn.cc +++ b/decoder/ff_soft_syn.cc @@ -3,11 +3,12 @@ * */ #include "ff_soft_syn.h" + +#include "filelib.h" #include "stringlib.h" #include "hg.h" #include "sentence_metadata.h" #include "tree.h" -#include "synutils.h" #include #include @@ -34,7 +35,7 @@ typedef HASH_MAP > MapFeatures; */ struct SoftSynFeatureImpl { - SoftSynFeatureImpl(const string& params) { + SoftSynFeatureImpl(const string& /*params*/) { parsed_tree_ = NULL; index_map_ = NULL; @@ -58,12 +59,13 @@ struct SoftSynFeatureImpl { } void ReadIndexMap(const std::string& index_map_file) { - STxtFileReader* reader = new STxtFileReader(index_map_file.c_str()); - char szLine[10001]; - szLine[0] = '\0'; - reader->fnReadNextLine(szLine, NULL); vector terms; - SplitOnWhitespace(string(szLine), &terms); + { + ReadFile file(index_map_file); + string line; + assert(getline(*file.stream(), line)); + SplitOnWhitespace(line, &terms); + } index_map_ = new short int[terms.size() + 1]; int ix = 0; @@ -74,7 +76,6 @@ struct SoftSynFeatureImpl { } index_map_[i] = ix; assert(parsed_tree_ == NULL || ix == parsed_tree_->m_vecTerminals.size()); - delete reader; } void MapIndex(short int begin, short int end, short int& mapped_begin, @@ -316,7 +317,7 @@ void SoftSynFeature::PrepareForInput(const SentenceMetadata& smeta) { void SoftSynFeature::TraversalFeaturesImpl( const SentenceMetadata& /* smeta */, const Hypergraph::Edge& edge, const vector& ant_states, SparseVector* features, - SparseVector* estimated_features, void* state) const { + SparseVector* /*estimated_features*/, void* state) const { pimpl_->SetSoftSynFeature(edge, features, ant_states, state); } @@ -359,7 +360,7 @@ typedef HASH_MAP MapDoubleFeatures; */ struct SoftKBestSynFeatureImpl { - SoftKBestSynFeatureImpl(const string& params) { + SoftKBestSynFeatureImpl(const string& /*params*/) { index_map_ = NULL; map_features_ = NULL; @@ -414,12 +415,13 @@ struct SoftKBestSynFeatureImpl { private: void ReadIndexMap(const std::string& index_map_file) { - STxtFileReader* reader = new STxtFileReader(index_map_file.c_str()); - char szLine[10001]; - szLine[0] = '\0'; - reader->fnReadNextLine(szLine, NULL); vector terms; - SplitOnWhitespace(string(szLine), &terms); + { + ReadFile file(index_map_file); + string line; + assert(getline(*file.stream(), line)); + SplitOnWhitespace(line, &terms); + } index_map_ = new short int[terms.size() + 1]; int ix = 0; @@ -431,7 +433,6 @@ struct SoftKBestSynFeatureImpl { index_map_[i] = ix; assert(vec_parsed_tree_.size() == 0 || ix == vec_parsed_tree_[0]->m_vecTerminals.size()); - delete reader; } void MapIndex(short int begin, short int end, short int& mapped_begin, @@ -595,10 +596,10 @@ struct SoftKBestSynFeatureImpl { void ReadParseTree(const std::string& parse_file, vector& vec_tree, vector& vec_prob) { - STxtFileReader* reader = new STxtFileReader(parse_file.c_str()); + ReadFile in(parse_file); SParsedTree* tree; string line; - while (reader->fnReadNextLine(line)) { + while (getline(*in.stream(), line)) { const char* p = strchr(line.c_str(), ' '); assert(p != NULL); string strProb = line.substr(0, line.find(' ')); @@ -613,7 +614,6 @@ struct SoftKBestSynFeatureImpl { vec_prob.push_back(atof(strProb.c_str())); } } - delete reader; } void ReadParseTree2(const std::string& parse_file, @@ -663,7 +663,7 @@ void SoftKBestSynFeature::PrepareForInput(const SentenceMetadata& smeta) { void SoftKBestSynFeature::TraversalFeaturesImpl( const SentenceMetadata& /* smeta */, const Hypergraph::Edge& edge, const vector& ant_states, SparseVector* features, - SparseVector* estimated_features, void* state) const { + SparseVector* /*estimated_features*/, void* state) const { pimpl_->SetSoftKBestSynFeature(edge, features, ant_states, state); } -- cgit v1.2.3 From e84e4b882d1515a373a6f564fda3b78bd647b73b Mon Sep 17 00:00:00 2001 From: "Wu, Ke" Date: Sat, 6 Dec 2014 12:19:01 -0500 Subject: Use ff_const_reorder_common.h instead of tree.h --- decoder/ff_soft_syn.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'decoder/ff_soft_syn.cc') diff --git a/decoder/ff_soft_syn.cc b/decoder/ff_soft_syn.cc index 9d5ed08e..50c2365e 100644 --- a/decoder/ff_soft_syn.cc +++ b/decoder/ff_soft_syn.cc @@ -8,13 +8,14 @@ #include "stringlib.h" #include "hg.h" #include "sentence_metadata.h" -#include "tree.h" +#include "ff_const_reorder_common.h" #include #include #include using namespace std; +using namespace const_reorder; typedef HASH_MAP > MapFeatures; -- cgit v1.2.3 From cf7cf1e63f034b703adf55ffe72e534854903aee Mon Sep 17 00:00:00 2001 From: "Wu, Ke" Date: Sat, 6 Dec 2014 16:30:44 -0500 Subject: Remove unnecessary state from SoftSynFeature --- decoder/ff_soft_syn.cc | 51 +++++++++----------------------------------------- 1 file changed, 9 insertions(+), 42 deletions(-) (limited to 'decoder/ff_soft_syn.cc') diff --git a/decoder/ff_soft_syn.cc b/decoder/ff_soft_syn.cc index 50c2365e..c108e58c 100644 --- a/decoder/ff_soft_syn.cc +++ b/decoder/ff_soft_syn.cc @@ -45,8 +45,6 @@ struct SoftSynFeatureImpl { ~SoftSynFeatureImpl() { FreeSentenceVariables(); } - static int ReserveStateSize() { return 2 * sizeof(uint16_t); } - void InitializeInputSentence(const std::string& parse_file, const std::string& index_map_file) { FreeSentenceVariables(); @@ -210,25 +208,12 @@ struct SoftSynFeatureImpl { } void SetSoftSynFeature(const Hypergraph::Edge& edge, - SparseVector* features, - const vector& ant_states, void* state) { - vector vec_pos; + SparseVector* features) { if (parsed_tree_ == NULL) return; - uint16_t* remnant = reinterpret_cast(state); - short int mapped_begin, mapped_end; MapIndex(edge.i_, edge.j_ - 1, mapped_begin, mapped_end); - remnant[0] = mapped_begin; - remnant[1] = mapped_end; - - for (size_t i = 0; i < edge.tail_nodes_.size(); i++) { - const uint16_t* astate = reinterpret_cast(ant_states[i]); - vec_pos.push_back(astate[0]); - vec_pos.push_back(astate[1]); - } - // soft feature for the whole span const vector vecFeature = GenerateSoftFeature(mapped_begin, mapped_end, map_features_); @@ -300,7 +285,6 @@ struct SoftSynFeatureImpl { SoftSynFeature::SoftSynFeature(std::string param) { pimpl_ = new SoftSynFeatureImpl(param); - SetStateSize(SoftSynFeatureImpl::ReserveStateSize()); name_ = "SoftSynFeature"; } @@ -316,10 +300,10 @@ void SoftSynFeature::PrepareForInput(const SentenceMetadata& smeta) { } void SoftSynFeature::TraversalFeaturesImpl( - const SentenceMetadata& /* smeta */, const Hypergraph::Edge& edge, - const vector& ant_states, SparseVector* features, - SparseVector* /*estimated_features*/, void* state) const { - pimpl_->SetSoftSynFeature(edge, features, ant_states, state); + const SentenceMetadata& /*smeta*/, const Hypergraph::Edge& edge, + const vector& /*ant_states*/, SparseVector* features, + SparseVector* /*estimated_features*/, void* /*state*/) const { + pimpl_->SetSoftSynFeature(edge, features); } string SoftSynFeature::usage(bool /*param*/, bool /*verbose*/) { @@ -369,8 +353,6 @@ struct SoftKBestSynFeatureImpl { ~SoftKBestSynFeatureImpl() { FreeSentenceVariables(); } - static int ReserveStateSize() { return 2 * sizeof(uint16_t); } - void InitializeInputSentence(const std::string& parse_file, const std::string& index_map_file) { FreeSentenceVariables(); @@ -384,26 +366,12 @@ struct SoftKBestSynFeatureImpl { } void SetSoftKBestSynFeature(const Hypergraph::Edge& edge, - SparseVector* features, - const vector& ant_states, - void* state) { - vector vec_pos; + SparseVector* features) { if (vec_parsed_tree_.size() == 0) return; - uint16_t* remnant = reinterpret_cast(state); - short int mapped_begin, mapped_end; MapIndex(edge.i_, edge.j_ - 1, mapped_begin, mapped_end); - remnant[0] = mapped_begin; - remnant[1] = mapped_end; - - for (size_t i = 0; i < edge.tail_nodes_.size(); i++) { - const uint16_t* astate = reinterpret_cast(ant_states[i]); - vec_pos.push_back(astate[0]); - vec_pos.push_back(astate[1]); - } - // soft feature for the whole span const MapDouble* pMapFeature = GenerateSoftFeature(mapped_begin, mapped_end, map_features_); @@ -646,7 +614,6 @@ struct SoftKBestSynFeatureImpl { SoftKBestSynFeature::SoftKBestSynFeature(std::string param) { pimpl_ = new SoftKBestSynFeatureImpl(param); - SetStateSize(SoftKBestSynFeatureImpl::ReserveStateSize()); name_ = "SoftKBestSynFeature"; } @@ -663,9 +630,9 @@ void SoftKBestSynFeature::PrepareForInput(const SentenceMetadata& smeta) { void SoftKBestSynFeature::TraversalFeaturesImpl( const SentenceMetadata& /* smeta */, const Hypergraph::Edge& edge, - const vector& ant_states, SparseVector* features, - SparseVector* /*estimated_features*/, void* state) const { - pimpl_->SetSoftKBestSynFeature(edge, features, ant_states, state); + const vector& /*ant_states*/, SparseVector* features, + SparseVector* /*estimated_features*/, void* /*state*/) const { + pimpl_->SetSoftKBestSynFeature(edge, features); } string SoftKBestSynFeature::usage(bool /*param*/, bool /*verbose*/) { -- cgit v1.2.3 From 9dff48152f8139c944ef15c658d939060e52b43c Mon Sep 17 00:00:00 2001 From: "Wu, Ke" Date: Sat, 6 Dec 2014 16:32:35 -0500 Subject: Remove unused code --- decoder/ff_soft_syn.cc | 329 ------------------------------------------------- decoder/ff_soft_syn.h | 52 -------- 2 files changed, 381 deletions(-) (limited to 'decoder/ff_soft_syn.cc') diff --git a/decoder/ff_soft_syn.cc b/decoder/ff_soft_syn.cc index c108e58c..970e532b 100644 --- a/decoder/ff_soft_syn.cc +++ b/decoder/ff_soft_syn.cc @@ -324,332 +324,3 @@ boost::shared_ptr SoftSynFeatureFactory::Create( std::string SoftSynFeatureFactory::usage(bool params, bool verbose) const { return SoftSynFeature::usage(params, verbose); } - -typedef HASH_MAP MapDouble; -typedef HASH_MAP MapDoubleFeatures; - -/* - * Note: - * In BOLT experiments, we need to merged some sequence words into one term - *(like from "1999 nian 1 yue 10 ri" to "1999_nian_1_yue_10_ri") due to some - *reasons; - * but in the parse file, we still use the parse tree before merging any - *words; - * therefore, the words in source sentence and parse tree diverse and we - *need to map a word in merged sentence into its original index; - * a word in source sentence maps 1 or more words in parse tree - * the index map info is stored at variable index_map_; - * if the index_map_ is NULL, indicating the word index in source sentence - *and parse tree is always same. - * - */ - -struct SoftKBestSynFeatureImpl { - SoftKBestSynFeatureImpl(const string& /*params*/) { - index_map_ = NULL; - - map_features_ = NULL; - } - - ~SoftKBestSynFeatureImpl() { FreeSentenceVariables(); } - - void InitializeInputSentence(const std::string& parse_file, - const std::string& index_map_file) { - FreeSentenceVariables(); - ReadParseTree(parse_file, vec_parsed_tree_, vec_tree_prob_); - - if (index_map_file != "") ReadIndexMap(index_map_file); - - // we can do the features "off-line" - map_features_ = new MapDoubleFeatures(); - InitializeFeatures(map_features_); - } - - void SetSoftKBestSynFeature(const Hypergraph::Edge& edge, - SparseVector* features) { - if (vec_parsed_tree_.size() == 0) return; - - short int mapped_begin, mapped_end; - MapIndex(edge.i_, edge.j_ - 1, mapped_begin, mapped_end); - - // soft feature for the whole span - const MapDouble* pMapFeature = - GenerateSoftFeature(mapped_begin, mapped_end, map_features_); - for (MapDouble::const_iterator iter = pMapFeature->begin(); - iter != pMapFeature->end(); iter++) { - int f_id = FD::Convert(iter->first); - if (f_id) features->set_value(f_id, iter->second); - } - } - - private: - void ReadIndexMap(const std::string& index_map_file) { - vector terms; - { - ReadFile file(index_map_file); - string line; - assert(getline(*file.stream(), line)); - SplitOnWhitespace(line, &terms); - } - - index_map_ = new short int[terms.size() + 1]; - int ix = 0; - size_t i; - for (i = 0; i < terms.size(); i++) { - index_map_[i] = ix; - ix += atoi(terms[i].c_str()); - } - index_map_[i] = ix; - assert(vec_parsed_tree_.size() == 0 || - ix == vec_parsed_tree_[0]->m_vecTerminals.size()); - } - - void MapIndex(short int begin, short int end, short int& mapped_begin, - short int& mapped_end) { - if (index_map_ == NULL) { - mapped_begin = begin; - mapped_end = end; - return; - } - - mapped_begin = index_map_[begin]; - mapped_end = index_map_[end + 1] - 1; - } - - /* - * ff_const_reorder.cc::ConstReorderFeatureImpl also defines this function - */ - void FindConsts(const SParsedTree* tree, int begin, int end, - vector& consts) { - STreeItem* item; - item = tree->m_vecTerminals[begin]->m_ptParent; - while (true) { - while (item->m_ptParent != NULL && - item->m_ptParent->m_iBegin == item->m_iBegin && - item->m_ptParent->m_iEnd <= end) - item = item->m_ptParent; - - if (item->m_ptParent == NULL && item->m_vecChildren.size() == 1 && - strcmp(item->m_pszTerm, "ROOT") == 0) - item = item->m_vecChildren[0]; // we automatically add a "ROOT" node at - // the top, skip it if necessary. - - consts.push_back(item); - if (item->m_iEnd < end) - item = tree->m_vecTerminals[item->m_iEnd + 1]->m_ptParent; - else - break; - } - } - - /* - * according to Marton & Resnik (2008) - * a span cann't have both X+ style and X= style features - * a constituent XP is crossed only if the span not only covers parts of XP's - *content, but also covers one or more words outside XP - * a span may have X+, Y+ - * - * (note, we refer X* features to X= features in Marton & Resnik (2008)) - */ - void GenerateSoftFeature(int begin, int end, - const vector& vec_tree, - const vector& vec_prob, - MapDouble* pMapFeature) { - - for (size_t i = 0; i < vec_tree.size(); i++) { - const SParsedTree* tree = vec_tree[i]; - vector vecNode; - FindConsts(tree, begin, end, vecNode); - - if (vecNode.size() == 1) { - // match to one constituent - string feature_name = string(vecNode[0]->m_pszTerm) + string("*"); - MapDouble::iterator iter = pMapFeature->find(feature_name); - if (iter != pMapFeature->end()) { - iter->second += vec_prob[i]; - } else - (*pMapFeature)[feature_name] = vec_prob[i]; - } else { - // match to multiple constituents, find the lowest common parent (lcp) - STreeItem* lcp = vecNode[0]; - while (lcp->m_iEnd < end) lcp = lcp->m_ptParent; - - for (size_t j = 0; j < vecNode.size(); j++) { - STreeItem* item = vecNode[j]; - - while (item != lcp) { - if (item->m_iBegin < begin || item->m_iEnd > end) { - // item is crossed - string feature_name = string(item->m_pszTerm) + string("+"); - MapDouble::iterator iter = pMapFeature->find(feature_name); - if (iter != pMapFeature->end()) { - iter->second += vec_prob[i]; - } else - (*pMapFeature)[feature_name] = vec_prob[i]; - } - if (item->m_iBrotherIndex > 0 && - item->m_ptParent->m_vecChildren[item->m_iBrotherIndex - 1] - ->m_iBegin >= begin && - item->m_ptParent->m_vecChildren[item->m_iBrotherIndex - 1] - ->m_iEnd <= end) - break; // we don't want to collect crossed constituents twice - item = item->m_ptParent; - } - } - } - } - } - - const MapDouble* GenerateSoftFeature(int begin, int end, - MapDoubleFeatures* map_features) { - string key; - GenerateKey(begin, end, key); - MapDoubleFeatures::const_iterator iter = (*map_features).find(key); - assert(iter != map_features->end()); - return iter->second; - } - - void Byte_to_Char(unsigned char* str, int n) { - str[0] = (n & 255); - str[1] = n / 256; - } - - void GenerateKey(int begin, int end, string& key) { - unsigned char szTerm[1001]; - Byte_to_Char(szTerm, begin); - Byte_to_Char(szTerm + 2, end); - szTerm[4] = '\0'; - key = string(szTerm, szTerm + 4); - } - - void InitializeFeatures(MapDoubleFeatures* map_features) { - if (vec_parsed_tree_.size() == 0) return; - - const SParsedTree* pTree = vec_parsed_tree_[0]; - - vector vec_prob; - vec_prob.reserve(vec_tree_prob_.size()); - double tmp = 0.0; - for (size_t i = 0; i < vec_tree_prob_.size(); i++) { - vec_prob.push_back(pow(10, vec_tree_prob_[i] - vec_tree_prob_[0])); - tmp += vec_prob[i]; - } - for (size_t i = 0; i < vec_prob.size(); i++) vec_prob[i] /= tmp; - - for (size_t i = 0; i < pTree->m_vecTerminals.size(); i++) - for (size_t j = i; j < pTree->m_vecTerminals.size(); j++) { - MapDouble* pMap = new MapDouble(); - GenerateSoftFeature(i, j, vec_parsed_tree_, vec_prob, pMap); - string key; - GenerateKey(i, j, key); - (*map_features)[key] = pMap; - } - } - - void FreeSentenceVariables() { - for (size_t i = 0; i < vec_parsed_tree_.size(); i++) { - if (vec_parsed_tree_[i] != NULL) delete vec_parsed_tree_[i]; - } - vec_parsed_tree_.clear(); - vec_tree_prob_.clear(); - if (index_map_ != NULL) delete[] index_map_; - index_map_ = NULL; - - if (map_features_ != NULL) { - for (MapDoubleFeatures::iterator iter = map_features_->begin(); - iter != map_features_->end(); iter++) - delete iter->second; - delete map_features_; - } - } - - void ReadParseTree(const std::string& parse_file, - vector& vec_tree, vector& vec_prob) { - ReadFile in(parse_file); - SParsedTree* tree; - string line; - while (getline(*in.stream(), line)) { - const char* p = strchr(line.c_str(), ' '); - assert(p != NULL); - string strProb = line.substr(0, line.find(' ')); - tree = SParsedTree::fnConvertFromString(p + 1); - tree->fnSetSpanInfo(); - tree->fnSetHeadWord(); - vec_tree.push_back(tree); - if (strProb == string("-Infinity")) { - vec_prob.push_back(-99.0); - break; - } else { - vec_prob.push_back(atof(strProb.c_str())); - } - } - } - - void ReadParseTree2(const std::string& parse_file, - vector& vec_tree, - vector& vec_prob) { - SParseReader* reader = new SParseReader(parse_file.c_str(), false); - double prob; - SParsedTree* tree; - while ((tree = reader->fnReadNextParseTreeWithProb(&prob)) != NULL) { - vec_tree.push_back(tree); - if (std::isinf(prob)) { - vec_prob.push_back(-99); - break; - } else - vec_prob.push_back(prob); - } - // assert(tree != NULL); - delete reader; - } - - private: - vector vec_parsed_tree_; - vector vec_tree_prob_; - - short int* index_map_; - - MapDoubleFeatures* map_features_; -}; - -SoftKBestSynFeature::SoftKBestSynFeature(std::string param) { - pimpl_ = new SoftKBestSynFeatureImpl(param); - name_ = "SoftKBestSynFeature"; -} - -SoftKBestSynFeature::~SoftKBestSynFeature() { delete pimpl_; } - -void SoftKBestSynFeature::PrepareForInput(const SentenceMetadata& smeta) { - string parse_file = smeta.GetSGMLValue("kbestparse"); - assert(parse_file != ""); - - string indexmap_file = smeta.GetSGMLValue("index-map"); - - pimpl_->InitializeInputSentence(parse_file, indexmap_file); -} - -void SoftKBestSynFeature::TraversalFeaturesImpl( - const SentenceMetadata& /* smeta */, const Hypergraph::Edge& edge, - const vector& /*ant_states*/, SparseVector* features, - SparseVector* /*estimated_features*/, void* /*state*/) const { - pimpl_->SetSoftKBestSynFeature(edge, features); -} - -string SoftKBestSynFeature::usage(bool /*param*/, bool /*verbose*/) { - return "SoftKBestSynFeature"; -} - -boost::shared_ptr CreateSoftKBestSynFeatureModel( - std::string param) { - SoftKBestSynFeature* ret = new SoftKBestSynFeature(param); - return boost::shared_ptr(ret); -} - -boost::shared_ptr SoftKBestSynFeatureFactory::Create( - std::string param) const { - return CreateSoftKBestSynFeatureModel(param); -} - -std::string SoftKBestSynFeatureFactory::usage(bool params, bool verbose) const { - return SoftKBestSynFeature::usage(params, verbose); -} diff --git a/decoder/ff_soft_syn.h b/decoder/ff_soft_syn.h index 21618bc0..df9a6cc8 100644 --- a/decoder/ff_soft_syn.h +++ b/decoder/ff_soft_syn.h @@ -35,56 +35,4 @@ struct SoftSynFeatureFactory : public FactoryBase { std::string usage(bool params, bool verbose) const; }; -struct SoftKBestSynFeatureImpl; - -class SoftKBestSynFeature : public FeatureFunction { - public: - SoftKBestSynFeature(std::string param); - ~SoftKBestSynFeature(); - static std::string usage(bool param, bool verbose); - - protected: - virtual void PrepareForInput(const SentenceMetadata& smeta); - - virtual void TraversalFeaturesImpl( - const SentenceMetadata& smeta, const HG::Edge& edge, - const std::vector& ant_contexts, - SparseVector* features, SparseVector* estimated_features, - void* out_context) const; - - private: - SoftKBestSynFeatureImpl* pimpl_; -}; - -struct SoftKBestSynFeatureFactory : public FactoryBase { - FP Create(std::string param) const; - std::string usage(bool params, bool verbose) const; -}; - -struct SoftForestSynFeatureImpl; - -class SoftForestSynFeature : public FeatureFunction { - public: - SoftForestSynFeature(std::string param); - ~SoftForestSynFeature(); - static std::string usage(bool param, bool verbose); - - protected: - virtual void PrepareForInput(const SentenceMetadata& smeta); - - virtual void TraversalFeaturesImpl( - const SentenceMetadata& smeta, const HG::Edge& edge, - const std::vector& ant_contexts, - SparseVector* features, SparseVector* estimated_features, - void* out_context) const; - - private: - SoftForestSynFeatureImpl* pimpl_; -}; - -struct SoftForestSynFeatureFactory : public FactoryBase { - FP Create(std::string param) const; - std::string usage(bool params, bool verbose) const; -}; - #endif /* FF_SOFT_SYN_H_ */ -- cgit v1.2.3