summaryrefslogtreecommitdiff
path: root/decoder/ff_soft_syn.cc
diff options
context:
space:
mode:
Diffstat (limited to 'decoder/ff_soft_syn.cc')
-rw-r--r--decoder/ff_soft_syn.cc326
1 files changed, 326 insertions, 0 deletions
diff --git a/decoder/ff_soft_syn.cc b/decoder/ff_soft_syn.cc
new file mode 100644
index 00000000..970e532b
--- /dev/null
+++ b/decoder/ff_soft_syn.cc
@@ -0,0 +1,326 @@
+/*
+ * ff_soft_syn.cc
+ *
+ */
+#include "ff_soft_syn.h"
+
+#include "filelib.h"
+#include "stringlib.h"
+#include "hg.h"
+#include "sentence_metadata.h"
+#include "ff_const_reorder_common.h"
+
+#include <string>
+#include <vector>
+#include <stdio.h>
+
+using namespace std;
+using namespace const_reorder;
+
+typedef HASH_MAP<std::string, vector<string> > MapFeatures;
+
+/*
+ * Note:
+ * In BOLT experiments, we need to merged some sequence words into one term
+ *(like from "1999 nian 1 yue 10 ri" to "1999_nian_1_yue_10_ri") due to some
+ *reasons;
+ * but in the parse file, we still use the parse tree before merging any
+ *words;
+ * therefore, the words in source sentence and parse tree diverse and we
+ *need to map a word in merged sentence into its original index;
+ * a word in source sentence maps 1 or more words in parse tree
+ * the index map info is stored at variable index_map_;
+ * if the index_map_ is NULL, indicating the word index in source sentence
+ *and parse tree is always same.
+ *
+ */
+
+struct SoftSynFeatureImpl {
+ SoftSynFeatureImpl(const string& /*params*/) {
+ parsed_tree_ = NULL;
+ index_map_ = NULL;
+
+ map_features_ = NULL;
+ }
+
+ ~SoftSynFeatureImpl() { FreeSentenceVariables(); }
+
+ void InitializeInputSentence(const std::string& parse_file,
+ const std::string& index_map_file) {
+ FreeSentenceVariables();
+ parsed_tree_ = ReadParseTree(parse_file);
+
+ if (index_map_file != "") ReadIndexMap(index_map_file);
+
+ // we can do the features "off-line"
+ map_features_ = new MapFeatures();
+ InitializeFeatures(map_features_);
+ }
+
+ void ReadIndexMap(const std::string& index_map_file) {
+ vector<string> terms;
+ {
+ ReadFile file(index_map_file);
+ string line;
+ assert(getline(*file.stream(), line));
+ SplitOnWhitespace(line, &terms);
+ }
+
+ index_map_ = new short int[terms.size() + 1];
+ int ix = 0;
+ size_t i;
+ for (i = 0; i < terms.size(); i++) {
+ index_map_[i] = ix;
+ ix += atoi(terms[i].c_str());
+ }
+ index_map_[i] = ix;
+ assert(parsed_tree_ == NULL || ix == parsed_tree_->m_vecTerminals.size());
+ }
+
+ void MapIndex(short int begin, short int end, short int& mapped_begin,
+ short int& mapped_end) {
+ if (index_map_ == NULL) {
+ mapped_begin = begin;
+ mapped_end = end;
+ return;
+ }
+
+ mapped_begin = index_map_[begin];
+ mapped_end = index_map_[end + 1] - 1;
+ }
+
+ /*
+ * ff_const_reorder.cc::ConstReorderFeatureImpl also defines this function
+ */
+ void FindConsts(const SParsedTree* tree, int begin, int end,
+ vector<STreeItem*>& consts) {
+ STreeItem* item;
+ item = tree->m_vecTerminals[begin]->m_ptParent;
+ while (true) {
+ while (item->m_ptParent != NULL &&
+ item->m_ptParent->m_iBegin == item->m_iBegin &&
+ item->m_ptParent->m_iEnd <= end)
+ item = item->m_ptParent;
+
+ if (item->m_ptParent == NULL && item->m_vecChildren.size() == 1 &&
+ strcmp(item->m_pszTerm, "ROOT") == 0)
+ item = item->m_vecChildren[0]; // we automatically add a "ROOT" node at
+ // the top, skip it if necessary.
+
+ consts.push_back(item);
+ if (item->m_iEnd < end)
+ item = tree->m_vecTerminals[item->m_iEnd + 1]->m_ptParent;
+ else
+ break;
+ }
+ }
+
+ /*
+ * according to Marton & Resnik (2008)
+ * a span cann't have both X+ style and X= style features
+ * a constituent XP is crossed only if the span not only covers parts of XP's
+ *content, but also covers one or more words outside XP
+ * a span may have X+, Y+
+ *
+ * (note, we refer X* features to X= features in Marton & Resnik (2008))
+ */
+ void GenerateSoftFeature(int begin, int end, const SParsedTree* tree,
+ vector<string>& vecFeature) {
+ vector<STreeItem*> vecNode;
+ FindConsts(tree, begin, end, vecNode);
+
+ if (vecNode.size() == 1) {
+ // match to one constituent
+ string feature_name = string(vecNode[0]->m_pszTerm) + string("*");
+ vecFeature.push_back(feature_name);
+ } else {
+ // match to multiple constituents, find the lowest common parent (lcp)
+ STreeItem* lcp = vecNode[0];
+ while (lcp->m_iEnd < end) lcp = lcp->m_ptParent;
+
+ for (size_t i = 0; i < vecNode.size(); i++) {
+ STreeItem* item = vecNode[i];
+
+ while (item != lcp) {
+ if (item->m_iBegin < begin || item->m_iEnd > end) {
+ // item is crossed
+ string feature_name = string(item->m_pszTerm) + string("+");
+ vecFeature.push_back(feature_name);
+ }
+ if (item->m_iBrotherIndex > 0 &&
+ item->m_ptParent->m_vecChildren[item->m_iBrotherIndex - 1]
+ ->m_iBegin >= begin &&
+ item->m_ptParent->m_vecChildren[item->m_iBrotherIndex - 1]
+ ->m_iEnd <= end)
+ break; // we don't want to collect crossed constituents twice
+ item = item->m_ptParent;
+ }
+ }
+ }
+ }
+
+ void GenerateSoftFeatureFromFlattenedTree(int begin, int end,
+ const SParsedTree* tree,
+ vector<string>& vecFeature) {
+ vector<STreeItem*> vecNode;
+ FindConsts(tree, begin, end, vecNode);
+
+ if (vecNode.size() == 1) {
+ // match to one constituent
+ string feature_name = string(vecNode[0]->m_pszTerm) + string("*");
+ vecFeature.push_back(feature_name);
+ } else {
+ // match to multiple constituents, see if they have a common parent
+ size_t i = 0;
+ for (i = 1; i < vecNode.size(); i++) {
+ if (vecNode[i]->m_ptParent != vecNode[0]->m_ptParent) break;
+ }
+ if (i == vecNode.size()) {
+ // they share a common parent
+ string feature_name =
+ string(vecNode[0]->m_ptParent->m_pszTerm) + string("&");
+ vecFeature.push_back(feature_name);
+ } else {
+ // they don't share a common parent, find the lowest common parent (lcp)
+ STreeItem* lcp = vecNode[0];
+ while (lcp->m_iEnd < end) lcp = lcp->m_ptParent;
+
+ for (size_t i = 0; i < vecNode.size(); i++) {
+ STreeItem* item = vecNode[i];
+
+ while (item != lcp) {
+ if (item->m_iBegin < begin || item->m_iEnd > end) {
+ // item is crossed
+ string feature_name = string(item->m_pszTerm) + string("+");
+ vecFeature.push_back(feature_name);
+ }
+ if (item->m_iBrotherIndex > 0 &&
+ item->m_ptParent->m_vecChildren[item->m_iBrotherIndex - 1]
+ ->m_iBegin >= begin &&
+ item->m_ptParent->m_vecChildren[item->m_iBrotherIndex - 1]
+ ->m_iEnd <= end)
+ break; // we don't want to collect crossed constituents twice
+ item = item->m_ptParent;
+ }
+ }
+ }
+ }
+ }
+
+ void SetSoftSynFeature(const Hypergraph::Edge& edge,
+ SparseVector<double>* features) {
+ if (parsed_tree_ == NULL) return;
+
+ short int mapped_begin, mapped_end;
+ MapIndex(edge.i_, edge.j_ - 1, mapped_begin, mapped_end);
+
+ // soft feature for the whole span
+ const vector<string> vecFeature =
+ GenerateSoftFeature(mapped_begin, mapped_end, map_features_);
+ for (size_t i = 0; i < vecFeature.size(); i++) {
+ int f_id = FD::Convert(vecFeature[i]);
+ if (f_id) features->set_value(f_id, 1);
+ }
+ }
+
+ private:
+ const vector<string>& GenerateSoftFeature(int begin, int end,
+ MapFeatures* map_features) {
+ string key;
+ GenerateKey(begin, end, key);
+ MapFeatures::const_iterator iter = (*map_features).find(key);
+ assert(iter != map_features->end());
+ return iter->second;
+ }
+
+ void Byte_to_Char(unsigned char* str, int n) {
+ str[0] = (n & 255);
+ str[1] = n / 256;
+ }
+
+ void GenerateKey(int begin, int end, string& key) {
+ unsigned char szTerm[1001];
+ Byte_to_Char(szTerm, begin);
+ Byte_to_Char(szTerm + 2, end);
+ szTerm[4] = '\0';
+ key = string(szTerm, szTerm + 4);
+ }
+
+ void InitializeFeatures(MapFeatures* map_features) {
+ if (parsed_tree_ == NULL) return;
+
+ for (size_t i = 0; i < parsed_tree_->m_vecTerminals.size(); i++)
+ for (size_t j = i; j < parsed_tree_->m_vecTerminals.size(); j++) {
+ vector<string> vecFeature;
+ GenerateSoftFeature(i, j, parsed_tree_, vecFeature);
+ string key;
+ GenerateKey(i, j, key);
+ (*map_features)[key] = vecFeature;
+ }
+ }
+
+ void FreeSentenceVariables() {
+ if (parsed_tree_ != NULL) delete parsed_tree_;
+ if (index_map_ != NULL) delete[] index_map_;
+ index_map_ = NULL;
+
+ if (map_features_ != NULL) delete map_features_;
+ }
+
+ SParsedTree* ReadParseTree(const std::string& parse_file) {
+ SParseReader* reader = new SParseReader(parse_file.c_str(), false);
+ SParsedTree* tree = reader->fnReadNextParseTree();
+ // assert(tree != NULL);
+ delete reader;
+ return tree;
+ }
+
+ private:
+ SParsedTree* parsed_tree_;
+
+ short int* index_map_;
+
+ MapFeatures* map_features_;
+};
+
+SoftSynFeature::SoftSynFeature(std::string param) {
+ pimpl_ = new SoftSynFeatureImpl(param);
+ name_ = "SoftSynFeature";
+}
+
+SoftSynFeature::~SoftSynFeature() { delete pimpl_; }
+
+void SoftSynFeature::PrepareForInput(const SentenceMetadata& smeta) {
+ string parse_file = smeta.GetSGMLValue("parse");
+ assert(parse_file != "");
+
+ string indexmap_file = smeta.GetSGMLValue("index-map");
+
+ pimpl_->InitializeInputSentence(parse_file, indexmap_file);
+}
+
+void SoftSynFeature::TraversalFeaturesImpl(
+ const SentenceMetadata& /*smeta*/, const Hypergraph::Edge& edge,
+ const vector<const void*>& /*ant_states*/, SparseVector<double>* features,
+ SparseVector<double>* /*estimated_features*/, void* /*state*/) const {
+ pimpl_->SetSoftSynFeature(edge, features);
+}
+
+string SoftSynFeature::usage(bool /*param*/, bool /*verbose*/) {
+ return "SoftSynFeature";
+}
+
+boost::shared_ptr<FeatureFunction> CreateSoftSynFeatureModel(
+ std::string param) {
+ SoftSynFeature* ret = new SoftSynFeature(param);
+ return boost::shared_ptr<FeatureFunction>(ret);
+}
+
+boost::shared_ptr<FeatureFunction> SoftSynFeatureFactory::Create(
+ std::string param) const {
+ return CreateSoftSynFeatureModel(param);
+}
+
+std::string SoftSynFeatureFactory::usage(bool params, bool verbose) const {
+ return SoftSynFeature::usage(params, verbose);
+}