diff options
Diffstat (limited to 'decoder')
| -rw-r--r-- | decoder/Makefile.am | 2 | ||||
| -rw-r--r-- | decoder/cdec_ff.cc | 2 | ||||
| -rw-r--r-- | decoder/ff_soft_syn.cc | 326 | ||||
| -rw-r--r-- | decoder/ff_soft_syn.h | 38 | 
4 files changed, 368 insertions, 0 deletions
| diff --git a/decoder/Makefile.am b/decoder/Makefile.am index f791b3cc..727e5af5 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -60,6 +60,7 @@ libcdec_a_SOURCES = \    ff_rules.h \    ff_ruleshape.h \    ff_sample_fsa.h \ +  ff_soft_syn.h \    ff_soft_syntax.h \    ff_soft_syntax_mindist.h \    ff_source_path.h \ @@ -123,6 +124,7 @@ libcdec_a_SOURCES = \    ff_parse_match.cc \    ff_rules.cc \    ff_ruleshape.cc \ +  ff_soft_syn.cc \    ff_soft_syntax.cc \    ff_soft_syntax_mindist.cc \    ff_source_path.cc \ diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc index 6f7227aa..973a643a 100644 --- a/decoder/cdec_ff.cc +++ b/decoder/cdec_ff.cc @@ -15,6 +15,7 @@  #include "ff_rules.h"  #include "ff_ruleshape.h"  #include "ff_bleu.h" +#include "ff_soft_syn.h"  #include "ff_soft_syntax.h"  #include "ff_soft_syntax_mindist.h"  #include "ff_source_path.h" @@ -80,4 +81,5 @@ void register_feature_functions() {    ff_registry.Register("WordSet", new FFFactory<WordSet>);    ff_registry.Register("ConstReorderFeature", new FFFactory<ConstReorderFeature>);    ff_registry.Register("External", new FFFactory<ExternalFeature>); +  ff_registry.Register("SoftSynFeature", new SoftSynFeatureFactory());  } diff --git a/decoder/ff_soft_syn.cc b/decoder/ff_soft_syn.cc new file mode 100644 index 00000000..970e532b --- /dev/null +++ b/decoder/ff_soft_syn.cc @@ -0,0 +1,326 @@ +/* + * ff_soft_syn.cc + * + */ +#include "ff_soft_syn.h" + +#include "filelib.h" +#include "stringlib.h" +#include "hg.h" +#include "sentence_metadata.h" +#include "ff_const_reorder_common.h" + +#include <string> +#include <vector> +#include <stdio.h> + +using namespace std; +using namespace const_reorder; + +typedef HASH_MAP<std::string, vector<string> > MapFeatures; + +/* + * Note: + *      In BOLT experiments, we need to merged some sequence words into one term + *(like from "1999 nian 1 yue 10 ri" to "1999_nian_1_yue_10_ri") due to some + *reasons; + *      but in the parse file, we still use the parse tree before merging any + *words; + *      therefore, the words in source sentence and parse tree diverse and we + *need to map a word in merged sentence into its original index; + *      a word in source sentence maps 1 or more words in parse tree + *      the index map info is stored at variable index_map_; + *      if the index_map_ is NULL, indicating the word index in source sentence + *and parse tree is always same. + * + */ + +struct SoftSynFeatureImpl { +  SoftSynFeatureImpl(const string& /*params*/) { +    parsed_tree_ = NULL; +    index_map_ = NULL; + +    map_features_ = NULL; +  } + +  ~SoftSynFeatureImpl() { FreeSentenceVariables(); } + +  void InitializeInputSentence(const std::string& parse_file, +                               const std::string& index_map_file) { +    FreeSentenceVariables(); +    parsed_tree_ = ReadParseTree(parse_file); + +    if (index_map_file != "") ReadIndexMap(index_map_file); + +    // we can do the features "off-line" +    map_features_ = new MapFeatures(); +    InitializeFeatures(map_features_); +  } + +  void ReadIndexMap(const std::string& index_map_file) { +    vector<string> terms; +    { +      ReadFile file(index_map_file); +      string line; +      assert(getline(*file.stream(), line)); +      SplitOnWhitespace(line, &terms); +    } + +    index_map_ = new short int[terms.size() + 1]; +    int ix = 0; +    size_t i; +    for (i = 0; i < terms.size(); i++) { +      index_map_[i] = ix; +      ix += atoi(terms[i].c_str()); +    } +    index_map_[i] = ix; +    assert(parsed_tree_ == NULL || ix == parsed_tree_->m_vecTerminals.size()); +  } + +  void MapIndex(short int begin, short int end, short int& mapped_begin, +                short int& mapped_end) { +    if (index_map_ == NULL) { +      mapped_begin = begin; +      mapped_end = end; +      return; +    } + +    mapped_begin = index_map_[begin]; +    mapped_end = index_map_[end + 1] - 1; +  } + +  /* +   * ff_const_reorder.cc::ConstReorderFeatureImpl also defines this function +   */ +  void FindConsts(const SParsedTree* tree, int begin, int end, +                  vector<STreeItem*>& consts) { +    STreeItem* item; +    item = tree->m_vecTerminals[begin]->m_ptParent; +    while (true) { +      while (item->m_ptParent != NULL && +             item->m_ptParent->m_iBegin == item->m_iBegin && +             item->m_ptParent->m_iEnd <= end) +        item = item->m_ptParent; + +      if (item->m_ptParent == NULL && item->m_vecChildren.size() == 1 && +          strcmp(item->m_pszTerm, "ROOT") == 0) +        item = item->m_vecChildren[0];  // we automatically add a "ROOT" node at +                                        // the top, skip it if necessary. + +      consts.push_back(item); +      if (item->m_iEnd < end) +        item = tree->m_vecTerminals[item->m_iEnd + 1]->m_ptParent; +      else +        break; +    } +  } + +  /* +   * according to Marton & Resnik (2008) +   * a span cann't have both X+ style and X= style features +   * a constituent XP is crossed only if the span not only covers parts of XP's +   *content, but also covers one or more words outside XP +   * a span may have X+, Y+ +   * +   * (note, we refer X* features to X= features in Marton & Resnik (2008)) +   */ +  void GenerateSoftFeature(int begin, int end, const SParsedTree* tree, +                           vector<string>& vecFeature) { +    vector<STreeItem*> vecNode; +    FindConsts(tree, begin, end, vecNode); + +    if (vecNode.size() == 1) { +      // match to one constituent +      string feature_name = string(vecNode[0]->m_pszTerm) + string("*"); +      vecFeature.push_back(feature_name); +    } else { +      // match to multiple constituents, find the lowest common parent (lcp) +      STreeItem* lcp = vecNode[0]; +      while (lcp->m_iEnd < end) lcp = lcp->m_ptParent; + +      for (size_t i = 0; i < vecNode.size(); i++) { +        STreeItem* item = vecNode[i]; + +        while (item != lcp) { +          if (item->m_iBegin < begin || item->m_iEnd > end) { +            // item is crossed +            string feature_name = string(item->m_pszTerm) + string("+"); +            vecFeature.push_back(feature_name); +          } +          if (item->m_iBrotherIndex > 0 && +              item->m_ptParent->m_vecChildren[item->m_iBrotherIndex - 1] +                      ->m_iBegin >= begin && +              item->m_ptParent->m_vecChildren[item->m_iBrotherIndex - 1] +                      ->m_iEnd <= end) +            break;  // we don't want to collect crossed constituents twice +          item = item->m_ptParent; +        } +      } +    } +  } + +  void GenerateSoftFeatureFromFlattenedTree(int begin, int end, +                                            const SParsedTree* tree, +                                            vector<string>& vecFeature) { +    vector<STreeItem*> vecNode; +    FindConsts(tree, begin, end, vecNode); + +    if (vecNode.size() == 1) { +      // match to one constituent +      string feature_name = string(vecNode[0]->m_pszTerm) + string("*"); +      vecFeature.push_back(feature_name); +    } else { +      // match to multiple constituents, see if they have a common parent +      size_t i = 0; +      for (i = 1; i < vecNode.size(); i++) { +        if (vecNode[i]->m_ptParent != vecNode[0]->m_ptParent) break; +      } +      if (i == vecNode.size()) { +        // they share a common parent +        string feature_name = +            string(vecNode[0]->m_ptParent->m_pszTerm) + string("&"); +        vecFeature.push_back(feature_name); +      } else { +        // they don't share a common parent, find the lowest common parent (lcp) +        STreeItem* lcp = vecNode[0]; +        while (lcp->m_iEnd < end) lcp = lcp->m_ptParent; + +        for (size_t i = 0; i < vecNode.size(); i++) { +          STreeItem* item = vecNode[i]; + +          while (item != lcp) { +            if (item->m_iBegin < begin || item->m_iEnd > end) { +              // item is crossed +              string feature_name = string(item->m_pszTerm) + string("+"); +              vecFeature.push_back(feature_name); +            } +            if (item->m_iBrotherIndex > 0 && +                item->m_ptParent->m_vecChildren[item->m_iBrotherIndex - 1] +                        ->m_iBegin >= begin && +                item->m_ptParent->m_vecChildren[item->m_iBrotherIndex - 1] +                        ->m_iEnd <= end) +              break;  // we don't want to collect crossed constituents twice +            item = item->m_ptParent; +          } +        } +      } +    } +  } + +  void SetSoftSynFeature(const Hypergraph::Edge& edge, +                         SparseVector<double>* features) { +    if (parsed_tree_ == NULL) return; + +    short int mapped_begin, mapped_end; +    MapIndex(edge.i_, edge.j_ - 1, mapped_begin, mapped_end); + +    // soft feature for the whole span +    const vector<string> vecFeature = +        GenerateSoftFeature(mapped_begin, mapped_end, map_features_); +    for (size_t i = 0; i < vecFeature.size(); i++) { +      int f_id = FD::Convert(vecFeature[i]); +      if (f_id) features->set_value(f_id, 1); +    } +  } + + private: +  const vector<string>& GenerateSoftFeature(int begin, int end, +                                            MapFeatures* map_features) { +    string key; +    GenerateKey(begin, end, key); +    MapFeatures::const_iterator iter = (*map_features).find(key); +    assert(iter != map_features->end()); +    return iter->second; +  } + +  void Byte_to_Char(unsigned char* str, int n) { +    str[0] = (n & 255); +    str[1] = n / 256; +  } + +  void GenerateKey(int begin, int end, string& key) { +    unsigned char szTerm[1001]; +    Byte_to_Char(szTerm, begin); +    Byte_to_Char(szTerm + 2, end); +    szTerm[4] = '\0'; +    key = string(szTerm, szTerm + 4); +  } + +  void InitializeFeatures(MapFeatures* map_features) { +    if (parsed_tree_ == NULL) return; + +    for (size_t i = 0; i < parsed_tree_->m_vecTerminals.size(); i++) +      for (size_t j = i; j < parsed_tree_->m_vecTerminals.size(); j++) { +        vector<string> vecFeature; +        GenerateSoftFeature(i, j, parsed_tree_, vecFeature); +        string key; +        GenerateKey(i, j, key); +        (*map_features)[key] = vecFeature; +      } +  } + +  void FreeSentenceVariables() { +    if (parsed_tree_ != NULL) delete parsed_tree_; +    if (index_map_ != NULL) delete[] index_map_; +    index_map_ = NULL; + +    if (map_features_ != NULL) delete map_features_; +  } + +  SParsedTree* ReadParseTree(const std::string& parse_file) { +    SParseReader* reader = new SParseReader(parse_file.c_str(), false); +    SParsedTree* tree = reader->fnReadNextParseTree(); +    // assert(tree != NULL); +    delete reader; +    return tree; +  } + + private: +  SParsedTree* parsed_tree_; + +  short int* index_map_; + +  MapFeatures* map_features_; +}; + +SoftSynFeature::SoftSynFeature(std::string param) { +  pimpl_ = new SoftSynFeatureImpl(param); +  name_ = "SoftSynFeature"; +} + +SoftSynFeature::~SoftSynFeature() { delete pimpl_; } + +void SoftSynFeature::PrepareForInput(const SentenceMetadata& smeta) { +  string parse_file = smeta.GetSGMLValue("parse"); +  assert(parse_file != ""); + +  string indexmap_file = smeta.GetSGMLValue("index-map"); + +  pimpl_->InitializeInputSentence(parse_file, indexmap_file); +} + +void SoftSynFeature::TraversalFeaturesImpl( +    const SentenceMetadata& /*smeta*/, const Hypergraph::Edge& edge, +    const vector<const void*>& /*ant_states*/, SparseVector<double>* features, +    SparseVector<double>* /*estimated_features*/, void* /*state*/) const { +  pimpl_->SetSoftSynFeature(edge, features); +} + +string SoftSynFeature::usage(bool /*param*/, bool /*verbose*/) { +  return "SoftSynFeature"; +} + +boost::shared_ptr<FeatureFunction> CreateSoftSynFeatureModel( +    std::string param) { +  SoftSynFeature* ret = new SoftSynFeature(param); +  return boost::shared_ptr<FeatureFunction>(ret); +} + +boost::shared_ptr<FeatureFunction> SoftSynFeatureFactory::Create( +    std::string param) const { +  return CreateSoftSynFeatureModel(param); +} + +std::string SoftSynFeatureFactory::usage(bool params, bool verbose) const { +  return SoftSynFeature::usage(params, verbose); +} diff --git a/decoder/ff_soft_syn.h b/decoder/ff_soft_syn.h new file mode 100644 index 00000000..df9a6cc8 --- /dev/null +++ b/decoder/ff_soft_syn.h @@ -0,0 +1,38 @@ +/* + * ff_soft_syn.h + * + */ + +#ifndef FF_SOFT_SYN_H_ +#define FF_SOFT_SYN_H_ + +#include "ff_factory.h" +#include "ff.h" + +struct SoftSynFeatureImpl; + +class SoftSynFeature : public FeatureFunction { + public: +  SoftSynFeature(std::string param); +  ~SoftSynFeature(); +  static std::string usage(bool param, bool verbose); + + protected: +  virtual void PrepareForInput(const SentenceMetadata& smeta); + +  virtual void TraversalFeaturesImpl( +      const SentenceMetadata& smeta, const HG::Edge& edge, +      const std::vector<const void*>& ant_contexts, +      SparseVector<double>* features, SparseVector<double>* estimated_features, +      void* out_context) const; + + private: +  SoftSynFeatureImpl* pimpl_; +}; + +struct SoftSynFeatureFactory : public FactoryBase<FeatureFunction> { +  FP Create(std::string param) const; +  std::string usage(bool params, bool verbose) const; +}; + +#endif /* FF_SOFT_SYN_H_ */ | 
