From 5574acc916b1938fca3bf9d41fae6ca170f73b34 Mon Sep 17 00:00:00 2001 From: "jon.h.clark" Date: Fri, 12 Nov 2010 20:28:09 +0000 Subject: Forgot worset feature git-svn-id: https://ws10smt.googlecode.com/svn/trunk@717 ec762483-ff6d-05da-a07a-a48fb63a330f --- decoder/ff_wordset.cc | 28 +++++++++++++ decoder/ff_wordset.h | 108 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 136 insertions(+) create mode 100644 decoder/ff_wordset.cc create mode 100644 decoder/ff_wordset.h (limited to 'decoder') diff --git a/decoder/ff_wordset.cc b/decoder/ff_wordset.cc new file mode 100644 index 00000000..44468899 --- /dev/null +++ b/decoder/ff_wordset.cc @@ -0,0 +1,28 @@ +#include "ff_wordset.h" + +#include "fdict.h" +#include +#include + +using namespace std; + +void WordSet::TraversalFeaturesImpl(const SentenceMetadata& /*smeta*/ , + const Hypergraph::Edge& edge, + const vector& /* ant_contexts */, + SparseVector* features, + SparseVector* /* estimated_features */, + void* /* context */) const { + + double addScore = 0.0; + for(std::vector::const_iterator it = edge.rule_->e_.begin(); it != edge.rule_->e_.end(); ++it) { + + bool inVocab = (vocab_.find(*it) != vocab_.end()); + if(oovMode_ && !inVocab) { + addScore += 1.0; + } else if(!oovMode_ && inVocab) { + addScore += 1.0; + } + } + features->set_value(fid_, addScore); +} + diff --git a/decoder/ff_wordset.h b/decoder/ff_wordset.h new file mode 100644 index 00000000..256d54bb --- /dev/null +++ b/decoder/ff_wordset.h @@ -0,0 +1,108 @@ +#ifndef _FF_WORDSET_H_ +#define _FF_WORDSET_H_ + +#include "ff.h" + +#include +#include + +#include +#include +#include +#include + +class WordSet : public FeatureFunction { + public: + +// we depend on the order of the initializer list +// to call member constructurs in the proper order +// modify this carefully! +// +// Usage: "WordSet -v vocab.txt [--oov]" + WordSet(const std::string& param) { + std::string vocabFile; + std::string featName; + parseArgs(param, &featName, &vocabFile, &oovMode_); + + fid_ = FD::Convert(featName); + + std::cerr << "Loading vocab for " << param << " from " << vocabFile << std::endl; + loadVocab(vocabFile, &vocab_); + } + + + protected: + virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector& ant_contexts, + SparseVector* features, + SparseVector* estimated_features, + void* context) const; + private: + + static void loadVocab(const std::string& vocabFile, boost::unordered_set* vocab) { + + std::ifstream file; + std::string line; + + file.open(vocabFile.c_str(), std::fstream::in); + if (file.is_open()) { + unsigned lineNum = 0; + while (!file.eof()) { + ++lineNum; + getline(file, line); + boost::trim(line); + if(line.empty()) { + continue; + } + + WordID vocabId = TD::Convert(line); + vocab->insert(vocabId); + } + file.close(); + } else { + std::cerr << "Unable to open file: " << vocabFile; + exit(1); + } + } + + static void parseArgs(const std::string& args, std::string* featName, std::string* vocabFile, bool* oovMode) { + + std::vector toks(10); + boost::split(toks, args, boost::is_any_of(" ")); + + *oovMode = false; + + // skip initial feature name + for(std::vector::const_iterator it = toks.begin(); it != toks.end(); ++it) { + if(*it == "-v") { + *vocabFile = *++it; // copy + + } else if(*it == "-N") { + *featName = *++it; + + } else if(*it == "--oov") { + *oovMode = true; + + } else { + std::cerr << "Unrecognized argument: " << *it << std::endl; + exit(1); + } + } + + if(*featName == "") { + std::cerr << "featName (-N) not specified for WordSet" << std::endl; + exit(1); + } + if(*vocabFile == "") { + std::cerr << "vocabFile (-v) not specified for WordSet" << std::endl; + exit(1); + } + } + + int fid_; + bool oovMode_; + boost::unordered_set vocab_; +}; + +#endif -- cgit v1.2.3