diff options
author | jon.h.clark <jon.h.clark@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-11-12 20:28:09 +0000 |
---|---|---|
committer | jon.h.clark <jon.h.clark@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-11-12 20:28:09 +0000 |
commit | bc93e6d9e869b9769a5d0273b4292eedf921780e (patch) | |
tree | 052bf9addc9330b8cfab100d6cb98ca6ecfad04e /decoder/ff_wordset.h | |
parent | 2ca345d3612c5a3dcf4af6f320200b4bfecbe1e9 (diff) |
Forgot worset feature
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@717 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'decoder/ff_wordset.h')
-rw-r--r-- | decoder/ff_wordset.h | 108 |
1 files changed, 108 insertions, 0 deletions
diff --git a/decoder/ff_wordset.h b/decoder/ff_wordset.h new file mode 100644 index 00000000..256d54bb --- /dev/null +++ b/decoder/ff_wordset.h @@ -0,0 +1,108 @@ +#ifndef _FF_WORDSET_H_ +#define _FF_WORDSET_H_ + +#include "ff.h" + +#include <boost/unordered/unordered_set.hpp> +#include <boost/algorithm/string.hpp> + +#include <vector> +#include <string> +#include <iostream> +#include <fstream> + +class WordSet : public FeatureFunction { + public: + +// we depend on the order of the initializer list +// to call member constructurs in the proper order +// modify this carefully! +// +// Usage: "WordSet -v vocab.txt [--oov]" + WordSet(const std::string& param) { + std::string vocabFile; + std::string featName; + parseArgs(param, &featName, &vocabFile, &oovMode_); + + fid_ = FD::Convert(featName); + + std::cerr << "Loading vocab for " << param << " from " << vocabFile << std::endl; + loadVocab(vocabFile, &vocab_); + } + + + protected: + virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector<const void*>& ant_contexts, + SparseVector<double>* features, + SparseVector<double>* estimated_features, + void* context) const; + private: + + static void loadVocab(const std::string& vocabFile, boost::unordered_set<WordID>* vocab) { + + std::ifstream file; + std::string line; + + file.open(vocabFile.c_str(), std::fstream::in); + if (file.is_open()) { + unsigned lineNum = 0; + while (!file.eof()) { + ++lineNum; + getline(file, line); + boost::trim(line); + if(line.empty()) { + continue; + } + + WordID vocabId = TD::Convert(line); + vocab->insert(vocabId); + } + file.close(); + } else { + std::cerr << "Unable to open file: " << vocabFile; + exit(1); + } + } + + static void parseArgs(const std::string& args, std::string* featName, std::string* vocabFile, bool* oovMode) { + + std::vector<std::string> toks(10); + boost::split(toks, args, boost::is_any_of(" ")); + + *oovMode = false; + + // skip initial feature name + for(std::vector<std::string>::const_iterator it = toks.begin(); it != toks.end(); ++it) { + if(*it == "-v") { + *vocabFile = *++it; // copy + + } else if(*it == "-N") { + *featName = *++it; + + } else if(*it == "--oov") { + *oovMode = true; + + } else { + std::cerr << "Unrecognized argument: " << *it << std::endl; + exit(1); + } + } + + if(*featName == "") { + std::cerr << "featName (-N) not specified for WordSet" << std::endl; + exit(1); + } + if(*vocabFile == "") { + std::cerr << "vocabFile (-v) not specified for WordSet" << std::endl; + exit(1); + } + } + + int fid_; + bool oovMode_; + boost::unordered_set<WordID> vocab_; +}; + +#endif |