diff options
-rw-r--r-- | decoder/ff_wordset.cc | 28 | ||||
-rw-r--r-- | decoder/ff_wordset.h | 108 |
2 files changed, 136 insertions, 0 deletions
diff --git a/decoder/ff_wordset.cc b/decoder/ff_wordset.cc new file mode 100644 index 00000000..44468899 --- /dev/null +++ b/decoder/ff_wordset.cc @@ -0,0 +1,28 @@ +#include "ff_wordset.h" + +#include "fdict.h" +#include <sstream> +#include <iostream> + +using namespace std; + +void WordSet::TraversalFeaturesImpl(const SentenceMetadata& /*smeta*/ , + const Hypergraph::Edge& edge, + const vector<const void*>& /* ant_contexts */, + SparseVector<double>* features, + SparseVector<double>* /* estimated_features */, + void* /* context */) const { + + double addScore = 0.0; + for(std::vector<WordID>::const_iterator it = edge.rule_->e_.begin(); it != edge.rule_->e_.end(); ++it) { + + bool inVocab = (vocab_.find(*it) != vocab_.end()); + if(oovMode_ && !inVocab) { + addScore += 1.0; + } else if(!oovMode_ && inVocab) { + addScore += 1.0; + } + } + features->set_value(fid_, addScore); +} + diff --git a/decoder/ff_wordset.h b/decoder/ff_wordset.h new file mode 100644 index 00000000..256d54bb --- /dev/null +++ b/decoder/ff_wordset.h @@ -0,0 +1,108 @@ +#ifndef _FF_WORDSET_H_ +#define _FF_WORDSET_H_ + +#include "ff.h" + +#include <boost/unordered/unordered_set.hpp> +#include <boost/algorithm/string.hpp> + +#include <vector> +#include <string> +#include <iostream> +#include <fstream> + +class WordSet : public FeatureFunction { + public: + +// we depend on the order of the initializer list +// to call member constructurs in the proper order +// modify this carefully! +// +// Usage: "WordSet -v vocab.txt [--oov]" + WordSet(const std::string& param) { + std::string vocabFile; + std::string featName; + parseArgs(param, &featName, &vocabFile, &oovMode_); + + fid_ = FD::Convert(featName); + + std::cerr << "Loading vocab for " << param << " from " << vocabFile << std::endl; + loadVocab(vocabFile, &vocab_); + } + + + protected: + virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector<const void*>& ant_contexts, + SparseVector<double>* features, + SparseVector<double>* estimated_features, + void* context) const; + private: + + static void loadVocab(const std::string& vocabFile, boost::unordered_set<WordID>* vocab) { + + std::ifstream file; + std::string line; + + file.open(vocabFile.c_str(), std::fstream::in); + if (file.is_open()) { + unsigned lineNum = 0; + while (!file.eof()) { + ++lineNum; + getline(file, line); + boost::trim(line); + if(line.empty()) { + continue; + } + + WordID vocabId = TD::Convert(line); + vocab->insert(vocabId); + } + file.close(); + } else { + std::cerr << "Unable to open file: " << vocabFile; + exit(1); + } + } + + static void parseArgs(const std::string& args, std::string* featName, std::string* vocabFile, bool* oovMode) { + + std::vector<std::string> toks(10); + boost::split(toks, args, boost::is_any_of(" ")); + + *oovMode = false; + + // skip initial feature name + for(std::vector<std::string>::const_iterator it = toks.begin(); it != toks.end(); ++it) { + if(*it == "-v") { + *vocabFile = *++it; // copy + + } else if(*it == "-N") { + *featName = *++it; + + } else if(*it == "--oov") { + *oovMode = true; + + } else { + std::cerr << "Unrecognized argument: " << *it << std::endl; + exit(1); + } + } + + if(*featName == "") { + std::cerr << "featName (-N) not specified for WordSet" << std::endl; + exit(1); + } + if(*vocabFile == "") { + std::cerr << "vocabFile (-v) not specified for WordSet" << std::endl; + exit(1); + } + } + + int fid_; + bool oovMode_; + boost::unordered_set<WordID> vocab_; +}; + +#endif |