summaryrefslogtreecommitdiff
path: root/decoder
diff options
context:
space:
mode:
Diffstat (limited to 'decoder')
-rw-r--r--decoder/ff_wordset.cc28
-rw-r--r--decoder/ff_wordset.h108
2 files changed, 136 insertions, 0 deletions
diff --git a/decoder/ff_wordset.cc b/decoder/ff_wordset.cc
new file mode 100644
index 00000000..44468899
--- /dev/null
+++ b/decoder/ff_wordset.cc
@@ -0,0 +1,28 @@
+#include "ff_wordset.h"
+
+#include "fdict.h"
+#include <sstream>
+#include <iostream>
+
+using namespace std;
+
+void WordSet::TraversalFeaturesImpl(const SentenceMetadata& /*smeta*/ ,
+ const Hypergraph::Edge& edge,
+ const vector<const void*>& /* ant_contexts */,
+ SparseVector<double>* features,
+ SparseVector<double>* /* estimated_features */,
+ void* /* context */) const {
+
+ double addScore = 0.0;
+ for(std::vector<WordID>::const_iterator it = edge.rule_->e_.begin(); it != edge.rule_->e_.end(); ++it) {
+
+ bool inVocab = (vocab_.find(*it) != vocab_.end());
+ if(oovMode_ && !inVocab) {
+ addScore += 1.0;
+ } else if(!oovMode_ && inVocab) {
+ addScore += 1.0;
+ }
+ }
+ features->set_value(fid_, addScore);
+}
+
diff --git a/decoder/ff_wordset.h b/decoder/ff_wordset.h
new file mode 100644
index 00000000..256d54bb
--- /dev/null
+++ b/decoder/ff_wordset.h
@@ -0,0 +1,108 @@
+#ifndef _FF_WORDSET_H_
+#define _FF_WORDSET_H_
+
+#include "ff.h"
+
+#include <boost/unordered/unordered_set.hpp>
+#include <boost/algorithm/string.hpp>
+
+#include <vector>
+#include <string>
+#include <iostream>
+#include <fstream>
+
+class WordSet : public FeatureFunction {
+ public:
+
+// we depend on the order of the initializer list
+// to call member constructurs in the proper order
+// modify this carefully!
+//
+// Usage: "WordSet -v vocab.txt [--oov]"
+ WordSet(const std::string& param) {
+ std::string vocabFile;
+ std::string featName;
+ parseArgs(param, &featName, &vocabFile, &oovMode_);
+
+ fid_ = FD::Convert(featName);
+
+ std::cerr << "Loading vocab for " << param << " from " << vocabFile << std::endl;
+ loadVocab(vocabFile, &vocab_);
+ }
+
+
+ protected:
+ virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
+ const Hypergraph::Edge& edge,
+ const std::vector<const void*>& ant_contexts,
+ SparseVector<double>* features,
+ SparseVector<double>* estimated_features,
+ void* context) const;
+ private:
+
+ static void loadVocab(const std::string& vocabFile, boost::unordered_set<WordID>* vocab) {
+
+ std::ifstream file;
+ std::string line;
+
+ file.open(vocabFile.c_str(), std::fstream::in);
+ if (file.is_open()) {
+ unsigned lineNum = 0;
+ while (!file.eof()) {
+ ++lineNum;
+ getline(file, line);
+ boost::trim(line);
+ if(line.empty()) {
+ continue;
+ }
+
+ WordID vocabId = TD::Convert(line);
+ vocab->insert(vocabId);
+ }
+ file.close();
+ } else {
+ std::cerr << "Unable to open file: " << vocabFile;
+ exit(1);
+ }
+ }
+
+ static void parseArgs(const std::string& args, std::string* featName, std::string* vocabFile, bool* oovMode) {
+
+ std::vector<std::string> toks(10);
+ boost::split(toks, args, boost::is_any_of(" "));
+
+ *oovMode = false;
+
+ // skip initial feature name
+ for(std::vector<std::string>::const_iterator it = toks.begin(); it != toks.end(); ++it) {
+ if(*it == "-v") {
+ *vocabFile = *++it; // copy
+
+ } else if(*it == "-N") {
+ *featName = *++it;
+
+ } else if(*it == "--oov") {
+ *oovMode = true;
+
+ } else {
+ std::cerr << "Unrecognized argument: " << *it << std::endl;
+ exit(1);
+ }
+ }
+
+ if(*featName == "") {
+ std::cerr << "featName (-N) not specified for WordSet" << std::endl;
+ exit(1);
+ }
+ if(*vocabFile == "") {
+ std::cerr << "vocabFile (-v) not specified for WordSet" << std::endl;
+ exit(1);
+ }
+ }
+
+ int fid_;
+ bool oovMode_;
+ boost::unordered_set<WordID> vocab_;
+};
+
+#endif