summaryrefslogtreecommitdiff
path: root/decoder/ff_wordset.cc
blob: 9be6f2e0695064e1b6bf39a9a4b45fa94087b9da (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
#include "ff_wordset.h"

#include "hg.h"
#include "fdict.h"
#include "filelib.h"
#include <boost/algorithm/string.hpp>
#include <sstream>
#include <iostream>

using namespace std;

void WordSet::parseArgs(const string& args, string* featName, string* vocabFile, bool* oovMode) {
  vector<string> toks(10);
  boost::split(toks, args, boost::is_any_of(" "));

  *oovMode = false;

  // skip initial feature name
  for(vector<string>::const_iterator it = toks.begin(); it != toks.end(); ++it) {
    if(*it == "-v") {
      *vocabFile = *++it; // copy

    } else if(*it == "-N") {
      *featName = *++it;
    } else if(*it == "--oov") {
       *oovMode = true;
    } else {
       cerr << "Unrecognized argument: " << *it << endl;
       exit(1);
    }
  }

  if(*featName == "") {
    cerr << "featName (-N) not specified for WordSet" << endl;
    exit(1);
  }
  if(*vocabFile == "") {
    cerr << "vocabFile (-v) not specified for WordSet" << endl;
    exit(1);
  }
}

void WordSet::loadVocab(const string& vocabFile, unordered_set<WordID>* vocab) {
  ReadFile rf(vocabFile);
  if (!rf) {
    cerr << "Unable to open file: " << vocabFile; 
    abort();
  }
  string line;
  while (getline(*rf.stream(), line)) {
    boost::trim(line);
    if(line.empty()) continue;
    WordID vocabId = TD::Convert(line);
    vocab->insert(vocabId);
  }
}

void WordSet::TraversalFeaturesImpl(const SentenceMetadata& /*smeta*/ ,
				    const Hypergraph::Edge& edge,
				    const vector<const void*>& /* ant_contexts */,
				    SparseVector<double>* features,
				    SparseVector<double>* /* estimated_features */,
				    void* /* context */) const {
  double addScore = 0.0;
  for(vector<WordID>::const_iterator it = edge.rule_->e_.begin(); it != edge.rule_->e_.end(); ++it) {
    bool inVocab = (vocab_.find(*it) != vocab_.end());
    if(oovMode_ && !inVocab) {
      addScore += 1.0;
    } else if(!oovMode_ && inVocab) {
      addScore += 1.0;
    }
  }
  features->set_value(fid_, addScore);
}