summaryrefslogtreecommitdiff
path: root/decoder/tagger.cc
blob: 500d20617759676ccfca754969b675dd3c8fbafd (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#include "tagger.h"

#include "tdict.h"
#include "hg_io.h"
#include "filelib.h"
#include "hg.h"
#include "wordid.h"
#include "sentence_metadata.h"

using namespace std;

// This is a really simple linear chain tagger.
// You specify a tagset, and it hypothesizes that each word in the
// input can be tagged with any member of the tagset.
// The are a couple sample features implemented in ff_tagger.h/cc
// One thing to note, that while CRFs typically define the label
// sequence as corresponding to the hidden states in a trellis,
// in our model the labels are on edges, but mathematically
// they are identical.
//
// Things to do if you want to make this a "real" tagger:
// - support dictionaries (for each word, limit the tags considered)
// - add latent variables - this is really easy to do

static void ReadTagset(const string& file, vector<WordID>* tags) {
  ReadFile rf(file);
  istream& in(*rf.stream());
  while(in) {
    string tag;
    in >> tag;
    if (tag.empty()) continue;
    tags->push_back(TD::Convert(tag));
  }
  cerr << "Read " << tags->size() << " labels (tags) from " << file << endl;
}

struct TaggerImpl {
  TaggerImpl(const boost::program_options::variables_map& conf) :
      kXCAT(TD::Convert("X")*-1),
      kNULL(TD::Convert("<eps>")),
      kBINARY(new TRule("[X] ||| [X,1] [X,2] ||| [1] [2]")),
      kGOAL_RULE(new TRule("[Goal] ||| [X,1] ||| [1]")) {
    if (conf.count("tagger_tagset") == 0) {
      cerr << "Tagger requires --tagger_tagset FILE!\n";
      exit(1);
    }
    ReadTagset(conf["tagger_tagset"].as<string>(), &tagset_);
  }

  void BuildTrellis(const vector<WordID>& seq, Hypergraph* forest) {
    int prev_node_id = -1;
    for (int i = 0; i < seq.size(); ++i) {
      const WordID& src = seq[i];
      const int new_node_id = forest->AddNode(kXCAT)->id_;
      for (int k = 0; k < tagset_.size(); ++k) {
        TRulePtr rule(TRule::CreateLexicalRule(src, tagset_[k]));
        rule->lhs_ = kXCAT;
        Hypergraph::Edge* edge = forest->AddEdge(rule, Hypergraph::TailNodeVector());
        edge->i_ = i;
        edge->j_ = i+1;
        edge->prev_i_ = i;    // we set these for FastLinearIntersect
        edge->prev_j_ = i+1;  //      "      "            "
        forest->ConnectEdgeToHeadNode(edge->id_, new_node_id);
      }
      if (prev_node_id >= 0) {
        const int comb_node_id = forest->AddNode(kXCAT)->id_;
        Hypergraph::TailNodeVector tail(2, prev_node_id);
        tail[1] = new_node_id;
        Hypergraph::Edge* edge = forest->AddEdge(kBINARY, tail);
        edge->i_ = 0;
        edge->j_ = i+1;
        forest->ConnectEdgeToHeadNode(edge->id_, comb_node_id);
        prev_node_id = comb_node_id;
      } else {
        prev_node_id = new_node_id;
      }
    }
    Hypergraph::TailNodeVector tail(1, forest->nodes_.size() - 1);
    Hypergraph::Node* goal = forest->AddNode(TD::Convert("Goal")*-1);
    Hypergraph::Edge* hg_edge = forest->AddEdge(kGOAL_RULE, tail);
    forest->ConnectEdgeToHeadNode(hg_edge, goal);
  }

 private:
  vector<WordID> tagset_;
  const WordID kXCAT;
  const WordID kNULL;
  const TRulePtr kBINARY;
  const TRulePtr kGOAL_RULE;
};

Tagger::Tagger(const boost::program_options::variables_map& conf) :
 pimpl_(new TaggerImpl(conf)) {}


bool Tagger::TranslateImpl(const string& input,
                       SentenceMetadata* smeta,
                       const vector<double>& weights,
                       Hypergraph* forest) {
  Lattice& lattice = smeta->src_lattice_;
  LatticeTools::ConvertTextToLattice(input, &lattice);
  smeta->SetSourceLength(lattice.size());
  smeta->ComputeInputLatticeType();
  assert(smeta->GetInputType() == cdec::kSEQUENCE);
  vector<WordID> sequence(lattice.size());
  for (int i = 0; i < lattice.size(); ++i) {
    assert(lattice[i].size() == 1);
    sequence[i] = lattice[i][0].label;
  }
  pimpl_->BuildTrellis(sequence, forest);
  forest->Reweight(weights);
  forest->is_linear_chain_ = true;
  // since we don't do any pruning, the node_hash will be the same for
  // every run of the composer
  int nc = 0;
  for (auto& node : forest->nodes_)
    node.node_hash = ++nc;
  return true;
}