diff options
author | Chris Dyer <redpony@gmail.com> | 2009-12-17 13:57:54 -0500 |
---|---|---|
committer | Chris Dyer <redpony@gmail.com> | 2009-12-17 13:57:54 -0500 |
commit | bba4ff830c8722cdcaf29e36c1ff5821a912ae5d (patch) | |
tree | 268f2f8118aca09b3cc40dca8b2be7de8295acd5 /decoder/tagger.cc | |
parent | 04ae1beeaeceb0161a64d33112f21956f9741bde (diff) |
added non-pruning intersection and a CRF tagger
- the linear-chain tagger is more of a proof of concept than a real tagger-- the context-free assumptions made in a number of places mean that the algorithms used may not be as efficient as they could be, but the model is as powerful as any CRF
- it would be easy to add latent variables or semi-CRF support (or both!)
- i've added a couple basic features that are often used for POS tagging
- non-pruning intersection is useful for lexical word alignment models and the tagger
- a sample POS tagger model will be committed later
Diffstat (limited to 'decoder/tagger.cc')
-rw-r--r-- | decoder/tagger.cc | 109 |
1 files changed, 109 insertions, 0 deletions
diff --git a/decoder/tagger.cc b/decoder/tagger.cc new file mode 100644 index 00000000..5a0155cc --- /dev/null +++ b/decoder/tagger.cc @@ -0,0 +1,109 @@ +#include "tagger.h" + +#include "tdict.h" +#include "hg_io.h" +#include "filelib.h" +#include "hg.h" +#include "wordid.h" +#include "sentence_metadata.h" + +using namespace std; + +// This is a really simple linear chain tagger. +// You specify a tagset, and it hypothesizes that each word in the +// input can be tagged with any member of the tagset. +// The are a couple sample features implemented in ff_tagger.h/cc +// One thing to note, that while CRFs typically define the label +// sequence as corresponding to the hidden states in a trellis, +// in our model the labels are on edges, but mathematically +// they are identical. +// +// Things to do if you want to make this a "real" tagger: +// - support dictionaries (for each word, limit the tags considered) +// - add latent variables - this is really easy to do + +static void ReadTagset(const string& file, vector<WordID>* tags) { + ReadFile rf(file); + istream& in(*rf.stream()); + while(in) { + string tag; + in >> tag; + if (tag.empty()) continue; + tags->push_back(TD::Convert(tag)); + } + cerr << "Read " << tags->size() << " labels (tags) from " << file << endl; +} + +struct TaggerImpl { + TaggerImpl(const boost::program_options::variables_map& conf) : + kXCAT(TD::Convert("X")*-1), + kNULL(TD::Convert("<eps>")), + kBINARY(new TRule("[X] ||| [X,1] [X,2] ||| [1] [2]")), + kGOAL_RULE(new TRule("[Goal] ||| [X,1] ||| [1]")) { + if (conf.count("tagger_tagset") == 0) { + cerr << "Tagger requires --tagger_tagset FILE!\n"; + exit(1); + } + ReadTagset(conf["tagger_tagset"].as<string>(), &tagset_); + } + + void BuildTrellis(const vector<WordID>& seq, Hypergraph* forest) { + int prev_node_id = -1; + for (int i = 0; i < seq.size(); ++i) { + const WordID& src = seq[i]; + const int new_node_id = forest->AddNode(kXCAT)->id_; + for (int k = 0; k < tagset_.size(); ++k) { + TRulePtr rule(TRule::CreateLexicalRule(src, tagset_[k])); + Hypergraph::Edge* edge = forest->AddEdge(rule, Hypergraph::TailNodeVector()); + edge->i_ = i; + edge->j_ = i+1; + forest->ConnectEdgeToHeadNode(edge->id_, new_node_id); + } + if (prev_node_id >= 0) { + const int comb_node_id = forest->AddNode(kXCAT)->id_; + Hypergraph::TailNodeVector tail(2, prev_node_id); + tail[1] = new_node_id; + Hypergraph::Edge* edge = forest->AddEdge(kBINARY, tail); + edge->i_ = 0; + edge->j_ = i+1; + forest->ConnectEdgeToHeadNode(edge->id_, comb_node_id); + prev_node_id = comb_node_id; + } else { + prev_node_id = new_node_id; + } + } + Hypergraph::TailNodeVector tail(1, forest->nodes_.size() - 1); + Hypergraph::Node* goal = forest->AddNode(TD::Convert("Goal")*-1); + Hypergraph::Edge* hg_edge = forest->AddEdge(kGOAL_RULE, tail); + forest->ConnectEdgeToHeadNode(hg_edge, goal); + } + + private: + vector<WordID> tagset_; + const WordID kXCAT; + const WordID kNULL; + const TRulePtr kBINARY; + const TRulePtr kGOAL_RULE; +}; + +Tagger::Tagger(const boost::program_options::variables_map& conf) : + pimpl_(new TaggerImpl(conf)) {} + + +bool Tagger::Translate(const string& input, + SentenceMetadata* smeta, + const vector<double>& weights, + Hypergraph* forest) { + Lattice lattice; + LatticeTools::ConvertTextToLattice(input, &lattice); + smeta->SetSourceLength(lattice.size()); + vector<WordID> sequence(lattice.size()); + for (int i = 0; i < lattice.size(); ++i) { + assert(lattice[i].size() == 1); + sequence[i] = lattice[i][0].label; + } + pimpl_->BuildTrellis(sequence, forest); + forest->Reweight(weights); + return true; +} + |