From bba4ff830c8722cdcaf29e36c1ff5821a912ae5d Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 17 Dec 2009 13:57:54 -0500 Subject: added non-pruning intersection and a CRF tagger - the linear-chain tagger is more of a proof of concept than a real tagger-- the context-free assumptions made in a number of places mean that the algorithms used may not be as efficient as they could be, but the model is as powerful as any CRF - it would be easy to add latent variables or semi-CRF support (or both!) - i've added a couple basic features that are often used for POS tagging - non-pruning intersection is useful for lexical word alignment models and the tagger - a sample POS tagger model will be committed later --- decoder/cdec.cc | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'decoder/cdec.cc') diff --git a/decoder/cdec.cc b/decoder/cdec.cc index 6185c79b..c6773cce 100644 --- a/decoder/cdec.cc +++ b/decoder/cdec.cc @@ -17,6 +17,7 @@ #include "filelib.h" #include "sampler.h" #include "sparse_vector.h" +#include "tagger.h" #include "lexcrf.h" #include "csplit.h" #include "weights.h" @@ -48,7 +49,7 @@ void ShowBanner() { void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() - ("formalism,f",po::value(),"Translation formalism; values include SCFG, FST, PB, LexCRF (lexical translation model), CSplit (compound splitting)") + ("formalism,f",po::value(),"Decoding formalism; values include SCFG, FST, PB, LexCRF (lexical translation model), CSplit (compound splitting), Tagger (sequence labeling)") ("input,i",po::value()->default_value("-"),"Source file") ("grammar,g",po::value >()->composing(),"Either SCFG grammar file(s) or phrase tables file(s)") ("weights,w",po::value(),"Feature weights file") @@ -58,16 +59,18 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("k_best,k",po::value(),"Extract the k best derivations") ("unique_k_best,r", "Unique k-best translation list") ("aligner,a", "Run as a word/phrase aligner (src & ref required)") + ("intersection_strategy,I",po::value()->default_value("cube_pruning"), "Intersection strategy for incorporating finite-state features; values include Cube_pruning, Full") ("cubepruning_pop_limit,K",po::value()->default_value(200), "Max number of pops from the candidate heap at each node") ("goal",po::value()->default_value("S"),"Goal symbol (SCFG & FST)") ("scfg_extra_glue_grammar", po::value(), "Extra glue grammar file (Glue grammars apply when i=0 but have no other span restrictions)") ("scfg_no_hiero_glue_grammar,n", "No Hiero glue grammar (nb. by default the SCFG decoder adds Hiero glue rules)") ("scfg_default_nt,d",po::value()->default_value("X"),"Default non-terminal symbol in SCFG") ("scfg_max_span_limit,S",po::value()->default_value(10),"Maximum non-terminal span limit (except \"glue\" grammar)") - ("show_tree_structure,T", "Show the Viterbi derivation structure") + ("show_tree_structure", "Show the Viterbi derivation structure") ("show_expected_length", "Show the expected translation length under the model") ("show_partition,z", "Compute and show the partition (inside score)") ("beam_prune", po::value(), "Prune paths from +LM forest") + ("tagger_tagset,t", po::value(), "(Tagger) file containing tag set") ("csplit_output_plf", "(Compound splitter) Output lattice in PLF format") ("csplit_preserve_full_word", "(Compound splitter) Always include the unsegmented form in the output lattice") ("extract_rules", po::value(), "Extract the rules used in translation (de-duped) to this file") @@ -111,8 +114,8 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { } const string formalism = LowercaseString((*conf)["formalism"].as()); - if (formalism != "scfg" && formalism != "fst" && formalism != "lexcrf" && formalism != "pb" && formalism != "csplit") { - cerr << "Error: --formalism takes only 'scfg', 'fst', 'pb', 'csplit' or 'lexcrf'\n"; + if (formalism != "scfg" && formalism != "fst" && formalism != "lexcrf" && formalism != "pb" && formalism != "csplit" && formalism != "tagger") { + cerr << "Error: --formalism takes only 'scfg', 'fst', 'pb', 'csplit', 'lexcrf', or 'tagger'\n"; cerr << dcmdline_options << endl; exit(1); } @@ -255,6 +258,8 @@ int main(int argc, char** argv) { translator.reset(new CompoundSplit(conf)); else if (formalism == "lexcrf") translator.reset(new LexicalCRF(conf)); + else if (formalism == "tagger") + translator.reset(new Tagger(conf)); else assert(!"error"); @@ -285,6 +290,12 @@ int main(int argc, char** argv) { } } ModelSet late_models(feature_weights, late_ffs); + int palg = 1; + if (LowercaseString(conf["intersection_strategy"].as()) == "full") { + palg = 0; + cerr << "Using full intersection (no pruning).\n"; + } + const IntersectionConfiguration inter_conf(palg, conf["cubepruning_pop_limit"].as()); const int sample_max_trans = conf.count("max_translation_sample") ? conf["max_translation_sample"].as() : 0; @@ -374,11 +385,10 @@ int main(int argc, char** argv) { forest.Reweight(feature_weights); forest.SortInEdgesByEdgeWeights(); Hypergraph lm_forest; - int cubepruning_pop_limit = conf["cubepruning_pop_limit"].as(); ApplyModelSet(forest, smeta, late_models, - PruningConfiguration(cubepruning_pop_limit), + inter_conf, &lm_forest); forest.swap(lm_forest); forest.Reweight(feature_weights); -- cgit v1.2.3