diff options
author | Chris Dyer <redpony@gmail.com> | 2009-12-17 13:57:54 -0500 |
---|---|---|
committer | Chris Dyer <redpony@gmail.com> | 2009-12-17 13:57:54 -0500 |
commit | bba4ff830c8722cdcaf29e36c1ff5821a912ae5d (patch) | |
tree | 268f2f8118aca09b3cc40dca8b2be7de8295acd5 /decoder/cdec.cc | |
parent | 04ae1beeaeceb0161a64d33112f21956f9741bde (diff) |
added non-pruning intersection and a CRF tagger
- the linear-chain tagger is more of a proof of concept than a real tagger-- the context-free assumptions made in a number of places mean that the algorithms used may not be as efficient as they could be, but the model is as powerful as any CRF
- it would be easy to add latent variables or semi-CRF support (or both!)
- i've added a couple basic features that are often used for POS tagging
- non-pruning intersection is useful for lexical word alignment models and the tagger
- a sample POS tagger model will be committed later
Diffstat (limited to 'decoder/cdec.cc')
-rw-r--r-- | decoder/cdec.cc | 22 |
1 files changed, 16 insertions, 6 deletions
diff --git a/decoder/cdec.cc b/decoder/cdec.cc index 6185c79b..c6773cce 100644 --- a/decoder/cdec.cc +++ b/decoder/cdec.cc @@ -17,6 +17,7 @@ #include "filelib.h" #include "sampler.h" #include "sparse_vector.h" +#include "tagger.h" #include "lexcrf.h" #include "csplit.h" #include "weights.h" @@ -48,7 +49,7 @@ void ShowBanner() { void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() - ("formalism,f",po::value<string>(),"Translation formalism; values include SCFG, FST, PB, LexCRF (lexical translation model), CSplit (compound splitting)") + ("formalism,f",po::value<string>(),"Decoding formalism; values include SCFG, FST, PB, LexCRF (lexical translation model), CSplit (compound splitting), Tagger (sequence labeling)") ("input,i",po::value<string>()->default_value("-"),"Source file") ("grammar,g",po::value<vector<string> >()->composing(),"Either SCFG grammar file(s) or phrase tables file(s)") ("weights,w",po::value<string>(),"Feature weights file") @@ -58,16 +59,18 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { ("k_best,k",po::value<int>(),"Extract the k best derivations") ("unique_k_best,r", "Unique k-best translation list") ("aligner,a", "Run as a word/phrase aligner (src & ref required)") + ("intersection_strategy,I",po::value<string>()->default_value("cube_pruning"), "Intersection strategy for incorporating finite-state features; values include Cube_pruning, Full") ("cubepruning_pop_limit,K",po::value<int>()->default_value(200), "Max number of pops from the candidate heap at each node") ("goal",po::value<string>()->default_value("S"),"Goal symbol (SCFG & FST)") ("scfg_extra_glue_grammar", po::value<string>(), "Extra glue grammar file (Glue grammars apply when i=0 but have no other span restrictions)") ("scfg_no_hiero_glue_grammar,n", "No Hiero glue grammar (nb. by default the SCFG decoder adds Hiero glue rules)") ("scfg_default_nt,d",po::value<string>()->default_value("X"),"Default non-terminal symbol in SCFG") ("scfg_max_span_limit,S",po::value<int>()->default_value(10),"Maximum non-terminal span limit (except \"glue\" grammar)") - ("show_tree_structure,T", "Show the Viterbi derivation structure") + ("show_tree_structure", "Show the Viterbi derivation structure") ("show_expected_length", "Show the expected translation length under the model") ("show_partition,z", "Compute and show the partition (inside score)") ("beam_prune", po::value<double>(), "Prune paths from +LM forest") + ("tagger_tagset,t", po::value<string>(), "(Tagger) file containing tag set") ("csplit_output_plf", "(Compound splitter) Output lattice in PLF format") ("csplit_preserve_full_word", "(Compound splitter) Always include the unsegmented form in the output lattice") ("extract_rules", po::value<string>(), "Extract the rules used in translation (de-duped) to this file") @@ -111,8 +114,8 @@ void InitCommandLine(int argc, char** argv, po::variables_map* conf) { } const string formalism = LowercaseString((*conf)["formalism"].as<string>()); - if (formalism != "scfg" && formalism != "fst" && formalism != "lexcrf" && formalism != "pb" && formalism != "csplit") { - cerr << "Error: --formalism takes only 'scfg', 'fst', 'pb', 'csplit' or 'lexcrf'\n"; + if (formalism != "scfg" && formalism != "fst" && formalism != "lexcrf" && formalism != "pb" && formalism != "csplit" && formalism != "tagger") { + cerr << "Error: --formalism takes only 'scfg', 'fst', 'pb', 'csplit', 'lexcrf', or 'tagger'\n"; cerr << dcmdline_options << endl; exit(1); } @@ -255,6 +258,8 @@ int main(int argc, char** argv) { translator.reset(new CompoundSplit(conf)); else if (formalism == "lexcrf") translator.reset(new LexicalCRF(conf)); + else if (formalism == "tagger") + translator.reset(new Tagger(conf)); else assert(!"error"); @@ -285,6 +290,12 @@ int main(int argc, char** argv) { } } ModelSet late_models(feature_weights, late_ffs); + int palg = 1; + if (LowercaseString(conf["intersection_strategy"].as<string>()) == "full") { + palg = 0; + cerr << "Using full intersection (no pruning).\n"; + } + const IntersectionConfiguration inter_conf(palg, conf["cubepruning_pop_limit"].as<int>()); const int sample_max_trans = conf.count("max_translation_sample") ? conf["max_translation_sample"].as<int>() : 0; @@ -374,11 +385,10 @@ int main(int argc, char** argv) { forest.Reweight(feature_weights); forest.SortInEdgesByEdgeWeights(); Hypergraph lm_forest; - int cubepruning_pop_limit = conf["cubepruning_pop_limit"].as<int>(); ApplyModelSet(forest, smeta, late_models, - PruningConfiguration(cubepruning_pop_limit), + inter_conf, &lm_forest); forest.swap(lm_forest); forest.Reweight(feature_weights); |