#include #include #include #include #include #include #include #include "timing_stats.h" #include "translator.h" #include "phrasebased_translator.h" #include "aligner.h" #include "stringlib.h" #include "forest_writer.h" #include "hg_io.h" #include "filelib.h" #include "sampler.h" #include "sparse_vector.h" #include "tagger.h" #include "lextrans.h" #include "lexalign.h" #include "csplit.h" #include "weights.h" #include "tdict.h" #include "ff.h" #include "ff_factory.h" #include "hg_intersect.h" #include "apply_models.h" #include "viterbi.h" #include "kbest.h" #include "inside_outside.h" #include "exp_semiring.h" #include "sentence_metadata.h" using namespace std; using namespace std::tr1; using boost::shared_ptr; namespace po = boost::program_options; // some globals ... boost::shared_ptr > rng; static const double kMINUS_EPSILON = -1e-6; // don't be too strict namespace Hack { void MaxTrans(const Hypergraph& in, int beam_size); } namespace NgramCache { void Clear(); } void ShowBanner() { cerr << "cdec v1.0 (c) 2009-2010 by Chris Dyer\n"; } void ConvertSV(const SparseVector& src, SparseVector* trg) { for (SparseVector::const_iterator it = src.begin(); it != src.end(); ++it) trg->set_value(it->first, it->second); } void InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); opts.add_options() ("formalism,f",po::value(),"Decoding formalism; values include SCFG, FST, PB, LexTrans (lexical translation model, also disc training), CSplit (compound splitting), Tagger (sequence labeling), LexAlign (alignment only, or EM training)") ("input,i",po::value()->default_value("-"),"Source file") ("grammar,g",po::value >()->composing(),"Either SCFG grammar file(s) or phrase tables file(s)") ("weights,w",po::value(),"Feature weights file") ("no_freeze_feature_set,Z", "Do not freeze feature set after reading feature weights file") ("feature_function,F",po::value >()->composing(), "Additional feature function(s) (-L for list)") ("list_feature_functions,L","List available feature functions") ("add_pass_through_rules,P","Add rules to translate OOV words as themselves") ("k_best,k",po::value(),"Extract the k best derivations") ("unique_k_best,r", "Unique k-best translation list") ("aligner,a", "Run as a word/phrase aligner (src & ref required)") ("intersection_strategy,I",po::value()->default_value("cube_pruning"), "Intersection strategy for incorporating finite-state features; values include Cube_pruning, Full") ("cubepruning_pop_limit,K",po::value()->default_value(200), "Max number of pops from the candidate heap at each node") ("goal",po::value()->default_value("S"),"Goal symbol (SCFG & FST)") ("scfg_extra_glue_grammar", po::value(), "Extra glue grammar file (Glue grammars apply when i=0 but have no other span restrictions)") ("scfg_no_hiero_glue_grammar,n", "No Hiero glue grammar (nb. by default the SCFG decoder adds Hiero glue rules)") ("scfg_default_nt,d",po::value()->default_value("X"),"Default non-terminal symbol in SCFG") ("scfg_max_span_limit,S",po::value()->default_value(10),"Maximum non-terminal span limit (except \"glue\" grammar)") ("show_joshua_visualization,J", "Produce output compatible with the Joshua visualization tools") ("show_tree_structure", "Show the Viterbi derivation structure") ("show_expected_length", "Show the expected translation length under the model") ("show_partition,z", "Compute and show the partition (inside score)") ("show_cfg_search_space", "Show the search space as a CFG") ("prelm_beam_prune", po::value(), "Prune paths from -LM forest before LM rescoring, keeping paths within exp(alpha>=0)") ("beam_prune", po::value(), "Prune paths from +LM forest, keep paths within exp(alpha>=0)") ("scale_prune_srclen", "scale beams by the input length (in # of tokens; may not be what you want for lattices") ("lexalign_use_null", "Support source-side null words in lexical translation") ("tagger_tagset,t", po::value(), "(Tagger) file containing tag set") ("csplit_output_plf", "(Compound splitter) Output lattice in PLF format") ("csplit_preserve_full_word", "(Compound splitter) Always include the unsegmented form in the output lattice") ("extract_rules", po::value(), "Extract the rules used in translation (de-duped) to this file") ("graphviz","Show (constrained) translation forest in GraphViz format") ("max_translation_beam,x", po::value(), "Beam approximation to get max translation from the chart") ("max_translation_sample,X", po::value(), "Sample the max translation from the chart") ("pb_max_distortion,D", po::value()->default_value(4), "Phrase-based decoder: maximum distortion") ("cll_gradient,G","Compute conditional log-likelihood gradient and write to STDOUT (src & ref required)") ("crf_uniform_empirical", "If there are multple references use (i.e., lattice) a uniform distribution rather than posterior weighting a la EM") ("feature_expectations","Write feature expectations for all features in chart (**OBJ** will be the partition)") ("vector_format",po::value()->default_value("b64"), "Sparse vector serialization format for feature expectations or gradients, includes (text or b64)") ("combine_size,C",po::value()->default_value(1), "When option -G is used, process this many sentence pairs before writing the gradient (1=emit after every sentence pair)") ("forest_output,O",po::value(),"Directory to write forests to") ("minimal_forests,m","Write minimal forests (excludes Rule information). Such forests can be used for ML/MAP training, but not rescoring, etc."); po::options_description clo("Command line options"); clo.add_options() ("config,c", po::value(), "Configuration file") ("help,h", "Print this help message and exit"); po::options_description dconfig_options, dcmdline_options; dconfig_options.add(opts); dcmdline_options.add(opts).add(clo); po::store(parse_command_line(argc, argv, dcmdline_options), *conf); if (conf->count("config")) { const string cfg = (*conf)["config"].as(); cerr << "Configuration file: " << cfg << endl; ifstream config(cfg.c_str()); po::store(po::parse_config_file(config, dconfig_options), *conf); } po::notify(*conf); if (conf->count("list_feature_functions")) { cerr << "Available feature functions (specify with -F):\n"; global_ff_registry->DisplayList(); cerr << endl; exit(1); } if (conf->count("help") || conf->count("formalism") == 0) { cerr << dcmdline_options << endl; exit(1); } const string formalism = LowercaseString((*conf)["formalism"].as()); if (formalism != "scfg" && formalism != "fst" && formalism != "lextrans" && formalism != "pb" && formalism != "csplit" && formalism != "tagger" && formalism != "lexalign") { cerr << "Error: --formalism takes only 'scfg', 'fst', 'pb', 'csplit', 'lextrans', 'lexalign', or 'tagger'\n"; cerr << dcmdline_options << endl; exit(1); } } // TODO move out of cdec into some sampling decoder file void SampleRecurse(const Hypergraph& hg, const vector& ss, int n, vector* out) { const SampleSet& s = ss[n]; int i = rng->SelectSample(s); const Hypergraph::Edge& edge = hg.edges_[hg.nodes_[n].in_edges_[i]]; vector > ants(edge.tail_nodes_.size()); for (int j = 0; j < ants.size(); ++j) SampleRecurse(hg, ss, edge.tail_nodes_[j], &ants[j]); vector*> pants(ants.size()); for (int j = 0; j < ants.size(); ++j) pants[j] = &ants[j]; edge.rule_->ESubstitute(pants, out); } struct SampleSort { bool operator()(const pair& a, const pair& b) const { return a.first > b.first; } }; // TODO move out of cdec into some sampling decoder file void MaxTranslationSample(Hypergraph* hg, const int samples, const int k) { unordered_map > m; hg->PushWeightsToGoal(); const int num_nodes = hg->nodes_.size(); vector ss(num_nodes); for (int i = 0; i < num_nodes; ++i) { SampleSet& s = ss[i]; const vector& in_edges = hg->nodes_[i].in_edges_; for (int j = 0; j < in_edges.size(); ++j) { s.add(hg->edges_[in_edges[j]].edge_prob_); } } for (int i = 0; i < samples; ++i) { vector yield; SampleRecurse(*hg, ss, hg->nodes_.size() - 1, &yield); const string trans = TD::GetString(yield); ++m[trans]; } vector > dist; for (unordered_map >::iterator i = m.begin(); i != m.end(); ++i) { dist.push_back(make_pair(i->second, i->first)); } sort(dist.begin(), dist.end(), SampleSort()); if (k) { for (int i = 0; i < k; ++i) cout << dist[i].first << " ||| " << dist[i].second << endl; } else { cout << dist[0].second << endl; } } // TODO decoder output should probably be moved to another file void DumpKBest(const int sent_id, const Hypergraph& forest, const int k, const bool unique) { cerr << "In kbest\n"; if (unique) { KBest::KBestDerivations, ESentenceTraversal, KBest::FilterUnique> kbest(forest, k); for (int i = 0; i < k; ++i) { const KBest::KBestDerivations, ESentenceTraversal, KBest::FilterUnique>::Derivation* d = kbest.LazyKthBest(forest.nodes_.size() - 1, i); if (!d) break; cout << sent_id << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << " ||| " << log(d->score) << endl; } } else { KBest::KBestDerivations, ESentenceTraversal> kbest(forest, k); for (int i = 0; i < k; ++i) { const KBest::KBestDerivations, ESentenceTraversal>::Derivation* d = kbest.LazyKthBest(forest.nodes_.size() - 1, i); if (!d) break; cout << sent_id << " ||| " << TD::GetString(d->yield) << " ||| " << d->feature_values << " ||| " << log(d->score) << endl; } } } struct ELengthWeightFunction { double operator()(const Hypergraph::Edge& e) const { return e.rule_->ELength() - e.rule_->Arity(); } }; struct TRPHash { size_t operator()(const TRulePtr& o) const { return reinterpret_cast(o.get()); } }; static void ExtractRulesDedupe(const Hypergraph& hg, ostream* os) { static unordered_set written; for (int i = 0; i < hg.edges_.size(); ++i) { const TRulePtr& rule = hg.edges_[i].rule_; if (written.insert(rule).second) { (*os) << rule->AsString() << endl; } } } void register_feature_functions(); bool beam_param(po::variables_map const& conf,char const* name,double *val,bool scale_srclen=false,double srclen=1) { if (conf.count(name)) { *val=conf[name].as()*(scale_srclen?srclen:1); return true; } return false; } int main(int argc, char** argv) { global_ff_registry.reset(new FFRegistry); register_feature_functions(); ShowBanner(); po::variables_map conf; InitCommandLine(argc, argv, &conf); const bool write_gradient = conf.count("cll_gradient"); const bool feature_expectations = conf.count("feature_expectations"); if (write_gradient && feature_expectations) { cerr << "You can only specify --gradient or --feature_expectations, not both!\n"; exit(1); } const bool output_training_vector = (write_gradient || feature_expectations); boost::shared_ptr translator; const string formalism = LowercaseString(conf["formalism"].as()); const bool csplit_preserve_full_word = conf.count("csplit_preserve_full_word"); if (csplit_preserve_full_word && (formalism != "csplit" || !conf.count("beam_prune"))) { cerr << "--csplit_preserve_full_word should only be " << "used with csplit AND --beam_prune!\n"; exit(1); } const bool scale_prune_srclen=conf.count("scale_prune_srclen"); const bool csplit_output_plf = conf.count("csplit_output_plf"); if (csplit_output_plf && formalism != "csplit") { cerr << "--csplit_output_plf should only be used with csplit!\n"; exit(1); } // load feature weights (and possibly freeze feature set) vector feature_weights; Weights w; if (conf.count("weights")) { w.InitFromFile(conf["weights"].as()); feature_weights.resize(FD::NumFeats()); w.InitVector(&feature_weights); if (!conf.count("no_freeze_feature_set")) { cerr << "Freezing feature set (use --no_freeze_feature_set to change)." << endl; FD::Freeze(); } } // set up translation back end if (formalism == "scfg") translator.reset(new SCFGTranslator(conf)); else if (formalism == "fst") translator.reset(new FSTTranslator(conf)); else if (formalism == "pb") translator.reset(new PhraseBasedTranslator(conf)); else if (formalism == "csplit") translator.reset(new CompoundSplit(conf)); else if (formalism == "lextrans") translator.reset(new LexicalTrans(conf)); else if (formalism == "lexalign") translator.reset(new LexicalAlign(conf)); else if (formalism == "tagger") translator.reset(new Tagger(conf)); else assert(!"error"); // set up additional scoring features vector > pffs; vector late_ffs; if (conf.count("feature_function") > 0) { const vector& add_ffs = conf["feature_function"].as >(); for (int i = 0; i < add_ffs.size(); ++i) { string ff, param; SplitCommandAndParam(add_ffs[i], &ff, ¶m); cerr << "Feature: " << ff; if (param.size() > 0) cerr << " (with config parameters '" << param << "')\n"; else cerr << " (no config parameters)\n"; shared_ptr pff = global_ff_registry->Create(ff, param); if (!pff) { exit(1); } // TODO check that multiple features aren't trying to set the same fid pffs.push_back(pff); late_ffs.push_back(pff.get()); } } ModelSet late_models(feature_weights, late_ffs); int palg = 1; if (LowercaseString(conf["intersection_strategy"].as()) == "full") { palg = 0; cerr << "Using full intersection (no pruning).\n"; } const IntersectionConfiguration inter_conf(palg, conf["cubepruning_pop_limit"].as()); const int sample_max_trans = conf.count("max_translation_sample") ? conf["max_translation_sample"].as() : 0; if (sample_max_trans) rng.reset(new RandomNumberGenerator); const bool aligner_mode = conf.count("aligner"); const bool minimal_forests = conf.count("minimal_forests"); const bool graphviz = conf.count("graphviz"); const bool joshua_viz = conf.count("show_joshua_visualization"); const bool encode_b64 = conf["vector_format"].as() == "b64"; const bool kbest = conf.count("k_best"); const bool unique_kbest = conf.count("unique_k_best"); const bool crf_uniform_empirical = conf.count("crf_uniform_empirical"); shared_ptr extract_file; if (conf.count("extract_rules")) extract_file.reset(new WriteFile(conf["extract_rules"].as())); int combine_size = conf["combine_size"].as(); if (combine_size < 1) combine_size = 1; const string input = conf["input"].as(); cerr << "Reading input from " << ((input == "-") ? "STDIN" : input.c_str()) << endl; ReadFile in_read(input); istream *in = in_read.stream(); assert(*in); SparseVector acc_vec; // accumulate gradient double acc_obj = 0; // accumulate objective int g_count = 0; // number of gradient pieces computed int sent_id = -1; // line counter while(*in) { NgramCache::Clear(); // clear ngram cache for remote LM (if used) Timer::Summarize(); ++sent_id; string buf; getline(*in, buf); if (buf.empty()) continue; map sgml; ProcessAndStripSGML(&buf, &sgml); if (sgml.find("id") != sgml.end()) sent_id = atoi(sgml["id"].c_str()); cerr << "\nINPUT: "; if (buf.size() < 100) cerr << buf << endl; else { size_t x = buf.rfind(" ", 100); if (x == string::npos) x = 100; cerr << buf.substr(0, x) << " ..." << endl; } cerr << " id = " << sent_id << endl; string to_translate; Lattice ref; ParseTranslatorInputLattice(buf, &to_translate, &ref); const unsigned srclen=NTokens(to_translate,' '); //FIXME: should get the avg. or max source length of the input lattice (like Lattice::dist_(start,end)); but this is only used to scale beam parameters (optionally) anyway so fidelity isn't important. const bool has_ref = ref.size() > 0; SentenceMetadata smeta(sent_id, ref); const bool hadoop_counters = (write_gradient); Hypergraph forest; // -LM forest translator->ProcessMarkupHints(sgml); Timer t("Translation"); const bool translation_successful = translator->Translate(to_translate, &smeta, feature_weights, &forest); translator->SentenceComplete(); if (!translation_successful) { cerr << " NO PARSE FOUND.\n"; if (hadoop_counters) cerr << "reporter:counter:UserCounters,FParseFailed,1" << endl; cout << endl << flush; continue; } const bool show_tree_structure=conf.count("show_tree_structure"); cerr << viterbi_stats(forest," -LM forest",true,show_tree_structure); if (conf.count("show_expected_length")) { const PRPair res = Inside, PRWeightFunction >(forest); cerr << " Expected length (words): " << res.r / res.p << "\t" << res << endl; } if (conf.count("show_partition")) { const prob_t z = Inside(forest); cerr << " -LM partition log(Z): " << log(z) << endl; } if (extract_file) ExtractRulesDedupe(forest, extract_file->stream()); double prelm_beam_prune; if (beam_param(conf,"prelm_beam_prune",&prelm_beam_prune,scale_prune_srclen,srclen)) { double presize=forest.edges_.size(); forest.BeamPruneInsideOutside(1.0, false, prelm_beam_prune, NULL); cerr << viterbi_stats(forest," Pruned -LM forest",false,false); cerr << " Pruned -LM forest (beam="< preserve_mask(forest.edges_.size(), false); if (csplit_preserve_full_word) preserve_mask[CompoundSplit::GetFullWordEdgeIndex(forest)] = true; forest.BeamPruneInsideOutside(1.0, false, beam_prune, &preserve_mask); cerr << viterbi_stats(forest," Pruned forest",false,false); } if (conf.count("forest_output") && !has_ref) { ForestWriter writer(conf["forest_output"].as(), sent_id); if (FileExists(writer.fname_)) { cerr << " Unioning...\n"; Hypergraph new_hg; { ReadFile rf(writer.fname_); bool succeeded = HypergraphIO::ReadFromJSON(rf.stream(), &new_hg); assert(succeeded); } new_hg.Union(forest); bool succeeded = writer.Write(new_hg, minimal_forests); assert(succeeded); } else { bool succeeded = writer.Write(forest, minimal_forests); assert(succeeded); } } if (sample_max_trans) { MaxTranslationSample(&forest, sample_max_trans, conf.count("k_best") ? conf["k_best"].as() : 0); } else { vector trans; ViterbiESentence(forest, &trans); if (kbest) { DumpKBest(sent_id, forest, conf["k_best"].as(), unique_kbest); } else if (csplit_output_plf) { cout << HypergraphIO::AsPLF(forest, false) << endl; } else { if (!graphviz && !has_ref && !joshua_viz) { cout << TD::GetString(trans) << endl << flush; } if (joshua_viz) { cout << sent_id << " ||| " << JoshuaVisualizationString(forest) << " ||| 1.0 ||| " << -1.0 << endl << flush; } } } const int max_trans_beam_size = conf.count("max_translation_beam") ? conf["max_translation_beam"].as() : 0; if (max_trans_beam_size) { Hack::MaxTrans(forest, max_trans_beam_size); continue; } if (graphviz && !has_ref) forest.PrintGraphviz(); // the following are only used if write_gradient is true! SparseVector full_exp, ref_exp, gradient; double log_z = 0, log_ref_z = 0; if (write_gradient) { const prob_t z = InsideOutside, EdgeFeaturesAndProbWeightFunction>(forest, &full_exp); log_z = log(z); full_exp /= z; } if (conf.count("show_cfg_search_space")) HypergraphIO::WriteAsCFG(forest); if (has_ref) { if (HG::Intersect(ref, &forest)) { cerr << " Constr. forest (nodes/edges): " << forest.nodes_.size() << '/' << forest.edges_.size() << endl; cerr << " Constr. forest (paths): " << forest.NumberOfPaths() << endl; if (crf_uniform_empirical) { cerr << " USING UNIFORM WEIGHTS\n"; for (int i = 0; i < forest.edges_.size(); ++i) forest.edges_[i].edge_prob_=prob_t::One(); } else { forest.Reweight(feature_weights); cerr << " Constr. VitTree: " << ViterbiFTree(forest) << endl; } if (hadoop_counters) cerr << "reporter:counter:UserCounters,SentencePairsParsed,1" << endl; if (conf.count("show_partition")) { const prob_t z = Inside(forest); cerr << " Contst. partition log(Z): " << log(z) << endl; } //DumpKBest(sent_id, forest, 1000); if (conf.count("forest_output")) { ForestWriter writer(conf["forest_output"].as(), sent_id); if (FileExists(writer.fname_)) { cerr << " Unioning...\n"; Hypergraph new_hg; { ReadFile rf(writer.fname_); bool succeeded = HypergraphIO::ReadFromJSON(rf.stream(), &new_hg); assert(succeeded); } new_hg.Union(forest); bool succeeded = writer.Write(new_hg, minimal_forests); assert(succeeded); } else { bool succeeded = writer.Write(forest, minimal_forests); assert(succeeded); } } if (aligner_mode && !output_training_vector) AlignerTools::WriteAlignment(smeta.GetSourceLattice(), smeta.GetReference(), forest, &cout); if (write_gradient) { const prob_t ref_z = InsideOutside, EdgeFeaturesAndProbWeightFunction>(forest, &ref_exp); ref_exp /= ref_z; if (crf_uniform_empirical) { log_ref_z = ref_exp.dot(feature_weights); } else { log_ref_z = log(ref_z); } //cerr << " MODEL LOG Z: " << log_z << endl; //cerr << " EMPIRICAL LOG Z: " << log_ref_z << endl; if ((log_z - log_ref_z) < kMINUS_EPSILON) { cerr << "DIFF. ERR! log_z < log_ref_z: " << log_z << " " << log_ref_z << endl; exit(1); } assert(!isnan(log_ref_z)); ref_exp -= full_exp; acc_vec += ref_exp; acc_obj += (log_z - log_ref_z); } if (feature_expectations) { const prob_t z = InsideOutside, EdgeFeaturesAndProbWeightFunction>(forest, &ref_exp); ref_exp /= z; acc_obj += log(z); acc_vec += ref_exp; } if (output_training_vector) { acc_vec.clear_value(0); ++g_count; if (g_count % combine_size == 0) { if (encode_b64) { cout << "0\t"; SparseVector dav; ConvertSV(acc_vec, &dav); B64::Encode(acc_obj, dav, &cout); cout << endl << flush; } else { cout << "0\t**OBJ**=" << acc_obj << ';' << acc_vec << endl << flush; } acc_vec.clear(); acc_obj = 0; } } if (conf.count("graphviz")) forest.PrintGraphviz(); } else { cerr << " REFERENCE UNREACHABLE.\n"; if (write_gradient) { if (hadoop_counters) cerr << "reporter:counter:UserCounters,EFParseFailed,1" << endl; cout << endl << flush; } } } } if (output_training_vector && !acc_vec.empty()) { if (encode_b64) { cout << "0\t"; SparseVector dav; ConvertSV(acc_vec, &dav); B64::Encode(acc_obj, dav, &cout); cout << endl << flush; } else { cout << "0\t**OBJ**=" << acc_obj << ';' << acc_vec << endl << flush; } } }