diff options
author | Chris Dyer <redpony@gmail.com> | 2009-12-14 20:35:11 -0500 |
---|---|---|
committer | Chris Dyer <redpony@gmail.com> | 2009-12-14 20:35:11 -0500 |
commit | 851e389dffdd6996ea32d70defb8906de80b9edc (patch) | |
tree | 8c68ee77205badc056b8ab5b332e67e3e98017df /src/cdec.cc | |
parent | dc6930c00b4b276883280cff1ed6dcd9ddef03c7 (diff) |
few small fixes of alignment tools, add new orthographic similarity feature for word aligner, final naming of directories, libraries in cdec
Diffstat (limited to 'src/cdec.cc')
-rw-r--r-- | src/cdec.cc | 507 |
1 files changed, 0 insertions, 507 deletions
diff --git a/src/cdec.cc b/src/cdec.cc deleted file mode 100644 index 6185c79b..00000000 --- a/src/cdec.cc +++ /dev/null @@ -1,507 +0,0 @@ -#include <iostream> -#include <fstream> -#include <tr1/unordered_map> -#include <tr1/unordered_set> - -#include <boost/shared_ptr.hpp> -#include <boost/program_options.hpp> -#include <boost/program_options/variables_map.hpp> - -#include "timing_stats.h" -#include "translator.h" -#include "phrasebased_translator.h" -#include "aligner.h" -#include "stringlib.h" -#include "forest_writer.h" -#include "hg_io.h" -#include "filelib.h" -#include "sampler.h" -#include "sparse_vector.h" -#include "lexcrf.h" -#include "csplit.h" -#include "weights.h" -#include "tdict.h" -#include "ff.h" -#include "ff_factory.h" -#include "hg_intersect.h" -#include "apply_models.h" -#include "viterbi.h" -#include "kbest.h" -#include "inside_outside.h" -#include "exp_semiring.h" -#include "sentence_metadata.h" - -using namespace std; -using namespace std::tr1; -using boost::shared_ptr; -namespace po = boost::program_options; - -// some globals ... -boost::shared_ptr<RandomNumberGenerator<boost::mt19937> > rng; - -namespace Hack { void MaxTrans(const Hypergraph& in, int beam_size); } - -void ShowBanner() { - cerr << "cdec v1.0 (c) 2009 by Chris Dyer\n"; -} - -void InitCommandLine(int argc, char** argv, po::variables_map* conf) { - po::options_description opts("Configuration options"); - opts.add_options() - ("formalism,f",po::value<string>(),"Translation formalism; values include SCFG, FST, PB, LexCRF (lexical translation model), CSplit (compound splitting)") - ("input,i",po::value<string>()->default_value("-"),"Source file") - ("grammar,g",po::value<vector<string> >()->composing(),"Either SCFG grammar file(s) or phrase tables file(s)") - ("weights,w",po::value<string>(),"Feature weights file") - ("feature_function,F",po::value<vector<string> >()->composing(), "Additional feature function(s) (-L for list)") - ("list_feature_functions,L","List available feature functions") - ("add_pass_through_rules,P","Add rules to translate OOV words as themselves") - ("k_best,k",po::value<int>(),"Extract the k best derivations") - ("unique_k_best,r", "Unique k-best translation list") - ("aligner,a", "Run as a word/phrase aligner (src & ref required)") - ("cubepruning_pop_limit,K",po::value<int>()->default_value(200), "Max number of pops from the candidate heap at each node") - ("goal",po::value<string>()->default_value("S"),"Goal symbol (SCFG & FST)") - ("scfg_extra_glue_grammar", po::value<string>(), "Extra glue grammar file (Glue grammars apply when i=0 but have no other span restrictions)") - ("scfg_no_hiero_glue_grammar,n", "No Hiero glue grammar (nb. by default the SCFG decoder adds Hiero glue rules)") - ("scfg_default_nt,d",po::value<string>()->default_value("X"),"Default non-terminal symbol in SCFG") - ("scfg_max_span_limit,S",po::value<int>()->default_value(10),"Maximum non-terminal span limit (except \"glue\" grammar)") - ("show_tree_structure,T", "Show the Viterbi derivation structure") - ("show_expected_length", "Show the expected translation length under the model") - ("show_partition,z", "Compute and show the partition (inside score)") - ("beam_prune", po::value<double>(), "Prune paths from +LM forest") - ("csplit_output_plf", "(Compound splitter) Output lattice in PLF format") - ("csplit_preserve_full_word", "(Compound splitter) Always include the unsegmented form in the output lattice") - ("extract_rules", po::value<string>(), "Extract the rules used in translation (de-duped) to this file") - ("graphviz","Show (constrained) translation forest in GraphViz format") - ("max_translation_beam,x", po::value<int>(), "Beam approximation to get max translation from the chart") - ("max_translation_sample,X", po::value<int>(), "Sample the max translation from the chart") - ("pb_max_distortion,D", po::value<int>()->default_value(4), "Phrase-based decoder: maximum distortion") - ("gradient,G","Compute d log p(e|f) / d lambda_i and write to STDOUT (src & ref required)") - ("feature_expectations","Write feature expectations for all features in chart (**OBJ** will be the partition)") - ("vector_format",po::value<string>()->default_value("b64"), "Sparse vector serialization format for feature expectations or gradients, includes (text or b64)") - ("combine_size,C",po::value<int>()->default_value(1), "When option -G is used, process this many sentence pairs before writing the gradient (1=emit after every sentence pair)") - ("forest_output,O",po::value<string>(),"Directory to write forests to") - ("minimal_forests,m","Write minimal forests (excludes Rule information). Such forests can be used for ML/MAP training, but not rescoring, etc."); - po::options_description clo("Command line options"); - clo.add_options() - ("config,c", po::value<string>(), "Configuration file") - ("help,h", "Print this help message and exit"); - po::options_description dconfig_options, dcmdline_options; - dconfig_options.add(opts); - dcmdline_options.add(opts).add(clo); - - po::store(parse_command_line(argc, argv, dcmdline_options), *conf); - if (conf->count("config")) { - const string cfg = (*conf)["config"].as<string>(); - cerr << "Configuration file: " << cfg << endl; - ifstream config(cfg.c_str()); - po::store(po::parse_config_file(config, dconfig_options), *conf); - } - po::notify(*conf); - - if (conf->count("list_feature_functions")) { - cerr << "Available feature functions (specify with -F):\n"; - global_ff_registry->DisplayList(); - cerr << endl; - exit(1); - } - - if (conf->count("help") || conf->count("formalism") == 0) { - cerr << dcmdline_options << endl; - exit(1); - } - - const string formalism = LowercaseString((*conf)["formalism"].as<string>()); - if (formalism != "scfg" && formalism != "fst" && formalism != "lexcrf" && formalism != "pb" && formalism != "csplit") { - cerr << "Error: --formalism takes only 'scfg', 'fst', 'pb', 'csplit' or 'lexcrf'\n"; - cerr << dcmdline_options << endl; - exit(1); - } -} - -// TODO move out of cdec into some sampling decoder file -void SampleRecurse(const Hypergraph& hg, const vector<SampleSet>& ss, int n, vector<WordID>* out) { - const SampleSet& s = ss[n]; - int i = rng->SelectSample(s); - const Hypergraph::Edge& edge = hg.edges_[hg.nodes_[n].in_edges_[i]]; - vector<vector<WordID> > ants(edge.tail_nodes_.size()); - for (int j = 0; j < ants.size(); ++j) - SampleRecurse(hg, ss, edge.tail_nodes_[j], &ants[j]); - - vector<const vector<WordID>*> pants(ants.size()); - for (int j = 0; j < ants.size(); ++j) pants[j] = &ants[j]; - edge.rule_->ESubstitute(pants, out); -} - -struct SampleSort { - bool operator()(const pair<int,string>& a, const pair<int,string>& b) const { - return a.first > b.first; - } -}; - -// TODO move out of cdec into some sampling decoder file -void MaxTranslationSample(Hypergraph* hg, const int samples, const int k) { - unordered_map<string, int, boost::hash<string> > m; - hg->PushWeightsToGoal(); - const int num_nodes = hg->nodes_.size(); - vector<SampleSet> ss(num_nodes); - for (int i = 0; i < num_nodes; ++i) { - SampleSet& s = ss[i]; - const vector<int>& in_edges = hg->nodes_[i].in_edges_; - for (int j = 0; j < in_edges.size(); ++j) { - s.add(hg->edges_[in_edges[j]].edge_prob_); - } - } - for (int i = 0; i < samples; ++i) { - vector<WordID> yield; - SampleRecurse(*hg, ss, hg->nodes_.size() - 1, &yield); - const string trans = TD::GetString(yield); - ++m[trans]; - } - vector<pair<int, string> > dist; - for (unordered_map<string, int, boost::hash<string> >::iterator i = m.begin(); - i != m.end(); ++i) { - dist.push_back(make_pair(i->second, i->first)); - } - sort(dist.begin(), dist.end(), SampleSort()); - if (k) { - for (int i = 0; i < k; ++i) - cout << dist[i].first << " ||| " << dist[i].second << endl; - } else { - cout << dist[0].second << endl; - } -} - -// TODO decoder output should probably be moved to another file -void DumpKBest(const int sent_id, const Hypergraph& forest, const int k, const bool unique) { - if (unique) { - KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique> kbest(forest, k); - for (int i = 0; i < k; ++i) { - const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal, KBest::FilterUnique>::Derivation* d = - kbest.LazyKthBest(forest.nodes_.size() - 1, i); - if (!d) break; - cout << sent_id << " ||| " << TD::GetString(d->yield) << " ||| " - << d->feature_values << " ||| " << log(d->score) << endl; - } - } else { - KBest::KBestDerivations<vector<WordID>, ESentenceTraversal> kbest(forest, k); - for (int i = 0; i < k; ++i) { - const KBest::KBestDerivations<vector<WordID>, ESentenceTraversal>::Derivation* d = - kbest.LazyKthBest(forest.nodes_.size() - 1, i); - if (!d) break; - cout << sent_id << " ||| " << TD::GetString(d->yield) << " ||| " - << d->feature_values << " ||| " << log(d->score) << endl; - } - } -} - -struct ELengthWeightFunction { - double operator()(const Hypergraph::Edge& e) const { - return e.rule_->ELength() - e.rule_->Arity(); - } -}; - - -struct TRPHash { - size_t operator()(const TRulePtr& o) const { return reinterpret_cast<size_t>(o.get()); } -}; -static void ExtractRulesDedupe(const Hypergraph& hg, ostream* os) { - static unordered_set<TRulePtr, TRPHash> written; - for (int i = 0; i < hg.edges_.size(); ++i) { - const TRulePtr& rule = hg.edges_[i].rule_; - if (written.insert(rule).second) { - (*os) << rule->AsString() << endl; - } - } -} - -void register_feature_functions(); - -int main(int argc, char** argv) { - global_ff_registry.reset(new FFRegistry); - register_feature_functions(); - ShowBanner(); - po::variables_map conf; - InitCommandLine(argc, argv, &conf); - const bool write_gradient = conf.count("gradient"); - const bool feature_expectations = conf.count("feature_expectations"); - if (write_gradient && feature_expectations) { - cerr << "You can only specify --gradient or --feature_expectations, not both!\n"; - exit(1); - } - const bool output_training_vector = (write_gradient || feature_expectations); - - boost::shared_ptr<Translator> translator; - const string formalism = LowercaseString(conf["formalism"].as<string>()); - const bool csplit_preserve_full_word = conf.count("csplit_preserve_full_word"); - if (csplit_preserve_full_word && - (formalism != "csplit" || !conf.count("beam_prune"))) { - cerr << "--csplit_preserve_full_word should only be " - << "used with csplit AND --beam_prune!\n"; - exit(1); - } - const bool csplit_output_plf = conf.count("csplit_output_plf"); - if (csplit_output_plf && formalism != "csplit") { - cerr << "--csplit_output_plf should only be used with csplit!\n"; - exit(1); - } - - if (formalism == "scfg") - translator.reset(new SCFGTranslator(conf)); - else if (formalism == "fst") - translator.reset(new FSTTranslator(conf)); - else if (formalism == "pb") - translator.reset(new PhraseBasedTranslator(conf)); - else if (formalism == "csplit") - translator.reset(new CompoundSplit(conf)); - else if (formalism == "lexcrf") - translator.reset(new LexicalCRF(conf)); - else - assert(!"error"); - - vector<double> feature_weights; - Weights w; - if (conf.count("weights")) { - w.InitFromFile(conf["weights"].as<string>()); - feature_weights.resize(FD::NumFeats()); - w.InitVector(&feature_weights); - } - - // set up additional scoring features - vector<shared_ptr<FeatureFunction> > pffs; - vector<const FeatureFunction*> late_ffs; - if (conf.count("feature_function") > 0) { - const vector<string>& add_ffs = conf["feature_function"].as<vector<string> >(); - for (int i = 0; i < add_ffs.size(); ++i) { - string ff, param; - SplitCommandAndParam(add_ffs[i], &ff, ¶m); - cerr << "Feature: " << ff; - if (param.size() > 0) cerr << " (with config parameters '" << param << "')\n"; - else cerr << " (no config parameters)\n"; - shared_ptr<FeatureFunction> pff = global_ff_registry->Create(ff, param); - if (!pff) { exit(1); } - // TODO check that multiple features aren't trying to set the same fid - pffs.push_back(pff); - late_ffs.push_back(pff.get()); - } - } - ModelSet late_models(feature_weights, late_ffs); - - const int sample_max_trans = conf.count("max_translation_sample") ? - conf["max_translation_sample"].as<int>() : 0; - if (sample_max_trans) - rng.reset(new RandomNumberGenerator<boost::mt19937>); - const bool aligner_mode = conf.count("aligner"); - const bool minimal_forests = conf.count("minimal_forests"); - const bool graphviz = conf.count("graphviz"); - const bool encode_b64 = conf["vector_format"].as<string>() == "b64"; - const bool kbest = conf.count("k_best"); - const bool unique_kbest = conf.count("unique_k_best"); - shared_ptr<WriteFile> extract_file; - if (conf.count("extract_rules")) - extract_file.reset(new WriteFile(conf["extract_rules"].as<string>())); - - int combine_size = conf["combine_size"].as<int>(); - if (combine_size < 1) combine_size = 1; - const string input = conf["input"].as<string>(); - cerr << "Reading input from " << ((input == "-") ? "STDIN" : input.c_str()) << endl; - ReadFile in_read(input); - istream *in = in_read.stream(); - assert(*in); - - SparseVector<double> acc_vec; // accumulate gradient - double acc_obj = 0; // accumulate objective - int g_count = 0; // number of gradient pieces computed - int sent_id = -1; // line counter - - while(*in) { - Timer::Summarize(); - ++sent_id; - string buf; - getline(*in, buf); - if (buf.empty()) continue; - map<string, string> sgml; - ProcessAndStripSGML(&buf, &sgml); - if (sgml.find("id") != sgml.end()) - sent_id = atoi(sgml["id"].c_str()); - - cerr << "\nINPUT: "; - if (buf.size() < 100) - cerr << buf << endl; - else { - size_t x = buf.rfind(" ", 100); - if (x == string::npos) x = 100; - cerr << buf.substr(0, x) << " ..." << endl; - } - cerr << " id = " << sent_id << endl; - string to_translate; - Lattice ref; - ParseTranslatorInputLattice(buf, &to_translate, &ref); - const bool has_ref = ref.size() > 0; - SentenceMetadata smeta(sent_id, ref); - const bool hadoop_counters = (write_gradient); - Hypergraph forest; // -LM forest - Timer t("Translation"); - if (!translator->Translate(to_translate, &smeta, feature_weights, &forest)) { - cerr << " NO PARSE FOUND.\n"; - if (hadoop_counters) - cerr << "reporter:counter:UserCounters,FParseFailed,1" << endl; - cout << endl << flush; - continue; - } - cerr << " -LM forest (nodes/edges): " << forest.nodes_.size() << '/' << forest.edges_.size() << endl; - cerr << " -LM forest (paths): " << forest.NumberOfPaths() << endl; - if (conf.count("show_expected_length")) { - const PRPair<double, double> res = - Inside<PRPair<double, double>, - PRWeightFunction<double, EdgeProb, double, ELengthWeightFunction> >(forest); - cerr << " Expected length (words): " << res.r / res.p << "\t" << res << endl; - } - if (conf.count("show_partition")) { - const prob_t z = Inside<prob_t, EdgeProb>(forest); - cerr << " -LM partition log(Z): " << log(z) << endl; - } - if (extract_file) - ExtractRulesDedupe(forest, extract_file->stream()); - vector<WordID> trans; - const prob_t vs = ViterbiESentence(forest, &trans); - cerr << " -LM Viterbi: " << TD::GetString(trans) << endl; - if (conf.count("show_tree_structure")) - cerr << " -LM tree: " << ViterbiETree(forest) << endl;; - cerr << " -LM Viterbi: " << log(vs) << endl; - - bool has_late_models = !late_models.empty(); - if (has_late_models) { - forest.Reweight(feature_weights); - forest.SortInEdgesByEdgeWeights(); - Hypergraph lm_forest; - int cubepruning_pop_limit = conf["cubepruning_pop_limit"].as<int>(); - ApplyModelSet(forest, - smeta, - late_models, - PruningConfiguration(cubepruning_pop_limit), - &lm_forest); - forest.swap(lm_forest); - forest.Reweight(feature_weights); - trans.clear(); - ViterbiESentence(forest, &trans); - cerr << " +LM forest (nodes/edges): " << forest.nodes_.size() << '/' << forest.edges_.size() << endl; - cerr << " +LM forest (paths): " << forest.NumberOfPaths() << endl; - cerr << " +LM Viterbi: " << TD::GetString(trans) << endl; - } - if (conf.count("beam_prune")) { - vector<bool> preserve_mask(forest.edges_.size(), false); - if (csplit_preserve_full_word) - preserve_mask[CompoundSplit::GetFullWordEdgeIndex(forest)] = true; - forest.BeamPruneInsideOutside(1.0, false, conf["beam_prune"].as<double>(), &preserve_mask); - cerr << " Pruned forest (paths): " << forest.NumberOfPaths() << endl; - } - - if (conf.count("forest_output") && !has_ref) { - ForestWriter writer(conf["forest_output"].as<string>(), sent_id); - assert(writer.Write(forest, minimal_forests)); - } - - if (sample_max_trans) { - MaxTranslationSample(&forest, sample_max_trans, conf.count("k_best") ? conf["k_best"].as<int>() : 0); - } else { - if (kbest) { - DumpKBest(sent_id, forest, conf["k_best"].as<int>(), unique_kbest); - } else if (csplit_output_plf) { - cout << HypergraphIO::AsPLF(forest, false) << endl; - } else { - if (!graphviz && !has_ref) { - cout << TD::GetString(trans) << endl << flush; - } - } - } - - const int max_trans_beam_size = conf.count("max_translation_beam") ? - conf["max_translation_beam"].as<int>() : 0; - if (max_trans_beam_size) { - Hack::MaxTrans(forest, max_trans_beam_size); - continue; - } - - if (graphviz && !has_ref) forest.PrintGraphviz(); - - // the following are only used if write_gradient is true! - SparseVector<double> full_exp, ref_exp, gradient; - double log_z = 0, log_ref_z = 0; - if (write_gradient) - log_z = log( - InsideOutside<prob_t, EdgeProb, SparseVector<double>, EdgeFeaturesWeightFunction>(forest, &full_exp)); - - if (has_ref) { - if (HG::Intersect(ref, &forest)) { - cerr << " Constr. forest (nodes/edges): " << forest.nodes_.size() << '/' << forest.edges_.size() << endl; - cerr << " Constr. forest (paths): " << forest.NumberOfPaths() << endl; - forest.Reweight(feature_weights); - cerr << " Constr. VitTree: " << ViterbiFTree(forest) << endl; - if (hadoop_counters) - cerr << "reporter:counter:UserCounters,SentencePairsParsed,1" << endl; - if (conf.count("show_partition")) { - const prob_t z = Inside<prob_t, EdgeProb>(forest); - cerr << " Contst. partition log(Z): " << log(z) << endl; - } - //DumpKBest(sent_id, forest, 1000); - if (conf.count("forest_output")) { - ForestWriter writer(conf["forest_output"].as<string>(), sent_id); - assert(writer.Write(forest, minimal_forests)); - } - if (aligner_mode && !output_training_vector) - AlignerTools::WriteAlignment(to_translate, ref, forest); - if (write_gradient) { - log_ref_z = log( - InsideOutside<prob_t, EdgeProb, SparseVector<double>, EdgeFeaturesWeightFunction>(forest, &ref_exp)); - if (log_z < log_ref_z) { - cerr << "DIFF. ERR! log_z < log_ref_z: " << log_z << " " << log_ref_z << endl; - exit(1); - } - //cerr << "FULL: " << full_exp << endl; - //cerr << " REF: " << ref_exp << endl; - ref_exp -= full_exp; - acc_vec += ref_exp; - acc_obj += (log_z - log_ref_z); - } - if (feature_expectations) { - acc_obj += log( - InsideOutside<prob_t, EdgeProb, SparseVector<double>, EdgeFeaturesWeightFunction>(forest, &ref_exp)); - acc_vec += ref_exp; - } - - if (output_training_vector) { - ++g_count; - if (g_count % combine_size == 0) { - if (encode_b64) { - cout << "0\t"; - B64::Encode(acc_obj, acc_vec, &cout); - cout << endl << flush; - } else { - cout << "0\t**OBJ**=" << acc_obj << ';' << acc_vec << endl << flush; - } - acc_vec.clear(); - acc_obj = 0; - } - } - if (conf.count("graphviz")) forest.PrintGraphviz(); - } else { - cerr << " REFERENCE UNREACHABLE.\n"; - if (write_gradient) { - if (hadoop_counters) - cerr << "reporter:counter:UserCounters,EFParseFailed,1" << endl; - cout << endl << flush; - } - } - } - } - if (output_training_vector && !acc_vec.empty()) { - if (encode_b64) { - cout << "0\t"; - B64::Encode(acc_obj, acc_vec, &cout); - cout << endl << flush; - } else { - cout << "0\t**OBJ**=" << acc_obj << ';' << acc_vec << endl << flush; - } - } -} - |