Cherry picked Mr.MIRA compatibility mode code

author: Wu, Ke <wuke@cs.umd.edu> 2014-10-12 16:30:02 -0400
committer: Wu, Ke <wuke@cs.umd.edu> 2014-10-12 16:30:02 -0400
commit: 8c0e4a5c1f168a419b3a236a94815c97164bddbc (patch)
tree: 177b7cd332482f779f04945fea8325a00e9cfa54 /decoder/decoder.cc
parent: d88186af251ecae60974b20395ce75807bfdda35 (diff)
1 files changed, 32 insertions, 7 deletions
diff --git a/decoder/decoder.cc b/decoder/decoder.cc
index c384c33f..93282576 100644
--- a/decoder/decoder.cc
+++ b/decoder/decoder.cc
@@ -17,6 +17,7 @@ namespace std { using std::tr1::unordered_map; }
 #include "fdict.h"
 #include "timing_stats.h"
 #include "verbose.h"
+#include "b64featvector.h"
 
 #include "translator.h"
 #include "phrasebased_translator.h"
@@ -195,7 +196,7 @@ struct DecoderImpl {
       }
       forest.PruneInsideOutside(beam_prune,density_prune,pm,false,1);
       if (!forestname.empty()) forestname=" "+forestname;
-      if (!SILENT) { 
+      if (!SILENT) {
         forest_stats(forest,"  Pruned "+forestname+" forest",false,false);
         cerr << "  Pruned "<<forestname<<" forest portion of edges kept: "<<forest.edges_.size()/presize<<endl;
       }
@@ -261,7 +262,7 @@ struct DecoderImpl {
       assert(ref);
       LatticeTools::ConvertTextOrPLF(sref, ref);
     }
-  } 
+  }
 
   // used to construct the suffix string to get the name of arguments for multiple passes
   // e.g., the "2" in --weights2
@@ -284,7 +285,7 @@ struct DecoderImpl {
   boost::shared_ptr<RandomNumberGenerator<boost::mt19937> > rng;
   int sample_max_trans;
   bool aligner_mode;
-  bool graphviz; 
+  bool graphviz;
   bool joshua_viz;
   bool encode_b64;
   bool kbest;
@@ -301,6 +302,7 @@ struct DecoderImpl {
   bool feature_expectations; // TODO Observer
   bool output_training_vector; // TODO Observer
   bool remove_intersected_rule_annotations;
+  bool mr_mira_compat;  // Mr.MIRA compatibility mode.
   boost::scoped_ptr<IncrementalBase> incremental;
 
 
@@ -414,7 +416,8 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
         ("vector_format",po::value<string>()->default_value("b64"), "Sparse vector serialization format for feature expectations or gradients, includes (text or b64)")
         ("combine_size,C",po::value<int>()->default_value(1), "When option -G is used, process this many sentence pairs before writing the gradient (1=emit after every sentence pair)")
         ("forest_output,O",po::value<string>(),"Directory to write forests to")
-        ("remove_intersected_rule_annotations", "After forced decoding is completed, remove nonterminal annotations (i.e., the source side spans)");
+        ("remove_intersected_rule_annotations", "After forced decoding is completed, remove nonterminal annotations (i.e., the source side spans)")
+        ("mr_mira_compat", "Mr.MIRA compatibility mode (applies weight delta if available; outputs number of lines before k-best)");
 
   // ob.AddOptions(&opts);
   po::options_description clo("Command line options");
@@ -666,6 +669,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
   get_oracle_forest = conf.count("get_oracle_forest");
   oracle.show_derivation=conf.count("show_derivations");
   remove_intersected_rule_annotations = conf.count("remove_intersected_rule_annotations");
+  mr_mira_compat = conf.count("mr_mira_compat");
 
   combine_size = conf["combine_size"].as<int>();
   if (combine_size < 1) combine_size = 1;
@@ -699,6 +703,24 @@ void Decoder::AddSupplementalGrammarFromString(const std::string& grammar_string
   static_cast<SCFGTranslator&>(*pimpl_->translator).AddSupplementalGrammarFromString(grammar_string);
 }
 
+static inline void ApplyWeightDelta(const string &delta_b64, vector<weight_t> *weights) {
+  SparseVector<weight_t> delta;
+  DecodeFeatureVector(delta_b64, &delta);
+  if (delta.empty()) return;
+  // Apply updates
+  for (SparseVector<weight_t>::iterator dit = delta.begin();
+       dit != delta.end(); ++dit) {
+    int feat_id = dit->first;
+    union { weight_t weight; unsigned long long repr; } feat_delta;
+    feat_delta.weight = dit->second;
+    if (!SILENT)
+      cerr << "[decoder weight update] " << FD::Convert(feat_id) << " " << feat_delta.weight
+           << " = " << hex << feat_delta.repr << endl;
+    if (weights->size() <= feat_id) weights->resize(feat_id + 1);
+    (*weights)[feat_id] += feat_delta.weight;
+  }
+}
+
 bool DecoderImpl::Decode(const string& input, DecoderObserver* o) {
   string buf = input;
   NgramCache::Clear();   // clear ngram cache for remote LM (if used)
@@ -709,6 +731,10 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) {
   if (sgml.find("id") != sgml.end())
     sent_id = atoi(sgml["id"].c_str());
 
+  // Add delta from input to weights before decoding
+  if (mr_mira_compat)
+    ApplyWeightDelta(sgml["delta"], init_weights.get());
+
   if (!SILENT) {
     cerr << "\nINPUT: ";
     if (buf.size() < 100)
@@ -947,7 +973,7 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) {
     if (kbest && !has_ref) {
       //TODO: does this work properly?
       const string deriv_fname = conf.count("show_derivations") ? str("show_derivations",conf) : "-";
-      oracle.DumpKBest(sent_id, forest, conf["k_best"].as<int>(), unique_kbest,"-", deriv_fname);
+      oracle.DumpKBest(sent_id, forest, conf["k_best"].as<int>(), unique_kbest,mr_mira_compat, smeta.GetSourceLength(), "-", deriv_fname);
     } else if (csplit_output_plf) {
       cout << HypergraphIO::AsPLF(forest, false) << endl;
     } else {
@@ -1078,7 +1104,7 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) {
       if (conf.count("graphviz")) forest.PrintGraphviz();
       if (kbest) {
         const string deriv_fname = conf.count("show_derivations") ? str("show_derivations",conf) : "-";
-        oracle.DumpKBest(sent_id, forest, conf["k_best"].as<int>(), unique_kbest,"-", deriv_fname);
+        oracle.DumpKBest(sent_id, forest, conf["k_best"].as<int>(), unique_kbest, mr_mira_compat, smeta.GetSourceLength(), "-", deriv_fname);
       }
       if (conf.count("show_conditional_prob")) {
         const prob_t ref_z = Inside<prob_t, EdgeProb>(forest);
@@ -1098,4 +1124,3 @@ bool DecoderImpl::Decode(const string& input, DecoderObserver* o) {
   o->NotifyDecodingComplete(smeta);
   return true;
 }
-
author	Wu, Ke <wuke@cs.umd.edu>	2014-10-12 16:30:02 -0400
committer	Wu, Ke <wuke@cs.umd.edu>	2014-10-12 16:30:02 -0400
commit	8c0e4a5c1f168a419b3a236a94815c97164bddbc (patch)
tree	177b7cd332482f779f04945fea8325a00e9cfa54 /decoder/decoder.cc
parent	d88186af251ecae60974b20395ce75807bfdda35 (diff)