From 279bc261148978c4ac3e5e4e8e9ef689ca0c25ca Mon Sep 17 00:00:00 2001
From: graehl <graehl@ec762483-ff6d-05da-a07a-a48fb63a330f>
Date: Sun, 25 Jul 2010 02:52:58 +0000
Subject: cleaned up kbest, new USE_INFO_EDGE 1 logs per edge,
 --show_derivation (needs work; handle kbest deriv, viterbi deriv, sort hg
 exposing viterbi?)

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@405 ec762483-ff6d-05da-a07a-a48fb63a330f
---
 decoder/cdec.cc         | 18 +++++------
 decoder/ff_factory.h    |  9 +++++-
 decoder/ff_lm_fsa.h     |  2 ++
 decoder/ff_sample_fsa.h | 20 ++++++------
 decoder/hg.h            | 81 +++++++++++++++++++++++++++++++++++++++++++++++++
 decoder/oracle_bleu.h   | 59 ++++++++++++++++++-----------------
 decoder/viterbi.cc      | 13 +++++++-
 decoder/viterbi.h       |  2 +-
 8 files changed, 154 insertions(+), 50 deletions(-)

diff --git a/decoder/cdec.cc b/decoder/cdec.cc
index f366a08f..9110a234 100644
--- a/decoder/cdec.cc
+++ b/decoder/cdec.cc
@@ -313,8 +313,8 @@ bool prelm_weights_string(po::variables_map const& conf,string &s)
 }
 
 
-void forest_stats(Hypergraph &forest,string name,bool show_tree,bool show_features,WeightVector *weights=0) {
-    cerr << viterbi_stats(forest,name,true,show_tree);
+void forest_stats(Hypergraph &forest,string name,bool show_tree,bool show_features,WeightVector *weights=0,bool show_deriv=false) {
+  cerr << viterbi_stats(forest,name,true,show_tree,show_deriv);
     if (show_features) {
       cerr << name<<"     features: ";
 /*      Hypergraph::Edge const* best=forest.ViterbiGoalEdge();
@@ -328,9 +328,9 @@ void forest_stats(Hypergraph &forest,string name,bool show_tree,bool show_featur
     }
 }
 
-void forest_stats(Hypergraph &forest,string name,bool show_tree,bool show_features,DenseWeightVector const& feature_weights) {
+void forest_stats(Hypergraph &forest,string name,bool show_tree,bool show_features,DenseWeightVector const& feature_weights,bool sd=false) {
     WeightVector fw(feature_weights);
-    forest_stats(forest,name,show_tree,show_features,&fw);
+    forest_stats(forest,name,show_tree,show_features,&fw,sd);
 }
 
 
@@ -348,7 +348,7 @@ void maybe_prune(Hypergraph &forest,po::variables_map const& conf,string nbeam,s
       }
       forest.PruneInsideOutside(beam_prune,density_prune,pm,false,1,conf["promise_power"].as<double>());
       if (!forestname.empty()) forestname=" "+forestname;
-      forest_stats(forest,"  Pruned "+forestname+" forest",false,false);
+      forest_stats(forest,"  Pruned "+forestname+" forest",false,false,false);
       cerr << "  Pruned "<<forestname<<" forest portion of edges kept: "<<forest.edges_.size()/presize<<endl;
     }
 }
@@ -560,7 +560,7 @@ int main(int argc, char** argv) {
     }
     const bool show_tree_structure=conf.count("show_tree_structure");
     const bool show_features=conf.count("show_features");
-    forest_stats(forest,"  -LM forest",show_tree_structure,show_features,feature_weights);
+    forest_stats(forest,"  -LM forest",show_tree_structure,show_features,feature_weights,oracle.show_derivation);
     if (conf.count("show_expected_length")) {
       const PRPair<double, double> res =
         Inside<PRPair<double, double>,
@@ -586,7 +586,7 @@ int main(int argc, char** argv) {
                     &prelm_forest);
       forest.swap(prelm_forest);
       forest.Reweight(prelm_feature_weights);
-      forest_stats(forest," prelm forest",show_tree_structure,show_features,prelm_feature_weights);
+      forest_stats(forest," prelm forest",show_tree_structure,show_features,prelm_feature_weights,oracle.show_derivation);
     }
 
     maybe_prune(forest,conf,"prelm_beam_prune","prelm_density_prune","-LM",srclen);
@@ -605,7 +605,7 @@ int main(int argc, char** argv) {
                     &lm_forest);
       forest.swap(lm_forest);
       forest.Reweight(feature_weights);
-      forest_stats(forest,"  +LM forest",show_tree_structure,show_features,feature_weights);
+      forest_stats(forest,"  +LM forest",show_tree_structure,show_features,feature_weights,oracle.show_derivation);
     }
 
     maybe_prune(forest,conf,"beam_prune","density_prune","+LM",srclen);
@@ -650,7 +650,7 @@ int main(int argc, char** argv) {
     } else {
       if (kbest) {
         //TODO: does this work properly?
-        oracle.DumpKBest(sent_id, forest, conf["k_best"].as<int>(), unique_kbest,"");
+        oracle.DumpKBest(sent_id, forest, conf["k_best"].as<int>(), unique_kbest,"-");
       } else if (csplit_output_plf) {
         cout << HypergraphIO::AsPLF(forest, false) << endl;
       } else {
diff --git a/decoder/ff_factory.h b/decoder/ff_factory.h
index 12e768aa..93681c5e 100644
--- a/decoder/ff_factory.h
+++ b/decoder/ff_factory.h
@@ -1,7 +1,14 @@
 #ifndef _FF_FACTORY_H_
 #define _FF_FACTORY_H_
 
-//TODO: use http://www.boost.org/doc/libs/1_43_0/libs/functional/factory/doc/html/index.html?
+/*TODO: register state identity separately from feature function identity?  as
+ * in: string registry for name of state somewhere, assert that same result is
+ * computed by all users?  or, we can just require that ff sharing same state
+ * all be mashed into a single ffunc, which can just emit all the fid scores at
+ * once.  that's fine.
+ */
+
+//TODO: use http://www.boost.org/doc/libs/1_43_0/libs/functional/factory/doc/html/index.html ?
 
 #include <iostream>
 #include <string>
diff --git a/decoder/ff_lm_fsa.h b/decoder/ff_lm_fsa.h
index 344cd992..01b3764e 100755
--- a/decoder/ff_lm_fsa.h
+++ b/decoder/ff_lm_fsa.h
@@ -1,6 +1,8 @@
 #ifndef FF_LM_FSA_H
 #define FF_LM_FSA_H
 
+//TODO: use SRI LM::contextBOW, LM::contextID to shorten state
+
 #include "ff_lm.h"
 #include "ff_from_fsa.h"
 
diff --git a/decoder/ff_sample_fsa.h b/decoder/ff_sample_fsa.h
index 74d9e7b5..2aeaa6de 100755
--- a/decoder/ff_sample_fsa.h
+++ b/decoder/ff_sample_fsa.h
@@ -3,11 +3,11 @@
 
 #include "ff_from_fsa.h"
 
-// example: feature val = -1 * # of target words
+// example: feature val = 1 * # of target words
 struct WordPenaltyFsa : public FsaFeatureFunctionBase<WordPenaltyFsa> {
   static std::string usage(bool param,bool verbose) {
     return FeatureFunction::usage_helper(
-      "WordPenaltyFsa","","-1 per target word"
+      "WordPenaltyFsa","","1 per target word"
       ,param,verbose);
   }
 
@@ -21,7 +21,7 @@ struct WordPenaltyFsa : public FsaFeatureFunctionBase<WordPenaltyFsa> {
   }
   // move from state to next_state after seeing word x, while emitting features->add_value(fid,val) possibly with duplicates.  state and next_state may be same memory.
   Featval Scan1(WordID w,void const* state,void *next_state) const {
-    return -1;
+    return 1;
   }
 };
 
@@ -35,7 +35,7 @@ struct LongerThanPrev : public FsaFeatureFunctionBase<LongerThanPrev> {
     return FeatureFunction::usage_helper(
       "LongerThanPrev",
       "",
-      "stupid example stateful (bigram) feature: -1 per target word that's longer than the previous word (<s> sentence begin considered 3 chars long, </s> is sentence end.)",
+      "stupid example stateful (bigram) feature: 1 per target word that's longer than the previous word (<s> sentence begin considered 3 chars long, </s> is sentence end.)",
       param,verbose);
   }
 
@@ -49,7 +49,7 @@ struct LongerThanPrev : public FsaFeatureFunctionBase<LongerThanPrev> {
     return std::strlen(TD::Convert(w));
   }
   int markov_order() const { return 1; }
-  LongerThanPrev(std::string const& param) : Base(sizeof(int),singleton_sentence(TD::se)) {
+  LongerThanPrev(std::string const& param) : Base(sizeof(int)/* ,singleton_sentence(TD::se) */) {
     Init();
     if (0) { // all this is done in constructor already
       set_state_bytes(sizeof(int));
@@ -61,7 +61,7 @@ struct LongerThanPrev : public FsaFeatureFunctionBase<LongerThanPrev> {
       to_state(h_start.begin(),&ss,1);
     }
 
-    state(start.begin())=3;
+    state(start.begin())=999999;
     state(h_start.begin())=4; // estimate: anything >4 chars is usually longer than previous
 
   }
@@ -70,7 +70,7 @@ struct LongerThanPrev : public FsaFeatureFunctionBase<LongerThanPrev> {
     int prevlen=state(from);
     int len=wordlen(w);
     state(next_state)=len;
-    return len>prevlen ? -1 : 0;
+    return len>prevlen ? 1 : 0;
   }
 };
 
@@ -82,7 +82,7 @@ struct ShorterThanPrev : FsaTypedBase<int,ShorterThanPrev> {
     return FeatureFunction::usage_helper(
       "ShorterThanPrev",
       "",
-      "stupid example stateful (bigram) feature: -1 per target word that's shorter than the previous word (end of sentence considered '</s>')",
+      "stupid example stateful (bigram) feature: 1 per target word that's shorter than the previous word (end of sentence considered '</s>')",
       param,verbose);
   }
 
@@ -90,7 +90,7 @@ struct ShorterThanPrev : FsaTypedBase<int,ShorterThanPrev> {
     return std::strlen(TD::Convert(w));
   }
   ShorterThanPrev(std::string const& param)
-  : Base(3,4,singleton_sentence(TD::se))
+  : Base(-1,4/* ,singleton_sentence(TD::se) */)
     // start, h_start, end_phrase
     // estimate: anything <4 chars is usually shorter than previous
   {
@@ -106,7 +106,7 @@ struct ShorterThanPrev : FsaTypedBase<int,ShorterThanPrev> {
   void ScanT(SentenceMetadata const& /* smeta */,const Hypergraph::Edge& /* edge */,WordID w,int prevlen,int &len,FeatureVector *features) const {
     len=wordlen(w);
     if (len<prevlen)
-      features->add_value(fid_,-1);
+      features->add_value(fid_,1);
   }
 
 };
diff --git a/decoder/hg.h b/decoder/hg.h
index b3bfd19c..34638e04 100644
--- a/decoder/hg.h
+++ b/decoder/hg.h
@@ -1,12 +1,24 @@
 #ifndef _HG_H_
 #define _HG_H_
 
+#define USE_INFO_EDGE 1
+#if USE_INFO_EDGE
+# include <sstream>
+# define INFO_EDGE(e,msg) do { std::ostringstream &o=e.info_;o<<msg; } while(0)
+# define INFO_EDGEw(e,msg) do { std::ostringstream &o=e.info_;if (o.empty()) o<<' ';o<<msg; } while(0)
+#else
+# define INFO_EDGE(e,msg)
+# define INFO_EDGEw(e,msg)
+#endif
+#define INFO_EDGEln(e,msg) INFO_EDGE(e,msg<<'\n')
+
 #include <string>
 #include <vector>
 
 #include "feature_vector.h"
 #include "small_vector.h"
 #include "wordid.h"
+#include "tdict.h"
 #include "trule.h"
 #include "prob.h"
 
@@ -23,6 +35,7 @@ class Hypergraph {
 
   // SmallVector is a fast, small vector<int> implementation for sizes <= 2
   typedef SmallVectorInt TailNodeVector;
+  typedef std::vector<int> EdgesVector;
 
   // TODO get rid of cat_?
   // TODO keep cat_ and add span and/or state? :)
@@ -59,8 +72,76 @@ class Hypergraph {
     short int j_;
     short int prev_i_;
     short int prev_j_;
+#if USE_INFO_EDGE
+ private:
+    std::ostringstream info_;
+ public:
+    Edge(Edge const& o) : head_node_(o.head_node_),tail_nodes_(o.tail_nodes_),rule_(o.rule_),feature_values_(o.feature_values_),edge_prob_(o.edge_prob_),id_(o.id_),i_(o.i_),j_(o.j_),prev_i_(o.prev_i_),prev_j_(o.prev_j_), info_(o.info_.str()) {  }
+    void operator=(Edge const& o) {
+      head_node_ = o.head_node_; tail_nodes_ = o.tail_nodes_; rule_ = o.rule_; feature_values_ = o.feature_values_; edge_prob_ = o.edge_prob_; id_ = o.id_; i_ = o.i_; j_ = o.j_; prev_i_ = o.prev_i_; prev_j_ = o.prev_j_;  info_.str(o.info_.str());
+        }
+    std::string info() const { return info_.str(); }
+#else
+    std::string info() const { return std::string(); }
+#endif
+    void show(std::ostream &o,unsigned mask=SPAN|RULE) const {
+      o<<'{';
+      if (mask&CATEGORY)
+        o<<TD::Convert(rule_->GetLHS());
+      if (mask&PREV_SPAN)
+        o<<'<'<<prev_i_<<','<<prev_j_<<'>';
+      if (mask&SPAN)
+        o<<'<'<<i_<<','<<j_<<'>';
+      if (mask&PROB)
+        o<<" p="<<edge_prob_;
+      if (mask&FEATURES)
+        o<<" "<<feature_values_;
+      if (mask&RULE)
+        o<<rule_->AsString(mask&RULE_LHS);
+      if (USE_INFO_EDGE) {
+        if (mask) o << ' ';
+        o<<info();
+      }
+      o<<'}';
+    }
+    std::string show(unsigned mask=SPAN|RULE) const {
+      std::ostringstream o;
+      show(o,mask);
+      return o.str();
+    }
   };
 
+  Edge const* viterbi_edge(int node) const { // assumes sorting has best edge as first.  FIXME: check.  SortInEdgesByEdgeWeights appears to accomplish this
+    EdgesVector const& v=nodes_[node].in_edges_;
+    return v.empty() ? 0 : &edges_[v.front()];
+  }
+
+  enum {
+    NONE=0,CATEGORY=1,SPAN=2,PROB=4,FEATURES=8,RULE=16,RULE_LHS=32,PREV_SPAN=64,ALL=0xFFFFFFFF
+  };
+  std::string show_tree(Edge const& e, bool indent=true,unsigned show_mask=SPAN|RULE,int maxdepth=0x7FFFFFFF,int depth=0) const {
+    std::ostringstream o;
+    show_tree(o,e,show_mask,indent,maxdepth,depth);
+    return o.str();
+  }
+  void show_tree(std::ostream &o,Edge const& e,bool indent=true,unsigned show_mask=SPAN|RULE,int maxdepth=0x7FFFFFFF,int depth=0) const {
+    if (depth>maxdepth) return;
+    if (indent) for(int i=0;i<depth;++i) o<<' ';
+    o<<'(';
+    e.show(o,show_mask);
+    if (indent) o<<'\n';
+    for (int i=0;i<e.tail_nodes_.size();++i) {
+      Edge const *c=viterbi_edge(e.tail_nodes_[i]);
+      if (!c) continue;
+      assert(c); // leaf?
+      show_tree(o,*c,indent,show_mask,maxdepth,depth+1);
+      if (!indent) o<<' ';
+    }
+    if (indent) for(int i=0;i<depth;++i) o<<' ';
+    o<<")";
+    if (indent) o<<"\n";
+  }
+
   // returns edge with rule_.IsGoal, returns 0 if none found.  otherwise gives best edge_prob_ - note: I don't think edge_prob_ is viterbi cumulative, so this would just be the best local probability.
   Edge const* ViterbiGoalEdge() const;
 
diff --git a/decoder/oracle_bleu.h b/decoder/oracle_bleu.h
index 4dc86bc7..4a2cbbe5 100755
--- a/decoder/oracle_bleu.h
+++ b/decoder/oracle_bleu.h
@@ -94,6 +94,7 @@ struct OracleBleu {
       ("references,R", value<Refs >(&refs), "Translation reference files")
       ("oracle_loss", value<string>(&loss_name)->default_value("IBM_BLEU_3"), "IBM_BLEU_3 (default), IBM_BLEU etc")
       ("bleu_weight", value<double>(&bleu_weight)->default_value(1.), "weight to give the hope/fear loss function vs. model score")
+      ("show_derivation", bool_switch(&show_derivation), "show derivation tree in kbest")
       ("verbose",bool_switch(&verbose),"detailed logs")
       ;
   }
@@ -248,46 +249,48 @@ struct OracleBleu {
 //	dest_forest->SortInEdgesByEdgeWeights();
   }
 
-// TODO decoder output should probably be moved to another file - how about oracle_bleu.h
-  void DumpKBest(const int sent_id, const Hypergraph& forest, const int k, const bool unique, std::string const &kbest_out_filename_) {
+  bool show_derivation;
+  template <class Filter>
+  void kbest(int sent_id,Hypergraph const& forest,int k,std::ostream &kbest_out=std::cout,std::ostream &deriv_out=std::cerr) {
     using namespace std;
     using namespace boost;
-    cerr << "In kbest\n";
-
-    ofstream kbest_out;
-    kbest_out.open(kbest_out_filename_.c_str());
-    cerr << "Output kbest to " << kbest_out_filename_;
-
+    typedef KBest::KBestDerivations<Sentence, ESentenceTraversal,Filter> K;
+    K kbest(forest,k);
     //add length (f side) src length of this sentence to the psuedo-doc src length count
     float curr_src_length = doc_src_length + tmp_src_length;
-
-    if (unique) {
-      KBest::KBestDerivations<Sentence, ESentenceTraversal, KBest::FilterUnique> kbest(forest, k);
-      for (int i = 0; i < k; ++i) {
-        const KBest::KBestDerivations<Sentence, ESentenceTraversal, KBest::FilterUnique>::Derivation* d =
-          kbest.LazyKthBest(forest.nodes_.size() - 1, i);
-        if (!d) break;
-        //calculate score in context of psuedo-doc
+    for (int i = 0; i < k; ++i) {
+      typename K::Derivation *d = kbest.LazyKthBest(forest.nodes_.size() - 1, i);
+      if (!d) break;
+      kbest_out << sent_id << " ||| " << TD::GetString(d->yield) << " ||| "
+                << d->feature_values << " ||| " << log(d->score);
+      if (!refs.empty()) {
         ScoreP sentscore = GetScore(d->yield,sent_id);
         sentscore->PlusEquals(*doc_score,float(1));
         float bleu = curr_src_length * sentscore->ComputeScore();
-        kbest_out << sent_id << " ||| " << TD::GetString(d->yield) << " ||| "
-                  << d->feature_values << " ||| " << log(d->score) << " ||| " << bleu << endl;
-        // cout << sent_id << " ||| " << TD::GetString(d->yield) << " ||| "
-        //     << d->feature_values << " ||| " << log(d->score) << endl;
+        kbest_out << " ||| " << bleu;
       }
-    } else {
-      KBest::KBestDerivations<Sentence, ESentenceTraversal> kbest(forest, k);
-      for (int i = 0; i < k; ++i) {
-        const KBest::KBestDerivations<Sentence, ESentenceTraversal>::Derivation* d =
-          kbest.LazyKthBest(forest.nodes_.size() - 1, i);
-        if (!d) break;
-        cout << sent_id << " ||| " << TD::GetString(d->yield) << " ||| "
-             << d->feature_values << " ||| " << log(d->score) << endl;
+      kbest_out<<endl<<flush;
+      if (show_derivation) {
+        deriv_out<<"\nsent_id="<<sent_id<<"\n";
+        forest.show_tree(cerr,*d->edge);
+        deriv_out<<flush;
       }
     }
   }
 
+// TODO decoder output should probably be moved to another file - how about oracle_bleu.h
+  void DumpKBest(const int sent_id, const Hypergraph& forest, const int k, const bool unique, std::string const &kbest_out_filename_) {
+
+    WriteFile ko(kbest_out_filename_);
+    std::cerr << "Output kbest to " << kbest_out_filename_;
+
+    if (unique)
+      kbest<KBest::NoFilter>(sent_id,forest,k,ko.get(),std::cerr);
+    else {
+      kbest<KBest::FilterUnique>(sent_id,forest,k,ko.get(),std::cerr);
+    }
+  }
+
 void DumpKBest(std::string const& suffix,const int sent_id, const Hypergraph& forest, const int k, const bool unique, std::string const& forest_output)
   {
     std::ostringstream kbest_string_stream;
diff --git a/decoder/viterbi.cc b/decoder/viterbi.cc
index 7719de32..d0b7e6ec 100644
--- a/decoder/viterbi.cc
+++ b/decoder/viterbi.cc
@@ -6,7 +6,7 @@
 
 using namespace std;
 
-std::string viterbi_stats(Hypergraph const& hg, std::string const& name, bool estring, bool etree)
+std::string viterbi_stats(Hypergraph const& hg, std::string const& name, bool estring, bool etree,bool show_derivation)
 {
   ostringstream o;
   o << hg.stats(name);
@@ -19,6 +19,17 @@ std::string viterbi_stats(Hypergraph const& hg, std::string const& name, bool es
   if (etree) {
     o<<name<<"          tree: "<<ViterbiETree(hg)<<endl;
   }
+  if (show_derivation) {
+    o<<name<<"          derivation: ";
+    ViterbiPathTraversal::Result d;
+    Viterbi<ViterbiPathTraversal>(hg, &d);
+    if (d.empty())
+      o<<"(empty viterbi hyperpath - no translation)";
+    else
+      hg.show_tree(o,*d.back(),false); // last item should be goal (or at least depend on prev items).  TODO: this doesn't actually reorder the nodes in hg.
+    o<<endl;
+  }
+
   return o.str();
 }
 
diff --git a/decoder/viterbi.h b/decoder/viterbi.h
index 388bff3c..ea2bc6c2 100644
--- a/decoder/viterbi.h
+++ b/decoder/viterbi.h
@@ -6,7 +6,7 @@
 #include "hg.h"
 #include "tdict.h"
 
-std::string viterbi_stats(Hypergraph const& hg, std::string const& name="forest", bool estring=true, bool etree=false);
+std::string viterbi_stats(Hypergraph const& hg, std::string const& name="forest", bool estring=true, bool etree=false, bool derivation_tree=false);
 
 /// computes for each hg node the best (according to WeightType/WeightFunction) derivation, and some homomorphism (bottom up expression tree applied through Traversal) of it. T is the "return type" of Traversal, which is called only once for the best edge for a node's result (i.e. result will start default constructed)
 //TODO: make T a typename inside Traversal and WeightType a typename inside WeightFunction?
-- 
cgit v1.2.3