1 files changed, 286 insertions, 0 deletions
diff --git a/decoder/dwarf.h b/decoder/dwarf.h
new file mode 100644
index 00000000..83a0cae9
--- /dev/null
+++ b/decoder/dwarf.h
@@ -0,0 +1,286 @@
+#ifndef DWARF_H
+#define DWARF_H
+
+#include <cstdlib>
+#include <vector>
+#include <map>
+#include <string>
+#include <ostream>
+#include "wordid.h"
+#include "lattice.h"
+#include "trule.h"
+#include "tdict.h"
+#include <boost/functional/hash.hpp>
+#include <tr1/unordered_map>
+#include <boost/tuple/tuple.hpp>
+
+using namespace std;
+using namespace std::tr1;
+using namespace boost::tuples;
+using namespace boost;
+
+const static bool DEBUG = false;
+
+class CountTable {
+public:
+        int* ultimate;
+        map<WordID,int*> model;
+        int mode;
+        int numColumn;
+        const void print();
+        void setup(int _numcolumn, int _mode) {
+          mode = _mode; numColumn = _numcolumn;
+        }
+};
+
+class Alignment {
+/* Alignment represents an alignment object in a 2D format to support function word-based models calculation 
+
+   A note about model's parameter estimation:
+   ==========================================
+   The model is estimated as a two-level Dirichlet process. 
+   For orientation model, the first tier estimation is:
+   P(o|f,e) where *o* is the orientation value to estimate, *f* is the source function word aligned to *e* 
+   its second tier is: P(o|f), while its third tier is P(o)
+   For dominance model, the first tier estimation is:
+   P(d|f1,f2,e1,e2) where *d* is a dominance value to estimate, *f1,f2* are the neighboring function words on the source
+   aligned to *e1,e2* on the target side
+   its second tier is: P(d|f1,f2) while its third tier is P(d)
+    
+   Taking orientation model as a case in point, a two level estimation proceeds as follow:
+   P(o|f,e) = c(o,f,e) + alpha { c(o,f) + beta [ c (o) / c(.) ] }
+                                 ------------------------------
+                                 c(f)   + beta
+              -------------------------------------------------
+              c(f,e)   + alpha
+   where c() is a count function, alpha and beta are the concentration parameter 
+         of the first and second Dirichlet process respectively 
+   To encourage or penalize the use of second and third tier statistics, bo1 and bo2 binary features are introduced 
+*/
+public:
+  const static int MAX_WORDS = 200;  
+  const static int MINIMUM_INIT = 1000;
+  const static int MAXIMUM_INIT = -1000;
+  const static int MAX_ARITY = 2;
+  WordID kSOS;
+  WordID kEOS;
+  WordID kUNK;
+  double alpha_oris; // 1st concentration parameter for orientation model 
+  double beta_oris;  // 2nd concentration parameter for orientation model
+  double alpha_orit; // 1st concentration parameter for orientation model 
+  double beta_orit;  // 2nd concentration parameter for orientation model
+  double alpha_doms; // idem as above but for dominance model
+  double beta_doms;
+  double alpha_domt; // idem as above but for dominance model
+  double beta_domt;
+  
+  // ACCESS to alignment
+  void set(int j,int i);   // j is the source index, while i is the target index
+  void reset(int j,int i); // idem as above
+  inline bool at(int j, int i) { return _matrix[j][i]; };
+  inline int getJ() {return _J;}; // max source of the current alignment
+  inline int getI() {return _I;}; // max target of the current alignment
+  inline void setI(int I) { _I = I; };
+  inline void setJ(int J) { _J = J; };
+  inline void setF(vector<WordID> f) { _f=f;};
+  inline void setE(vector<WordID> e) { _e=e;};
+  inline WordID getF(int id) { if (id<0) return TD::Convert("<s>"); if (id>=_f.size()) return TD::Convert("</s>"); return _f[id];};
+  inline WordID getE(int id) { if (id<0) return TD::Convert("<s>"); if (id>=_e.size()) return TD::Convert("</s>"); return _e[id];};
+  void clearAls(int prevJ=200, int prevI=200);
+  int sourceOf(int i, int start = -1);
+  int targetOf(int j, int start = -1);
+  inline int minSSpan(int i) { return _sSpan[i][0];}
+  inline int maxSSpan(int i) { return _sSpan[i][1];}
+  inline int minTSpan(int j) { return _tSpan[j][0];}
+  inline int maxTSpan(int j) { return _tSpan[j][1];}
+  static inline int link(int s, int t) { return (s << 16) | t; }
+  static inline int source(int st) {return st >> 16; }
+  static inline int target(int st) {return st & 0xffff; }
+  inline void setAlphaOris(double val) { alpha_oris=val; }
+  inline void setAlphaOrit(double val) { alpha_orit=val; }
+  inline void setAlphaDoms(double val) { alpha_doms=val; }
+  inline void setAlphaDomt(double val) { alpha_domt=val; }
+  inline void setBetaOris(double val) { beta_oris=val; }
+  inline void setBetaOrit(double val) { beta_orit=val; }
+  inline void setBetaDoms(double val) { beta_doms=val; }
+  inline void setBetaDomt(double val) { beta_domt=val; }
+  inline void setFreqCutoff(int val) { cout << _freq_cutoff << " to " << val << endl;  _freq_cutoff=val; }
+  string AsString();
+  string AsStringSimple();
+  int* SOS();
+  int* EOS();
+
+  // Model related function  
+  Alignment();
+  // Given the current *rule* and its antecedents, construct an alignment space and mark the function word alignments 
+  // according *sfw* and *tfw*
+  bool prepare(TRule& rule, const std::vector<const void*>& ant_contexts, 
+               const map<WordID,int>& sfw, const map<WordID,int>& tfw, const Lattice& sourcelattice, int spanstart, int spanend);
+
+  // Compute orientation model score which parameters are stored in *table* and pass the values accordingly
+  // will call Orientation(Source|Target) and ScoreOrientation(Source|Target)
+  void computeOrientationSource(const CountTable& table, double *cost, double *bonus, double *bo1, 
+                                double *bo1_bonus, double *bo2, double *bo2_bonus);
+  void computeOrientationSourcePos(const CountTable& table, double *cost, double *bonus,
+                double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, int maxfwidx, int maxdepth1, int maxdepth2);
+  void computeOrientationSourceGen(const CountTable& table, double *cost, double *bonus, double *bo1, 
+                                double *bo1_bonus, double *bo2, double *bo2_bonus, const map<WordID,WordID>& tags);
+  void computeOrientationSourceBackward(const CountTable& table, double *cost, double *bonus, double *bo1, 
+                                double *bo1_bonus, double *bo2, double *bo2_bonus);
+  void computeOrientationSourceBackwardPos(const CountTable& table, double *cost, double *bonus, double *bo1, 
+                                double *bo1_bonus, double *bo2, double *bo2_bonus, int maxfwidx, int maxdepth1, int maxdepth2);
+  void computeOrientationTarget(const CountTable& table, double *cost, double *bonus, double *bo1, 
+                                double *bo1_bonus, double *bo2, double *bo2_bonus);
+  void computeOrientationTargetBackward(const CountTable& table, double *cost, double *bonus, double *bo1, 
+                                double *bo1_bonus, double *bo2, double *bo2_bonus);
+  // Get the orientation value of a function word at a particular index *fw*
+  // assign the value to either *oril* or *orir* accoring to *Lcompute* and *Rcompute*
+  void OrientationSource(int fw, int*oril, int* orir, bool Lcompute=true, bool Rcompute=true);
+  void OrientationSource(int fw0, int fw1, int*oril, int* orir, bool Lcompute=true, bool Rcompute=true);
+  int  OrientationSource(int* left, int* right);
+  void OrientationTarget(int fw, int*oril, int* orir, bool Lcompute=true, bool Rcompute=true);
+  void OrientationTarget(int fw0, int fw1, int*oril, int* orir, bool Lcompute=true, bool Rcompute=true);
+
+  vector<int> OrientationSourceLeft4Sampler(int fw0, int fw1);
+  vector<int> OrientationSourceLeft4Sampler(int fw);
+  vector<int> OrientationSourceRight4Sampler(int fw0, int fw1);
+  vector<int> OrientationSourceRight4Sampler(int fw);
+  vector<int> OrientationTargetLeft4Sampler(int fw0, int fw1);
+  vector<int> OrientationTargetLeft4Sampler(int fw);
+  vector<int> OrientationTargetRight4Sampler(int fw0, int fw1);
+  vector<int> OrientationTargetRight4Sampler(int fw);
+
+  // Given an orientation value *ori*, estimate the score accoding to *cond1*, *cond2* 
+  // and assign the value accordingly according to *isBonus* and whether the first or the second tier estimation
+  // is used or not
+  void ScoreOrientationRight(const CountTable& table, int ori, WordID cond1, WordID cond2, 
+                             bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, 
+                             double *bo2, double *bo2_bonus, double alpha1, double beta1);
+  void ScoreOrientationLeft(const CountTable& table, int ori, WordID cond1, WordID cond, 
+                            bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, 
+                            double *bo2, double *bo2_bonus, double alpha1, double beta1);
+  double ScoreOrientationRight(const CountTable& table, int ori, WordID cond1, WordID cond2); 
+  double ScoreOrientationLeft(const CountTable& table, int ori, WordID cond1, WordID cond); 
+  void ScoreOrientationRightBackward(const CountTable& table, int ori, WordID cond1, WordID cond2, 
+                             bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, 
+                             double *bo2, double *bo2_bonus, double alpha1, double beta1);
+  void ScoreOrientationLeftBackward(const CountTable& table, int ori, WordID cond1, WordID cond, 
+                            bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, 
+                            double *bo2, double *bo2_bonus, double alpha1, double beta1);
+  double ScoreOrientationRightBackward(const CountTable& table, int ori, WordID cond1, WordID cond2); 
+  double ScoreOrientationLeftBackward(const CountTable& table, int ori, WordID cond1, WordID cond); 
+  void ScoreOrientation(const CountTable& table, int offset, int ori, WordID cond1, WordID cond2, 
+                            bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, 
+                            double *bo2, double *bo2_bonus, double alpha1, double beta1);
+  double ScoreOrientation(const CountTable& table, int offset, int ori, WordID cond1, WordID cond2); 
+
+  // idem as above except these are for dominance model
+  void computeDominanceSource(const CountTable& table, WordID lfw, WordID rfw, double *cost, double *bonus, 
+                              double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus);
+  void computeDominanceSourcePos(const CountTable& table, WordID lfw, WordID rfw, double *cost, double *bonus, 
+                              double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, int maxfwidx, int maxdepth1, int maxdepth2);
+  void computeDominanceTarget(const CountTable& table, WordID lfw, WordID rfw, double *cost, double *bonus, 
+                              double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus);
+  void computeBorderDominanceSource(const CountTable& table, double *cost, double *bonus, 
+        double *state_mono, double *state_nonmono,
+        TRule &rule, const std::vector<const void*>& ant_contexts, const map<WordID,int>& sfw);
+  int DominanceSource(int fw1, int fw2);
+  int DominanceTarget(int fw1, int fw2);
+  vector<int> DominanceSource4Sampler(int fw1, int fw2);
+  vector<int> DominanceTarget4Sampler(int fw1, int fw2);
+  void ScoreDominance(const CountTable& table, int dom, WordID s1, WordID s2, WordID t1, WordID t2, 
+                      double *cost, double *bo1, double *bo2, bool isBonus, double alpha2, double beta2);
+  double ScoreDominance(const CountTable& table, int dom, WordID s1, WordID s2, WordID t1, WordID t2); 
+
+  // Remove all function word alignments except those at the borders
+  // May result in more than two function word alignments at each side, because this function 
+  // will continue keeping function word alignments until the first aligned word at each side
+  void BorderingSFWsOnly();
+  void BorderingTFWsOnly();
+  void simplify(int *ret); // preparing the next state
+  void simplify_nofw(int *ret); // preparing the next state when no function word appears
+  // set the first part of the next state, which concerns with function word
+  // fas, las, fat, lat is the (f)irst or (l)ast function word alignments either on the (s)ource or (t)arget
+  // these parameters to anticipate cases where there are more than two function word alignments
+  void FillFWIdxsState(int *state, int fas, int las, int fat, int lat);
+
+  // Helper function to obtain the aligned words on the other side 
+  // WARNING!!! Only to be used if the als are in sync with either source or target sentences
+  WordID F2EProjectionFromExternal(int idx, const vector<AlignmentPoint>& als, const string& delimiter=" ");
+  WordID E2FProjectionFromExternal(int idx, const vector<AlignmentPoint>& als, const string& delimiter=" ");
+  // WARNING!!! Only to be used in dwarf_main.cc 
+  // These two function words assume that the alignment contains phrase boundary 
+  // but the source and target sentences do not
+  WordID F2EProjection(int idx, const string& delimiter=" ");
+  WordID E2FProjection(int idx, const string& delimiter=" ");
+  void SetCurrAlVector();
+  int* blockSource(int fw1, int fw2);
+  int* blockTarget(int fw1, int fw2);
+  void ToArrayInt(vector<int>* arr);
+  int* neighborLeft(int startidx, int endidx, bool* found);
+  int* neighborRight(int startidx, int endidx, bool* found);
+private:
+  // Hash to avoid redundancy
+  unordered_map<vector<int>, int, boost::hash<vector<int> > > oris_hash;
+  unordered_map<vector<int>, int, boost::hash<vector<int> > > orit_hash;
+  unordered_map<vector<int>, int, boost::hash<vector<int> > > doms_hash;
+  unordered_map<vector<int>, int, boost::hash<vector<int> > > domt_hash;
+  unordered_map<vector<int>, vector<int>, boost::hash<vector<int> > > simplify_hash;
+  unordered_map<vector<int>, vector<int>, boost::hash<vector<int> > > prepare_hash;
+ 
+  int _J; // effective source length;
+  int _I; // effective target length;
+  bool _matrix[MAX_WORDS][MAX_WORDS]; // true if aligned 
+  short _sSpan[MAX_WORDS][2]; //the source span of a target index; 0->min, 1->max
+  short _tSpan[MAX_WORDS][2]; //the target span of a source index; 0->min, 2->max
+  int _freq_cutoff;
+  int SourceFWRuleIdxs[40]; //the indexes of function words in the rule; 
+          // The following applies to all *FW*Idxs
+          // *FW*Idxs[0] = size
+          // *FW*Idxs[idx*3-2] = index in the alignment, where idx starts from 1 to size
+          // *FW*Idxs[idx*3-1] = source WordID
+          // *FW*Idxs[idx*3]   = target WordID
+  int SourceFWRuleAbsIdxs[40];
+  int TargetFWRuleIdxs[40]; //the indexes of function words in the rule; zeroth element is the count
+  int ** SourceFWAntsIdxs;  //the indexes of function words in antecedents
+  int ** SourceFWAntsAbsIdxs;
+  int ** TargetFWAntsIdxs;  //the indexes of function words in antecedents
+  int SourceRuleIdxs[40]; //the indexes of SOURCE tokens (zeroth element is the number of source tokens)
+        //>0 means terminal, -i means the i-th Xs
+  int TargetRuleIdxs[40]; //the indexes of TARGET tokens (zeroth element is the number of target tokens)
+  int ** SourceAntsIdxs;  //the array of indexes of a particular antecedent's SOURCE tokens
+  int ** TargetAntsIdxs;  //the array of indexes of a particular antecedent's TARGET tokens
+  int SourceFWIdxs[40];
+  int SourceFWAbsIdxs[40];
+  int TargetFWIdxs[40];
+  // *sort* and *quickSort* are used to sort *FW*Idxs
+  void sort(int* num);
+  void quickSort(int arr[], int top, int bottom);
+
+  // *block(Source|Target)* finds the minimum block that containts two indexes (fw1 and fw2)
+  inline int least(int i1, int i2) { return (i1<i2)?i1:i2; }
+  inline int most(int i1, int i2) { return (i1>i2)?i1:i2; }
+  void simplifyBackward(vector<int *>*blocks, int* block, const vector<int>& danglings);
+  // used in simplify to check whether an atomic block according to source function words is also atomic according
+  // to target function words as well, otherwise break it 
+  // the resulting blocks are added into *blocks*
+  int _Arity;
+  std::vector<WordID> _f; // the source sentence of the **current** rule (may not consistent with the current alignment)
+  std::vector<WordID> _e; // the target sentence of the **current** rule
+  int RuleAl[40];
+  int **AntsAl;
+  int firstSourceAligned(int start);
+  int firstTargetAligned(int start);
+  int lastSourceAligned(int end);
+  int lastTargetAligned(int end);
+  int fas, las, fat, lat; // first aligned source, last aligned source, first aligned target, last aligned target
+  bool MemberOf(int* FWIdxs, int pos1, int pos2); // whether FWIdxs contains pos1 and pos2 consecutively
+  // Convert the alignment to vector form, will be used for hashing purposes
+  vector<int> curr_al;
+  int GetFWGlobalIdx(int idx, const Lattice& sourcelattice, vector<WordID>& sources, int spanstart, int spanend, const std::vector<const void*>& ant_contexts, const map<WordID,int>& sfw);
+  int GetFirstFWIdx(int spanstart,int spanend, const Lattice& sourcelattice, const map<WordID,int>& sfw);
+  int GetLastFWIdx(int spanstart,int spanend, const Lattice& sourcelattice, const map<WordID,int>& sfw);
+  WordID generalize(WordID original, const map<WordID,WordID>& tags, bool pos=false);
+};
+
+#endif