summaryrefslogtreecommitdiff
path: root/decoder/dwarf.h
blob: 49d2a3b760adddc77f872db6c4c86f8e95913e14 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
#ifndef DWARF_H
#define DWARF_H

#include <cstdlib>
#include <vector>
#include <map>
#include <string>
#include <ostream>
#include "wordid.h"
#include "lattice.h"
#include "trule.h"
#include "tdict.h"
#include <boost/functional/hash.hpp>
#include <tr1/unordered_map>
#include <boost/tuple/tuple.hpp>

using namespace std;
using namespace std::tr1;
using namespace boost::tuples;
using namespace boost;

const static bool DEBUG = false;

class CountTable {
public:
        int* ultimate;
        map<WordID,int*> model;
        int mode;
        int numColumn;
        void print() const;
        void setup(int _numcolumn, int _mode) {
          mode = _mode; numColumn = _numcolumn;
        }
};

class Alignment {
/* Alignment represents an alignment object in a 2D format to support function word-based models calculation 

   A note about model's parameter estimation:
   ==========================================
   The model is estimated as a two-level Dirichlet process. 
   For orientation model, the first tier estimation is:
   P(o|f,e) where *o* is the orientation value to estimate, *f* is the source function word aligned to *e* 
   its second tier is: P(o|f), while its third tier is P(o)
   For dominance model, the first tier estimation is:
   P(d|f1,f2,e1,e2) where *d* is a dominance value to estimate, *f1,f2* are the neighboring function words on the source
   aligned to *e1,e2* on the target side
   its second tier is: P(d|f1,f2) while its third tier is P(d)
    
   Taking orientation model as a case in point, a two level estimation proceeds as follow:
   P(o|f,e) = c(o,f,e) + alpha { c(o,f) + beta [ c (o) / c(.) ] }
                                 ------------------------------
                                 c(f)   + beta
              -------------------------------------------------
              c(f,e)   + alpha
   where c() is a count function, alpha and beta are the concentration parameter 
         of the first and second Dirichlet process respectively 
   To encourage or penalize the use of second and third tier statistics, bo1 and bo2 binary features are introduced 
*/
public:
  const static int MAX_WORDS = 200;  
  const static int MINIMUM_INIT = 1000;
  const static int MAXIMUM_INIT = -1000;
  const static int MAX_ARITY = 2;
  WordID kSOS;
  WordID kEOS;
  WordID kUNK;
  double alpha_oris; // 1st concentration parameter for orientation model 
  double beta_oris;  // 2nd concentration parameter for orientation model
  double alpha_orit; // 1st concentration parameter for orientation model 
  double beta_orit;  // 2nd concentration parameter for orientation model
  double alpha_doms; // idem as above but for dominance model
  double beta_doms;
  double alpha_domt; // idem as above but for dominance model
  double beta_domt;
  
  // ACCESS to alignment
  void set(int j,int i);   // j is the source index, while i is the target index
  void reset(int j,int i); // idem as above
  inline bool at(int j, int i) { return _matrix[j][i]; };
  inline int getJ() {return _J;}; // max source of the current alignment
  inline int getI() {return _I;}; // max target of the current alignment
  inline void setI(int I) { _I = I; };
  inline void setJ(int J) { _J = J; };
  inline void setF(vector<WordID> f) { _f=f;};
  inline void setE(vector<WordID> e) { _e=e;};
  inline WordID getF(int id) { if (id<0) return TD::Convert("<s>"); if (id>=_f.size()) return TD::Convert("</s>"); return _f[id];};
  inline WordID getE(int id) { if (id<0) return TD::Convert("<s>"); if (id>=_e.size()) return TD::Convert("</s>"); return _e[id];};
  void clearAls(int prevJ=200, int prevI=200);
  int sourceOf(int i, int start = -1);
  int targetOf(int j, int start = -1);
  inline int minSSpan(int i) { return _sSpan[i][0];}
  inline int maxSSpan(int i) { return _sSpan[i][1];}
  inline int minTSpan(int j) { return _tSpan[j][0];}
  inline int maxTSpan(int j) { return _tSpan[j][1];}
  static inline int link(int s, int t) { return (s << 16) | t; }
  static inline int source(int st) {return st >> 16; }
  static inline int target(int st) {return st & 0xffff; }
  inline void setAlphaOris(double val) { alpha_oris=val; }
  inline void setAlphaOrit(double val) { alpha_orit=val; }
  inline void setAlphaDoms(double val) { alpha_doms=val; }
  inline void setAlphaDomt(double val) { alpha_domt=val; }
  inline void setBetaOris(double val) { beta_oris=val; }
  inline void setBetaOrit(double val) { beta_orit=val; }
  inline void setBetaDoms(double val) { beta_doms=val; }
  inline void setBetaDomt(double val) { beta_domt=val; }
  inline void setFreqCutoff(int val) { cout << _freq_cutoff << " to " << val << endl;  _freq_cutoff=val; }
  string AsString();
  string AsStringSimple();
  int* SOS();
  int* EOS();

  // Model related function  
  Alignment();
  // Given the current *rule* and its antecedents, construct an alignment space and mark the function word alignments 
  // according *sfw* and *tfw*
  bool prepare(TRule& rule, const std::vector<const void*>& ant_contexts, 
               const map<WordID,int>& sfw, const map<WordID,int>& tfw, const Lattice& sourcelattice, int spanstart, int spanend);

  // Compute orientation model score which parameters are stored in *table* and pass the values accordingly
  // will call Orientation(Source|Target) and ScoreOrientation(Source|Target)
  void computeOrientationSource(const CountTable& table, double *cost, double *bonus, double *bo1, 
                                double *bo1_bonus, double *bo2, double *bo2_bonus);
  void computeOrientationSourcePos(const CountTable& table, double *cost, double *bonus,
                double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, int maxfwidx, int maxdepth1, int maxdepth2);
  void computeOrientationSourceGen(const CountTable& table, double *cost, double *bonus, double *bo1, 
                                double *bo1_bonus, double *bo2, double *bo2_bonus, const map<WordID,WordID>& tags);
  void computeOrientationSourceBackward(const CountTable& table, double *cost, double *bonus, double *bo1, 
                                double *bo1_bonus, double *bo2, double *bo2_bonus);
  void computeOrientationSourceBackwardPos(const CountTable& table, double *cost, double *bonus, double *bo1, 
                                double *bo1_bonus, double *bo2, double *bo2_bonus, int maxfwidx, int maxdepth1, int maxdepth2);
  void computeOrientationTarget(const CountTable& table, double *cost, double *bonus, double *bo1, 
                                double *bo1_bonus, double *bo2, double *bo2_bonus);
  void computeOrientationTargetBackward(const CountTable& table, double *cost, double *bonus, double *bo1, 
                                double *bo1_bonus, double *bo2, double *bo2_bonus);
  // Get the orientation value of a function word at a particular index *fw*
  // assign the value to either *oril* or *orir* accoring to *Lcompute* and *Rcompute*
  void OrientationSource(int fw, int*oril, int* orir, bool Lcompute=true, bool Rcompute=true);
  void OrientationSource(int fw0, int fw1, int*oril, int* orir, bool Lcompute=true, bool Rcompute=true);
  int  OrientationSource(int* left, int* right);
  void OrientationTarget(int fw, int*oril, int* orir, bool Lcompute=true, bool Rcompute=true);
  void OrientationTarget(int fw0, int fw1, int*oril, int* orir, bool Lcompute=true, bool Rcompute=true);

  vector<int> OrientationSourceLeft4Sampler(int fw0, int fw1);
  vector<int> OrientationSourceLeft4Sampler(int fw);
  vector<int> OrientationSourceRight4Sampler(int fw0, int fw1);
  vector<int> OrientationSourceRight4Sampler(int fw);
  vector<int> OrientationTargetLeft4Sampler(int fw0, int fw1);
  vector<int> OrientationTargetLeft4Sampler(int fw);
  vector<int> OrientationTargetRight4Sampler(int fw0, int fw1);
  vector<int> OrientationTargetRight4Sampler(int fw);

  // Given an orientation value *ori*, estimate the score accoding to *cond1*, *cond2* 
  // and assign the value accordingly according to *isBonus* and whether the first or the second tier estimation
  // is used or not
  void ScoreOrientationRight(const CountTable& table, int ori, WordID cond1, WordID cond2, 
                             bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, 
                             double *bo2, double *bo2_bonus, double alpha1, double beta1);
  void ScoreOrientationLeft(const CountTable& table, int ori, WordID cond1, WordID cond, 
                            bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, 
                            double *bo2, double *bo2_bonus, double alpha1, double beta1);
  double ScoreOrientationRight(const CountTable& table, int ori, WordID cond1, WordID cond2); 
  double ScoreOrientationLeft(const CountTable& table, int ori, WordID cond1, WordID cond); 
  void ScoreOrientationRightBackward(const CountTable& table, int ori, WordID cond1, WordID cond2, 
                             bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, 
                             double *bo2, double *bo2_bonus, double alpha1, double beta1);
  void ScoreOrientationLeftBackward(const CountTable& table, int ori, WordID cond1, WordID cond, 
                            bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, 
                            double *bo2, double *bo2_bonus, double alpha1, double beta1);
  double ScoreOrientationRightBackward(const CountTable& table, int ori, WordID cond1, WordID cond2); 
  double ScoreOrientationLeftBackward(const CountTable& table, int ori, WordID cond1, WordID cond); 
  void ScoreOrientation(const CountTable& table, int offset, int ori, WordID cond1, WordID cond2, 
                            bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, 
                            double *bo2, double *bo2_bonus, double alpha1, double beta1);
  double ScoreOrientation(const CountTable& table, int offset, int ori, WordID cond1, WordID cond2); 

  // idem as above except these are for dominance model
  void computeDominanceSource(const CountTable& table, WordID lfw, WordID rfw, double *cost, double *bonus, 
                              double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus);
  void computeDominanceSourcePos(const CountTable& table, WordID lfw, WordID rfw, double *cost, double *bonus, 
                              double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, int maxfwidx, int maxdepth1, int maxdepth2);
  void computeDominanceTarget(const CountTable& table, WordID lfw, WordID rfw, double *cost, double *bonus, 
                              double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus);
  void computeBorderDominanceSource(const CountTable& table, double *cost, double *bonus, 
        double *state_mono, double *state_nonmono,
        TRule &rule, const std::vector<const void*>& ant_contexts, const map<WordID,int>& sfw);
  int DominanceSource(int fw1, int fw2);
  int DominanceTarget(int fw1, int fw2);
  vector<int> DominanceSource4Sampler(int fw1, int fw2);
  vector<int> DominanceTarget4Sampler(int fw1, int fw2);
  void ScoreDominance(const CountTable& table, int dom, WordID s1, WordID s2, WordID t1, WordID t2, 
                      double *cost, double *bo1, double *bo2, bool isBonus, double alpha2, double beta2);
  double ScoreDominance(const CountTable& table, int dom, WordID s1, WordID s2, WordID t1, WordID t2); 

  // Remove all function word alignments except those at the borders
  // May result in more than two function word alignments at each side, because this function 
  // will continue keeping function word alignments until the first aligned word at each side
  void BorderingSFWsOnly();
  void BorderingTFWsOnly();
  void simplify(int *ret); // preparing the next state
  void simplify_nofw(int *ret); // preparing the next state when no function word appears
  // set the first part of the next state, which concerns with function word
  // fas, las, fat, lat is the (f)irst or (l)ast function word alignments either on the (s)ource or (t)arget
  // these parameters to anticipate cases where there are more than two function word alignments
  void FillFWIdxsState(int *state, int fas, int las, int fat, int lat);

  // Helper function to obtain the aligned words on the other side 
  // WARNING!!! Only to be used if the als are in sync with either source or target sentences
  WordID F2EProjectionFromExternal(int idx, const vector<AlignmentPoint>& als, const string& delimiter=" ");
  WordID E2FProjectionFromExternal(int idx, const vector<AlignmentPoint>& als, const string& delimiter=" ");
  // WARNING!!! Only to be used in dwarf_main.cc 
  // These two function words assume that the alignment contains phrase boundary 
  // but the source and target sentences do not
  WordID F2EProjection(int idx, const string& delimiter=" ");
  WordID E2FProjection(int idx, const string& delimiter=" ");
  void SetCurrAlVector();
  int* blockSource(int fw1, int fw2);
  int* blockTarget(int fw1, int fw2);
  void ToArrayInt(vector<int>* arr);
  int* neighborLeft(int startidx, int endidx, bool* found);
  int* neighborRight(int startidx, int endidx, bool* found);
private:
  // Hash to avoid redundancy
  unordered_map<vector<int>, int, boost::hash<vector<int> > > oris_hash;
  unordered_map<vector<int>, int, boost::hash<vector<int> > > orit_hash;
  unordered_map<vector<int>, int, boost::hash<vector<int> > > doms_hash;
  unordered_map<vector<int>, int, boost::hash<vector<int> > > domt_hash;
  unordered_map<vector<int>, vector<int>, boost::hash<vector<int> > > simplify_hash;
  unordered_map<vector<int>, vector<int>, boost::hash<vector<int> > > prepare_hash;
 
  int _J; // effective source length;
  int _I; // effective target length;
  bool _matrix[MAX_WORDS][MAX_WORDS]; // true if aligned 
  short _sSpan[MAX_WORDS][2]; //the source span of a target index; 0->min, 1->max
  short _tSpan[MAX_WORDS][2]; //the target span of a source index; 0->min, 2->max
  int _freq_cutoff;
  int SourceFWRuleIdxs[40]; //the indexes of function words in the rule; 
          // The following applies to all *FW*Idxs
          // *FW*Idxs[0] = size
          // *FW*Idxs[idx*3-2] = index in the alignment, where idx starts from 1 to size
          // *FW*Idxs[idx*3-1] = source WordID
          // *FW*Idxs[idx*3]   = target WordID
  int SourceFWRuleAbsIdxs[40];
  int TargetFWRuleIdxs[40]; //the indexes of function words in the rule; zeroth element is the count
  int ** SourceFWAntsIdxs;  //the indexes of function words in antecedents
  int ** SourceFWAntsAbsIdxs;
  int ** TargetFWAntsIdxs;  //the indexes of function words in antecedents
  int SourceRuleIdxs[40]; //the indexes of SOURCE tokens (zeroth element is the number of source tokens)
        //>0 means terminal, -i means the i-th Xs
  int TargetRuleIdxs[40]; //the indexes of TARGET tokens (zeroth element is the number of target tokens)
  int ** SourceAntsIdxs;  //the array of indexes of a particular antecedent's SOURCE tokens
  int ** TargetAntsIdxs;  //the array of indexes of a particular antecedent's TARGET tokens
  int SourceFWIdxs[40];
  int SourceFWAbsIdxs[40];
  int TargetFWIdxs[40];
  // *sort* and *quickSort* are used to sort *FW*Idxs
  void sort(int* num);
  void quickSort(int arr[], int top, int bottom);

  // *block(Source|Target)* finds the minimum block that containts two indexes (fw1 and fw2)
  inline int least(int i1, int i2) { return (i1<i2)?i1:i2; }
  inline int most(int i1, int i2) { return (i1>i2)?i1:i2; }
  void simplifyBackward(vector<int *>*blocks, int* block, const vector<int>& danglings);
  // used in simplify to check whether an atomic block according to source function words is also atomic according
  // to target function words as well, otherwise break it 
  // the resulting blocks are added into *blocks*
  int _Arity;
  std::vector<WordID> _f; // the source sentence of the **current** rule (may not consistent with the current alignment)
  std::vector<WordID> _e; // the target sentence of the **current** rule
  int RuleAl[40];
  int **AntsAl;
  int firstSourceAligned(int start);
  int firstTargetAligned(int start);
  int lastSourceAligned(int end);
  int lastTargetAligned(int end);
  int fas, las, fat, lat; // first aligned source, last aligned source, first aligned target, last aligned target
  bool MemberOf(int* FWIdxs, int pos1, int pos2); // whether FWIdxs contains pos1 and pos2 consecutively
  // Convert the alignment to vector form, will be used for hashing purposes
  vector<int> curr_al;
  int GetFWGlobalIdx(int idx, const Lattice& sourcelattice, vector<WordID>& sources, int spanstart, int spanend, const std::vector<const void*>& ant_contexts, const map<WordID,int>& sfw);
  int GetFirstFWIdx(int spanstart,int spanend, const Lattice& sourcelattice, const map<WordID,int>& sfw);
  int GetLastFWIdx(int spanstart,int spanend, const Lattice& sourcelattice, const map<WordID,int>& sfw);
  WordID generalize(WordID original, const map<WordID,WordID>& tags, bool pos=false);
};

#endif