summaryrefslogtreecommitdiff
path: root/decoder/dwarf.h
diff options
context:
space:
mode:
Diffstat (limited to 'decoder/dwarf.h')
-rw-r--r--decoder/dwarf.h286
1 files changed, 286 insertions, 0 deletions
diff --git a/decoder/dwarf.h b/decoder/dwarf.h
new file mode 100644
index 00000000..83a0cae9
--- /dev/null
+++ b/decoder/dwarf.h
@@ -0,0 +1,286 @@
+#ifndef DWARF_H
+#define DWARF_H
+
+#include <cstdlib>
+#include <vector>
+#include <map>
+#include <string>
+#include <ostream>
+#include "wordid.h"
+#include "lattice.h"
+#include "trule.h"
+#include "tdict.h"
+#include <boost/functional/hash.hpp>
+#include <tr1/unordered_map>
+#include <boost/tuple/tuple.hpp>
+
+using namespace std;
+using namespace std::tr1;
+using namespace boost::tuples;
+using namespace boost;
+
+const static bool DEBUG = false;
+
+class CountTable {
+public:
+ int* ultimate;
+ map<WordID,int*> model;
+ int mode;
+ int numColumn;
+ const void print();
+ void setup(int _numcolumn, int _mode) {
+ mode = _mode; numColumn = _numcolumn;
+ }
+};
+
+class Alignment {
+/* Alignment represents an alignment object in a 2D format to support function word-based models calculation
+
+ A note about model's parameter estimation:
+ ==========================================
+ The model is estimated as a two-level Dirichlet process.
+ For orientation model, the first tier estimation is:
+ P(o|f,e) where *o* is the orientation value to estimate, *f* is the source function word aligned to *e*
+ its second tier is: P(o|f), while its third tier is P(o)
+ For dominance model, the first tier estimation is:
+ P(d|f1,f2,e1,e2) where *d* is a dominance value to estimate, *f1,f2* are the neighboring function words on the source
+ aligned to *e1,e2* on the target side
+ its second tier is: P(d|f1,f2) while its third tier is P(d)
+
+ Taking orientation model as a case in point, a two level estimation proceeds as follow:
+ P(o|f,e) = c(o,f,e) + alpha { c(o,f) + beta [ c (o) / c(.) ] }
+ ------------------------------
+ c(f) + beta
+ -------------------------------------------------
+ c(f,e) + alpha
+ where c() is a count function, alpha and beta are the concentration parameter
+ of the first and second Dirichlet process respectively
+ To encourage or penalize the use of second and third tier statistics, bo1 and bo2 binary features are introduced
+*/
+public:
+ const static int MAX_WORDS = 200;
+ const static int MINIMUM_INIT = 1000;
+ const static int MAXIMUM_INIT = -1000;
+ const static int MAX_ARITY = 2;
+ WordID kSOS;
+ WordID kEOS;
+ WordID kUNK;
+ double alpha_oris; // 1st concentration parameter for orientation model
+ double beta_oris; // 2nd concentration parameter for orientation model
+ double alpha_orit; // 1st concentration parameter for orientation model
+ double beta_orit; // 2nd concentration parameter for orientation model
+ double alpha_doms; // idem as above but for dominance model
+ double beta_doms;
+ double alpha_domt; // idem as above but for dominance model
+ double beta_domt;
+
+ // ACCESS to alignment
+ void set(int j,int i); // j is the source index, while i is the target index
+ void reset(int j,int i); // idem as above
+ inline bool at(int j, int i) { return _matrix[j][i]; };
+ inline int getJ() {return _J;}; // max source of the current alignment
+ inline int getI() {return _I;}; // max target of the current alignment
+ inline void setI(int I) { _I = I; };
+ inline void setJ(int J) { _J = J; };
+ inline void setF(vector<WordID> f) { _f=f;};
+ inline void setE(vector<WordID> e) { _e=e;};
+ inline WordID getF(int id) { if (id<0) return TD::Convert("<s>"); if (id>=_f.size()) return TD::Convert("</s>"); return _f[id];};
+ inline WordID getE(int id) { if (id<0) return TD::Convert("<s>"); if (id>=_e.size()) return TD::Convert("</s>"); return _e[id];};
+ void clearAls(int prevJ=200, int prevI=200);
+ int sourceOf(int i, int start = -1);
+ int targetOf(int j, int start = -1);
+ inline int minSSpan(int i) { return _sSpan[i][0];}
+ inline int maxSSpan(int i) { return _sSpan[i][1];}
+ inline int minTSpan(int j) { return _tSpan[j][0];}
+ inline int maxTSpan(int j) { return _tSpan[j][1];}
+ static inline int link(int s, int t) { return (s << 16) | t; }
+ static inline int source(int st) {return st >> 16; }
+ static inline int target(int st) {return st & 0xffff; }
+ inline void setAlphaOris(double val) { alpha_oris=val; }
+ inline void setAlphaOrit(double val) { alpha_orit=val; }
+ inline void setAlphaDoms(double val) { alpha_doms=val; }
+ inline void setAlphaDomt(double val) { alpha_domt=val; }
+ inline void setBetaOris(double val) { beta_oris=val; }
+ inline void setBetaOrit(double val) { beta_orit=val; }
+ inline void setBetaDoms(double val) { beta_doms=val; }
+ inline void setBetaDomt(double val) { beta_domt=val; }
+ inline void setFreqCutoff(int val) { cout << _freq_cutoff << " to " << val << endl; _freq_cutoff=val; }
+ string AsString();
+ string AsStringSimple();
+ int* SOS();
+ int* EOS();
+
+ // Model related function
+ Alignment();
+ // Given the current *rule* and its antecedents, construct an alignment space and mark the function word alignments
+ // according *sfw* and *tfw*
+ bool prepare(TRule& rule, const std::vector<const void*>& ant_contexts,
+ const map<WordID,int>& sfw, const map<WordID,int>& tfw, const Lattice& sourcelattice, int spanstart, int spanend);
+
+ // Compute orientation model score which parameters are stored in *table* and pass the values accordingly
+ // will call Orientation(Source|Target) and ScoreOrientation(Source|Target)
+ void computeOrientationSource(const CountTable& table, double *cost, double *bonus, double *bo1,
+ double *bo1_bonus, double *bo2, double *bo2_bonus);
+ void computeOrientationSourcePos(const CountTable& table, double *cost, double *bonus,
+ double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, int maxfwidx, int maxdepth1, int maxdepth2);
+ void computeOrientationSourceGen(const CountTable& table, double *cost, double *bonus, double *bo1,
+ double *bo1_bonus, double *bo2, double *bo2_bonus, const map<WordID,WordID>& tags);
+ void computeOrientationSourceBackward(const CountTable& table, double *cost, double *bonus, double *bo1,
+ double *bo1_bonus, double *bo2, double *bo2_bonus);
+ void computeOrientationSourceBackwardPos(const CountTable& table, double *cost, double *bonus, double *bo1,
+ double *bo1_bonus, double *bo2, double *bo2_bonus, int maxfwidx, int maxdepth1, int maxdepth2);
+ void computeOrientationTarget(const CountTable& table, double *cost, double *bonus, double *bo1,
+ double *bo1_bonus, double *bo2, double *bo2_bonus);
+ void computeOrientationTargetBackward(const CountTable& table, double *cost, double *bonus, double *bo1,
+ double *bo1_bonus, double *bo2, double *bo2_bonus);
+ // Get the orientation value of a function word at a particular index *fw*
+ // assign the value to either *oril* or *orir* accoring to *Lcompute* and *Rcompute*
+ void OrientationSource(int fw, int*oril, int* orir, bool Lcompute=true, bool Rcompute=true);
+ void OrientationSource(int fw0, int fw1, int*oril, int* orir, bool Lcompute=true, bool Rcompute=true);
+ int OrientationSource(int* left, int* right);
+ void OrientationTarget(int fw, int*oril, int* orir, bool Lcompute=true, bool Rcompute=true);
+ void OrientationTarget(int fw0, int fw1, int*oril, int* orir, bool Lcompute=true, bool Rcompute=true);
+
+ vector<int> OrientationSourceLeft4Sampler(int fw0, int fw1);
+ vector<int> OrientationSourceLeft4Sampler(int fw);
+ vector<int> OrientationSourceRight4Sampler(int fw0, int fw1);
+ vector<int> OrientationSourceRight4Sampler(int fw);
+ vector<int> OrientationTargetLeft4Sampler(int fw0, int fw1);
+ vector<int> OrientationTargetLeft4Sampler(int fw);
+ vector<int> OrientationTargetRight4Sampler(int fw0, int fw1);
+ vector<int> OrientationTargetRight4Sampler(int fw);
+
+ // Given an orientation value *ori*, estimate the score accoding to *cond1*, *cond2*
+ // and assign the value accordingly according to *isBonus* and whether the first or the second tier estimation
+ // is used or not
+ void ScoreOrientationRight(const CountTable& table, int ori, WordID cond1, WordID cond2,
+ bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus,
+ double *bo2, double *bo2_bonus, double alpha1, double beta1);
+ void ScoreOrientationLeft(const CountTable& table, int ori, WordID cond1, WordID cond,
+ bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus,
+ double *bo2, double *bo2_bonus, double alpha1, double beta1);
+ double ScoreOrientationRight(const CountTable& table, int ori, WordID cond1, WordID cond2);
+ double ScoreOrientationLeft(const CountTable& table, int ori, WordID cond1, WordID cond);
+ void ScoreOrientationRightBackward(const CountTable& table, int ori, WordID cond1, WordID cond2,
+ bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus,
+ double *bo2, double *bo2_bonus, double alpha1, double beta1);
+ void ScoreOrientationLeftBackward(const CountTable& table, int ori, WordID cond1, WordID cond,
+ bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus,
+ double *bo2, double *bo2_bonus, double alpha1, double beta1);
+ double ScoreOrientationRightBackward(const CountTable& table, int ori, WordID cond1, WordID cond2);
+ double ScoreOrientationLeftBackward(const CountTable& table, int ori, WordID cond1, WordID cond);
+ void ScoreOrientation(const CountTable& table, int offset, int ori, WordID cond1, WordID cond2,
+ bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus,
+ double *bo2, double *bo2_bonus, double alpha1, double beta1);
+ double ScoreOrientation(const CountTable& table, int offset, int ori, WordID cond1, WordID cond2);
+
+ // idem as above except these are for dominance model
+ void computeDominanceSource(const CountTable& table, WordID lfw, WordID rfw, double *cost, double *bonus,
+ double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus);
+ void computeDominanceSourcePos(const CountTable& table, WordID lfw, WordID rfw, double *cost, double *bonus,
+ double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, int maxfwidx, int maxdepth1, int maxdepth2);
+ void computeDominanceTarget(const CountTable& table, WordID lfw, WordID rfw, double *cost, double *bonus,
+ double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus);
+ void computeBorderDominanceSource(const CountTable& table, double *cost, double *bonus,
+ double *state_mono, double *state_nonmono,
+ TRule &rule, const std::vector<const void*>& ant_contexts, const map<WordID,int>& sfw);
+ int DominanceSource(int fw1, int fw2);
+ int DominanceTarget(int fw1, int fw2);
+ vector<int> DominanceSource4Sampler(int fw1, int fw2);
+ vector<int> DominanceTarget4Sampler(int fw1, int fw2);
+ void ScoreDominance(const CountTable& table, int dom, WordID s1, WordID s2, WordID t1, WordID t2,
+ double *cost, double *bo1, double *bo2, bool isBonus, double alpha2, double beta2);
+ double ScoreDominance(const CountTable& table, int dom, WordID s1, WordID s2, WordID t1, WordID t2);
+
+ // Remove all function word alignments except those at the borders
+ // May result in more than two function word alignments at each side, because this function
+ // will continue keeping function word alignments until the first aligned word at each side
+ void BorderingSFWsOnly();
+ void BorderingTFWsOnly();
+ void simplify(int *ret); // preparing the next state
+ void simplify_nofw(int *ret); // preparing the next state when no function word appears
+ // set the first part of the next state, which concerns with function word
+ // fas, las, fat, lat is the (f)irst or (l)ast function word alignments either on the (s)ource or (t)arget
+ // these parameters to anticipate cases where there are more than two function word alignments
+ void FillFWIdxsState(int *state, int fas, int las, int fat, int lat);
+
+ // Helper function to obtain the aligned words on the other side
+ // WARNING!!! Only to be used if the als are in sync with either source or target sentences
+ WordID F2EProjectionFromExternal(int idx, const vector<AlignmentPoint>& als, const string& delimiter=" ");
+ WordID E2FProjectionFromExternal(int idx, const vector<AlignmentPoint>& als, const string& delimiter=" ");
+ // WARNING!!! Only to be used in dwarf_main.cc
+ // These two function words assume that the alignment contains phrase boundary
+ // but the source and target sentences do not
+ WordID F2EProjection(int idx, const string& delimiter=" ");
+ WordID E2FProjection(int idx, const string& delimiter=" ");
+ void SetCurrAlVector();
+ int* blockSource(int fw1, int fw2);
+ int* blockTarget(int fw1, int fw2);
+ void ToArrayInt(vector<int>* arr);
+ int* neighborLeft(int startidx, int endidx, bool* found);
+ int* neighborRight(int startidx, int endidx, bool* found);
+private:
+ // Hash to avoid redundancy
+ unordered_map<vector<int>, int, boost::hash<vector<int> > > oris_hash;
+ unordered_map<vector<int>, int, boost::hash<vector<int> > > orit_hash;
+ unordered_map<vector<int>, int, boost::hash<vector<int> > > doms_hash;
+ unordered_map<vector<int>, int, boost::hash<vector<int> > > domt_hash;
+ unordered_map<vector<int>, vector<int>, boost::hash<vector<int> > > simplify_hash;
+ unordered_map<vector<int>, vector<int>, boost::hash<vector<int> > > prepare_hash;
+
+ int _J; // effective source length;
+ int _I; // effective target length;
+ bool _matrix[MAX_WORDS][MAX_WORDS]; // true if aligned
+ short _sSpan[MAX_WORDS][2]; //the source span of a target index; 0->min, 1->max
+ short _tSpan[MAX_WORDS][2]; //the target span of a source index; 0->min, 2->max
+ int _freq_cutoff;
+ int SourceFWRuleIdxs[40]; //the indexes of function words in the rule;
+ // The following applies to all *FW*Idxs
+ // *FW*Idxs[0] = size
+ // *FW*Idxs[idx*3-2] = index in the alignment, where idx starts from 1 to size
+ // *FW*Idxs[idx*3-1] = source WordID
+ // *FW*Idxs[idx*3] = target WordID
+ int SourceFWRuleAbsIdxs[40];
+ int TargetFWRuleIdxs[40]; //the indexes of function words in the rule; zeroth element is the count
+ int ** SourceFWAntsIdxs; //the indexes of function words in antecedents
+ int ** SourceFWAntsAbsIdxs;
+ int ** TargetFWAntsIdxs; //the indexes of function words in antecedents
+ int SourceRuleIdxs[40]; //the indexes of SOURCE tokens (zeroth element is the number of source tokens)
+ //>0 means terminal, -i means the i-th Xs
+ int TargetRuleIdxs[40]; //the indexes of TARGET tokens (zeroth element is the number of target tokens)
+ int ** SourceAntsIdxs; //the array of indexes of a particular antecedent's SOURCE tokens
+ int ** TargetAntsIdxs; //the array of indexes of a particular antecedent's TARGET tokens
+ int SourceFWIdxs[40];
+ int SourceFWAbsIdxs[40];
+ int TargetFWIdxs[40];
+ // *sort* and *quickSort* are used to sort *FW*Idxs
+ void sort(int* num);
+ void quickSort(int arr[], int top, int bottom);
+
+ // *block(Source|Target)* finds the minimum block that containts two indexes (fw1 and fw2)
+ inline int least(int i1, int i2) { return (i1<i2)?i1:i2; }
+ inline int most(int i1, int i2) { return (i1>i2)?i1:i2; }
+ void simplifyBackward(vector<int *>*blocks, int* block, const vector<int>& danglings);
+ // used in simplify to check whether an atomic block according to source function words is also atomic according
+ // to target function words as well, otherwise break it
+ // the resulting blocks are added into *blocks*
+ int _Arity;
+ std::vector<WordID> _f; // the source sentence of the **current** rule (may not consistent with the current alignment)
+ std::vector<WordID> _e; // the target sentence of the **current** rule
+ int RuleAl[40];
+ int **AntsAl;
+ int firstSourceAligned(int start);
+ int firstTargetAligned(int start);
+ int lastSourceAligned(int end);
+ int lastTargetAligned(int end);
+ int fas, las, fat, lat; // first aligned source, last aligned source, first aligned target, last aligned target
+ bool MemberOf(int* FWIdxs, int pos1, int pos2); // whether FWIdxs contains pos1 and pos2 consecutively
+ // Convert the alignment to vector form, will be used for hashing purposes
+ vector<int> curr_al;
+ int GetFWGlobalIdx(int idx, const Lattice& sourcelattice, vector<WordID>& sources, int spanstart, int spanend, const std::vector<const void*>& ant_contexts, const map<WordID,int>& sfw);
+ int GetFirstFWIdx(int spanstart,int spanend, const Lattice& sourcelattice, const map<WordID,int>& sfw);
+ int GetLastFWIdx(int spanstart,int spanend, const Lattice& sourcelattice, const map<WordID,int>& sfw);
+ WordID generalize(WordID original, const map<WordID,WordID>& tags, bool pos=false);
+};
+
+#endif