#ifndef DWARF_H #define DWARF_H #include #include #include #include #include #include "wordid.h" #include "lattice.h" #include "trule.h" #include "tdict.h" #include #include #include using namespace std; using namespace std::tr1; using namespace boost::tuples; using namespace boost; const static bool DEBUG = false; class CountTable { public: int* ultimate; map model; int mode; int numColumn; const void print(); void setup(int _numcolumn, int _mode) { mode = _mode; numColumn = _numcolumn; } }; class Alignment { /* Alignment represents an alignment object in a 2D format to support function word-based models calculation A note about model's parameter estimation: ========================================== The model is estimated as a two-level Dirichlet process. For orientation model, the first tier estimation is: P(o|f,e) where *o* is the orientation value to estimate, *f* is the source function word aligned to *e* its second tier is: P(o|f), while its third tier is P(o) For dominance model, the first tier estimation is: P(d|f1,f2,e1,e2) where *d* is a dominance value to estimate, *f1,f2* are the neighboring function words on the source aligned to *e1,e2* on the target side its second tier is: P(d|f1,f2) while its third tier is P(d) Taking orientation model as a case in point, a two level estimation proceeds as follow: P(o|f,e) = c(o,f,e) + alpha { c(o,f) + beta [ c (o) / c(.) ] } ------------------------------ c(f) + beta ------------------------------------------------- c(f,e) + alpha where c() is a count function, alpha and beta are the concentration parameter of the first and second Dirichlet process respectively To encourage or penalize the use of second and third tier statistics, bo1 and bo2 binary features are introduced */ public: const static int MAX_WORDS = 200; const static int MINIMUM_INIT = 1000; const static int MAXIMUM_INIT = -1000; const static int MAX_ARITY = 2; WordID kSOS; WordID kEOS; WordID kUNK; double alpha_oris; // 1st concentration parameter for orientation model double beta_oris; // 2nd concentration parameter for orientation model double alpha_orit; // 1st concentration parameter for orientation model double beta_orit; // 2nd concentration parameter for orientation model double alpha_doms; // idem as above but for dominance model double beta_doms; double alpha_domt; // idem as above but for dominance model double beta_domt; // ACCESS to alignment void set(int j,int i); // j is the source index, while i is the target index void reset(int j,int i); // idem as above inline bool at(int j, int i) { return _matrix[j][i]; }; inline int getJ() {return _J;}; // max source of the current alignment inline int getI() {return _I;}; // max target of the current alignment inline void setI(int I) { _I = I; }; inline void setJ(int J) { _J = J; }; inline void setF(vector f) { _f=f;}; inline void setE(vector e) { _e=e;}; inline WordID getF(int id) { if (id<0) return TD::Convert(""); if (id>=_f.size()) return TD::Convert(""); return _f[id];}; inline WordID getE(int id) { if (id<0) return TD::Convert(""); if (id>=_e.size()) return TD::Convert(""); return _e[id];}; void clearAls(int prevJ=200, int prevI=200); int sourceOf(int i, int start = -1); int targetOf(int j, int start = -1); inline int minSSpan(int i) { return _sSpan[i][0];} inline int maxSSpan(int i) { return _sSpan[i][1];} inline int minTSpan(int j) { return _tSpan[j][0];} inline int maxTSpan(int j) { return _tSpan[j][1];} static inline int link(int s, int t) { return (s << 16) | t; } static inline int source(int st) {return st >> 16; } static inline int target(int st) {return st & 0xffff; } inline void setAlphaOris(double val) { alpha_oris=val; } inline void setAlphaOrit(double val) { alpha_orit=val; } inline void setAlphaDoms(double val) { alpha_doms=val; } inline void setAlphaDomt(double val) { alpha_domt=val; } inline void setBetaOris(double val) { beta_oris=val; } inline void setBetaOrit(double val) { beta_orit=val; } inline void setBetaDoms(double val) { beta_doms=val; } inline void setBetaDomt(double val) { beta_domt=val; } inline void setFreqCutoff(int val) { cout << _freq_cutoff << " to " << val << endl; _freq_cutoff=val; } string AsString(); string AsStringSimple(); int* SOS(); int* EOS(); // Model related function Alignment(); // Given the current *rule* and its antecedents, construct an alignment space and mark the function word alignments // according *sfw* and *tfw* bool prepare(TRule& rule, const std::vector& ant_contexts, const map& sfw, const map& tfw, const Lattice& sourcelattice, int spanstart, int spanend); // Compute orientation model score which parameters are stored in *table* and pass the values accordingly // will call Orientation(Source|Target) and ScoreOrientation(Source|Target) void computeOrientationSource(const CountTable& table, double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus); void computeOrientationSourcePos(const CountTable& table, double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, int maxfwidx, int maxdepth1, int maxdepth2); void computeOrientationSourceGen(const CountTable& table, double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, const map& tags); void computeOrientationSourceBackward(const CountTable& table, double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus); void computeOrientationSourceBackwardPos(const CountTable& table, double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, int maxfwidx, int maxdepth1, int maxdepth2); void computeOrientationTarget(const CountTable& table, double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus); void computeOrientationTargetBackward(const CountTable& table, double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus); // Get the orientation value of a function word at a particular index *fw* // assign the value to either *oril* or *orir* accoring to *Lcompute* and *Rcompute* void OrientationSource(int fw, int*oril, int* orir, bool Lcompute=true, bool Rcompute=true); void OrientationSource(int fw0, int fw1, int*oril, int* orir, bool Lcompute=true, bool Rcompute=true); int OrientationSource(int* left, int* right); void OrientationTarget(int fw, int*oril, int* orir, bool Lcompute=true, bool Rcompute=true); void OrientationTarget(int fw0, int fw1, int*oril, int* orir, bool Lcompute=true, bool Rcompute=true); vector OrientationSourceLeft4Sampler(int fw0, int fw1); vector OrientationSourceLeft4Sampler(int fw); vector OrientationSourceRight4Sampler(int fw0, int fw1); vector OrientationSourceRight4Sampler(int fw); vector OrientationTargetLeft4Sampler(int fw0, int fw1); vector OrientationTargetLeft4Sampler(int fw); vector OrientationTargetRight4Sampler(int fw0, int fw1); vector OrientationTargetRight4Sampler(int fw); // Given an orientation value *ori*, estimate the score accoding to *cond1*, *cond2* // and assign the value accordingly according to *isBonus* and whether the first or the second tier estimation // is used or not void ScoreOrientationRight(const CountTable& table, int ori, WordID cond1, WordID cond2, bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, double alpha1, double beta1); void ScoreOrientationLeft(const CountTable& table, int ori, WordID cond1, WordID cond, bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, double alpha1, double beta1); double ScoreOrientationRight(const CountTable& table, int ori, WordID cond1, WordID cond2); double ScoreOrientationLeft(const CountTable& table, int ori, WordID cond1, WordID cond); void ScoreOrientationRightBackward(const CountTable& table, int ori, WordID cond1, WordID cond2, bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, double alpha1, double beta1); void ScoreOrientationLeftBackward(const CountTable& table, int ori, WordID cond1, WordID cond, bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, double alpha1, double beta1); double ScoreOrientationRightBackward(const CountTable& table, int ori, WordID cond1, WordID cond2); double ScoreOrientationLeftBackward(const CountTable& table, int ori, WordID cond1, WordID cond); void ScoreOrientation(const CountTable& table, int offset, int ori, WordID cond1, WordID cond2, bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, double alpha1, double beta1); double ScoreOrientation(const CountTable& table, int offset, int ori, WordID cond1, WordID cond2); // idem as above except these are for dominance model void computeDominanceSource(const CountTable& table, WordID lfw, WordID rfw, double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus); void computeDominanceSourcePos(const CountTable& table, WordID lfw, WordID rfw, double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, int maxfwidx, int maxdepth1, int maxdepth2); void computeDominanceTarget(const CountTable& table, WordID lfw, WordID rfw, double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus); void computeBorderDominanceSource(const CountTable& table, double *cost, double *bonus, double *state_mono, double *state_nonmono, TRule &rule, const std::vector& ant_contexts, const map& sfw); int DominanceSource(int fw1, int fw2); int DominanceTarget(int fw1, int fw2); vector DominanceSource4Sampler(int fw1, int fw2); vector DominanceTarget4Sampler(int fw1, int fw2); void ScoreDominance(const CountTable& table, int dom, WordID s1, WordID s2, WordID t1, WordID t2, double *cost, double *bo1, double *bo2, bool isBonus, double alpha2, double beta2); double ScoreDominance(const CountTable& table, int dom, WordID s1, WordID s2, WordID t1, WordID t2); // Remove all function word alignments except those at the borders // May result in more than two function word alignments at each side, because this function // will continue keeping function word alignments until the first aligned word at each side void BorderingSFWsOnly(); void BorderingTFWsOnly(); void simplify(int *ret); // preparing the next state void simplify_nofw(int *ret); // preparing the next state when no function word appears // set the first part of the next state, which concerns with function word // fas, las, fat, lat is the (f)irst or (l)ast function word alignments either on the (s)ource or (t)arget // these parameters to anticipate cases where there are more than two function word alignments void FillFWIdxsState(int *state, int fas, int las, int fat, int lat); // Helper function to obtain the aligned words on the other side // WARNING!!! Only to be used if the als are in sync with either source or target sentences WordID F2EProjectionFromExternal(int idx, const vector& als, const string& delimiter=" "); WordID E2FProjectionFromExternal(int idx, const vector& als, const string& delimiter=" "); // WARNING!!! Only to be used in dwarf_main.cc // These two function words assume that the alignment contains phrase boundary // but the source and target sentences do not WordID F2EProjection(int idx, const string& delimiter=" "); WordID E2FProjection(int idx, const string& delimiter=" "); void SetCurrAlVector(); int* blockSource(int fw1, int fw2); int* blockTarget(int fw1, int fw2); void ToArrayInt(vector* arr); int* neighborLeft(int startidx, int endidx, bool* found); int* neighborRight(int startidx, int endidx, bool* found); private: // Hash to avoid redundancy unordered_map, int, boost::hash > > oris_hash; unordered_map, int, boost::hash > > orit_hash; unordered_map, int, boost::hash > > doms_hash; unordered_map, int, boost::hash > > domt_hash; unordered_map, vector, boost::hash > > simplify_hash; unordered_map, vector, boost::hash > > prepare_hash; int _J; // effective source length; int _I; // effective target length; bool _matrix[MAX_WORDS][MAX_WORDS]; // true if aligned short _sSpan[MAX_WORDS][2]; //the source span of a target index; 0->min, 1->max short _tSpan[MAX_WORDS][2]; //the target span of a source index; 0->min, 2->max int _freq_cutoff; int SourceFWRuleIdxs[40]; //the indexes of function words in the rule; // The following applies to all *FW*Idxs // *FW*Idxs[0] = size // *FW*Idxs[idx*3-2] = index in the alignment, where idx starts from 1 to size // *FW*Idxs[idx*3-1] = source WordID // *FW*Idxs[idx*3] = target WordID int SourceFWRuleAbsIdxs[40]; int TargetFWRuleIdxs[40]; //the indexes of function words in the rule; zeroth element is the count int ** SourceFWAntsIdxs; //the indexes of function words in antecedents int ** SourceFWAntsAbsIdxs; int ** TargetFWAntsIdxs; //the indexes of function words in antecedents int SourceRuleIdxs[40]; //the indexes of SOURCE tokens (zeroth element is the number of source tokens) //>0 means terminal, -i means the i-th Xs int TargetRuleIdxs[40]; //the indexes of TARGET tokens (zeroth element is the number of target tokens) int ** SourceAntsIdxs; //the array of indexes of a particular antecedent's SOURCE tokens int ** TargetAntsIdxs; //the array of indexes of a particular antecedent's TARGET tokens int SourceFWIdxs[40]; int SourceFWAbsIdxs[40]; int TargetFWIdxs[40]; // *sort* and *quickSort* are used to sort *FW*Idxs void sort(int* num); void quickSort(int arr[], int top, int bottom); // *block(Source|Target)* finds the minimum block that containts two indexes (fw1 and fw2) inline int least(int i1, int i2) { return (i1i2)?i1:i2; } void simplifyBackward(vector*blocks, int* block, const vector& danglings); // used in simplify to check whether an atomic block according to source function words is also atomic according // to target function words as well, otherwise break it // the resulting blocks are added into *blocks* int _Arity; std::vector _f; // the source sentence of the **current** rule (may not consistent with the current alignment) std::vector _e; // the target sentence of the **current** rule int RuleAl[40]; int **AntsAl; int firstSourceAligned(int start); int firstTargetAligned(int start); int lastSourceAligned(int end); int lastTargetAligned(int end); int fas, las, fat, lat; // first aligned source, last aligned source, first aligned target, last aligned target bool MemberOf(int* FWIdxs, int pos1, int pos2); // whether FWIdxs contains pos1 and pos2 consecutively // Convert the alignment to vector form, will be used for hashing purposes vector curr_al; int GetFWGlobalIdx(int idx, const Lattice& sourcelattice, vector& sources, int spanstart, int spanend, const std::vector& ant_contexts, const map& sfw); int GetFirstFWIdx(int spanstart,int spanend, const Lattice& sourcelattice, const map& sfw); int GetLastFWIdx(int spanstart,int spanend, const Lattice& sourcelattice, const map& sfw); WordID generalize(WordID original, const map& tags, bool pos=false); }; #endif