diff options
Diffstat (limited to 'decoder/dwarf.h')
-rw-r--r-- | decoder/dwarf.h | 286 |
1 files changed, 0 insertions, 286 deletions
diff --git a/decoder/dwarf.h b/decoder/dwarf.h deleted file mode 100644 index 49d2a3b7..00000000 --- a/decoder/dwarf.h +++ /dev/null @@ -1,286 +0,0 @@ -#ifndef DWARF_H -#define DWARF_H - -#include <cstdlib> -#include <vector> -#include <map> -#include <string> -#include <ostream> -#include "wordid.h" -#include "lattice.h" -#include "trule.h" -#include "tdict.h" -#include <boost/functional/hash.hpp> -#include <tr1/unordered_map> -#include <boost/tuple/tuple.hpp> - -using namespace std; -using namespace std::tr1; -using namespace boost::tuples; -using namespace boost; - -const static bool DEBUG = false; - -class CountTable { -public: - int* ultimate; - map<WordID,int*> model; - int mode; - int numColumn; - void print() const; - void setup(int _numcolumn, int _mode) { - mode = _mode; numColumn = _numcolumn; - } -}; - -class Alignment { -/* Alignment represents an alignment object in a 2D format to support function word-based models calculation - - A note about model's parameter estimation: - ========================================== - The model is estimated as a two-level Dirichlet process. - For orientation model, the first tier estimation is: - P(o|f,e) where *o* is the orientation value to estimate, *f* is the source function word aligned to *e* - its second tier is: P(o|f), while its third tier is P(o) - For dominance model, the first tier estimation is: - P(d|f1,f2,e1,e2) where *d* is a dominance value to estimate, *f1,f2* are the neighboring function words on the source - aligned to *e1,e2* on the target side - its second tier is: P(d|f1,f2) while its third tier is P(d) - - Taking orientation model as a case in point, a two level estimation proceeds as follow: - P(o|f,e) = c(o,f,e) + alpha { c(o,f) + beta [ c (o) / c(.) ] } - ------------------------------ - c(f) + beta - ------------------------------------------------- - c(f,e) + alpha - where c() is a count function, alpha and beta are the concentration parameter - of the first and second Dirichlet process respectively - To encourage or penalize the use of second and third tier statistics, bo1 and bo2 binary features are introduced -*/ -public: - const static int MAX_WORDS = 200; - const static int MINIMUM_INIT = 1000; - const static int MAXIMUM_INIT = -1000; - const static int MAX_ARITY = 2; - WordID kSOS; - WordID kEOS; - WordID kUNK; - double alpha_oris; // 1st concentration parameter for orientation model - double beta_oris; // 2nd concentration parameter for orientation model - double alpha_orit; // 1st concentration parameter for orientation model - double beta_orit; // 2nd concentration parameter for orientation model - double alpha_doms; // idem as above but for dominance model - double beta_doms; - double alpha_domt; // idem as above but for dominance model - double beta_domt; - - // ACCESS to alignment - void set(int j,int i); // j is the source index, while i is the target index - void reset(int j,int i); // idem as above - inline bool at(int j, int i) { return _matrix[j][i]; }; - inline int getJ() {return _J;}; // max source of the current alignment - inline int getI() {return _I;}; // max target of the current alignment - inline void setI(int I) { _I = I; }; - inline void setJ(int J) { _J = J; }; - inline void setF(vector<WordID> f) { _f=f;}; - inline void setE(vector<WordID> e) { _e=e;}; - inline WordID getF(int id) { if (id<0) return TD::Convert("<s>"); if (id>=_f.size()) return TD::Convert("</s>"); return _f[id];}; - inline WordID getE(int id) { if (id<0) return TD::Convert("<s>"); if (id>=_e.size()) return TD::Convert("</s>"); return _e[id];}; - void clearAls(int prevJ=200, int prevI=200); - int sourceOf(int i, int start = -1); - int targetOf(int j, int start = -1); - inline int minSSpan(int i) { return _sSpan[i][0];} - inline int maxSSpan(int i) { return _sSpan[i][1];} - inline int minTSpan(int j) { return _tSpan[j][0];} - inline int maxTSpan(int j) { return _tSpan[j][1];} - static inline int link(int s, int t) { return (s << 16) | t; } - static inline int source(int st) {return st >> 16; } - static inline int target(int st) {return st & 0xffff; } - inline void setAlphaOris(double val) { alpha_oris=val; } - inline void setAlphaOrit(double val) { alpha_orit=val; } - inline void setAlphaDoms(double val) { alpha_doms=val; } - inline void setAlphaDomt(double val) { alpha_domt=val; } - inline void setBetaOris(double val) { beta_oris=val; } - inline void setBetaOrit(double val) { beta_orit=val; } - inline void setBetaDoms(double val) { beta_doms=val; } - inline void setBetaDomt(double val) { beta_domt=val; } - inline void setFreqCutoff(int val) { cout << _freq_cutoff << " to " << val << endl; _freq_cutoff=val; } - string AsString(); - string AsStringSimple(); - int* SOS(); - int* EOS(); - - // Model related function - Alignment(); - // Given the current *rule* and its antecedents, construct an alignment space and mark the function word alignments - // according *sfw* and *tfw* - bool prepare(TRule& rule, const std::vector<const void*>& ant_contexts, - const map<WordID,int>& sfw, const map<WordID,int>& tfw, const Lattice& sourcelattice, int spanstart, int spanend); - - // Compute orientation model score which parameters are stored in *table* and pass the values accordingly - // will call Orientation(Source|Target) and ScoreOrientation(Source|Target) - void computeOrientationSource(const CountTable& table, double *cost, double *bonus, double *bo1, - double *bo1_bonus, double *bo2, double *bo2_bonus); - void computeOrientationSourcePos(const CountTable& table, double *cost, double *bonus, - double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, int maxfwidx, int maxdepth1, int maxdepth2); - void computeOrientationSourceGen(const CountTable& table, double *cost, double *bonus, double *bo1, - double *bo1_bonus, double *bo2, double *bo2_bonus, const map<WordID,WordID>& tags); - void computeOrientationSourceBackward(const CountTable& table, double *cost, double *bonus, double *bo1, - double *bo1_bonus, double *bo2, double *bo2_bonus); - void computeOrientationSourceBackwardPos(const CountTable& table, double *cost, double *bonus, double *bo1, - double *bo1_bonus, double *bo2, double *bo2_bonus, int maxfwidx, int maxdepth1, int maxdepth2); - void computeOrientationTarget(const CountTable& table, double *cost, double *bonus, double *bo1, - double *bo1_bonus, double *bo2, double *bo2_bonus); - void computeOrientationTargetBackward(const CountTable& table, double *cost, double *bonus, double *bo1, - double *bo1_bonus, double *bo2, double *bo2_bonus); - // Get the orientation value of a function word at a particular index *fw* - // assign the value to either *oril* or *orir* accoring to *Lcompute* and *Rcompute* - void OrientationSource(int fw, int*oril, int* orir, bool Lcompute=true, bool Rcompute=true); - void OrientationSource(int fw0, int fw1, int*oril, int* orir, bool Lcompute=true, bool Rcompute=true); - int OrientationSource(int* left, int* right); - void OrientationTarget(int fw, int*oril, int* orir, bool Lcompute=true, bool Rcompute=true); - void OrientationTarget(int fw0, int fw1, int*oril, int* orir, bool Lcompute=true, bool Rcompute=true); - - vector<int> OrientationSourceLeft4Sampler(int fw0, int fw1); - vector<int> OrientationSourceLeft4Sampler(int fw); - vector<int> OrientationSourceRight4Sampler(int fw0, int fw1); - vector<int> OrientationSourceRight4Sampler(int fw); - vector<int> OrientationTargetLeft4Sampler(int fw0, int fw1); - vector<int> OrientationTargetLeft4Sampler(int fw); - vector<int> OrientationTargetRight4Sampler(int fw0, int fw1); - vector<int> OrientationTargetRight4Sampler(int fw); - - // Given an orientation value *ori*, estimate the score accoding to *cond1*, *cond2* - // and assign the value accordingly according to *isBonus* and whether the first or the second tier estimation - // is used or not - void ScoreOrientationRight(const CountTable& table, int ori, WordID cond1, WordID cond2, - bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, - double *bo2, double *bo2_bonus, double alpha1, double beta1); - void ScoreOrientationLeft(const CountTable& table, int ori, WordID cond1, WordID cond, - bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, - double *bo2, double *bo2_bonus, double alpha1, double beta1); - double ScoreOrientationRight(const CountTable& table, int ori, WordID cond1, WordID cond2); - double ScoreOrientationLeft(const CountTable& table, int ori, WordID cond1, WordID cond); - void ScoreOrientationRightBackward(const CountTable& table, int ori, WordID cond1, WordID cond2, - bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, - double *bo2, double *bo2_bonus, double alpha1, double beta1); - void ScoreOrientationLeftBackward(const CountTable& table, int ori, WordID cond1, WordID cond, - bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, - double *bo2, double *bo2_bonus, double alpha1, double beta1); - double ScoreOrientationRightBackward(const CountTable& table, int ori, WordID cond1, WordID cond2); - double ScoreOrientationLeftBackward(const CountTable& table, int ori, WordID cond1, WordID cond); - void ScoreOrientation(const CountTable& table, int offset, int ori, WordID cond1, WordID cond2, - bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, - double *bo2, double *bo2_bonus, double alpha1, double beta1); - double ScoreOrientation(const CountTable& table, int offset, int ori, WordID cond1, WordID cond2); - - // idem as above except these are for dominance model - void computeDominanceSource(const CountTable& table, WordID lfw, WordID rfw, double *cost, double *bonus, - double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus); - void computeDominanceSourcePos(const CountTable& table, WordID lfw, WordID rfw, double *cost, double *bonus, - double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, int maxfwidx, int maxdepth1, int maxdepth2); - void computeDominanceTarget(const CountTable& table, WordID lfw, WordID rfw, double *cost, double *bonus, - double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus); - void computeBorderDominanceSource(const CountTable& table, double *cost, double *bonus, - double *state_mono, double *state_nonmono, - TRule &rule, const std::vector<const void*>& ant_contexts, const map<WordID,int>& sfw); - int DominanceSource(int fw1, int fw2); - int DominanceTarget(int fw1, int fw2); - vector<int> DominanceSource4Sampler(int fw1, int fw2); - vector<int> DominanceTarget4Sampler(int fw1, int fw2); - void ScoreDominance(const CountTable& table, int dom, WordID s1, WordID s2, WordID t1, WordID t2, - double *cost, double *bo1, double *bo2, bool isBonus, double alpha2, double beta2); - double ScoreDominance(const CountTable& table, int dom, WordID s1, WordID s2, WordID t1, WordID t2); - - // Remove all function word alignments except those at the borders - // May result in more than two function word alignments at each side, because this function - // will continue keeping function word alignments until the first aligned word at each side - void BorderingSFWsOnly(); - void BorderingTFWsOnly(); - void simplify(int *ret); // preparing the next state - void simplify_nofw(int *ret); // preparing the next state when no function word appears - // set the first part of the next state, which concerns with function word - // fas, las, fat, lat is the (f)irst or (l)ast function word alignments either on the (s)ource or (t)arget - // these parameters to anticipate cases where there are more than two function word alignments - void FillFWIdxsState(int *state, int fas, int las, int fat, int lat); - - // Helper function to obtain the aligned words on the other side - // WARNING!!! Only to be used if the als are in sync with either source or target sentences - WordID F2EProjectionFromExternal(int idx, const vector<AlignmentPoint>& als, const string& delimiter=" "); - WordID E2FProjectionFromExternal(int idx, const vector<AlignmentPoint>& als, const string& delimiter=" "); - // WARNING!!! Only to be used in dwarf_main.cc - // These two function words assume that the alignment contains phrase boundary - // but the source and target sentences do not - WordID F2EProjection(int idx, const string& delimiter=" "); - WordID E2FProjection(int idx, const string& delimiter=" "); - void SetCurrAlVector(); - int* blockSource(int fw1, int fw2); - int* blockTarget(int fw1, int fw2); - void ToArrayInt(vector<int>* arr); - int* neighborLeft(int startidx, int endidx, bool* found); - int* neighborRight(int startidx, int endidx, bool* found); -private: - // Hash to avoid redundancy - unordered_map<vector<int>, int, boost::hash<vector<int> > > oris_hash; - unordered_map<vector<int>, int, boost::hash<vector<int> > > orit_hash; - unordered_map<vector<int>, int, boost::hash<vector<int> > > doms_hash; - unordered_map<vector<int>, int, boost::hash<vector<int> > > domt_hash; - unordered_map<vector<int>, vector<int>, boost::hash<vector<int> > > simplify_hash; - unordered_map<vector<int>, vector<int>, boost::hash<vector<int> > > prepare_hash; - - int _J; // effective source length; - int _I; // effective target length; - bool _matrix[MAX_WORDS][MAX_WORDS]; // true if aligned - short _sSpan[MAX_WORDS][2]; //the source span of a target index; 0->min, 1->max - short _tSpan[MAX_WORDS][2]; //the target span of a source index; 0->min, 2->max - int _freq_cutoff; - int SourceFWRuleIdxs[40]; //the indexes of function words in the rule; - // The following applies to all *FW*Idxs - // *FW*Idxs[0] = size - // *FW*Idxs[idx*3-2] = index in the alignment, where idx starts from 1 to size - // *FW*Idxs[idx*3-1] = source WordID - // *FW*Idxs[idx*3] = target WordID - int SourceFWRuleAbsIdxs[40]; - int TargetFWRuleIdxs[40]; //the indexes of function words in the rule; zeroth element is the count - int ** SourceFWAntsIdxs; //the indexes of function words in antecedents - int ** SourceFWAntsAbsIdxs; - int ** TargetFWAntsIdxs; //the indexes of function words in antecedents - int SourceRuleIdxs[40]; //the indexes of SOURCE tokens (zeroth element is the number of source tokens) - //>0 means terminal, -i means the i-th Xs - int TargetRuleIdxs[40]; //the indexes of TARGET tokens (zeroth element is the number of target tokens) - int ** SourceAntsIdxs; //the array of indexes of a particular antecedent's SOURCE tokens - int ** TargetAntsIdxs; //the array of indexes of a particular antecedent's TARGET tokens - int SourceFWIdxs[40]; - int SourceFWAbsIdxs[40]; - int TargetFWIdxs[40]; - // *sort* and *quickSort* are used to sort *FW*Idxs - void sort(int* num); - void quickSort(int arr[], int top, int bottom); - - // *block(Source|Target)* finds the minimum block that containts two indexes (fw1 and fw2) - inline int least(int i1, int i2) { return (i1<i2)?i1:i2; } - inline int most(int i1, int i2) { return (i1>i2)?i1:i2; } - void simplifyBackward(vector<int *>*blocks, int* block, const vector<int>& danglings); - // used in simplify to check whether an atomic block according to source function words is also atomic according - // to target function words as well, otherwise break it - // the resulting blocks are added into *blocks* - int _Arity; - std::vector<WordID> _f; // the source sentence of the **current** rule (may not consistent with the current alignment) - std::vector<WordID> _e; // the target sentence of the **current** rule - int RuleAl[40]; - int **AntsAl; - int firstSourceAligned(int start); - int firstTargetAligned(int start); - int lastSourceAligned(int end); - int lastTargetAligned(int end); - int fas, las, fat, lat; // first aligned source, last aligned source, first aligned target, last aligned target - bool MemberOf(int* FWIdxs, int pos1, int pos2); // whether FWIdxs contains pos1 and pos2 consecutively - // Convert the alignment to vector form, will be used for hashing purposes - vector<int> curr_al; - int GetFWGlobalIdx(int idx, const Lattice& sourcelattice, vector<WordID>& sources, int spanstart, int spanend, const std::vector<const void*>& ant_contexts, const map<WordID,int>& sfw); - int GetFirstFWIdx(int spanstart,int spanend, const Lattice& sourcelattice, const map<WordID,int>& sfw); - int GetLastFWIdx(int spanstart,int spanend, const Lattice& sourcelattice, const map<WordID,int>& sfw); - WordID generalize(WordID original, const map<WordID,WordID>& tags, bool pos=false); -}; - -#endif |