diff options
Diffstat (limited to 'decoder/ff_dwarf.cc')
-rw-r--r-- | decoder/ff_dwarf.cc | 893 |
1 files changed, 893 insertions, 0 deletions
diff --git a/decoder/ff_dwarf.cc b/decoder/ff_dwarf.cc new file mode 100644 index 00000000..3daa85ac --- /dev/null +++ b/decoder/ff_dwarf.cc @@ -0,0 +1,893 @@ +#include <vector> +#include <sstream> +#include <fstream> +#include <string> +#include <iostream> +#include <map> +#include "ff_dwarf.h" +#include "dwarf.h" +#include "wordid.h" +#include "tdict.h" +#include "filelib.h" +#include "sentence_metadata.h" +#include "stringlib.h" + +using namespace std; + +Dwarf::Dwarf(const std::string& param) { +/* Param is a space separated string which contains any or all of the following: + oris|orit|doms|domt=filename + e.g. oris=/fs/clip-galep3eval/hendra/z2e/oris128.gz +*/ + sSOS="<s>"; + sEOS="</s>"; + kSOS=TD::Convert(sSOS); + kEOS=TD::Convert(sEOS); + kGOAL=TD::Convert("S")*-1; + _sent_id = (int *)malloc(sizeof(int)); + *_sent_id = -1; + if (DEBUG) cerr << "here = " << *_sent_id << endl; + _fwcount = (int *)malloc(sizeof(int)); + *_fwcount = -1; + cerr << "initializing dwarf" << endl; + flag_oris=false; flag_orit=false; flag_doms=false; flag_domt=false; flag_tfw_count=false; + flag_bdoms=false; flag_porislr=false, flag_porisrl=false, flag_goris=false; flag_pgorislr=false, flag_pgorisrl=false; + flag_pdomslr=false; flag_pdomsrl=false; flag_pgdomslr=false; flag_pgdomsrl=false; flag_gdoms=false; + flag_oris_backward=false; flag_orit_backward=false; + explicit_soseos=false; + SetStateSize(STATE_SIZE*sizeof(int)); + als = new Alignment(); + als->clearAls(Alignment::MAX_WORDS,Alignment::MAX_WORDS); + istringstream iss(param); string w; + while(iss >> w) { + int equal = w.find_first_of("="); + if (equal!=string::npos) { + string model = w.substr(0,equal); + vector<string> params; + Tokenize(w.substr(equal+1),',',¶ms); + string fn = params[0]; + if (model == "minfreq") { + cerr << "model minfreq " << fn << endl; + als->setFreqCutoff(atoi(fn.c_str())); + } else if (model == "oris") { + flag_oris = readOrientation(&toris,fn,&sfw); + if (flag_oris) { + oris_ = FD::Convert("OrientationSource"); + //oris_bo1_ = FD::Convert("OrientationSource_BO1"); + //oris_bo2_ = FD::Convert("OrientationSource_BO2"); + } + if (params.size()>1) als->setAlphaOris(atof(params[1].c_str())); + if (params.size()>2) als->setBetaOris(atof(params[2].c_str())); + } else if (model == "porislr") { + flag_porislr = readOrientation(&tporislr,fn,&sfw,true); + poris_nlr = 0; + if (flag_porislr) { + porislr_ = FD::Convert("OrientationSourcePositionfulLeftRight"); + } + if (params.size()>1) poris_nlr = atoi(params[1].c_str()); + if (DEBUG) cerr << " maximum poris depth=" << poris_nlr << endl; + } else if (model == "porisrl") { + flag_porisrl = readOrientation(&tporisrl,fn,&sfw,true); + poris_nrl = 0; + if (flag_porisrl) { + porisrl_ = FD::Convert("OrientationSourcePositionfulRightLeft"); + } + if (params.size()>1) poris_nrl = atoi(params[1].c_str()); + if (DEBUG) cerr << " maximum poris depth=" << poris_nrl << endl; + } else if (model=="goris") { + flag_goris = readOrientation(&tgoris,fn,&sfw); + if (flag_goris) { + goris_ = FD::Convert("OrientationSourceGeneralized"); + } + if (params.size()>1) { + readTags(params[1],&tags); + generalizeOrientation(&tgoris,tags); + } + } else if (model=="pgorislr") { + flag_pgorislr = readOrientation(&tpgorislr,fn,&sfw,true); + pgoris_nlr = 0; + if (flag_pgorislr) { + pgorislr_ = FD::Convert("OrientationSourceGeneralizedPositionfulLeftRight"); + } + if (DEBUG) { + cerr << "BEFORE GENERALIZATION" << endl; + tpgorislr.print(); + } + if (params.size()>1) pgoris_nlr = atoi(params[1].c_str()); + if (params.size()>2) { + readTags(params[2],&tags); + generalizeOrientation(&tpgorislr,tags,true); + } + if (DEBUG) { + cerr << "AFTER GENERALIZATION" << endl; + tpgorislr.print(); + } + } else if (model=="pgorisrl") { + flag_pgorisrl = readOrientation(&tpgorisrl,fn,&sfw,true); + pgoris_nrl = 0; + if (flag_pgorisrl) { + pgorisrl_ = FD::Convert("OrientationSourceGeneralizedPositionfulLeftRight"); + } + if (params.size()>1) pgoris_nrl = atoi(params[1].c_str()); + if (params.size()>2) { + readTags(params[2],&tags); + generalizeOrientation(&tpgorisrl,tags,true); + } + } else if (model == "oris_backward") { + flag_oris_backward = true; + if (!flag_oris) readOrientation(&toris,fn,&sfw); + oris_backward_ = FD::Convert("OrientationSourceBackward"); + if (params.size()>1) als->setAlphaOris(atof(params[1].c_str())); + if (params.size()>2) als->setBetaOris(atof(params[2].c_str())); + } else if (model == "orit") { + flag_orit = readOrientation(&torit,fn,&tfw); + if (flag_orit) { + orit_ = FD::Convert("OrientationTarget"); + //orit_bo1_ = FD::Convert("OrientationTarget_BO1"); + //orit_bo2_ = FD::Convert("OrientationTarget_BO2"); + } + if (params.size()>1) als->setAlphaOrit(atof(params[1].c_str())); + if (params.size()>2) als->setBetaOrit(atof(params[2].c_str())); + } else if (model == "orit_backward") { + flag_orit_backward = true; + if (!flag_orit) readOrientation(&torit,fn,&tfw); + orit_backward_ = FD::Convert("OrientationTargetBackward"); + if (params.size()>1) als->setAlphaOrit(atof(params[1].c_str())); + if (params.size()>2) als->setBetaOrit(atof(params[2].c_str())); + } else if (model == "doms") { + flag_doms = readDominance(&tdoms,fn,&sfw); + if (flag_doms) { + doms_ = FD::Convert("DominanceSource"); + //doms_bo1_ = FD::Convert("DominanceSource_BO1"); + //doms_bo2_ = FD::Convert("DominanceSource_BO2"); + } + if (params.size()>1) als->setAlphaDoms(atof(params[1].c_str())); + if (params.size()>2) als->setBetaDoms(atof(params[2].c_str())); + } else if (model == "pdomsrl") { + flag_pdomsrl = readDominance(&tpdomsrl,fn,&sfw,true); + if (flag_pdomsrl) { + pdomsrl_ = FD::Convert("DominanceSourcePositionfulRightLeft"); + } + if (params.size()>1) pdoms_nrl = atoi(params[1].c_str()); + } else if (model == "pdomslr") { + flag_pdomslr = readDominance(&tpdomslr,fn,&sfw,true); + tpdomslr.print(); + if (flag_pdomslr) { + pdomslr_ = FD::Convert("DominanceSourcePositionfulLeftRight"); + } + if (params.size()>1) pdoms_nlr = atoi(params[1].c_str()); + } else if (model == "pgdomsrl") { + flag_pgdomsrl = readDominance(&tpgdomsrl,fn,&sfw,true); + if (flag_pgdomsrl) { + pgdomsrl_ = FD::Convert("DominanceSourceGeneralizedPositionfulRightLeft"); + } + if (params.size()>1) pgdoms_nrl = atoi(params[1].c_str()); + if (params.size()>2) { + readTags(params[2],&tags); + generalizeDominance(&tpgdomsrl,tags,true); + } + } else if (model == "pgdomslr") { + flag_pgdomslr = readDominance(&tpgdomslr,fn,&sfw,true); + if (flag_pgdomslr) { + pgdomslr_ = FD::Convert("DominanceSourceGeneralizedPositionfulLeftRight"); + } + if (params.size()>1) pgdoms_nlr = atoi(params[1].c_str()); + if (params.size()>2) { + readTags(params[2],&tags); + if (DEBUG) { + for (map<WordID,WordID>::const_iterator it=tags.begin(); it!=tags.end(); it++) { + cerr << "tags = " << TD::Convert(it->first) << ", " << TD::Convert(it->second) << endl; + } + } + generalizeDominance(&tpgdomslr,tags,true); + } + if (DEBUG) tpgdomslr.print(); + } else if (model == "bdoms") { + flag_bdoms = readDominance(&tbdoms,fn,&sfw); + if (flag_bdoms) { + bdoms_ = FD::Convert("BorderDominanceSource"); + } + } else if (model == "domt") { + flag_domt = readDominance(&tdomt,fn,&tfw); + if (flag_domt) { + domt_ = FD::Convert("DominanceTarget"); + //domt_bo1_ = FD::Convert("DominanceTarget_BO1"); + //domt_bo2_ = FD::Convert("DominanceTarget_BO2"); + } + if (params.size()>1) als->setAlphaDomt(atof(params[1].c_str())); + if (params.size()>2) als->setBetaDomt(atof(params[2].c_str())); + } else if (model== "tfw_count") { + flag_tfw_count = readList(fn,&tfw); + tfw_count_ = FD::Convert("TargetFunctionWordsCount"); + } else { + cerr << "DWARF doesn't understand this model: " << model << endl; + } + } else { + if (w=="tfw_count") { + flag_tfw_count = true; + tfw_count_ = FD::Convert("TargetFunctionWordsCount"); + } else if (w=="oris_backward") { + flag_oris_backward = true; + oris_backward_ = FD::Convert("OrientationSourceBackward"); + } else if (w=="orit_backward") { + flag_orit_backward = true; + orit_backward_ = FD::Convert("OrientationTargetBackward"); + } else if (w=="explicit_soseos") { + explicit_soseos=true; + } else { + cerr << "DWARF doesn't need this param: " << param << endl; + } + } + } + for (map<WordID,int>::const_iterator it=sfw.begin(); it!=sfw.end() && DEBUG; it++) { + cerr << " FW:" << TD::Convert(it->first) << endl; + } +} + +void Dwarf::TraversalFeaturesImpl(const SentenceMetadata& smeta, + const Hypergraph::Edge& edge, + const std::vector<const void*>& ant_contexts, + SparseVector<double>* features, + SparseVector<double>* estimated_features, + void* context) const { + if (DEBUG) cerr << "TraversalFeaturesImpl" << endl; + double cost, bonus, bo1, bo2, bo1_bonus, bo2_bonus; + double bdoms_state_mono= 0; double bdoms_state_nonmono = 0; + TRule r = *edge.rule_; + if (DEBUG) cerr << " sent_id=" << *_sent_id << ", " << smeta.GetSentenceID() << endl; + if (DEBUG) cerr << "rule = " << r.AsString() << endl; + if (DEBUG) cerr << "rule[i,j] = " << edge.i_ << "," << edge.j_ << endl; + if (*_sent_id != smeta.GetSentenceID()) { //new sentence + *_sent_id = smeta.GetSentenceID(); + const Lattice l = smeta.GetSourceLattice(); + *_fwcount=0; + for (int i=0; i<smeta.GetSourceLength(); i++) { + if (sfw.find(l[i][0].label)!=sfw.end()) { + *_fwcount+=1; + } + } + if (DEBUG) cerr << "new sentence[" << *_sent_id << "]="<<*_fwcount<<endl; + } + bool nofw = als->prepare(*edge.rule_, ant_contexts, sfw, tfw,smeta.GetSourceLattice(),edge.i_,edge.j_); + bool isFinal = (edge.i_==0 && edge.j_==smeta.GetSourceLength() && r.GetLHS()==kGOAL); + // prepare *nofw* outputs whether the resulting alignment, contains function words or not + // if not, the models do not have to be calcualted and *simplify* is very simple + if (DEBUG) cerr << "nofw = " << nofw << endl; + if (flag_tfw_count) { + double count = 0; + for (int i=0; i<r.e_.size(); i++) { + if (tfw.find(r.e_[i])!=tfw.end()) count++; + } + features->set_value(tfw_count_,count); + } + if (flag_oris) { + cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; + if (!nofw) als->computeOrientationSource(toris,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus); + if (isFinal&&!explicit_soseos) { + cost += bonus; + bonus = 0; + } + features->set_value(oris_,cost); + //features->set_value(oris_bo1_,bo1); + //features->set_value(oris_bo2_,bo2); + estimated_features->set_value(oris_,bonus); + //estimated_features->set_value(oris_bo1_,bo1_bonus); + //estimated_features->set_value(oris_bo2_,bo2_bonus); + } + if (flag_porislr) { + cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; + if (!nofw) + als->computeOrientationSourcePos(tporislr,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus,*_fwcount,poris_nlr,0); + if (isFinal&&!explicit_soseos) { + cost += bonus; + bonus = 0; + } + features->set_value(porislr_,cost); + estimated_features->set_value(porislr_,bonus); + } + if (flag_porisrl) { + cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; + if (!nofw) + als->computeOrientationSourcePos(tporisrl,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus,*_fwcount,0,poris_nrl); + if (isFinal&&!explicit_soseos) { + cost += bonus; + bonus = 0; + } + features->set_value(porisrl_,cost); + estimated_features->set_value(porisrl_,bonus); + } + if (flag_pgorislr) { + cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; + if (!nofw) + als->computeOrientationSourcePos(tpgorislr,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus,*_fwcount,pgoris_nlr,0); + if (isFinal&&!explicit_soseos) { + cost += bonus; + bonus = 0; + } + features->set_value(pgorislr_,cost); + estimated_features->set_value(pgorislr_,bonus); + } + if (flag_pgorisrl) { + cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; + if (!nofw) + als->computeOrientationSourcePos(tpgorisrl,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus,*_fwcount,0,pgoris_nrl); + if (isFinal&&!explicit_soseos) { + cost += bonus; + bonus = 0; + } + features->set_value(pgorisrl_,cost); + estimated_features->set_value(pgorisrl_,bonus); + } + if (flag_goris) { + cost=0; bonus=0; + if (!nofw) als->computeOrientationSource(tgoris,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus); + if (isFinal&&!explicit_soseos) { + cost += bonus; + bonus = 0; + } + features->set_value(goris_,cost); + estimated_features->set_value(goris_,bonus); + } + if (flag_oris_backward) { + cost=0; bonus=0; + if (!nofw) + als->computeOrientationSourceBackward(toris,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus); + if (isFinal&&!explicit_soseos) { + cost += bonus; + bonus = 0; + } + features->set_value(oris_backward_,cost); + estimated_features->set_value(oris_backward_,bonus); + } + WordID _lfw = kSOS; + WordID _rfw = kEOS; + if (flag_doms || flag_pdomslr || flag_pdomsrl || flag_pgdomslr || flag_pgdomsrl) { + if (DEBUG) cerr << " seeking lfw and rfw" << endl; + int start = edge.i_; + int end = edge.j_; + if (DEBUG) cerr << " start=" << start << ", end=" << end << endl; + const Lattice l = smeta.GetSourceLattice(); + for (int idx=start-1; idx>=0; idx--) { + if (DEBUG) cerr << " checking idx=" << idx << ", label=" << l[idx][0].label << "-" << TD::Convert(l[idx][0].label) << endl; + if (sfw.find(l[idx][0].label) !=sfw.end()) { + if (DEBUG) cerr << "+"; + _lfw=l[idx][0].label; break; + } + } + for (int idx=end; idx<l.size(); idx++) { // end or end+1 + if (DEBUG) cerr << " checking idx=" << idx << ", label=" << l[idx][0].label << "-" << TD::Convert(l[idx][0].label) << endl; + if (sfw.find(l[idx][0].label)!=sfw.end()) { + if (DEBUG) cerr << "."; + _rfw=l[idx][0].label; break; + } + } + if (isFinal&&!explicit_soseos) { + _lfw=kSOS; _rfw=kEOS; + } + } + if (flag_doms) { + cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; + if (!nofw) als->computeDominanceSource(tdoms,_lfw,_rfw,&cost,&bonus, + &bo1,&bo1_bonus,&bo2,&bo2_bonus); + if (DEBUG) cerr << " COST=" << cost << ", BONUS=" << bonus << endl; + if (isFinal&&!explicit_soseos) { + cost += bonus; + if (DEBUG) cerr << " final and !explicit_soseos, thus cost = " << cost << endl; + bonus = 0; + } + features->set_value(doms_,cost); + estimated_features->set_value(doms_,bonus); + } + if (flag_pdomslr) { + if (DEBUG) cerr << " flag_pdomslr true, nofw=" << nofw << endl; + if (DEBUG) cerr << " lfw=" << _lfw << ", rfw=" << _rfw << endl; + if (DEBUG) cerr << " kSOS=" << kSOS << ", kEOS=" << kEOS << endl; + cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; + if (!nofw) als->computeDominanceSourcePos(tpdomslr,_lfw,_rfw,&cost,&bonus, + &bo1,&bo1_bonus,&bo2,&bo2_bonus,*_fwcount,pdoms_nlr,0); + if (isFinal&&!explicit_soseos) { + cost += bonus; + bonus = 0; + } + features->set_value(pdomslr_,cost); + estimated_features->set_value(pdomslr_,bonus); + } + if (flag_pdomsrl) { + cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; + if (!nofw) als->computeDominanceSourcePos(tpdomsrl,_lfw,_rfw,&cost,&bonus, + &bo1,&bo1_bonus,&bo2,&bo2_bonus,*_fwcount,0,pdoms_nrl); + if (isFinal&&!explicit_soseos) { + cost += bonus; + bonus = 0; + } + features->set_value(pdomsrl_,cost); + estimated_features->set_value(pdomsrl_,bonus); + } + if (flag_pgdomslr) { + cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; + if (!nofw) als->computeDominanceSourcePos(tpgdomslr,_lfw,_rfw,&cost,&bonus, + &bo1,&bo1_bonus,&bo2,&bo2_bonus,*_fwcount,pgdoms_nlr,0); + if (isFinal&&!explicit_soseos) { + cost += bonus; + bonus = 0; + } + features->set_value(pgdomslr_,cost); + estimated_features->set_value(pgdomslr_,bonus); + } + if (flag_pgdomsrl) { cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; + if (!nofw) als->computeDominanceSourcePos(tpgdomsrl,_lfw,_rfw,&cost,&bonus, + &bo1,&bo1_bonus,&bo2,&bo2_bonus,*_fwcount,0,pgdoms_nrl); + if (isFinal&&!explicit_soseos) { + cost += bonus; + bonus = 0; + } + features->set_value(pgdomsrl_,cost); + estimated_features->set_value(pgdomsrl_,bonus); + } + + + if (flag_bdoms) { + cost=0; bonus=0; bdoms_state_mono=0; bdoms_state_nonmono=0; + if (!nofw) + als->computeBorderDominanceSource(tbdoms,&cost,&bonus, + &bdoms_state_mono, &bdoms_state_nonmono,*edge.rule_, ant_contexts, sfw); + features->set_value(bdoms_,cost); + estimated_features->set_value(bdoms_,bonus); + } + if (flag_orit) { + cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; + if (!nofw) als->computeOrientationTarget(torit,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus); + if (DEBUG) cerr << "cost=" << cost << ", bonus=" << bonus << ", bo1=" << bo1 << ", bo1_bonus=" << bo1_bonus << ", bo2=" << bo2 << ", bo2_bonus=" << bo2_bonus << endl; + features->set_value(orit_,cost); + //features->set_value(orit_bo1_,bo1); + //features->set_value(orit_bo2_,bo2); + estimated_features->set_value(orit_,bonus); + //estimated_features->set_value(orit_bo1_,bo1_bonus); + //estimated_features->set_value(orit_bo2_,bo2_bonus); + } + if (flag_orit_backward) { + cost=0; bonus=0; + if (!nofw) als->computeOrientationTargetBackward(torit,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus); + features->set_value(orit_backward_,cost); + estimated_features->set_value(orit_backward_,bonus); + } + if (flag_domt) { + cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; + WordID _lfw=-1; int start = edge.i_; + WordID _rfw=-1; int end = edge.j_; + if (smeta.HasReference()) { + const Lattice l = smeta.GetReference(); + for (int idx=start-1; idx>=0; idx--) { + if (l.size()>0) + if (tfw.find(l[idx][0].label) !=tfw.end()) { + _lfw=l[idx][0].label; break; + } + } + for (int idx=end; idx<l.size(); idx++) { // end or end+1 + if (l[idx].size()>0) + if (tfw.find(l[idx][0].label)!=tfw.end()) { + _rfw=l[idx][0].label; break; + } + } + } + //neighboringFWs(smeta.GetReference(),edge.i_,edge.j_,tfw,&_lfw,&_rfw); + if (!nofw) als->computeDominanceTarget(tdomt,_lfw,_rfw,&cost,&bonus, + &bo1,&bo1_bonus,&bo2,&bo2_bonus); + features->set_value(domt_,cost); + //features->set_value(domt_bo1_,bo1); + //features->set_value(domt_bo2_,bo2); + estimated_features->set_value(domt_,bonus); + //estimated_features->set_value(domt_bo1_,bo1_bonus); + //estimated_features->set_value(domt_bo2_,bo2_bonus); + } + int* vcontext = reinterpret_cast<int *>(context); + if (!nofw) { + als->BorderingSFWsOnly(); + als->BorderingTFWsOnly(); + als->simplify(vcontext); + } else { + als->simplify_nofw(vcontext); + } + vcontext[50] = DoubleToInteger(bdoms_state_mono); + vcontext[51] = DoubleToInteger(bdoms_state_nonmono); + vcontext[STATE_SIZE-1] = Alignment::link(edge.i_,edge.j_); + if (DEBUG) { + cerr << "state@traverse = "; + for (int idx=0; idx<STATE_SIZE; idx++) cerr << idx << "." << vcontext[idx] << " "; + cerr << endl; + cerr << "bdoms_state_mono=" << bdoms_state_mono << ", state[50]=" << IntegerToDouble(vcontext[50]) << endl; + cerr << "bdoms_state_nonmono=" << bdoms_state_nonmono << ", state[51]=" << IntegerToDouble(vcontext[51]) << endl; + } +} + +int Dwarf::DoubleToInteger(double val) { + float x = (float)val; + float* px = &x; + int* pix = reinterpret_cast<int *>(px); + return *pix; +} + +double Dwarf::IntegerToDouble(int val) { + int *py = &val; + float* pd = reinterpret_cast<float *>(py); + return (double)*pd; +} + +void Dwarf::neighboringFWs(const Lattice& l, const int& i, const int& j, const map<WordID,int>& fw_hash, int* lfw, int* rfw) { + *lfw=0; *rfw=0; + int idx=i-l[i][0].dist2next; + while (idx>=0) { + if (l[idx].size()>0) { + if (fw_hash.find(l[idx][0].label)!=fw_hash.end()) { + *lfw++; + } + } + idx-=l[idx][0].dist2next; + } + idx=j+l[j][0].dist2next; + while (idx<l.size()) { + if (l[idx].size()>0) { + if (fw_hash.find(l[idx][0].label)!=fw_hash.end()) { + *rfw++; + } + } + idx+=l[idx][0].dist2next; + } +} + +bool Dwarf::readOrientation(CountTable* table, const std::string& filename, std::map<WordID,int> *fw, bool pos) { + // the input format is + // source target 0 1 2 3 4 0 1 2 3 4 + // 0 -> MA, 1 -> RA, 2 -> MG, 3 -> RG, 4 -> NO_NEIGHBOR + // first 01234 corresponds to the left neighbor, the second 01234 corresponds to the right neighbor + // append 2 more at the end as precomputed total + + // TONS of hack here. CountTable should be wrapped as a class + // TODO: check whether the file exists or not, return false if not + if (DEBUG) cerr << " readOrientation(" << filename << ", pos=" << pos << ")" << endl; + ReadFile rf(filename); + istream& in = *rf.stream(); + table->setup(24,pos); + table->ultimate = new int[24]; + for (int i=0; i<24; i++) table->ultimate[i]=0; + ostringstream oss; + while (in) { + string line; + getline(in,line); + if (line=="") break; + istringstream tokenizer(line); + string sourceidx, source, target, word; + tokenizer >> source >> target; + if (pos) { + sourceidx = source; + source = sourceidx.substr(0,sourceidx.find_last_of("/")); + } + if (fw->find(TD::Convert(source))==fw->end()) fw->insert(pair<WordID,int>(TD::Convert(source),1)); + + + int* element = new int[24]; + element[5] = 0; + for (int i=0; i<5; i++) { + element[i] = 0; + if (tokenizer >> word) element[i] = atoi(word.c_str()); + element[5] += element[i]; + } + element[11] = 0; + for (int i=6; i<11; i++) { + element[i] = 0; + if (tokenizer >> word) element[i] = atoi(word.c_str()); + element[11] += element[i]; + } + element[17] = 0; + for (int i=12; i<17; i++) { + element[i] = 0; + if (tokenizer >> word) element[i] = atoi(word.c_str()); + element[17] += element[i]; + } + element[23] = 0; + for (int i=18; i<23; i++) { + element[i] = 0; + if (tokenizer >> word) element[i] = atoi(word.c_str()); + element[23] += element[i]; + } + for (int i=0; i<24; i++) table->ultimate[i] += element[i]; + oss << source << " " << target; + WordID key_id = TD::Convert(oss.str()); + oss.str(""); + if (table->model.find(key_id)!=table->model.end()) { + for (int i=0; i<24; i++) table->model[key_id][i]+=element[i]; + } else { + int* el2 = new int[24]; + for (int i=0; i<24; i++) el2[i] = element[i]; + table->model.insert(pair<WordID,int*>(key_id,el2)); + } + + oss << source; + key_id = TD::Convert(oss.str()); + oss.str(""); + if (table->model.find(key_id)!=table->model.end()) { + for (int i=0; i<24; i++) table->model[key_id][i]+=element[i]; + } else { + int* el2 = new int[24]; + for (int i=0; i<24; i++) el2[i] = element[i]; + table->model.insert(pair<WordID,int*>(key_id,el2)); + } + + if (pos) { + oss << sourceidx << " " << target; + key_id = TD::Convert(oss.str()); + oss.str(""); + if (table->model.find(key_id)!=table->model.end()) { + for (int i=0; i<24; i++) table->model[key_id][i]+=element[i]; + } else { + int* el2 = new int[24]; + for (int i=0; i<24; i++) el2[i] = element[i]; + table->model.insert(pair<WordID,int*>(key_id,el2)); + } + } + delete[] element; + } + return true; +} + +bool Dwarf::readList(const std::string& filename, std::map<WordID,int>* fw) { + ReadFile rf(filename); + istream& in = *rf.stream(); + while (in) { + string word; + getline(in,word); + if (fw->find(TD::Convert(word))==fw->end()) fw->insert(pair<WordID,int>(TD::Convert(word),1)); + } + return true; +} + +bool Dwarf::readDominance(CountTable* table, const std::string& filename, std::map<WordID,int>* fw, bool pos) { + // the input format is + // source1 source2 target1 target2 0 1 2 3 + // 0 -> dontcase 1->leftfirst 2->rightfirst 3->neither + if (DEBUG) cerr << "readDominance(" << filename << ",pos="<< pos << ")" << endl; + ReadFile rf(filename); + istream& in = *rf.stream(); + table->ultimate = new int[5]; + table->setup(5,pos); + for (int i=0; i<5; i++) table->ultimate[i]=0; + while (in) { + string line, word; + getline(in,line); + if (line=="") break; + string source1idx, source2idx, target1, target2, source1, source2; + ostringstream oss; + WordID key_id; + istringstream tokenizer(line); + tokenizer >> source1 >> source2 >> target1 >> target2; + if (pos) { + source1idx = source1; + source2idx = source2; + source1 = source1idx.substr(0,source1idx.find_last_of("/")); + source2 = source2idx.substr(0,source2idx.find_last_of("/")); + } + if (fw->find(TD::Convert(source1))==fw->end()) fw->insert(pair<WordID,int>(TD::Convert(source1),1)); + if (fw->find(TD::Convert(source2))==fw->end()) fw->insert(pair<WordID,int>(TD::Convert(source2),1)); + + int* element = new int[5]; + element[4]=0; + for (int i=0; i<4; i++) { + element[i] = 0; + if (tokenizer >> word) element[i] = atoi(word.c_str()); + element[4]+=element[i]; + } + for (int i=0; i<5; i++) table->ultimate[i] += element[i]; + + oss << source1 << " " << source2 << " " << target1 << " " << target2; + key_id = TD::Convert(oss.str()); + oss.str(""); + if (table->model.find(key_id)!=table->model.end()) { + for (int i=0; i<5; i++) table->model[key_id][i]+=element[i]; + } else { + int* el2 = new int[5]; + for (int i=0; i<5; i++) el2[i]=element[i]; + table->model.insert(pair<WordID,int*>(key_id,el2)); + } + + oss << source1 << " " << source2; + key_id = TD::Convert(oss.str()); + oss.str(""); + if (table->model.find(key_id)!=table->model.end()) { + for (int i=0; i<5; i++) table->model[key_id][i]+=element[i]; + } else { + int* el2 = new int[5]; + for (int i=0; i<5; i++) el2[i]=element[i]; + table->model.insert(pair<WordID,int*>(key_id,el2)); + } + + if (pos) { + oss << source1idx << " " << source2idx << " " << target1 << " " << target2; + key_id = TD::Convert(oss.str()); + oss.str(""); + if (table->model.find(key_id)!=table->model.end()) { + for (int i=0; i<5; i++) table->model[key_id][i]+=element[i]; + } else { + int* el2 = new int[5]; + for (int i=0; i<5; i++) el2[i]=element[i]; + table->model.insert(pair<WordID,int*>(key_id,el2)); + } + } + delete element; + } + + return true; +} + +bool Dwarf::readTags(const std::string& filename, std::map<WordID,WordID>* tags) { + ReadFile rf(filename); + istream& in = *rf.stream(); + while(in) { + string line, word, tag; + getline(in,line); + if (line=="") break; + istringstream tokenizer(line); + tokenizer >> tag >> word; + tags->insert(pair<WordID,WordID>(TD::Convert(word),TD::Convert(tag))); + } + return true; +} + +bool Dwarf::generalizeOrientation(CountTable* table, const std::map<WordID,WordID>& tags, bool pos) { + map<string,int*> generalized; + for (map<WordID,int*>::iterator it=table->model.begin(); it!=table->model.end(); it++) { + string source, target; + istringstream tokenizer(TD::Convert(it->first)); + tokenizer >> source >> target; + string idx = ""; + if (pos) { + int found = source.find_last_of("/"); + if (found!=string::npos && found>0) { + idx = source.substr(found+1); + source = source.substr(0,found); + } + } + map<WordID,WordID>::const_iterator tags_iter = tags.find(TD::Convert(source)); + if (tags_iter!=tags.end()) { + ostringstream genkey; + genkey << TD::Convert(tags_iter->second); + if (idx!="") genkey << "/" << idx; + if (target!="") genkey << " " << target; + int* model; + if (generalized.find(genkey.str())!=generalized.end()) { + model = generalized[genkey.str()]; + for (int i=0; i<24; i++) model[i] += it->second[i]; + } else { + int* el = new int[24]; + for (int i=0; i<24; i++) el[i] = it->second[i]; + generalized.insert(pair<string,int*>(genkey.str(),el)); + } + } + } + for (map<WordID,int*>::iterator it=table->model.begin(); it!=table->model.end(); it++) { + string source, target; + istringstream tokenizer(TD::Convert(it->first)); + tokenizer >> source >> target; + string idx = ""; + if (pos) { + int found = source.find_last_of("/"); + if (found!=string::npos && found>0) { + idx = source.substr(found+1); + source = source.substr(0,found); + } + } + map<WordID,WordID>::const_iterator tags_iter = tags.find(TD::Convert(source)); + if (tags_iter!=tags.end()) { + ostringstream genkey; + genkey << TD::Convert(tags_iter->second); + if (idx!="") genkey << "/" << idx; + if (target!="") genkey << " " << target; + if (generalized.find(genkey.str())!=generalized.end()) { + delete it->second; + it->second = generalized[genkey.str()]; + } + } + } + +} + + + +bool Dwarf::generalizeDominance(CountTable* table, const std::map<WordID,WordID>& tags, bool pos) { + map<string,int*> generalized; + ostringstream oss; + for (map<WordID,int*>::iterator it=table->model.begin(); it!=table->model.end(); it++) { + string source1, source2, target1, target2; + string idx1 = ""; string idx2 = ""; + istringstream tokenizer(TD::Convert(it->first)); + tokenizer >> source1 >> source2 >> target1 >> target2; + if (DEBUG) cerr << "source1=|" << source1 << "|, source2=|" << source2 << "|, target1=|" << target1 << "|, target2=|" << target2 << "|" << endl; + if (pos) { + int found1 = source1.find_last_of("/"); + int found2 = source2.find_last_of("/"); + if (found1!=string::npos && found2!=string::npos && found1>0 && found2>0) { + idx1 = source1.substr(found1+1); + source1 = source1.substr(0,found1); + idx2 = source2.substr(found2+1); + source2 = source2.substr(0,found2); + } + } + if (DEBUG) + cerr << "[U]source1='" << source1 << "', idx1='"<< idx1 << "', source2='" << source2 << "', idx2='"<< idx2 << "', target1='" << target1 << "', target2='" << target2 << "'" << endl; + map<WordID,WordID>::const_iterator tags_iter1 = tags.find(TD::Convert(source1)); + map<WordID,WordID>::const_iterator tags_iter2 = tags.find(TD::Convert(source2)); + if (tags_iter1!=tags.end()) + source1 = TD::Convert(tags_iter1->second); + oss << source1; + if (idx1!="") oss << "/" << idx1; + if (tags_iter2!=tags.end()) + source2 = TD::Convert(tags_iter2->second); + oss << " " << source2; + if (idx2!="") oss << "/" << idx2; + if (target1!="" && target2!="") oss << " " << target1 << " " << target2; + + if (DEBUG) cerr << "generalized key = '" << oss.str() << "'" << endl; + if (generalized.find(oss.str())!=generalized.end()) { + int* model = generalized[oss.str()]; + for (int i=0; i<5; i++) model[i] += it->second[i]; + } else { + int* model = new int[5]; + for (int i=0; i<5; i++) model[i] = it->second[i]; + generalized.insert(pair<string,int*>(oss.str(),model)); + } + oss.str(""); + } + + if (DEBUG) { + for (map<string,int*>::const_iterator it=generalized.begin(); it!=generalized.end(); it++) { + cerr << "GENERALIZED = " << it->first << ", "; + for (int i=0; i<5; i++) cerr << it->second[i] << " "; + cerr << endl; + } + } + + for (map<WordID,int*>::iterator it=table->model.begin(); it!=table->model.end(); it++) { + string source1, source2, target1, target2; + string idx1 = ""; string idx2 = ""; + istringstream tokenizer(TD::Convert(it->first)); + tokenizer >> source1 >> source2 >> target1 >> target2; + if (pos) { + int found1 = source1.find_last_of("/"); + int found2 = source2.find_last_of("/"); + if (found1!=string::npos && found2!=string::npos && found1>0 && found2>0) { + idx1 = source1.substr(found1+1); + source1 = source1.substr(0,found1); + idx2 = source2.substr(found2+1); + source2 = source2.substr(0,found2); + } + } + map<WordID,WordID>::const_iterator tags_iter1 = tags.find(TD::Convert(source1)); + map<WordID,WordID>::const_iterator tags_iter2 = tags.find(TD::Convert(source2)); + if (tags_iter1!=tags.end()) + source1 = TD::Convert(tags_iter1->second); + oss << source1; + if (idx1!="") oss << "/" << idx1; + if (tags_iter2!=tags.end()) + source2 = TD::Convert(tags_iter2->second); + oss << " " << source2; + if (idx2!="") oss << "/" << idx2; + if (target1!="" && target2!="") oss << " " << target1 << " " << target2; + + if (generalized.find(oss.str())!=generalized.end()) { + if (DEBUG) cerr << " generalizing "<< TD::Convert(it->first) << " into " << oss.str() << endl; + if (DEBUG) { + cerr << " model from "; + for (int i=0; i<5; i++) cerr << it->second[i] << " "; + cerr << endl; + } + delete it->second; + it->second = generalized[oss.str()]; + if (DEBUG) { + cerr << " into "; + for (int i=0; i<5; i++) cerr << it->second[i] << " "; + cerr << endl; + } + } + oss.str(""); + } + +} |