diff options
| author | chris dyer <redpony@umd.edu> | 2010-12-28 20:56:42 -0500 | 
|---|---|---|
| committer | chris dyer <redpony@umd.edu> | 2010-12-28 20:56:42 -0500 | 
| commit | b85986c762bc8a2a74bfe0e2eb1d88fba991d554 (patch) | |
| tree | aae7a3b2127a14b67afda54184d261b56a670ec4 /decoder/ff_dwarf.cc | |
| parent | 6b4b4f19f44e051e5f62bcb1243c3d199d537cc6 (diff) | |
incorporate dwarf features
Diffstat (limited to 'decoder/ff_dwarf.cc')
| -rw-r--r-- | decoder/ff_dwarf.cc | 893 | 
1 files changed, 893 insertions, 0 deletions
| diff --git a/decoder/ff_dwarf.cc b/decoder/ff_dwarf.cc new file mode 100644 index 00000000..3daa85ac --- /dev/null +++ b/decoder/ff_dwarf.cc @@ -0,0 +1,893 @@ +#include <vector> +#include <sstream> +#include <fstream> +#include <string> +#include <iostream> +#include <map> +#include "ff_dwarf.h" +#include "dwarf.h" +#include "wordid.h" +#include "tdict.h" +#include "filelib.h" +#include "sentence_metadata.h" +#include "stringlib.h" + +using namespace std; + +Dwarf::Dwarf(const std::string& param) {  +/* Param is a space separated string which contains any or all of the following: +   oris|orit|doms|domt=filename  +   e.g. oris=/fs/clip-galep3eval/hendra/z2e/oris128.gz +*/ +  sSOS="<s>"; +  sEOS="</s>"; +  kSOS=TD::Convert(sSOS); +  kEOS=TD::Convert(sEOS); +  kGOAL=TD::Convert("S")*-1; +  _sent_id = (int *)malloc(sizeof(int)); +  *_sent_id = -1; +  if (DEBUG) cerr << "here = " << *_sent_id << endl; +  _fwcount = (int *)malloc(sizeof(int)); +  *_fwcount = -1; +  cerr << "initializing dwarf" << endl; +  flag_oris=false; flag_orit=false; flag_doms=false; flag_domt=false; flag_tfw_count=false; +  flag_bdoms=false; flag_porislr=false, flag_porisrl=false, flag_goris=false; flag_pgorislr=false, flag_pgorisrl=false; +  flag_pdomslr=false; flag_pdomsrl=false; flag_pgdomslr=false; flag_pgdomsrl=false; flag_gdoms=false; +  flag_oris_backward=false; flag_orit_backward=false;  +  explicit_soseos=false; +  SetStateSize(STATE_SIZE*sizeof(int));    +  als = new Alignment();   +  als->clearAls(Alignment::MAX_WORDS,Alignment::MAX_WORDS); +  istringstream iss(param); string w; +  while(iss >> w) { +    int equal = w.find_first_of("="); +    if (equal!=string::npos) { +      string model = w.substr(0,equal); +      vector<string> params;  +      Tokenize(w.substr(equal+1),',',¶ms); +      string fn = params[0]; +      if (model == "minfreq") { +        cerr << "model minfreq " << fn << endl; +        als->setFreqCutoff(atoi(fn.c_str())); +      } else if (model == "oris") { +        flag_oris = readOrientation(&toris,fn,&sfw);  +        if (flag_oris) { +          oris_ = FD::Convert("OrientationSource"); +          //oris_bo1_ = FD::Convert("OrientationSource_BO1"); +          //oris_bo2_ = FD::Convert("OrientationSource_BO2"); +        } +        if (params.size()>1) als->setAlphaOris(atof(params[1].c_str())); +        if (params.size()>2) als->setBetaOris(atof(params[2].c_str())); +      } else if (model == "porislr") { +        flag_porislr = readOrientation(&tporislr,fn,&sfw,true); +        poris_nlr = 0; +        if (flag_porislr) { +          porislr_ = FD::Convert("OrientationSourcePositionfulLeftRight"); +        } +        if (params.size()>1) poris_nlr = atoi(params[1].c_str()); +        if (DEBUG) cerr << "  maximum poris depth=" << poris_nlr  << endl; +      } else if (model == "porisrl") { +        flag_porisrl = readOrientation(&tporisrl,fn,&sfw,true); +        poris_nrl = 0; +        if (flag_porisrl) { +          porisrl_ = FD::Convert("OrientationSourcePositionfulRightLeft"); +        } +        if (params.size()>1) poris_nrl = atoi(params[1].c_str()); +        if (DEBUG) cerr << "  maximum poris depth=" << poris_nrl  << endl; +      } else if (model=="goris") { +        flag_goris = readOrientation(&tgoris,fn,&sfw); +        if (flag_goris) { +          goris_ = FD::Convert("OrientationSourceGeneralized"); +        } +        if (params.size()>1) { +          readTags(params[1],&tags); +          generalizeOrientation(&tgoris,tags);           +        } +      } else if (model=="pgorislr") { +        flag_pgorislr = readOrientation(&tpgorislr,fn,&sfw,true); +        pgoris_nlr = 0; +        if (flag_pgorislr) { +          pgorislr_ = FD::Convert("OrientationSourceGeneralizedPositionfulLeftRight"); +        } +        if (DEBUG) { +          cerr << "BEFORE GENERALIZATION" << endl; +          tpgorislr.print(); +        } +        if (params.size()>1) pgoris_nlr = atoi(params[1].c_str()); +        if (params.size()>2) { +          readTags(params[2],&tags); +          generalizeOrientation(&tpgorislr,tags,true); +        } +        if (DEBUG) { +          cerr << "AFTER GENERALIZATION" << endl; +          tpgorislr.print(); +        } +      } else if (model=="pgorisrl") { +        flag_pgorisrl = readOrientation(&tpgorisrl,fn,&sfw,true); +        pgoris_nrl = 0; +        if (flag_pgorisrl) { +          pgorisrl_ = FD::Convert("OrientationSourceGeneralizedPositionfulLeftRight"); +        }  +        if (params.size()>1) pgoris_nrl = atoi(params[1].c_str()); +        if (params.size()>2) { +          readTags(params[2],&tags); +          generalizeOrientation(&tpgorisrl,tags,true); +        } +      } else if (model == "oris_backward") { +        flag_oris_backward = true; +        if (!flag_oris) readOrientation(&toris,fn,&sfw); +        oris_backward_ = FD::Convert("OrientationSourceBackward"); +        if (params.size()>1) als->setAlphaOris(atof(params[1].c_str())); +        if (params.size()>2) als->setBetaOris(atof(params[2].c_str())); +      } else if (model == "orit") { +        flag_orit = readOrientation(&torit,fn,&tfw);  +        if (flag_orit) { +          orit_ = FD::Convert("OrientationTarget"); +          //orit_bo1_ = FD::Convert("OrientationTarget_BO1"); +          //orit_bo2_ = FD::Convert("OrientationTarget_BO2"); +        } +        if (params.size()>1) als->setAlphaOrit(atof(params[1].c_str())); +        if (params.size()>2) als->setBetaOrit(atof(params[2].c_str())); +      } else if (model == "orit_backward") { +        flag_orit_backward = true; +        if (!flag_orit) readOrientation(&torit,fn,&tfw); +        orit_backward_ = FD::Convert("OrientationTargetBackward"); +        if (params.size()>1) als->setAlphaOrit(atof(params[1].c_str())); +        if (params.size()>2) als->setBetaOrit(atof(params[2].c_str())); +      } else if (model == "doms") { +        flag_doms = readDominance(&tdoms,fn,&sfw);  +        if (flag_doms) { +          doms_ = FD::Convert("DominanceSource"); +          //doms_bo1_ = FD::Convert("DominanceSource_BO1"); +          //doms_bo2_ = FD::Convert("DominanceSource_BO2"); +        } +        if (params.size()>1) als->setAlphaDoms(atof(params[1].c_str())); +        if (params.size()>2) als->setBetaDoms(atof(params[2].c_str())); +      } else if (model == "pdomsrl") { +        flag_pdomsrl = readDominance(&tpdomsrl,fn,&sfw,true); +        if (flag_pdomsrl) { +          pdomsrl_ = FD::Convert("DominanceSourcePositionfulRightLeft"); +        } +        if (params.size()>1) pdoms_nrl = atoi(params[1].c_str()); +      } else if (model == "pdomslr") { +        flag_pdomslr = readDominance(&tpdomslr,fn,&sfw,true); +        tpdomslr.print(); +        if (flag_pdomslr) { +          pdomslr_ = FD::Convert("DominanceSourcePositionfulLeftRight"); +        } +        if (params.size()>1) pdoms_nlr = atoi(params[1].c_str()); +      } else if (model == "pgdomsrl") { +        flag_pgdomsrl = readDominance(&tpgdomsrl,fn,&sfw,true); +        if (flag_pgdomsrl) { +          pgdomsrl_ = FD::Convert("DominanceSourceGeneralizedPositionfulRightLeft"); +        } +        if (params.size()>1) pgdoms_nrl = atoi(params[1].c_str()); +        if (params.size()>2) { +          readTags(params[2],&tags); +          generalizeDominance(&tpgdomsrl,tags,true); +        } +      } else if (model == "pgdomslr") { +        flag_pgdomslr = readDominance(&tpgdomslr,fn,&sfw,true); +        if (flag_pgdomslr) { +          pgdomslr_ = FD::Convert("DominanceSourceGeneralizedPositionfulLeftRight"); +        } +        if (params.size()>1) pgdoms_nlr = atoi(params[1].c_str()); +        if (params.size()>2) { +          readTags(params[2],&tags); +          if (DEBUG) { +            for (map<WordID,WordID>::const_iterator it=tags.begin(); it!=tags.end(); it++) { +              cerr << "tags = " << TD::Convert(it->first) << ", " << TD::Convert(it->second) << endl; +            } +          }  +          generalizeDominance(&tpgdomslr,tags,true); +        } +        if (DEBUG) tpgdomslr.print(); +      } else if (model == "bdoms") { +        flag_bdoms = readDominance(&tbdoms,fn,&sfw); +        if (flag_bdoms) { +          bdoms_ = FD::Convert("BorderDominanceSource"); +        } +      } else if (model == "domt") { +        flag_domt = readDominance(&tdomt,fn,&tfw);  +        if (flag_domt) { +          domt_ = FD::Convert("DominanceTarget"); +          //domt_bo1_ = FD::Convert("DominanceTarget_BO1"); +          //domt_bo2_ = FD::Convert("DominanceTarget_BO2"); +        } +        if (params.size()>1) als->setAlphaDomt(atof(params[1].c_str())); +        if (params.size()>2) als->setBetaDomt(atof(params[2].c_str())); +      } else if (model== "tfw_count") { +        flag_tfw_count = readList(fn,&tfw); +        tfw_count_ = FD::Convert("TargetFunctionWordsCount");         +      } else { +        cerr << "DWARF doesn't understand this model: " << model << endl; +      } +    } else { +      if (w=="tfw_count") { +        flag_tfw_count = true; +        tfw_count_ = FD::Convert("TargetFunctionWordsCount"); +      } else if (w=="oris_backward") { +        flag_oris_backward = true; +        oris_backward_ = FD::Convert("OrientationSourceBackward");  +      } else if (w=="orit_backward") { +        flag_orit_backward = true; +        orit_backward_ = FD::Convert("OrientationTargetBackward"); +      } else if (w=="explicit_soseos") { +        explicit_soseos=true; +      } else { +        cerr << "DWARF doesn't need this param: " << param << endl;  +      } +    }   +  } +  for (map<WordID,int>::const_iterator it=sfw.begin(); it!=sfw.end() && DEBUG; it++) { +    cerr << "   FW:" << TD::Convert(it->first) << endl; +  } +} + +void Dwarf::TraversalFeaturesImpl(const SentenceMetadata& smeta, +                                     const Hypergraph::Edge& edge, +                                     const std::vector<const void*>& ant_contexts, +                                     SparseVector<double>* features, +                                     SparseVector<double>* estimated_features, +                                     void* context) const { +  if (DEBUG) cerr << "TraversalFeaturesImpl" << endl; +  double cost, bonus, bo1, bo2, bo1_bonus, bo2_bonus; +  double bdoms_state_mono= 0; double bdoms_state_nonmono = 0; +  TRule r = *edge.rule_; +  if (DEBUG) cerr << " sent_id=" << *_sent_id << ", " << smeta.GetSentenceID() << endl; +  if (DEBUG) cerr << "rule = " << r.AsString() << endl;  +  if (DEBUG) cerr << "rule[i,j] = " << edge.i_ << "," << edge.j_ << endl; +  if (*_sent_id != smeta.GetSentenceID()) { //new sentence +    *_sent_id = smeta.GetSentenceID(); +    const Lattice l = smeta.GetSourceLattice(); +    *_fwcount=0; +    for (int i=0; i<smeta.GetSourceLength(); i++) { +      if (sfw.find(l[i][0].label)!=sfw.end()) { +        *_fwcount+=1; +      } +    } +    if (DEBUG) cerr << "new sentence[" << *_sent_id << "]="<<*_fwcount<<endl; +  } +  bool nofw = als->prepare(*edge.rule_, ant_contexts, sfw, tfw,smeta.GetSourceLattice(),edge.i_,edge.j_);  +  bool isFinal = (edge.i_==0 && edge.j_==smeta.GetSourceLength() && r.GetLHS()==kGOAL); +  // prepare *nofw* outputs whether the resulting alignment, contains function words or not +  // if not, the models do not have to be calcualted and *simplify* is very simple +  if (DEBUG) cerr << "nofw = " << nofw << endl; +  if (flag_tfw_count) { +    double count = 0; +    for (int i=0; i<r.e_.size(); i++) { +      if (tfw.find(r.e_[i])!=tfw.end()) count++; +    } +    features->set_value(tfw_count_,count);   +  } +  if (flag_oris) { +    cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; +    if (!nofw) als->computeOrientationSource(toris,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus);  +    if (isFinal&&!explicit_soseos) { +      cost += bonus; +      bonus = 0; +    } +    features->set_value(oris_,cost);  +    //features->set_value(oris_bo1_,bo1);  +    //features->set_value(oris_bo2_,bo2); +    estimated_features->set_value(oris_,bonus);  +    //estimated_features->set_value(oris_bo1_,bo1_bonus);  +    //estimated_features->set_value(oris_bo2_,bo2_bonus); +  } +  if (flag_porislr) { +    cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; +    if (!nofw)  +      als->computeOrientationSourcePos(tporislr,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus,*_fwcount,poris_nlr,0); +    if (isFinal&&!explicit_soseos) { +      cost += bonus; +      bonus = 0; +    } +    features->set_value(porislr_,cost); +    estimated_features->set_value(porislr_,bonus); +  } +  if (flag_porisrl) { +    cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; +    if (!nofw) +      als->computeOrientationSourcePos(tporisrl,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus,*_fwcount,0,poris_nrl); +    if (isFinal&&!explicit_soseos) { +      cost += bonus; +      bonus = 0; +    } +    features->set_value(porisrl_,cost); +    estimated_features->set_value(porisrl_,bonus); +  } +  if (flag_pgorislr) { +    cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; +    if (!nofw) +      als->computeOrientationSourcePos(tpgorislr,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus,*_fwcount,pgoris_nlr,0); +    if (isFinal&&!explicit_soseos) { +      cost += bonus; +      bonus = 0; +    } +    features->set_value(pgorislr_,cost); +    estimated_features->set_value(pgorislr_,bonus); +  } +  if (flag_pgorisrl) { +    cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; +    if (!nofw) +      als->computeOrientationSourcePos(tpgorisrl,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus,*_fwcount,0,pgoris_nrl); +    if (isFinal&&!explicit_soseos) { +      cost += bonus; +      bonus = 0; +    } +    features->set_value(pgorisrl_,cost); +    estimated_features->set_value(pgorisrl_,bonus); +  } +  if (flag_goris) { +    cost=0; bonus=0; +    if (!nofw) als->computeOrientationSource(tgoris,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus); +    if (isFinal&&!explicit_soseos) { +      cost += bonus; +      bonus = 0; +    } +    features->set_value(goris_,cost); +    estimated_features->set_value(goris_,bonus); +  } +  if (flag_oris_backward) { +    cost=0; bonus=0; +    if (!nofw)  +      als->computeOrientationSourceBackward(toris,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus); +    if (isFinal&&!explicit_soseos) { +      cost += bonus; +      bonus = 0; +    } +    features->set_value(oris_backward_,cost); +    estimated_features->set_value(oris_backward_,bonus); +  } +  WordID _lfw = kSOS; +  WordID _rfw = kEOS;  +  if (flag_doms || flag_pdomslr || flag_pdomsrl || flag_pgdomslr || flag_pgdomsrl) { +    if (DEBUG) cerr << "   seeking lfw and rfw" << endl; +    int start = edge.i_; +    int end   = edge.j_; +    if (DEBUG) cerr << "   start=" << start << ", end=" << end << endl; +    const Lattice l = smeta.GetSourceLattice(); +    for (int idx=start-1; idx>=0; idx--) { +      if (DEBUG) cerr << "  checking idx=" << idx << ", label=" << l[idx][0].label << "-" << TD::Convert(l[idx][0].label) << endl; +      if (sfw.find(l[idx][0].label) !=sfw.end()) { +        if (DEBUG) cerr << "+"; +        _lfw=l[idx][0].label; break; +      } +    } +    for (int idx=end; idx<l.size(); idx++) { // end or end+1 +      if (DEBUG) cerr << "  checking idx=" << idx << ", label=" << l[idx][0].label << "-" << TD::Convert(l[idx][0].label) << endl; +      if (sfw.find(l[idx][0].label)!=sfw.end()) { +        if (DEBUG) cerr << "."; +        _rfw=l[idx][0].label; break; +      } +    } +    if (isFinal&&!explicit_soseos) { +      _lfw=kSOS; _rfw=kEOS; +    } +  } +  if (flag_doms) { +    cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; +    if (!nofw) als->computeDominanceSource(tdoms,_lfw,_rfw,&cost,&bonus, +                                           &bo1,&bo1_bonus,&bo2,&bo2_bonus);  +    if (DEBUG) cerr << "   COST=" << cost << ", BONUS=" << bonus << endl; +    if (isFinal&&!explicit_soseos) { +      cost += bonus; +      if (DEBUG) cerr << "    final and !explicit_soseos, thus cost = " << cost <<  endl; +      bonus = 0; +    } +    features->set_value(doms_,cost);  +    estimated_features->set_value(doms_,bonus); +  } +  if (flag_pdomslr) { +   if (DEBUG) cerr << " flag_pdomslr true, nofw=" << nofw << endl; +   if (DEBUG) cerr << "   lfw=" << _lfw << ", rfw=" << _rfw << endl; +   if (DEBUG) cerr << "   kSOS=" << kSOS << ", kEOS=" << kEOS << endl; +    cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; +    if (!nofw) als->computeDominanceSourcePos(tpdomslr,_lfw,_rfw,&cost,&bonus, +                                           &bo1,&bo1_bonus,&bo2,&bo2_bonus,*_fwcount,pdoms_nlr,0); +    if (isFinal&&!explicit_soseos) { +      cost += bonus; +      bonus = 0; +    } +    features->set_value(pdomslr_,cost); +    estimated_features->set_value(pdomslr_,bonus);   +  } +  if (flag_pdomsrl) { +    cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; +    if (!nofw) als->computeDominanceSourcePos(tpdomsrl,_lfw,_rfw,&cost,&bonus, +                                           &bo1,&bo1_bonus,&bo2,&bo2_bonus,*_fwcount,0,pdoms_nrl); +    if (isFinal&&!explicit_soseos) { +      cost += bonus; +      bonus = 0; +    } +    features->set_value(pdomsrl_,cost); +    estimated_features->set_value(pdomsrl_,bonus);  +  } +  if (flag_pgdomslr) { +    cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; +    if (!nofw) als->computeDominanceSourcePos(tpgdomslr,_lfw,_rfw,&cost,&bonus, +                                           &bo1,&bo1_bonus,&bo2,&bo2_bonus,*_fwcount,pgdoms_nlr,0); +    if (isFinal&&!explicit_soseos) { +      cost += bonus; +      bonus = 0; +    } +    features->set_value(pgdomslr_,cost); +    estimated_features->set_value(pgdomslr_,bonus);   +  } +  if (flag_pgdomsrl) {    cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; +    if (!nofw) als->computeDominanceSourcePos(tpgdomsrl,_lfw,_rfw,&cost,&bonus, +                                           &bo1,&bo1_bonus,&bo2,&bo2_bonus,*_fwcount,0,pgdoms_nrl); +    if (isFinal&&!explicit_soseos) { +      cost += bonus; +      bonus = 0; +    } +    features->set_value(pgdomsrl_,cost); +    estimated_features->set_value(pgdomsrl_,bonus);  +  } + + +  if (flag_bdoms) { +    cost=0; bonus=0; bdoms_state_mono=0; bdoms_state_nonmono=0;  +    if (!nofw) +      als->computeBorderDominanceSource(tbdoms,&cost,&bonus, +        &bdoms_state_mono, &bdoms_state_nonmono,*edge.rule_, ant_contexts, sfw); +    features->set_value(bdoms_,cost); +    estimated_features->set_value(bdoms_,bonus);  +  } +  if (flag_orit) { +    cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; +    if (!nofw) als->computeOrientationTarget(torit,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus);  +    if (DEBUG) cerr << "cost=" << cost << ", bonus=" << bonus << ", bo1=" << bo1 << ", bo1_bonus=" << bo1_bonus << ", bo2=" << bo2 << ", bo2_bonus=" << bo2_bonus << endl; +    features->set_value(orit_,cost);  +    //features->set_value(orit_bo1_,bo1);  +    //features->set_value(orit_bo2_,bo2); +    estimated_features->set_value(orit_,bonus); +    //estimated_features->set_value(orit_bo1_,bo1_bonus); +    //estimated_features->set_value(orit_bo2_,bo2_bonus); +  } +  if (flag_orit_backward) { +    cost=0; bonus=0; +    if (!nofw) als->computeOrientationTargetBackward(torit,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus); +    features->set_value(orit_backward_,cost); +    estimated_features->set_value(orit_backward_,bonus); +  } +  if (flag_domt) { +    cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; +    WordID _lfw=-1; int start = edge.i_; +    WordID _rfw=-1; int end   = edge.j_; +    if (smeta.HasReference()) { +      const Lattice l = smeta.GetReference(); +      for (int idx=start-1; idx>=0; idx--) { +        if (l.size()>0) +          if (tfw.find(l[idx][0].label) !=tfw.end()) { +            _lfw=l[idx][0].label; break; +          } +      } +      for (int idx=end; idx<l.size(); idx++) { // end or end+1 +        if (l[idx].size()>0) +          if (tfw.find(l[idx][0].label)!=tfw.end()) { +            _rfw=l[idx][0].label; break; +          } +      } +    } +    //neighboringFWs(smeta.GetReference(),edge.i_,edge.j_,tfw,&_lfw,&_rfw); +    if (!nofw) als->computeDominanceTarget(tdomt,_lfw,_rfw,&cost,&bonus, +                                           &bo1,&bo1_bonus,&bo2,&bo2_bonus); +    features->set_value(domt_,cost);  +    //features->set_value(domt_bo1_,bo1);  +    //features->set_value(domt_bo2_,bo2); +    estimated_features->set_value(domt_,bonus); +    //estimated_features->set_value(domt_bo1_,bo1_bonus); +    //estimated_features->set_value(domt_bo2_,bo2_bonus); +  } +  int* vcontext = reinterpret_cast<int *>(context); +  if (!nofw) { +    als->BorderingSFWsOnly(); +    als->BorderingTFWsOnly(); +    als->simplify(vcontext);   +  } else { +    als->simplify_nofw(vcontext); +  } +  vcontext[50] = DoubleToInteger(bdoms_state_mono); +  vcontext[51] = DoubleToInteger(bdoms_state_nonmono); +  vcontext[STATE_SIZE-1] = Alignment::link(edge.i_,edge.j_);  +  if (DEBUG) { +    cerr << "state@traverse = "; +    for (int idx=0; idx<STATE_SIZE; idx++) cerr << idx << "." << vcontext[idx] << " "; +    cerr << endl; +    cerr << "bdoms_state_mono=" << bdoms_state_mono << ", state[50]=" << IntegerToDouble(vcontext[50]) << endl; +    cerr << "bdoms_state_nonmono=" << bdoms_state_nonmono << ", state[51]=" << IntegerToDouble(vcontext[51]) << endl; +  } +} + +int Dwarf::DoubleToInteger(double val) { +  float x = (float)val; +  float* px = &x; +  int* pix = reinterpret_cast<int *>(px); +  return *pix;  +} + +double Dwarf::IntegerToDouble(int val) { +  int *py = &val; +  float* pd = reinterpret_cast<float *>(py); +  return (double)*pd; +} + +void Dwarf::neighboringFWs(const Lattice& l, const int& i, const int& j, const map<WordID,int>& fw_hash, int* lfw, int* rfw) { +  *lfw=0; *rfw=0; +  int idx=i-l[i][0].dist2next; +  while (idx>=0) { +    if (l[idx].size()>0) {  +      if (fw_hash.find(l[idx][0].label)!=fw_hash.end()) { +        *lfw++;   +      } +    } +    idx-=l[idx][0].dist2next; +  } +  idx=j+l[j][0].dist2next; +  while (idx<l.size()) { +    if (l[idx].size()>0) {  +      if (fw_hash.find(l[idx][0].label)!=fw_hash.end()) { +        *rfw++; +      } +    } +    idx+=l[idx][0].dist2next; +  } +} + +bool Dwarf::readOrientation(CountTable* table, const std::string& filename, std::map<WordID,int> *fw, bool pos) { +  // the input format is +  // source target 0 1 2 3 4 0 1 2 3 4 +  // 0 -> MA, 1 -> RA, 2 -> MG, 3 -> RG, 4 -> NO_NEIGHBOR +  // first 01234 corresponds to the left neighbor, the second 01234 corresponds to the right neighbor +  // append 2 more at the end as precomputed total +   +  // TONS of hack here. CountTable should be wrapped as a class   +  // TODO: check whether the file exists or not, return false if not +  if (DEBUG) cerr << "  readOrientation(" << filename << ", pos=" << pos << ")" << endl; +  ReadFile rf(filename);   +  istream& in = *rf.stream(); +  table->setup(24,pos); +  table->ultimate = new int[24]; +  for (int i=0; i<24; i++) table->ultimate[i]=0; +  ostringstream oss; +  while (in) { +    string line; +    getline(in,line); +    if (line=="") break; +    istringstream tokenizer(line); +    string sourceidx, source, target, word; +    tokenizer >> source >> target;  +    if (pos) { +      sourceidx = source; +      source = sourceidx.substr(0,sourceidx.find_last_of("/")); +    } +    if (fw->find(TD::Convert(source))==fw->end()) fw->insert(pair<WordID,int>(TD::Convert(source),1)); + + +    int* element = new int[24]; +    element[5] = 0; +    for (int i=0; i<5; i++) { +      element[i] = 0; +      if (tokenizer >> word) element[i] = atoi(word.c_str()); +      element[5] += element[i]; +    } +    element[11] = 0; +    for (int i=6; i<11; i++) { +      element[i] = 0; +      if (tokenizer >> word) element[i] = atoi(word.c_str()); +      element[11] += element[i]; +    } +    element[17] = 0; +    for (int i=12; i<17; i++) { +      element[i] = 0; +      if (tokenizer >> word) element[i] = atoi(word.c_str()); +      element[17] += element[i]; +    } +    element[23] = 0; +    for (int i=18; i<23; i++) { +      element[i] = 0; +      if (tokenizer >> word) element[i] = atoi(word.c_str()); +      element[23] += element[i]; +    } +    for (int i=0; i<24; i++) table->ultimate[i] += element[i]; +    oss << source << " " << target; +    WordID key_id = TD::Convert(oss.str()); +    oss.str(""); +    if (table->model.find(key_id)!=table->model.end()) {   +      for (int i=0; i<24; i++) table->model[key_id][i]+=element[i]; +    } else { +      int* el2 = new int[24]; +      for (int i=0; i<24; i++) el2[i] = element[i]; +      table->model.insert(pair<WordID,int*>(key_id,el2)); +    } +     +    oss << source; +    key_id = TD::Convert(oss.str()); +    oss.str(""); +    if (table->model.find(key_id)!=table->model.end()) {     +      for (int i=0; i<24; i++) table->model[key_id][i]+=element[i]; +    } else { +      int* el2 = new int[24]; +      for (int i=0; i<24; i++) el2[i] = element[i]; +      table->model.insert(pair<WordID,int*>(key_id,el2)); +    } + +    if (pos) { +      oss << sourceidx << " " << target; +      key_id = TD::Convert(oss.str()); +      oss.str("");  +      if (table->model.find(key_id)!=table->model.end()) { +        for (int i=0; i<24; i++) table->model[key_id][i]+=element[i]; +      } else { +        int* el2 = new int[24]; +        for (int i=0; i<24; i++) el2[i] = element[i]; +        table->model.insert(pair<WordID,int*>(key_id,el2)); +      } +    } +    delete[] element; +  }   +  return true;     +} + +bool Dwarf::readList(const std::string& filename, std::map<WordID,int>* fw) { +  ReadFile rf(filename); +  istream& in = *rf.stream(); +  while (in) { +    string word; +    getline(in,word); +    if (fw->find(TD::Convert(word))==fw->end()) fw->insert(pair<WordID,int>(TD::Convert(word),1));  +  } +  return true; +} + +bool Dwarf::readDominance(CountTable* table, const std::string& filename, std::map<WordID,int>* fw, bool pos) { +  // the input format is  +  // source1 source2 target1 target2 0 1 2 3 +  // 0 -> dontcase 1->leftfirst 2->rightfirst 3->neither  +  if (DEBUG) cerr << "readDominance(" << filename << ",pos="<< pos << ")" << endl; +  ReadFile rf(filename); +  istream& in = *rf.stream(); +  table->ultimate = new int[5]; +  table->setup(5,pos); +  for (int i=0; i<5; i++) table->ultimate[i]=0; +  while (in) { +    string line, word; +    getline(in,line); +    if (line=="") break; +    string source1idx, source2idx, target1, target2, source1, source2; +    ostringstream oss;  +    WordID key_id; +    istringstream tokenizer(line); +    tokenizer >> source1 >> source2 >> target1 >> target2;  +    if (pos) { +      source1idx = source1; +      source2idx = source2; +      source1 = source1idx.substr(0,source1idx.find_last_of("/")); +      source2 = source2idx.substr(0,source2idx.find_last_of("/")); +    } +    if (fw->find(TD::Convert(source1))==fw->end()) fw->insert(pair<WordID,int>(TD::Convert(source1),1)); +    if (fw->find(TD::Convert(source2))==fw->end()) fw->insert(pair<WordID,int>(TD::Convert(source2),1)); + +    int* element = new int[5]; +    element[4]=0; +    for (int i=0; i<4; i++) { +      element[i]  = 0; +      if (tokenizer >> word) element[i] = atoi(word.c_str()); +      element[4]+=element[i]; +    } +    for (int i=0; i<5; i++) table->ultimate[i] += element[i]; + +    oss << source1 << " " << source2 << " " << target1 << " " << target2; +    key_id = TD::Convert(oss.str()); +    oss.str(""); +    if (table->model.find(key_id)!=table->model.end()) {  +      for (int i=0; i<5; i++) table->model[key_id][i]+=element[i]; +    } else { +      int* el2 = new int[5];  +      for (int i=0; i<5; i++) el2[i]=element[i]; +      table->model.insert(pair<WordID,int*>(key_id,el2)); +    } + +    oss << source1 << " " << source2; +    key_id = TD::Convert(oss.str()); +    oss.str(""); +    if (table->model.find(key_id)!=table->model.end()) {   +      for (int i=0; i<5; i++) table->model[key_id][i]+=element[i]; +    } else { +      int* el2 = new int[5];  +      for (int i=0; i<5; i++) el2[i]=element[i]; +      table->model.insert(pair<WordID,int*>(key_id,el2)); +    } + +    if (pos) { +      oss << source1idx << " " << source2idx << " " << target1 << " " << target2; +      key_id = TD::Convert(oss.str()); +      oss.str(""); +      if (table->model.find(key_id)!=table->model.end()) {   +        for (int i=0; i<5; i++) table->model[key_id][i]+=element[i]; +      } else { +        int* el2 = new int[5];  +        for (int i=0; i<5; i++) el2[i]=element[i]; +        table->model.insert(pair<WordID,int*>(key_id,el2)); +      } +    } +    delete element; +  } + +  return true;     +} + +bool Dwarf::readTags(const std::string& filename, std::map<WordID,WordID>* tags) { +  ReadFile rf(filename); +  istream& in = *rf.stream(); +  while(in) { +    string line, word, tag; +    getline(in,line); +    if (line=="") break; +    istringstream tokenizer(line); +    tokenizer >> tag >> word; +    tags->insert(pair<WordID,WordID>(TD::Convert(word),TD::Convert(tag))); +  } +  return true; +} + +bool Dwarf::generalizeOrientation(CountTable* table, const std::map<WordID,WordID>& tags, bool pos) { +  map<string,int*> generalized; +  for (map<WordID,int*>::iterator it=table->model.begin(); it!=table->model.end(); it++) { +    string source, target; +    istringstream tokenizer(TD::Convert(it->first)); +    tokenizer >> source >> target; +    string idx = ""; +    if (pos) { +      int found = source.find_last_of("/"); +      if (found!=string::npos && found>0) {  +        idx = source.substr(found+1); +        source = source.substr(0,found); +      } +    } +    map<WordID,WordID>::const_iterator tags_iter = tags.find(TD::Convert(source)); +    if (tags_iter!=tags.end()) { +      ostringstream genkey; +      genkey << TD::Convert(tags_iter->second); +      if (idx!="") genkey << "/" << idx; +      if (target!="") genkey << " " << target; +      int* model; +      if (generalized.find(genkey.str())!=generalized.end()) { +        model = generalized[genkey.str()]; +        for (int i=0; i<24; i++) model[i] += it->second[i]; +      } else { +        int* el = new int[24]; +        for (int i=0; i<24; i++) el[i] = it->second[i]; +        generalized.insert(pair<string,int*>(genkey.str(),el)); +      } +    } +  } +  for (map<WordID,int*>::iterator it=table->model.begin(); it!=table->model.end(); it++) { +    string source, target; +    istringstream tokenizer(TD::Convert(it->first)); +    tokenizer >> source >> target; +    string idx = ""; +    if (pos) { +      int found = source.find_last_of("/"); +      if (found!=string::npos && found>0) { +        idx = source.substr(found+1); +        source = source.substr(0,found); +      } +    } +    map<WordID,WordID>::const_iterator tags_iter = tags.find(TD::Convert(source)); +    if (tags_iter!=tags.end()) { +      ostringstream genkey; +      genkey << TD::Convert(tags_iter->second); +      if (idx!="") genkey << "/" << idx; +      if (target!="") genkey << " " << target; +      if (generalized.find(genkey.str())!=generalized.end()) { +        delete it->second; +        it->second = generalized[genkey.str()]; +      } +    } +  } + +} +  + + +bool Dwarf::generalizeDominance(CountTable* table, const std::map<WordID,WordID>& tags, bool pos) { +  map<string,int*> generalized; +  ostringstream oss; +  for (map<WordID,int*>::iterator it=table->model.begin(); it!=table->model.end(); it++) { +    string source1, source2, target1, target2; +    string idx1 = ""; string idx2 = ""; +    istringstream tokenizer(TD::Convert(it->first)); +    tokenizer >> source1 >> source2 >> target1 >> target2; +    if (DEBUG) cerr << "source1=|" << source1 << "|, source2=|" << source2 << "|, target1=|" << target1 << "|, target2=|" << target2 << "|" << endl; +    if (pos) { +      int found1 = source1.find_last_of("/"); +      int found2 = source2.find_last_of("/"); +      if (found1!=string::npos && found2!=string::npos && found1>0 && found2>0) { +        idx1 = source1.substr(found1+1); +        source1 = source1.substr(0,found1); +        idx2 = source2.substr(found2+1); +        source2 = source2.substr(0,found2); +      } +    } +    if (DEBUG)  +      cerr << "[U]source1='" << source1 << "', idx1='"<< idx1 << "', source2='" << source2 << "', idx2='"<< idx2 << "', target1='" << target1 << "', target2='" << target2 << "'" << endl; +    map<WordID,WordID>::const_iterator tags_iter1 = tags.find(TD::Convert(source1)); +    map<WordID,WordID>::const_iterator tags_iter2 = tags.find(TD::Convert(source2)); +    if (tags_iter1!=tags.end())  +      source1 = TD::Convert(tags_iter1->second); +    oss << source1; +    if (idx1!="") oss << "/" << idx1; +    if (tags_iter2!=tags.end()) +      source2 = TD::Convert(tags_iter2->second); +    oss << " " << source2; +    if (idx2!="") oss << "/" << idx2; +    if (target1!="" && target2!="") oss << " " << target1 << " " << target2; +     +    if (DEBUG) cerr << "generalized key = '" << oss.str() << "'" << endl;  +    if (generalized.find(oss.str())!=generalized.end()) { +      int* model = generalized[oss.str()]; +      for (int i=0; i<5; i++) model[i] += it->second[i]; +    } else { +      int* model = new int[5]; +      for (int i=0; i<5; i++) model[i] = it->second[i]; +      generalized.insert(pair<string,int*>(oss.str(),model)); +    }     +    oss.str(""); +  } +   +  if (DEBUG) { +    for (map<string,int*>::const_iterator it=generalized.begin(); it!=generalized.end(); it++) { +      cerr << "GENERALIZED = " << it->first << ", "; +      for (int i=0; i<5; i++) cerr << it->second[i] << " "; +      cerr << endl; +    } +  } + +  for (map<WordID,int*>::iterator it=table->model.begin(); it!=table->model.end(); it++) { +    string source1, source2, target1, target2; +    string idx1 = ""; string idx2 = ""; +    istringstream tokenizer(TD::Convert(it->first)); +    tokenizer >> source1 >> source2 >> target1 >> target2; +    if (pos) { +      int found1 = source1.find_last_of("/"); +      int found2 = source2.find_last_of("/"); +      if (found1!=string::npos && found2!=string::npos && found1>0 && found2>0) { +        idx1 = source1.substr(found1+1); +        source1 = source1.substr(0,found1); +        idx2 = source2.substr(found2+1); +        source2 = source2.substr(0,found2); +      } +    } +    map<WordID,WordID>::const_iterator tags_iter1 = tags.find(TD::Convert(source1)); +    map<WordID,WordID>::const_iterator tags_iter2 = tags.find(TD::Convert(source2)); +    if (tags_iter1!=tags.end()) +      source1 = TD::Convert(tags_iter1->second); +    oss << source1; +    if (idx1!="") oss << "/" << idx1; +    if (tags_iter2!=tags.end()) +      source2 = TD::Convert(tags_iter2->second); +    oss << " " << source2; +    if (idx2!="") oss << "/" << idx2; +    if (target1!="" && target2!="") oss << " " << target1 << " " << target2; +     +    if (generalized.find(oss.str())!=generalized.end()) { +      if (DEBUG) cerr << " generalizing "<< TD::Convert(it->first) << " into " << oss.str() << endl;  +      if (DEBUG) { +        cerr << "  model from "; +        for (int i=0; i<5; i++) cerr << it->second[i] << " ";  +        cerr << endl; +      } +      delete it->second; +      it->second = generalized[oss.str()]; +      if (DEBUG) { +        cerr << "  into "; +        for (int i=0; i<5; i++) cerr << it->second[i] << " ";  +        cerr << endl; +      } +    }     +    oss.str(""); +  } + +} | 
