#include "agrammar.h" #include "Util.h" #include #include #include #include "rule_lexer.h" #include "filelib.h" #include "tdict.h" #include #include map > grSplitNonterminals; //const vector Grammar::NO_RULES; // vector substituteF(TRulePtr & rule, WordID wordID, vector & v){ // vector vRules; //outputs // vector f = rule->f(); // vector > newfvector; // for (int i =0; i< f.size(); i++){ // if (f[i] == wordID){ // newfvector.push_back(v); // } // else // newfvector.push_back(vector (1, f[i])); // } // //now creates new rules; // return vRules; // } struct aTextRuleBin : public RuleBin { int GetNumRules() const { return rules_.size(); } TRulePtr GetIthRule(int i) const { return rules_[i]; } void AddRule(TRulePtr t) { rules_.push_back(t); } int Arity() const { return rules_.front()->Arity(); } void Dump() const { for (int i = 0; i < rules_.size(); ++i) cerr << rules_[i]->AsString() << endl; } vector getRules(){ return rules_;} void substituteF(vector & f_path, map > & grSplitNonterminals){ //this substituteF method is different with substituteF procedure found in cdec code; // //aTextRuleBin has a collection of rules with the same f() on the rhs, //substituteF() replaces the f_ of all the rules with f_path vector, //the grSplitNonterminals input to split the lhs_ nonterminals of the rules incase the lhs_ nonterminal found in grSplitNonterminals vector newrules; for (vector::iterator it = rules_.begin() ; it != rules_.end(); it++){ assert(f_path.size() == (*it)->f_.size()); if (grSplitNonterminals.find( (*it)->lhs_) == grSplitNonterminals.end()){ (*it)->f_ = f_path; } else{ // split the lhs NT, vector new_lhs = grSplitNonterminals[ (*it)->lhs_ ]; for (vector::iterator vit = new_lhs.begin(); vit != new_lhs.end(); vit++){ TRulePtr newrule; newrule -> e_ = (*it)->e_; newrule -> f_ = (*it)->f_; newrule->lhs_ = *vit; newrule -> scores_ = (*it)->scores_; newrule -> arity_ = (*it)->arity_; newrules.push_back (newrule); } rules_.erase(it); } } //now add back newrules(output of splitting lhs_) to rules_ rules_.insert(newrules.begin(),newrules.begin(), newrules.end()); } private: vector rules_; }; struct aTextGrammarNode : public GrammarIter { aTextGrammarNode() : rb_(NULL) {} aTextGrammarNode(const aTextGrammarNode & a){ nonterminals_ = a.nonterminals_; tree_ = a.tree_; rb_ = new aTextRuleBin(); //cp constructor: don't cp the set of rules over } ~aTextGrammarNode() { delete rb_; } const GrammarIter* Extend(int symbol) const { map::const_iterator i = tree_.find(symbol); if (i == tree_.end()) return NULL; return &i->second; } const RuleBin* GetRules() const { if (rb_) { //rb_->Dump(); } return rb_; } void DFS(); void visit (); //todo: make this as a function pointer vector path_; //vector of f_ nonterminals/terminals from the top to the current node; set nonterminals_; //Linh added: the set of nonterminals extend the current TextGrammarNode, WordID is the label in the dict; i.e WordID>0 map tree_; aTextRuleBin* rb_; void print_path(){ //for debug only cout<<"path="<::iterator it = tree_.begin(); it != tree_.end(); it++){ (it->second).DFS(); } } void aTextGrammarNode::visit( ){ cout<<"start visit()"< vsplits = grSplitNonterminals[*it]; //split *it into vsplits //iterate through next terminals/nonterminals in tree_ vector tobe_removedNTs; //the list of nonterminal children in tree_ were splited hence will be removed from tree_ for (map::iterator it = tree_.begin() ; it != tree_.end(); it++){ cout<<"in visit(): inside for loop: wordID=="<first< >::const_iterator git = grSplitNonterminals.find(it->first * -1 ); if (git == grSplitNonterminals.end() || it->first >0){ //the next symbols is not to be split cout<<"not split\n"; tree_[it->first ].path_ = path_; tree_[it->first ].path_.push_back(it->first); cout<<"in visit() tree_[it->first ].path_= "; tree_[it->first ].print_path(); continue; } cout<<"tmp2"; vector vsplits = grSplitNonterminals[it->first * -1]; // vector vsplits = git->second; cout<<"tmp3"; // vector vsplits = agrammar_ ->splitNonterminals_[it->first * -1]; cout <<"got vsplits"<first]); //cp the subtree to new nonterminal tree_[vsplits[i] * -1].path_ = path_; //update the path if the subtrees tree_[vsplits[i] * -1].path_.push_back(vsplits[i] * -1); tree_[vsplits[i] * -1].print_path(); } //remove the old node: tobe_removedNTs.push_back(it->first); } for (int i =0; isubstituteF(path_, grSplitNonterminals); } cout<<"visit() end"<root_; } void aTextGrammar::addNonterminal(WordID wordID){ //addNonterminal add the nonterminal wordID (wordID<0) to the list of nonterminals (map) nonterminals_ of grammar //if the input parameter wordID<0 then do nothing if (wordID <0){ //it is a nonterminal map::iterator it = nonterminals_.find(wordID * -1); if (it == nonterminals_.end()) //if not found in the list of nonterminals(a new nonterminals) nonterminals_[wordID * -1] = 1; } } void aTextGrammar::AddRule(const TRulePtr& rule) { //add the LHS nonterminal to nonterminals_ map this->addNonterminal(rule->lhs_); if (rule->IsUnary()) { rhs2unaries_[rule->f().front()].push_back(rule); unaries_.push_back(rule); if (rule->f().front() <0) //add the RHS nonterminal to the list of nonterminals (the addNonterminal() function will check if it is the rhs symbol is a nonterminal then multiply by -1) this->addNonterminal(rule->f().front()); } else { aTextGrammarNode* cur = &pimpl_->root_; for (int i = 0; i < rule->f_.size(); ++i){ if (rule->f_[i] <0){ cur->nonterminals_.insert(rule->f_[i] * -1); //add the next(extend) nonterminals to the current node's nonterminals_ set this->addNonterminal(rule->f_[i]); //add the rhs nonterminal to the grammar's list of nonterminals } cur = &cur->tree_[rule->f_[i]]; } if (cur->rb_ == NULL) cur->rb_ = new aTextRuleBin; cur->rb_->AddRule(rule); } } static void aAddRuleHelper(const TRulePtr& new_rule, void* extra) { static_cast(extra)->AddRule(new_rule); } void aTextGrammar::ReadFromFile(const string& filename) { ReadFile in(filename); RuleLexer::ReadRules(in.stream(), &aAddRuleHelper, this); } bool aTextGrammar::HasRuleForSpan(int i, int j, int distance) const { return (max_span_ >= distance); } ////Linh added void aTextGrammar::setMaxSplit(int max_split){max_split_ = max_split;} void aTextGrammar::printAllNonterminals() const{ for (map::const_iterator it =nonterminals_.begin(); it != nonterminals_.end(); it++){ if (it->second >0){ cout <first<<"\t"<first)< v_splits;//split nonterminal wordID into the list of nonterminals in v_splits for (int i =0; i< this->max_split_; i++){ string split_str = old_str + "+" + itos(i); WordID splitID = TD::Convert(split_str); v_splits.push_back(splitID); nonterminals_[splitID] = 1; } grSplitNonterminals[wordID] = v_splits; //set wordID to be an inactive nonterminal nonterminals_[wordID] = 0; //print split nonterminas of wordID v_splits = grSplitNonterminals[wordID]; cout<<"print split nonterminals\n"; for (int i =0; i newrules; //first unary rules: //iterate through unary rules for (int i =0; i < unaries_.size(); i++){ TRulePtr rule = unaries_[i]; WordID lhs = rule.lhs_; if (grSplitNonterminals.find(rule->f().front() ) != grSplitNonterminals.end()//if the rhs is in the list of splitting nonterminal && grSplitNonterminals.find(lhs ) != grSplitNonterminals.end() //and the lhs is in the list of splitting nonterminal too ){ vector rhs_nonterminals = grSplitNonterminals[rule->f().front()]; //split the rhs nonterminal into the list of nonterminals in 'rhs_nonterminals' vector lhs_nonterminals = grSplitNonterminals[lhs]; //split the rhs nonterminal into the list of nonterminals in 'lhs_nonterminals' for (int k =0; k e_ = rule->e_; newrule -> f_ = rhs_nonterminals[k]->f_; newrule->lhs_ = lhs_nonterminals[j]->lhs_; newrule -> scores_ = rule->scores_; newrule -> arity_ = (*it)->arity_; newrules.push_back (newrule); //update } } else{//the rhs terminal/nonterminal is not in the list of splitting nonterminal } } // for (Cat2Rule::const_iterator it = rhs2unaries_.begin(); it != rhs2unaries_.end(); it++){ // } // if (rule->IsUnary()) { // rhs2unaries_[rule->f().front()].push_back(rule); // unaries_.push_back(rule); // if (rule->f().front() <0) // //add the RHS nonterminal to the list of nonterminals (the addNonterminal() function will check if it is the rhs symbol is a nonterminal then multiply by -1) // this->addNonterminal(rule->f().front()); pimpl_->root_.DFS(); } // void aTextGrammar::splitNonterminal0(WordID wordID){ // TextGrammarNode* cur = &pimpl_->root_; // for (int i = 0; i < rule->f_.size(); ++i) // cur = &cur->tree_[rule->f_[i]]; // } void aTextGrammar::splitAllNonterminals(){ }