diff options
author | linh.kitty <linh.kitty@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-13 20:20:55 +0000 |
---|---|---|
committer | linh.kitty <linh.kitty@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-13 20:20:55 +0000 |
commit | 298eed4f842bcc01b392a40c4a316110a6347f6c (patch) | |
tree | 3b2f4df81bafc58138c7b8f278eae0fc2241eb42 /gi/scfg/abc/old_agrammar.cc | |
parent | 19dc80f89a047b09010187f2306c85a16b25f575 (diff) |
linh added
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@241 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/scfg/abc/old_agrammar.cc')
-rw-r--r-- | gi/scfg/abc/old_agrammar.cc | 383 |
1 files changed, 383 insertions, 0 deletions
diff --git a/gi/scfg/abc/old_agrammar.cc b/gi/scfg/abc/old_agrammar.cc new file mode 100644 index 00000000..33d70dfc --- /dev/null +++ b/gi/scfg/abc/old_agrammar.cc @@ -0,0 +1,383 @@ +#include "agrammar.h" +#include "Util.h" + +#include <algorithm> +#include <utility> +#include <map> + +#include "rule_lexer.h" +#include "filelib.h" +#include "tdict.h" +#include <iostream> +#include <fstream> + +map<WordID, vector<WordID> > grSplitNonterminals; +//const vector<TRulePtr> Grammar::NO_RULES; + + +// vector<TRulePtr> substituteF(TRulePtr & rule, WordID wordID, vector<WordID> & v){ +// vector<TRulePtr> vRules; //outputs + +// vector<WordID> f = rule->f(); +// vector<vector<WordID> > newfvector; +// for (int i =0; i< f.size(); i++){ +// if (f[i] == wordID){ +// newfvector.push_back(v); +// } +// else +// newfvector.push_back(vector<WordID> (1, f[i])); +// } + +// //now creates new rules; + + +// return vRules; +// } + + +struct aTextRuleBin : public RuleBin { + int GetNumRules() const { + return rules_.size(); + } + TRulePtr GetIthRule(int i) const { + return rules_[i]; + } + void AddRule(TRulePtr t) { + rules_.push_back(t); + } + int Arity() const { + return rules_.front()->Arity(); + } + void Dump() const { + for (int i = 0; i < rules_.size(); ++i) + cerr << rules_[i]->AsString() << endl; + } + + + vector<TRulePtr> getRules(){ return rules_;} + + + void substituteF(vector<WordID> & f_path, map<WordID, vector<WordID> > & grSplitNonterminals){ + //this substituteF method is different with substituteF procedure found in cdec code; + // + //aTextRuleBin has a collection of rules with the same f() on the rhs, + //substituteF() replaces the f_ of all the rules with f_path vector, + //the grSplitNonterminals input to split the lhs_ nonterminals of the rules incase the lhs_ nonterminal found in grSplitNonterminals + + vector <TRulePtr> newrules; + for (vector<TRulePtr>::iterator it = rules_.begin() ; it != rules_.end(); it++){ + assert(f_path.size() == (*it)->f_.size()); + + if (grSplitNonterminals.find( (*it)->lhs_) == grSplitNonterminals.end()){ + (*it)->f_ = f_path; + } + else{ // split the lhs NT, + vector<WordID> new_lhs = grSplitNonterminals[ (*it)->lhs_ ]; + for (vector<WordID>::iterator vit = new_lhs.begin(); vit != new_lhs.end(); vit++){ + TRulePtr newrule; + newrule -> e_ = (*it)->e_; + newrule -> f_ = (*it)->f_; + newrule->lhs_ = *vit; + newrule -> scores_ = (*it)->scores_; + newrule -> arity_ = (*it)->arity_; + newrules.push_back (newrule); + } + rules_.erase(it); + } + } + + //now add back newrules(output of splitting lhs_) to rules_ + rules_.insert(newrules.begin(),newrules.begin(), newrules.end()); + } + +private: + vector<TRulePtr> rules_; +}; + + + +struct aTextGrammarNode : public GrammarIter { + aTextGrammarNode() : rb_(NULL) {} + + aTextGrammarNode(const aTextGrammarNode & a){ + nonterminals_ = a.nonterminals_; + tree_ = a.tree_; + rb_ = new aTextRuleBin(); //cp constructor: don't cp the set of rules over + } + + ~aTextGrammarNode() { + delete rb_; + } + const GrammarIter* Extend(int symbol) const { + map<WordID, aTextGrammarNode>::const_iterator i = tree_.find(symbol); + if (i == tree_.end()) return NULL; + return &i->second; + } + + const RuleBin* GetRules() const { + if (rb_) { + //rb_->Dump(); + } + return rb_; + } + + void DFS(); + + void visit (); //todo: make this as a function pointer + + vector <WordID > path_; //vector of f_ nonterminals/terminals from the top to the current node; + set<WordID> nonterminals_; //Linh added: the set of nonterminals extend the current TextGrammarNode, WordID is the label in the dict; i.e WordID>0 + map<WordID, aTextGrammarNode> tree_; + aTextRuleBin* rb_; + + void print_path(){ //for debug only + cout<<"path="<<endl; + for (int i =0; i< path_.size(); i++) + cout<<path_[i]<<" "; + cout<<endl; + } +}; + +void aTextGrammarNode::DFS(){ //because the grammar is a tree without circle, DFS does not require to color the nodes + + visit(); + + for (map<WordID, aTextGrammarNode>::iterator it = tree_.begin(); it != tree_.end(); it++){ + (it->second).DFS(); + } +} + + +void aTextGrammarNode::visit( ){ + + cout<<"start visit()"<<endl; + + cout<<"got grSplitNonterminals"<<endl; +// if (grSplitNonterminals.find(*it) != grSplitNonterminals.end()){ //split this *it nonterminal +// vector<WordID> vsplits = grSplitNonterminals[*it]; //split *it into vsplits + + //iterate through next terminals/nonterminals in tree_ + vector<WordID> tobe_removedNTs; //the list of nonterminal children in tree_ were splited hence will be removed from tree_ + + for (map<WordID, aTextGrammarNode>::iterator it = tree_.begin() ; it != tree_.end(); it++){ + cout<<"in visit(): inside for loop: wordID=="<<it->first<<endl; + + map<WordID, vector<WordID> >::const_iterator git = grSplitNonterminals.find(it->first * -1 ); + + if (git == grSplitNonterminals.end() || it->first >0){ //the next symbols is not to be split + cout<<"not split\n"; + tree_[it->first ].path_ = path_; + tree_[it->first ].path_.push_back(it->first); + cout<<"in visit() tree_[it->first ].path_= "; + tree_[it->first ].print_path(); + continue; + } + + + cout<<"tmp2"; + vector<WordID> vsplits = grSplitNonterminals[it->first * -1]; + // vector<WordID> vsplits = git->second; + cout<<"tmp3"; + // vector<WordID> vsplits = agrammar_ ->splitNonterminals_[it->first * -1]; + cout <<"got vsplits"<<endl; + for (int i =0 ; i<vsplits.size(); i++){ + // nonterminals_.insert(vsplits[i]); //add vsplits[i] into nonterminals_ of the current TextGrammarNode + tree_[vsplits[i] * -1] = aTextGrammarNode(tree_[it->first]); //cp the subtree to new nonterminal + tree_[vsplits[i] * -1].path_ = path_; //update the path if the subtrees + tree_[vsplits[i] * -1].path_.push_back(vsplits[i] * -1); + tree_[vsplits[i] * -1].print_path(); + } + + //remove the old node: + tobe_removedNTs.push_back(it->first); + + } + + for (int i =0; i<tobe_removedNTs.size(); i++) + tree_.erase(tobe_removedNTs[i]); + + if (tree_.size() ==0){ //the last (terminal/nonterminal + cout<<"inside visit(): the last terminal/nonterminal"<<endl; + rb_->substituteF(path_, grSplitNonterminals); + + } + cout<<"visit() end"<<endl; +} + +struct aTGImpl { + aTextGrammarNode root_; +}; + +aTextGrammar::aTextGrammar() : max_span_(10), pimpl_(new aTGImpl) {} +aTextGrammar::aTextGrammar(const std::string& file) : + max_span_(10), + pimpl_(new aTGImpl) { + ReadFromFile(file); +} + + +const GrammarIter* aTextGrammar::GetRoot() const { + return &pimpl_->root_; +} + + +void aTextGrammar::addNonterminal(WordID wordID){ + //addNonterminal add the nonterminal wordID (wordID<0) to the list of nonterminals (map<WordID, int>) nonterminals_ of grammar + //if the input parameter wordID<0 then do nothing + + if (wordID <0){ //it is a nonterminal + + map<WordID, int>::iterator it = nonterminals_.find(wordID * -1); + if (it == nonterminals_.end()) //if not found in the list of nonterminals(a new nonterminals) + nonterminals_[wordID * -1] = 1; + } +} + + + +void aTextGrammar::AddRule(const TRulePtr& rule) { + //add the LHS nonterminal to nonterminals_ map + + this->addNonterminal(rule->lhs_); + + if (rule->IsUnary()) { + rhs2unaries_[rule->f().front()].push_back(rule); + unaries_.push_back(rule); + if (rule->f().front() <0) + //add the RHS nonterminal to the list of nonterminals (the addNonterminal() function will check if it is the rhs symbol is a nonterminal then multiply by -1) + this->addNonterminal(rule->f().front()); + + + } else { + aTextGrammarNode* cur = &pimpl_->root_; + for (int i = 0; i < rule->f_.size(); ++i){ + if (rule->f_[i] <0){ + cur->nonterminals_.insert(rule->f_[i] * -1); //add the next(extend) nonterminals to the current node's nonterminals_ set + this->addNonterminal(rule->f_[i]); //add the rhs nonterminal to the grammar's list of nonterminals + } + cur = &cur->tree_[rule->f_[i]]; + + } + if (cur->rb_ == NULL) + cur->rb_ = new aTextRuleBin; + cur->rb_->AddRule(rule); + + } +} + +static void aAddRuleHelper(const TRulePtr& new_rule, void* extra) { + static_cast<aTextGrammar*>(extra)->AddRule(new_rule); +} + + +void aTextGrammar::ReadFromFile(const string& filename) { + ReadFile in(filename); + RuleLexer::ReadRules(in.stream(), &aAddRuleHelper, this); +} + +bool aTextGrammar::HasRuleForSpan(int i, int j, int distance) const { + return (max_span_ >= distance); +} + + +////Linh added + +void aTextGrammar::setMaxSplit(int max_split){max_split_ = max_split;} + + +void aTextGrammar::printAllNonterminals() const{ + for (map<WordID, int>::const_iterator it =nonterminals_.begin(); + it != nonterminals_.end(); it++){ + if (it->second >0){ + cout <<it->first<<"\t"<<TD::Convert(it->first)<<endl; + } + } + +} + + +void aTextGrammar::splitNonterminal(WordID wordID){ + + //first added the splits nonterminal into the TD dictionary + + string old_str = TD::Convert(wordID); //get the nonterminal label of wordID, the new nonterminals will be old_str+t where t=1..max_split + + vector<WordID> v_splits;//split nonterminal wordID into the list of nonterminals in v_splits + for (int i =0; i< this->max_split_; i++){ + string split_str = old_str + "+" + itos(i); + WordID splitID = TD::Convert(split_str); + v_splits.push_back(splitID); + nonterminals_[splitID] = 1; + } + + grSplitNonterminals[wordID] = v_splits; + //set wordID to be an inactive nonterminal + nonterminals_[wordID] = 0; + + //print split nonterminas of wordID + v_splits = grSplitNonterminals[wordID]; + cout<<"print split nonterminals\n"; + for (int i =0; i<v_splits.size(); i++) + cout<<v_splits[i]<<"\t"<<TD::Convert(v_splits[i])<<endl; + + + //now update in grammar rules and gramar tree: + vector<TRulePtr> newrules; + //first unary rules: + //iterate through unary rules + for (int i =0; i < unaries_.size(); i++){ + TRulePtr rule = unaries_[i]; + WordID lhs = rule.lhs_; + if (grSplitNonterminals.find(rule->f().front() ) != grSplitNonterminals.end()//if the rhs is in the list of splitting nonterminal + && grSplitNonterminals.find(lhs ) != grSplitNonterminals.end() //and the lhs is in the list of splitting nonterminal too + ){ + vector<WordID> rhs_nonterminals = grSplitNonterminals[rule->f().front()]; //split the rhs nonterminal into the list of nonterminals in 'rhs_nonterminals' + vector<WordID> lhs_nonterminals = grSplitNonterminals[lhs]; //split the rhs nonterminal into the list of nonterminals in 'lhs_nonterminals' + for (int k =0; k <rhs_nonterminals.size(); k++) + for (int j =0; j <lhs_nonterminals.size(); j++){ + TRulePtr newrule; + newrule -> e_ = rule->e_; + newrule -> f_ = rhs_nonterminals[k]->f_; + newrule->lhs_ = lhs_nonterminals[j]->lhs_; + newrule -> scores_ = rule->scores_; + newrule -> arity_ = (*it)->arity_; + newrules.push_back (newrule); + + //update + } + } + else{//the rhs terminal/nonterminal is not in the list of splitting nonterminal + + + } + } + + // for (Cat2Rule::const_iterator it = rhs2unaries_.begin(); it != rhs2unaries_.end(); it++){ + + // } + // if (rule->IsUnary()) { + // rhs2unaries_[rule->f().front()].push_back(rule); + // unaries_.push_back(rule); + // if (rule->f().front() <0) + // //add the RHS nonterminal to the list of nonterminals (the addNonterminal() function will check if it is the rhs symbol is a nonterminal then multiply by -1) + // this->addNonterminal(rule->f().front()); + + + pimpl_->root_.DFS(); + +} + + +// void aTextGrammar::splitNonterminal0(WordID wordID){ + +// TextGrammarNode* cur = &pimpl_->root_; +// for (int i = 0; i < rule->f_.size(); ++i) +// cur = &cur->tree_[rule->f_[i]]; + +// } + +void aTextGrammar::splitAllNonterminals(){ + + +} + |