linh added

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@241 ec762483-ff6d-05da-a07a-a48fb63a330f
author: linh.kitty <linh.kitty@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-13 20:20:55 +0000
committer: linh.kitty <linh.kitty@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-07-13 20:20:55 +0000
commit: f305e7b0e23b952fb4b7299b2607176ab7409ef9 (patch)
tree: 8d8d10484b7f10d1bc2a5f28b694490773ca8e6e /gi/scfg/abc/old_agrammar.cc
parent: c807e0b514f21a80df0268c686c7ba70fe39611a (diff)
1 files changed, 383 insertions, 0 deletions
diff --git a/gi/scfg/abc/old_agrammar.cc b/gi/scfg/abc/old_agrammar.cc
new file mode 100644
index 00000000..33d70dfc
--- /dev/null
+++ b/gi/scfg/abc/old_agrammar.cc
@@ -0,0 +1,383 @@
+#include "agrammar.h"
+#include "Util.h"
+
+#include <algorithm>
+#include <utility>
+#include <map>
+
+#include "rule_lexer.h"
+#include "filelib.h"
+#include "tdict.h"
+#include <iostream>
+#include <fstream>
+
+map<WordID, vector<WordID> > grSplitNonterminals;
+//const vector<TRulePtr> Grammar::NO_RULES;
+
+
+// vector<TRulePtr> substituteF(TRulePtr & rule, WordID wordID, vector<WordID> & v){
+//   vector<TRulePtr> vRules; //outputs
+
+//   vector<WordID> f = rule->f();
+//   vector<vector<WordID> > newfvector;
+//   for (int i =0; i< f.size(); i++){
+//     if (f[i] == wordID){
+//       newfvector.push_back(v);
+//     }
+//     else
+//       newfvector.push_back(vector<WordID> (1, f[i]));
+//   }
+  
+//   //now creates new rules;
+
+
+//   return vRules;
+// }
+
+
+struct aTextRuleBin : public RuleBin {
+  int GetNumRules() const {
+    return rules_.size();
+  }
+  TRulePtr GetIthRule(int i) const {
+    return rules_[i];
+  }
+  void AddRule(TRulePtr t) {
+    rules_.push_back(t);
+  }
+  int Arity() const {
+    return rules_.front()->Arity();
+  }
+  void Dump() const {
+    for (int i = 0; i < rules_.size(); ++i)
+      cerr << rules_[i]->AsString() << endl;
+  }
+
+
+  vector<TRulePtr> getRules(){ return rules_;}
+
+
+  void substituteF(vector<WordID> & f_path,   map<WordID, vector<WordID> > &   grSplitNonterminals){
+    //this substituteF method is different with substituteF procedure found in cdec code;
+  //
+  //aTextRuleBin has a collection of rules with the same f() on the rhs, 
+  //substituteF() replaces the f_ of all the rules with f_path vector, 
+  //the grSplitNonterminals input to split the lhs_ nonterminals of the rules  incase the lhs_ nonterminal found in grSplitNonterminals
+
+    vector <TRulePtr> newrules;
+    for (vector<TRulePtr>::iterator it = rules_.begin() ; it != rules_.end(); it++){
+      assert(f_path.size() == (*it)->f_.size());
+      
+      if (grSplitNonterminals.find( (*it)->lhs_) == grSplitNonterminals.end()){
+	(*it)->f_ = f_path;
+      }
+      else{ // split the lhs NT, 
+	vector<WordID> new_lhs = grSplitNonterminals[ (*it)->lhs_ ];
+	for (vector<WordID>::iterator vit = new_lhs.begin(); vit != new_lhs.end(); vit++){
+	  TRulePtr newrule;
+	  newrule -> e_ = (*it)->e_;
+	  newrule -> f_ = (*it)->f_;
+	  newrule->lhs_ = *vit;
+	  newrule -> scores_ = (*it)->scores_;
+	  newrule -> arity_ = (*it)->arity_;
+	  newrules.push_back (newrule);
+	}
+	rules_.erase(it);
+      }
+    }
+
+    //now add back newrules(output of splitting lhs_) to rules_
+    rules_.insert(newrules.begin(),newrules.begin(), newrules.end());
+  }
+  
+private:
+  vector<TRulePtr> rules_;
+};
+
+
+
+struct aTextGrammarNode : public GrammarIter {
+  aTextGrammarNode() : rb_(NULL) {}
+
+  aTextGrammarNode(const aTextGrammarNode  & a){
+    nonterminals_ = a.nonterminals_;
+    tree_ = a.tree_;
+    rb_  = new  aTextRuleBin(); //cp constructor: don't cp the set of rules over 
+  }
+
+  ~aTextGrammarNode() {
+    delete rb_;
+  }
+  const GrammarIter* Extend(int symbol) const {
+    map<WordID, aTextGrammarNode>::const_iterator i = tree_.find(symbol);
+    if (i == tree_.end()) return NULL;
+    return &i->second;
+  }
+
+  const RuleBin* GetRules() const {
+    if (rb_) {
+      //rb_->Dump();
+    }
+    return rb_;
+  }
+ 
+  void DFS();
+
+  void visit (); //todo: make this as a function pointer
+
+  vector <WordID > path_; //vector of f_ nonterminals/terminals from the top to the current node;
+  set<WordID> nonterminals_; //Linh added: the set of nonterminals extend the current TextGrammarNode, WordID  is the label in the dict; i.e WordID>0 
+  map<WordID, aTextGrammarNode> tree_;
+  aTextRuleBin* rb_;
+  
+  void print_path(){ //for debug only
+    cout<<"path="<<endl;
+    for (int i =0; i< path_.size(); i++)
+      cout<<path_[i]<<"  ";
+    cout<<endl;
+  }
+};
+
+void aTextGrammarNode::DFS(){ //because the grammar is a tree without circle, DFS does not require to color the nodes
+
+  visit();
+  
+  for (map<WordID, aTextGrammarNode>::iterator it = tree_.begin(); it != tree_.end(); it++){
+    (it->second).DFS();
+  }
+}
+
+
+void aTextGrammarNode::visit( ){  
+
+  cout<<"start visit()"<<endl;
+  
+  cout<<"got grSplitNonterminals"<<endl;
+//   if (grSplitNonterminals.find(*it) != grSplitNonterminals.end()){ //split this *it nonterminal
+//     vector<WordID> vsplits = grSplitNonterminals[*it]; //split *it into vsplits
+
+  //iterate through next terminals/nonterminals in tree_
+  vector<WordID> tobe_removedNTs; //the list of nonterminal children in tree_ were splited hence will be removed from tree_
+
+  for (map<WordID, aTextGrammarNode>::iterator it = tree_.begin() ; it != tree_.end(); it++){
+    cout<<"in visit(): inside for loop: wordID=="<<it->first<<endl;
+
+    map<WordID, vector<WordID> >::const_iterator git = grSplitNonterminals.find(it->first * -1 );
+
+    if (git == grSplitNonterminals.end() || it->first >0){ //the next symbols is not to be split
+      cout<<"not split\n";
+      tree_[it->first ].path_  = path_;
+      tree_[it->first ].path_.push_back(it->first);
+      cout<<"in visit() tree_[it->first ].path_= ";
+      tree_[it->first ].print_path();
+      continue;
+    }
+
+
+    cout<<"tmp2";
+    vector<WordID> vsplits = grSplitNonterminals[it->first * -1];
+    //    vector<WordID> vsplits = git->second;
+    cout<<"tmp3";
+    //    vector<WordID> vsplits = agrammar_ ->splitNonterminals_[it->first * -1];
+    cout <<"got vsplits"<<endl;
+    for (int i =0 ; i<vsplits.size(); i++){
+      //  nonterminals_.insert(vsplits[i]); //add vsplits[i] into nonterminals_ of the current TextGrammarNode
+      tree_[vsplits[i] * -1] = aTextGrammarNode(tree_[it->first]); //cp the subtree to new nonterminal
+      tree_[vsplits[i] * -1].path_  = path_; //update the path if the subtrees
+      tree_[vsplits[i] * -1].path_.push_back(vsplits[i] * -1);
+      tree_[vsplits[i] * -1].print_path();
+    }
+
+    //remove the old node:
+    tobe_removedNTs.push_back(it->first); 
+    
+  }
+
+  for (int i =0; i<tobe_removedNTs.size(); i++)
+    tree_.erase(tobe_removedNTs[i]);
+  
+  if (tree_.size() ==0){ //the last (terminal/nonterminal
+    cout<<"inside visit(): the last terminal/nonterminal"<<endl;
+    rb_->substituteF(path_, grSplitNonterminals);
+    
+  }
+  cout<<"visit() end"<<endl;
+}
+
+struct aTGImpl {
+  aTextGrammarNode root_;
+};
+
+aTextGrammar::aTextGrammar() : max_span_(10), pimpl_(new aTGImpl) {}
+aTextGrammar::aTextGrammar(const std::string&  file) : 
+  max_span_(10),
+  pimpl_(new aTGImpl) {
+  ReadFromFile(file);
+}
+
+
+const GrammarIter* aTextGrammar::GetRoot() const {
+  return &pimpl_->root_;
+}
+
+
+void aTextGrammar::addNonterminal(WordID wordID){ 
+  //addNonterminal add the nonterminal wordID (wordID<0) to the list of nonterminals (map<WordID, int>) nonterminals_ of grammar
+  //if the input parameter wordID<0 then do nothing
+
+  if (wordID <0){ //it is a nonterminal
+
+    map<WordID, int>::iterator it = nonterminals_.find(wordID * -1);
+    if (it == nonterminals_.end()) //if not found in the list of nonterminals(a new nonterminals)
+        nonterminals_[wordID * -1] = 1;
+  }
+}
+
+
+
+void aTextGrammar::AddRule(const TRulePtr& rule) {
+  //add the LHS nonterminal to nonterminals_ map
+
+  this->addNonterminal(rule->lhs_);
+
+  if (rule->IsUnary()) {
+    rhs2unaries_[rule->f().front()].push_back(rule);
+    unaries_.push_back(rule);
+    if (rule->f().front() <0)
+      //add the RHS nonterminal to the list of nonterminals (the addNonterminal() function will check if it is the rhs symbol is a  nonterminal then multiply by -1)
+      this->addNonterminal(rule->f().front());
+    
+    
+  } else {
+    aTextGrammarNode* cur = &pimpl_->root_;
+    for (int i = 0; i < rule->f_.size(); ++i){
+      if (rule->f_[i] <0){ 
+	cur->nonterminals_.insert(rule->f_[i] * -1); //add the next(extend) nonterminals to the current node's nonterminals_ set
+	this->addNonterminal(rule->f_[i]); //add the rhs nonterminal to the  grammar's list of nonterminals
+      }
+      cur = &cur->tree_[rule->f_[i]];
+
+    }
+    if (cur->rb_ == NULL)
+      cur->rb_ = new aTextRuleBin;
+    cur->rb_->AddRule(rule);
+    
+  }
+}
+
+static void aAddRuleHelper(const TRulePtr& new_rule, void* extra) {
+  static_cast<aTextGrammar*>(extra)->AddRule(new_rule);
+}
+
+
+void aTextGrammar::ReadFromFile(const string& filename) {
+  ReadFile in(filename);
+  RuleLexer::ReadRules(in.stream(), &aAddRuleHelper, this);
+}
+
+bool aTextGrammar::HasRuleForSpan(int i, int j, int distance) const {
+  return (max_span_ >= distance);
+}
+
+
+////Linh added
+
+void aTextGrammar::setMaxSplit(int max_split){max_split_ = max_split;}
+
+
+void aTextGrammar::printAllNonterminals() const{
+  for (map<WordID, int>::const_iterator it =nonterminals_.begin();
+       it != nonterminals_.end(); it++){
+    if (it->second >0){
+      cout <<it->first<<"\t"<<TD::Convert(it->first)<<endl;
+    }
+  }
+  
+}
+ 
+
+void aTextGrammar::splitNonterminal(WordID wordID){
+
+  //first added the splits nonterminal into the TD dictionary 
+  
+  string old_str = TD::Convert(wordID); //get the nonterminal label of wordID, the new nonterminals will be old_str+t where t=1..max_split
+  
+  vector<WordID> v_splits;//split nonterminal wordID into the list of nonterminals in v_splits
+  for (int i =0; i< this->max_split_; i++){
+    string split_str = old_str + "+" + itos(i);
+    WordID splitID = TD::Convert(split_str);
+    v_splits.push_back(splitID);
+    nonterminals_[splitID] = 1;
+  }
+  
+  grSplitNonterminals[wordID] = v_splits;
+  //set wordID to be an inactive nonterminal
+  nonterminals_[wordID] = 0;
+
+  //print split nonterminas of wordID
+  v_splits = grSplitNonterminals[wordID];
+  cout<<"print split nonterminals\n";
+  for (int i =0; i<v_splits.size(); i++)
+    cout<<v_splits[i]<<"\t"<<TD::Convert(v_splits[i])<<endl;
+
+
+  //now update in grammar rules and gramar tree:
+  vector<TRulePtr> newrules;
+  //first unary rules:
+  //iterate through unary rules
+  for (int i =0; i < unaries_.size(); i++){
+    TRulePtr rule = unaries_[i];
+    WordID lhs = rule.lhs_;
+    if (grSplitNonterminals.find(rule->f().front() ) != grSplitNonterminals.end()//if the rhs is in the list of splitting nonterminal
+	&& grSplitNonterminals.find(lhs ) != grSplitNonterminals.end() //and the lhs is in the list of splitting nonterminal too
+	){ 
+      vector<WordID> rhs_nonterminals = grSplitNonterminals[rule->f().front()]; //split the  rhs nonterminal into the list of nonterminals in 'rhs_nonterminals'
+      vector<WordID> lhs_nonterminals = grSplitNonterminals[lhs]; //split the  rhs nonterminal into the list of nonterminals in 'lhs_nonterminals'      
+      for (int k =0; k <rhs_nonterminals.size(); k++)
+	for (int j =0; j <lhs_nonterminals.size(); j++){
+	  TRulePtr newrule;
+	  newrule -> e_ = rule->e_;
+	  newrule -> f_ = rhs_nonterminals[k]->f_;
+	  newrule->lhs_ = lhs_nonterminals[j]->lhs_;
+	  newrule -> scores_ = rule->scores_;
+	  newrule -> arity_ = (*it)->arity_;
+	  newrules.push_back (newrule);
+	  
+	  //update 
+	}
+    }
+    else{//the rhs terminal/nonterminal is not in the list of splitting nonterminal
+      
+
+    }
+  }
+  
+  // for (Cat2Rule::const_iterator it = rhs2unaries_.begin(); it != rhs2unaries_.end(); it++){
+
+  // }  
+  // if (rule->IsUnary()) {
+  //   rhs2unaries_[rule->f().front()].push_back(rule);
+  //   unaries_.push_back(rule);
+  //   if (rule->f().front() <0)
+  //     //add the RHS nonterminal to the list of nonterminals (the addNonterminal() function will check if it is the rhs symbol is a  nonterminal then multiply by -1)
+  //     this->addNonterminal(rule->f().front());
+    
+
+  pimpl_->root_.DFS();
+  
+}
+
+
+// void aTextGrammar::splitNonterminal0(WordID wordID){
+
+//   TextGrammarNode* cur = &pimpl_->root_;
+//   for (int i = 0; i < rule->f_.size(); ++i)
+//     cur = &cur->tree_[rule->f_[i]];
+ 
+// }
+
+void aTextGrammar::splitAllNonterminals(){
+
+
+}
+
author	linh.kitty <linh.kitty@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-13 20:20:55 +0000
committer	linh.kitty <linh.kitty@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-07-13 20:20:55 +0000
commit	f305e7b0e23b952fb4b7299b2607176ab7409ef9 (patch)
tree	8d8d10484b7f10d1bc2a5f28b694490773ca8e6e /gi/scfg/abc/old_agrammar.cc
parent	c807e0b514f21a80df0268c686c7ba70fe39611a (diff)