summaryrefslogtreecommitdiff
path: root/gi/scfg/abc/old_agrammar.cc
blob: 33d70dfc3602578f2078d14a98cf6a7c080d20ae (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
#include "agrammar.h"
#include "Util.h"

#include <algorithm>
#include <utility>
#include <map>

#include "rule_lexer.h"
#include "filelib.h"
#include "tdict.h"
#include <iostream>
#include <fstream>

map<WordID, vector<WordID> > grSplitNonterminals;
//const vector<TRulePtr> Grammar::NO_RULES;


// vector<TRulePtr> substituteF(TRulePtr & rule, WordID wordID, vector<WordID> & v){
//   vector<TRulePtr> vRules; //outputs

//   vector<WordID> f = rule->f();
//   vector<vector<WordID> > newfvector;
//   for (int i =0; i< f.size(); i++){
//     if (f[i] == wordID){
//       newfvector.push_back(v);
//     }
//     else
//       newfvector.push_back(vector<WordID> (1, f[i]));
//   }
  
//   //now creates new rules;


//   return vRules;
// }


struct aTextRuleBin : public RuleBin {
  int GetNumRules() const {
    return rules_.size();
  }
  TRulePtr GetIthRule(int i) const {
    return rules_[i];
  }
  void AddRule(TRulePtr t) {
    rules_.push_back(t);
  }
  int Arity() const {
    return rules_.front()->Arity();
  }
  void Dump() const {
    for (int i = 0; i < rules_.size(); ++i)
      cerr << rules_[i]->AsString() << endl;
  }


  vector<TRulePtr> getRules(){ return rules_;}


  void substituteF(vector<WordID> & f_path,   map<WordID, vector<WordID> > &   grSplitNonterminals){
    //this substituteF method is different with substituteF procedure found in cdec code;
  //
  //aTextRuleBin has a collection of rules with the same f() on the rhs, 
  //substituteF() replaces the f_ of all the rules with f_path vector, 
  //the grSplitNonterminals input to split the lhs_ nonterminals of the rules  incase the lhs_ nonterminal found in grSplitNonterminals

    vector <TRulePtr> newrules;
    for (vector<TRulePtr>::iterator it = rules_.begin() ; it != rules_.end(); it++){
      assert(f_path.size() == (*it)->f_.size());
      
      if (grSplitNonterminals.find( (*it)->lhs_) == grSplitNonterminals.end()){
	(*it)->f_ = f_path;
      }
      else{ // split the lhs NT, 
	vector<WordID> new_lhs = grSplitNonterminals[ (*it)->lhs_ ];
	for (vector<WordID>::iterator vit = new_lhs.begin(); vit != new_lhs.end(); vit++){
	  TRulePtr newrule;
	  newrule -> e_ = (*it)->e_;
	  newrule -> f_ = (*it)->f_;
	  newrule->lhs_ = *vit;
	  newrule -> scores_ = (*it)->scores_;
	  newrule -> arity_ = (*it)->arity_;
	  newrules.push_back (newrule);
	}
	rules_.erase(it);
      }
    }

    //now add back newrules(output of splitting lhs_) to rules_
    rules_.insert(newrules.begin(),newrules.begin(), newrules.end());
  }
  
private:
  vector<TRulePtr> rules_;
};



struct aTextGrammarNode : public GrammarIter {
  aTextGrammarNode() : rb_(NULL) {}

  aTextGrammarNode(const aTextGrammarNode  & a){
    nonterminals_ = a.nonterminals_;
    tree_ = a.tree_;
    rb_  = new  aTextRuleBin(); //cp constructor: don't cp the set of rules over 
  }

  ~aTextGrammarNode() {
    delete rb_;
  }
  const GrammarIter* Extend(int symbol) const {
    map<WordID, aTextGrammarNode>::const_iterator i = tree_.find(symbol);
    if (i == tree_.end()) return NULL;
    return &i->second;
  }

  const RuleBin* GetRules() const {
    if (rb_) {
      //rb_->Dump();
    }
    return rb_;
  }
 
  void DFS();

  void visit (); //todo: make this as a function pointer

  vector <WordID > path_; //vector of f_ nonterminals/terminals from the top to the current node;
  set<WordID> nonterminals_; //Linh added: the set of nonterminals extend the current TextGrammarNode, WordID  is the label in the dict; i.e WordID>0 
  map<WordID, aTextGrammarNode> tree_;
  aTextRuleBin* rb_;
  
  void print_path(){ //for debug only
    cout<<"path="<<endl;
    for (int i =0; i< path_.size(); i++)
      cout<<path_[i]<<"  ";
    cout<<endl;
  }
};

void aTextGrammarNode::DFS(){ //because the grammar is a tree without circle, DFS does not require to color the nodes

  visit();
  
  for (map<WordID, aTextGrammarNode>::iterator it = tree_.begin(); it != tree_.end(); it++){
    (it->second).DFS();
  }
}


void aTextGrammarNode::visit( ){  

  cout<<"start visit()"<<endl;
  
  cout<<"got grSplitNonterminals"<<endl;
//   if (grSplitNonterminals.find(*it) != grSplitNonterminals.end()){ //split this *it nonterminal
//     vector<WordID> vsplits = grSplitNonterminals[*it]; //split *it into vsplits

  //iterate through next terminals/nonterminals in tree_
  vector<WordID> tobe_removedNTs; //the list of nonterminal children in tree_ were splited hence will be removed from tree_

  for (map<WordID, aTextGrammarNode>::iterator it = tree_.begin() ; it != tree_.end(); it++){
    cout<<"in visit(): inside for loop: wordID=="<<it->first<<endl;

    map<WordID, vector<WordID> >::const_iterator git = grSplitNonterminals.find(it->first * -1 );

    if (git == grSplitNonterminals.end() || it->first >0){ //the next symbols is not to be split
      cout<<"not split\n";
      tree_[it->first ].path_  = path_;
      tree_[it->first ].path_.push_back(it->first);
      cout<<"in visit() tree_[it->first ].path_= ";
      tree_[it->first ].print_path();
      continue;
    }


    cout<<"tmp2";
    vector<WordID> vsplits = grSplitNonterminals[it->first * -1];
    //    vector<WordID> vsplits = git->second;
    cout<<"tmp3";
    //    vector<WordID> vsplits = agrammar_ ->splitNonterminals_[it->first * -1];
    cout <<"got vsplits"<<endl;
    for (int i =0 ; i<vsplits.size(); i++){
      //  nonterminals_.insert(vsplits[i]); //add vsplits[i] into nonterminals_ of the current TextGrammarNode
      tree_[vsplits[i] * -1] = aTextGrammarNode(tree_[it->first]); //cp the subtree to new nonterminal
      tree_[vsplits[i] * -1].path_  = path_; //update the path if the subtrees
      tree_[vsplits[i] * -1].path_.push_back(vsplits[i] * -1);
      tree_[vsplits[i] * -1].print_path();
    }

    //remove the old node:
    tobe_removedNTs.push_back(it->first); 
    
  }

  for (int i =0; i<tobe_removedNTs.size(); i++)
    tree_.erase(tobe_removedNTs[i]);
  
  if (tree_.size() ==0){ //the last (terminal/nonterminal
    cout<<"inside visit(): the last terminal/nonterminal"<<endl;
    rb_->substituteF(path_, grSplitNonterminals);
    
  }
  cout<<"visit() end"<<endl;
}

struct aTGImpl {
  aTextGrammarNode root_;
};

aTextGrammar::aTextGrammar() : max_span_(10), pimpl_(new aTGImpl) {}
aTextGrammar::aTextGrammar(const std::string&  file) : 
  max_span_(10),
  pimpl_(new aTGImpl) {
  ReadFromFile(file);
}


const GrammarIter* aTextGrammar::GetRoot() const {
  return &pimpl_->root_;
}


void aTextGrammar::addNonterminal(WordID wordID){ 
  //addNonterminal add the nonterminal wordID (wordID<0) to the list of nonterminals (map<WordID, int>) nonterminals_ of grammar
  //if the input parameter wordID<0 then do nothing

  if (wordID <0){ //it is a nonterminal

    map<WordID, int>::iterator it = nonterminals_.find(wordID * -1);
    if (it == nonterminals_.end()) //if not found in the list of nonterminals(a new nonterminals)
        nonterminals_[wordID * -1] = 1;
  }
}



void aTextGrammar::AddRule(const TRulePtr& rule) {
  //add the LHS nonterminal to nonterminals_ map

  this->addNonterminal(rule->lhs_);

  if (rule->IsUnary()) {
    rhs2unaries_[rule->f().front()].push_back(rule);
    unaries_.push_back(rule);
    if (rule->f().front() <0)
      //add the RHS nonterminal to the list of nonterminals (the addNonterminal() function will check if it is the rhs symbol is a  nonterminal then multiply by -1)
      this->addNonterminal(rule->f().front());
    
    
  } else {
    aTextGrammarNode* cur = &pimpl_->root_;
    for (int i = 0; i < rule->f_.size(); ++i){
      if (rule->f_[i] <0){ 
	cur->nonterminals_.insert(rule->f_[i] * -1); //add the next(extend) nonterminals to the current node's nonterminals_ set
	this->addNonterminal(rule->f_[i]); //add the rhs nonterminal to the  grammar's list of nonterminals
      }
      cur = &cur->tree_[rule->f_[i]];

    }
    if (cur->rb_ == NULL)
      cur->rb_ = new aTextRuleBin;
    cur->rb_->AddRule(rule);
    
  }
}

static void aAddRuleHelper(const TRulePtr& new_rule, void* extra) {
  static_cast<aTextGrammar*>(extra)->AddRule(new_rule);
}


void aTextGrammar::ReadFromFile(const string& filename) {
  ReadFile in(filename);
  RuleLexer::ReadRules(in.stream(), &aAddRuleHelper, this);
}

bool aTextGrammar::HasRuleForSpan(int i, int j, int distance) const {
  return (max_span_ >= distance);
}


////Linh added

void aTextGrammar::setMaxSplit(int max_split){max_split_ = max_split;}


void aTextGrammar::printAllNonterminals() const{
  for (map<WordID, int>::const_iterator it =nonterminals_.begin();
       it != nonterminals_.end(); it++){
    if (it->second >0){
      cout <<it->first<<"\t"<<TD::Convert(it->first)<<endl;
    }
  }
  
}
 

void aTextGrammar::splitNonterminal(WordID wordID){

  //first added the splits nonterminal into the TD dictionary 
  
  string old_str = TD::Convert(wordID); //get the nonterminal label of wordID, the new nonterminals will be old_str+t where t=1..max_split
  
  vector<WordID> v_splits;//split nonterminal wordID into the list of nonterminals in v_splits
  for (int i =0; i< this->max_split_; i++){
    string split_str = old_str + "+" + itos(i);
    WordID splitID = TD::Convert(split_str);
    v_splits.push_back(splitID);
    nonterminals_[splitID] = 1;
  }
  
  grSplitNonterminals[wordID] = v_splits;
  //set wordID to be an inactive nonterminal
  nonterminals_[wordID] = 0;

  //print split nonterminas of wordID
  v_splits = grSplitNonterminals[wordID];
  cout<<"print split nonterminals\n";
  for (int i =0; i<v_splits.size(); i++)
    cout<<v_splits[i]<<"\t"<<TD::Convert(v_splits[i])<<endl;


  //now update in grammar rules and gramar tree:
  vector<TRulePtr> newrules;
  //first unary rules:
  //iterate through unary rules
  for (int i =0; i < unaries_.size(); i++){
    TRulePtr rule = unaries_[i];
    WordID lhs = rule.lhs_;
    if (grSplitNonterminals.find(rule->f().front() ) != grSplitNonterminals.end()//if the rhs is in the list of splitting nonterminal
	&& grSplitNonterminals.find(lhs ) != grSplitNonterminals.end() //and the lhs is in the list of splitting nonterminal too
	){ 
      vector<WordID> rhs_nonterminals = grSplitNonterminals[rule->f().front()]; //split the  rhs nonterminal into the list of nonterminals in 'rhs_nonterminals'
      vector<WordID> lhs_nonterminals = grSplitNonterminals[lhs]; //split the  rhs nonterminal into the list of nonterminals in 'lhs_nonterminals'      
      for (int k =0; k <rhs_nonterminals.size(); k++)
	for (int j =0; j <lhs_nonterminals.size(); j++){
	  TRulePtr newrule;
	  newrule -> e_ = rule->e_;
	  newrule -> f_ = rhs_nonterminals[k]->f_;
	  newrule->lhs_ = lhs_nonterminals[j]->lhs_;
	  newrule -> scores_ = rule->scores_;
	  newrule -> arity_ = (*it)->arity_;
	  newrules.push_back (newrule);
	  
	  //update 
	}
    }
    else{//the rhs terminal/nonterminal is not in the list of splitting nonterminal
      

    }
  }
  
  // for (Cat2Rule::const_iterator it = rhs2unaries_.begin(); it != rhs2unaries_.end(); it++){

  // }  
  // if (rule->IsUnary()) {
  //   rhs2unaries_[rule->f().front()].push_back(rule);
  //   unaries_.push_back(rule);
  //   if (rule->f().front() <0)
  //     //add the RHS nonterminal to the list of nonterminals (the addNonterminal() function will check if it is the rhs symbol is a  nonterminal then multiply by -1)
  //     this->addNonterminal(rule->f().front());
    

  pimpl_->root_.DFS();
  
}


// void aTextGrammar::splitNonterminal0(WordID wordID){

//   TextGrammarNode* cur = &pimpl_->root_;
//   for (int i = 0; i < rule->f_.size(); ++i)
//     cur = &cur->tree_[rule->f_[i]];
 
// }

void aTextGrammar::splitAllNonterminals(){


}