summaryrefslogtreecommitdiff
path: root/gi/scfg/abc/old_agrammar.cc
diff options
context:
space:
mode:
authorlinh.kitty <linh.kitty@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-13 20:20:55 +0000
committerlinh.kitty <linh.kitty@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-13 20:20:55 +0000
commitf305e7b0e23b952fb4b7299b2607176ab7409ef9 (patch)
tree8d8d10484b7f10d1bc2a5f28b694490773ca8e6e /gi/scfg/abc/old_agrammar.cc
parentc807e0b514f21a80df0268c686c7ba70fe39611a (diff)
linh added
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@241 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/scfg/abc/old_agrammar.cc')
-rw-r--r--gi/scfg/abc/old_agrammar.cc383
1 files changed, 383 insertions, 0 deletions
diff --git a/gi/scfg/abc/old_agrammar.cc b/gi/scfg/abc/old_agrammar.cc
new file mode 100644
index 00000000..33d70dfc
--- /dev/null
+++ b/gi/scfg/abc/old_agrammar.cc
@@ -0,0 +1,383 @@
+#include "agrammar.h"
+#include "Util.h"
+
+#include <algorithm>
+#include <utility>
+#include <map>
+
+#include "rule_lexer.h"
+#include "filelib.h"
+#include "tdict.h"
+#include <iostream>
+#include <fstream>
+
+map<WordID, vector<WordID> > grSplitNonterminals;
+//const vector<TRulePtr> Grammar::NO_RULES;
+
+
+// vector<TRulePtr> substituteF(TRulePtr & rule, WordID wordID, vector<WordID> & v){
+// vector<TRulePtr> vRules; //outputs
+
+// vector<WordID> f = rule->f();
+// vector<vector<WordID> > newfvector;
+// for (int i =0; i< f.size(); i++){
+// if (f[i] == wordID){
+// newfvector.push_back(v);
+// }
+// else
+// newfvector.push_back(vector<WordID> (1, f[i]));
+// }
+
+// //now creates new rules;
+
+
+// return vRules;
+// }
+
+
+struct aTextRuleBin : public RuleBin {
+ int GetNumRules() const {
+ return rules_.size();
+ }
+ TRulePtr GetIthRule(int i) const {
+ return rules_[i];
+ }
+ void AddRule(TRulePtr t) {
+ rules_.push_back(t);
+ }
+ int Arity() const {
+ return rules_.front()->Arity();
+ }
+ void Dump() const {
+ for (int i = 0; i < rules_.size(); ++i)
+ cerr << rules_[i]->AsString() << endl;
+ }
+
+
+ vector<TRulePtr> getRules(){ return rules_;}
+
+
+ void substituteF(vector<WordID> & f_path, map<WordID, vector<WordID> > & grSplitNonterminals){
+ //this substituteF method is different with substituteF procedure found in cdec code;
+ //
+ //aTextRuleBin has a collection of rules with the same f() on the rhs,
+ //substituteF() replaces the f_ of all the rules with f_path vector,
+ //the grSplitNonterminals input to split the lhs_ nonterminals of the rules incase the lhs_ nonterminal found in grSplitNonterminals
+
+ vector <TRulePtr> newrules;
+ for (vector<TRulePtr>::iterator it = rules_.begin() ; it != rules_.end(); it++){
+ assert(f_path.size() == (*it)->f_.size());
+
+ if (grSplitNonterminals.find( (*it)->lhs_) == grSplitNonterminals.end()){
+ (*it)->f_ = f_path;
+ }
+ else{ // split the lhs NT,
+ vector<WordID> new_lhs = grSplitNonterminals[ (*it)->lhs_ ];
+ for (vector<WordID>::iterator vit = new_lhs.begin(); vit != new_lhs.end(); vit++){
+ TRulePtr newrule;
+ newrule -> e_ = (*it)->e_;
+ newrule -> f_ = (*it)->f_;
+ newrule->lhs_ = *vit;
+ newrule -> scores_ = (*it)->scores_;
+ newrule -> arity_ = (*it)->arity_;
+ newrules.push_back (newrule);
+ }
+ rules_.erase(it);
+ }
+ }
+
+ //now add back newrules(output of splitting lhs_) to rules_
+ rules_.insert(newrules.begin(),newrules.begin(), newrules.end());
+ }
+
+private:
+ vector<TRulePtr> rules_;
+};
+
+
+
+struct aTextGrammarNode : public GrammarIter {
+ aTextGrammarNode() : rb_(NULL) {}
+
+ aTextGrammarNode(const aTextGrammarNode & a){
+ nonterminals_ = a.nonterminals_;
+ tree_ = a.tree_;
+ rb_ = new aTextRuleBin(); //cp constructor: don't cp the set of rules over
+ }
+
+ ~aTextGrammarNode() {
+ delete rb_;
+ }
+ const GrammarIter* Extend(int symbol) const {
+ map<WordID, aTextGrammarNode>::const_iterator i = tree_.find(symbol);
+ if (i == tree_.end()) return NULL;
+ return &i->second;
+ }
+
+ const RuleBin* GetRules() const {
+ if (rb_) {
+ //rb_->Dump();
+ }
+ return rb_;
+ }
+
+ void DFS();
+
+ void visit (); //todo: make this as a function pointer
+
+ vector <WordID > path_; //vector of f_ nonterminals/terminals from the top to the current node;
+ set<WordID> nonterminals_; //Linh added: the set of nonterminals extend the current TextGrammarNode, WordID is the label in the dict; i.e WordID>0
+ map<WordID, aTextGrammarNode> tree_;
+ aTextRuleBin* rb_;
+
+ void print_path(){ //for debug only
+ cout<<"path="<<endl;
+ for (int i =0; i< path_.size(); i++)
+ cout<<path_[i]<<" ";
+ cout<<endl;
+ }
+};
+
+void aTextGrammarNode::DFS(){ //because the grammar is a tree without circle, DFS does not require to color the nodes
+
+ visit();
+
+ for (map<WordID, aTextGrammarNode>::iterator it = tree_.begin(); it != tree_.end(); it++){
+ (it->second).DFS();
+ }
+}
+
+
+void aTextGrammarNode::visit( ){
+
+ cout<<"start visit()"<<endl;
+
+ cout<<"got grSplitNonterminals"<<endl;
+// if (grSplitNonterminals.find(*it) != grSplitNonterminals.end()){ //split this *it nonterminal
+// vector<WordID> vsplits = grSplitNonterminals[*it]; //split *it into vsplits
+
+ //iterate through next terminals/nonterminals in tree_
+ vector<WordID> tobe_removedNTs; //the list of nonterminal children in tree_ were splited hence will be removed from tree_
+
+ for (map<WordID, aTextGrammarNode>::iterator it = tree_.begin() ; it != tree_.end(); it++){
+ cout<<"in visit(): inside for loop: wordID=="<<it->first<<endl;
+
+ map<WordID, vector<WordID> >::const_iterator git = grSplitNonterminals.find(it->first * -1 );
+
+ if (git == grSplitNonterminals.end() || it->first >0){ //the next symbols is not to be split
+ cout<<"not split\n";
+ tree_[it->first ].path_ = path_;
+ tree_[it->first ].path_.push_back(it->first);
+ cout<<"in visit() tree_[it->first ].path_= ";
+ tree_[it->first ].print_path();
+ continue;
+ }
+
+
+ cout<<"tmp2";
+ vector<WordID> vsplits = grSplitNonterminals[it->first * -1];
+ // vector<WordID> vsplits = git->second;
+ cout<<"tmp3";
+ // vector<WordID> vsplits = agrammar_ ->splitNonterminals_[it->first * -1];
+ cout <<"got vsplits"<<endl;
+ for (int i =0 ; i<vsplits.size(); i++){
+ // nonterminals_.insert(vsplits[i]); //add vsplits[i] into nonterminals_ of the current TextGrammarNode
+ tree_[vsplits[i] * -1] = aTextGrammarNode(tree_[it->first]); //cp the subtree to new nonterminal
+ tree_[vsplits[i] * -1].path_ = path_; //update the path if the subtrees
+ tree_[vsplits[i] * -1].path_.push_back(vsplits[i] * -1);
+ tree_[vsplits[i] * -1].print_path();
+ }
+
+ //remove the old node:
+ tobe_removedNTs.push_back(it->first);
+
+ }
+
+ for (int i =0; i<tobe_removedNTs.size(); i++)
+ tree_.erase(tobe_removedNTs[i]);
+
+ if (tree_.size() ==0){ //the last (terminal/nonterminal
+ cout<<"inside visit(): the last terminal/nonterminal"<<endl;
+ rb_->substituteF(path_, grSplitNonterminals);
+
+ }
+ cout<<"visit() end"<<endl;
+}
+
+struct aTGImpl {
+ aTextGrammarNode root_;
+};
+
+aTextGrammar::aTextGrammar() : max_span_(10), pimpl_(new aTGImpl) {}
+aTextGrammar::aTextGrammar(const std::string& file) :
+ max_span_(10),
+ pimpl_(new aTGImpl) {
+ ReadFromFile(file);
+}
+
+
+const GrammarIter* aTextGrammar::GetRoot() const {
+ return &pimpl_->root_;
+}
+
+
+void aTextGrammar::addNonterminal(WordID wordID){
+ //addNonterminal add the nonterminal wordID (wordID<0) to the list of nonterminals (map<WordID, int>) nonterminals_ of grammar
+ //if the input parameter wordID<0 then do nothing
+
+ if (wordID <0){ //it is a nonterminal
+
+ map<WordID, int>::iterator it = nonterminals_.find(wordID * -1);
+ if (it == nonterminals_.end()) //if not found in the list of nonterminals(a new nonterminals)
+ nonterminals_[wordID * -1] = 1;
+ }
+}
+
+
+
+void aTextGrammar::AddRule(const TRulePtr& rule) {
+ //add the LHS nonterminal to nonterminals_ map
+
+ this->addNonterminal(rule->lhs_);
+
+ if (rule->IsUnary()) {
+ rhs2unaries_[rule->f().front()].push_back(rule);
+ unaries_.push_back(rule);
+ if (rule->f().front() <0)
+ //add the RHS nonterminal to the list of nonterminals (the addNonterminal() function will check if it is the rhs symbol is a nonterminal then multiply by -1)
+ this->addNonterminal(rule->f().front());
+
+
+ } else {
+ aTextGrammarNode* cur = &pimpl_->root_;
+ for (int i = 0; i < rule->f_.size(); ++i){
+ if (rule->f_[i] <0){
+ cur->nonterminals_.insert(rule->f_[i] * -1); //add the next(extend) nonterminals to the current node's nonterminals_ set
+ this->addNonterminal(rule->f_[i]); //add the rhs nonterminal to the grammar's list of nonterminals
+ }
+ cur = &cur->tree_[rule->f_[i]];
+
+ }
+ if (cur->rb_ == NULL)
+ cur->rb_ = new aTextRuleBin;
+ cur->rb_->AddRule(rule);
+
+ }
+}
+
+static void aAddRuleHelper(const TRulePtr& new_rule, void* extra) {
+ static_cast<aTextGrammar*>(extra)->AddRule(new_rule);
+}
+
+
+void aTextGrammar::ReadFromFile(const string& filename) {
+ ReadFile in(filename);
+ RuleLexer::ReadRules(in.stream(), &aAddRuleHelper, this);
+}
+
+bool aTextGrammar::HasRuleForSpan(int i, int j, int distance) const {
+ return (max_span_ >= distance);
+}
+
+
+////Linh added
+
+void aTextGrammar::setMaxSplit(int max_split){max_split_ = max_split;}
+
+
+void aTextGrammar::printAllNonterminals() const{
+ for (map<WordID, int>::const_iterator it =nonterminals_.begin();
+ it != nonterminals_.end(); it++){
+ if (it->second >0){
+ cout <<it->first<<"\t"<<TD::Convert(it->first)<<endl;
+ }
+ }
+
+}
+
+
+void aTextGrammar::splitNonterminal(WordID wordID){
+
+ //first added the splits nonterminal into the TD dictionary
+
+ string old_str = TD::Convert(wordID); //get the nonterminal label of wordID, the new nonterminals will be old_str+t where t=1..max_split
+
+ vector<WordID> v_splits;//split nonterminal wordID into the list of nonterminals in v_splits
+ for (int i =0; i< this->max_split_; i++){
+ string split_str = old_str + "+" + itos(i);
+ WordID splitID = TD::Convert(split_str);
+ v_splits.push_back(splitID);
+ nonterminals_[splitID] = 1;
+ }
+
+ grSplitNonterminals[wordID] = v_splits;
+ //set wordID to be an inactive nonterminal
+ nonterminals_[wordID] = 0;
+
+ //print split nonterminas of wordID
+ v_splits = grSplitNonterminals[wordID];
+ cout<<"print split nonterminals\n";
+ for (int i =0; i<v_splits.size(); i++)
+ cout<<v_splits[i]<<"\t"<<TD::Convert(v_splits[i])<<endl;
+
+
+ //now update in grammar rules and gramar tree:
+ vector<TRulePtr> newrules;
+ //first unary rules:
+ //iterate through unary rules
+ for (int i =0; i < unaries_.size(); i++){
+ TRulePtr rule = unaries_[i];
+ WordID lhs = rule.lhs_;
+ if (grSplitNonterminals.find(rule->f().front() ) != grSplitNonterminals.end()//if the rhs is in the list of splitting nonterminal
+ && grSplitNonterminals.find(lhs ) != grSplitNonterminals.end() //and the lhs is in the list of splitting nonterminal too
+ ){
+ vector<WordID> rhs_nonterminals = grSplitNonterminals[rule->f().front()]; //split the rhs nonterminal into the list of nonterminals in 'rhs_nonterminals'
+ vector<WordID> lhs_nonterminals = grSplitNonterminals[lhs]; //split the rhs nonterminal into the list of nonterminals in 'lhs_nonterminals'
+ for (int k =0; k <rhs_nonterminals.size(); k++)
+ for (int j =0; j <lhs_nonterminals.size(); j++){
+ TRulePtr newrule;
+ newrule -> e_ = rule->e_;
+ newrule -> f_ = rhs_nonterminals[k]->f_;
+ newrule->lhs_ = lhs_nonterminals[j]->lhs_;
+ newrule -> scores_ = rule->scores_;
+ newrule -> arity_ = (*it)->arity_;
+ newrules.push_back (newrule);
+
+ //update
+ }
+ }
+ else{//the rhs terminal/nonterminal is not in the list of splitting nonterminal
+
+
+ }
+ }
+
+ // for (Cat2Rule::const_iterator it = rhs2unaries_.begin(); it != rhs2unaries_.end(); it++){
+
+ // }
+ // if (rule->IsUnary()) {
+ // rhs2unaries_[rule->f().front()].push_back(rule);
+ // unaries_.push_back(rule);
+ // if (rule->f().front() <0)
+ // //add the RHS nonterminal to the list of nonterminals (the addNonterminal() function will check if it is the rhs symbol is a nonterminal then multiply by -1)
+ // this->addNonterminal(rule->f().front());
+
+
+ pimpl_->root_.DFS();
+
+}
+
+
+// void aTextGrammar::splitNonterminal0(WordID wordID){
+
+// TextGrammarNode* cur = &pimpl_->root_;
+// for (int i = 0; i < rule->f_.size(); ++i)
+// cur = &cur->tree_[rule->f_[i]];
+
+// }
+
+void aTextGrammar::splitAllNonterminals(){
+
+
+}
+