Merge remote-tracking branch 'upstream/master'

author: Patrick Simianer <simianer@cl.uni-heidelberg.de> 2012-05-31 13:57:24 +0200
committer: Patrick Simianer <simianer@cl.uni-heidelberg.de> 2012-05-31 13:57:24 +0200
commit: f1ba05780db1705493d9afb562332498b93d26f1 (patch)
tree: fb429a657ba97f33e8140742de9bc74d9fc88e75 /gi/scfg/abc/scfg.cpp
parent: aadabfdf37dfd451485277cb77fad02f77b361c6 (diff)
parent: 317d650f6cb1e24ac6f3be6f7bf9d4246a59e0e5 (diff)
1 files changed, 0 insertions, 277 deletions
diff --git a/gi/scfg/abc/scfg.cpp b/gi/scfg/abc/scfg.cpp
deleted file mode 100644
index 1e59fb4a..00000000
--- a/gi/scfg/abc/scfg.cpp
+++ /dev/null
@@ -1,277 +0,0 @@
-#include <iostream>
-#include <fstream>
-
-#include <boost/shared_ptr.hpp>
-#include <boost/pointer_cast.hpp>
-#include "lattice.h"
-#include "tdict.h"
-#include "agrammar.h"
-#include "bottom_up_parser.h"
-#include "hg.h"
-#include "hg_intersect.h"
-#include "../utils/ParamsArray.h"
-
-
-using namespace std;
-
-vector<string> src_corpus;
-vector<string> tgt_corpus;
-
-bool openParallelCorpora(string & input_filename){
-  ifstream input_file;
-
-  input_file.open(input_filename.c_str());
-  if (!input_file) {
-    cerr << "Cannot open input file " << input_filename << ". Exiting..." << endl;
-    return false;
-  } 
-
-  int line =0;
-  while (!input_file.eof()) {
-    // get a line of source language data                                                                                                                                          
-    //    cerr<<"new line "<<ctr<<endl;                                                                                                                                           
-    string str;
-
-    getline(input_file, str);
-    line++;
-    if (str.length()==0){
-      cerr<<" sentence number "<<line<<" is empty, skip the sentence\n";
-      continue;
-    }
-    string delimiters("|||");
-
-    vector<string> v = tokenize(str, delimiters);
-
-    if ( (v.size() != 2)  and (v.size() != 3) )  {
-      cerr<<str<<endl;
-      cerr<<" source or target sentence is not found in sentence number "<<line<<" , skip the sentence\n";
-      continue;
-    }
-
-    src_corpus.push_back(v[0]);
-    tgt_corpus.push_back(v[1]);
-  }
-  return true;
-}
-
-
-typedef aTextGrammar aGrammar;
-aGrammar * load_grammar(string & grammar_filename){
-  cerr<<"start_load_grammar "<<grammar_filename<<endl;
-
-  aGrammar * test = new aGrammar(grammar_filename);
-
-  return test;
-}
-
-Lattice convertSentenceToLattice(const string & str){
-
-  std::vector<WordID> vID;
-  TD::ConvertSentence(str , &vID);
-  Lattice lsentence;
-  lsentence.resize(vID.size());
-
-  for (int i=0; i<vID.size(); i++){
-
-    lsentence[i].push_back( LatticeArc(vID[i], 0.0, 1) );  
-  }
-
-  //  if(!lsentence.IsSentence())
-  //  cout<<"not a sentence"<<endl;
-
-  return lsentence;
-
-}
-
-bool parseSentencePair(const string & goal_sym, const string & src, const string & tgt,  GrammarPtr & g, Hypergraph &hg){
-
-
-  //  cout<<"  Start parse the sentence pairs\n"<<endl;
-  Lattice lsource = convertSentenceToLattice(src);
-  
-  //parse the source sentence by the grammar
-
-  vector<GrammarPtr> grammars(1, g);
-
-  ExhaustiveBottomUpParser parser = ExhaustiveBottomUpParser(goal_sym, grammars);
-  
-  if (!parser.Parse(lsource, &hg)){
-
-     cerr<<"source sentence is not parsed by the grammar!"<<endl;
-     return false;
-   }
-
-  //intersect the hg with the target sentence
-  Lattice ltarget = convertSentenceToLattice(tgt);
-
-  //forest.PrintGraphviz();
-  if (!HG::Intersect(ltarget, & hg)) return false;
-
-  SparseVector<double> reweight;
-  
-  reweight.set_value(FD::Convert("MinusLogP"), -1 );
-  hg.Reweight(reweight);
-
-  return true;
-  
-}
-
-
-
-
-int main(int argc, char** argv){
-
-  ParamsArray params(argc, argv);
-  params.setDescription("scfg models");
-
-  params.addConstraint("grammar_file", "grammar file (default ./grammar.pr )", true); //  optional                               
-
-  params.addConstraint("input_file", "parallel input file (default ./parallel_corpora)", true); //optional                                         
-
-  params.addConstraint("output_file", "grammar output file (default ./grammar_output)", true); //optional                                         
-
-  params.addConstraint("goal_symbol", "top nonterminal symbol (default: X)", true); //optional                                         
-
-  params.addConstraint("split", "split one nonterminal into 'split' nonterminals (default: 2)", true); //optional                                         
-
-  params.addConstraint("prob_iters", "number of iterations (default: 10)", true); //optional                                         
-
-  params.addConstraint("split_iters", "number of splitting iterations (default: 3)", true); //optional                                         
-
-  params.addConstraint("alpha", "alpha (default: 0.1)", true); //optional                                         
-
-  if (!params.runConstraints("scfg")) {
-    return 0;
-  }
-  cerr<<"get parametters\n\n\n";
-
-
-  string grammar_file = params.asString("grammar_file", "./grammar.pr");
-
-  string input_file = params.asString("input_file", "parallel_corpora");
-
-  string output_file = params.asString("output_file", "grammar_output");
-
-  string goal_sym = params.asString("goal_symbol", "X");
-
-  int max_split = atoi(params.asString("split", "2").c_str());
-  
-  int prob_iters = atoi(params.asString("prob_iters", "2").c_str());
-  int split_iters = atoi(params.asString("split_iters", "1").c_str());
-  double alpha = atof(params.asString("alpha", ".001").c_str());
-
-  /////
-  cerr<<"grammar_file ="<<grammar_file<<endl;
-  cerr<<"input_file ="<< input_file<<endl;
-  cerr<<"output_file ="<< output_file<<endl;
-  cerr<<"goal_sym ="<< goal_sym<<endl;
-  cerr<<"max_split ="<< max_split<<endl;
-  cerr<<"prob_iters ="<< prob_iters<<endl;
-  cerr<<"split_iters ="<< split_iters<<endl;
-  cerr<<"alpha ="<< alpha<<endl;
-  //////////////////////////
-
-  cerr<<"\n\nLoad parallel corpus...\n";
-  if (! openParallelCorpora(input_file))
-    exit(1);
-
-  cerr<<"Load grammar file ...\n";
-  aGrammar * agrammar = load_grammar(grammar_file);
-  agrammar->SetGoalNT(goal_sym);
-  agrammar->setMaxSplit(max_split);
-  agrammar->set_alpha(alpha);
-
-  srand(123);
-
-  GrammarPtr g( agrammar);
-  Hypergraph hg;
-
-  int data_size = src_corpus.size();
-  int cnt_unparsed =0;
-  for (int i =0; i <split_iters; i++){
-    
-    cerr<<"Split Nonterminals, iteration "<<(i+1)<<endl;
-    agrammar->PrintAllRules(output_file+".s" + itos(i+1));
-    agrammar->splitAllNonterminals();
-
-    //vector<string> src_corpus;
-    //vector<string> tgt_corpus;
-    
-    for (int j=0; j<prob_iters; j++){
-      cerr<<"reset grammar score\n";
-      agrammar->ResetScore();
-      //      cerr<<"done reset grammar score\n";
-      for (int k=0; k <data_size; k++){
-	string src = src_corpus[k];
-  
-	string tgt = tgt_corpus[k];
-	cerr <<"parse sentence pair: "<<src<<"  |||  "<<tgt<<endl;
-
-	if (! parseSentencePair(goal_sym, src, tgt, g, hg) ){
-	  cerr<<"target sentence is not parsed by the grammar!\n";
-	  //return 1;
-	  cnt_unparsed++;
-	  continue;
-
-	} 
-
-	cerr<<"update edge posterior prob"<<endl;
-	boost::static_pointer_cast<aGrammar>(g)->UpdateHgProsteriorProb(hg);
-	hg.clear();
-	if (k%1000 ==0 ) cerr<<"sentences "<<k<<endl;
-      }
-      cerr<<"cnt_unparased="<<cnt_unparsed<<endl;
-      boost::static_pointer_cast<aGrammar>(g)->UpdateScore();
-    }
-    boost::static_pointer_cast<aGrammar>(g)->PrintAllRules(output_file+".e" + itos(i+1));
-  }
-
-
-
-
-  
-
- 
-
-
-  // // agrammar->ResetScore();
-  // // agrammar->UpdateScore();
-  // if (! parseSentencePair(goal_sym, src, tgt, g, hg) ){
-  //   cerr<<"target sentence is not parsed by the grammar!\n";
-  //   return 1;
-
-  //  }
-  // //   hg.PrintGraphviz();
-  //  //hg.clear();
-
-  // agrammar->PrintAllRules();
-  // /*split grammar*/
-  // cout<<"split NTs\n"; 
-  // cerr<<"first of all write all nonterminals"<<endl;
-  // // agrammar->printAllNonterminals();
-  // cout<<"after split nonterminal"<<endl;
-  // agrammar->PrintAllRules();
-  // Hypergraph hg1;
-  // if (! parseSentencePair(goal_sym, src, tgt,  g, hg1) ){
-  //   cerr<<"target sentence is not parsed by the grammar!\n";
-  //   return 1;
-
-  // }
-
-  // hg1.PrintGraphviz();
-  
-
-  // agrammar->splitNonterminal(15);
-  // cout<<"after split nonterminal"<<TD::Convert(15)<<endl;
-  // agrammar->PrintAllRules();
-
-  
-  /*load training corpus*/
-
-
-  /*for each sentence pair in training corpus*/
- 
-  //  forest.PrintGraphviz();
-  /*calculate expected count*/
-  
-}
author	Patrick Simianer <simianer@cl.uni-heidelberg.de>	2012-05-31 13:57:24 +0200
committer	Patrick Simianer <simianer@cl.uni-heidelberg.de>	2012-05-31 13:57:24 +0200
commit	f1ba05780db1705493d9afb562332498b93d26f1 (patch)
tree	fb429a657ba97f33e8140742de9bc74d9fc88e75 /gi/scfg/abc/scfg.cpp
parent	aadabfdf37dfd451485277cb77fad02f77b361c6 (diff)
parent	317d650f6cb1e24ac6f3be6f7bf9d4246a59e0e5 (diff)