summaryrefslogtreecommitdiff
path: root/gi/scfg/abc/scfg.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'gi/scfg/abc/scfg.cpp')
-rw-r--r--gi/scfg/abc/scfg.cpp277
1 files changed, 0 insertions, 277 deletions
diff --git a/gi/scfg/abc/scfg.cpp b/gi/scfg/abc/scfg.cpp
deleted file mode 100644
index 1e59fb4a..00000000
--- a/gi/scfg/abc/scfg.cpp
+++ /dev/null
@@ -1,277 +0,0 @@
-#include <iostream>
-#include <fstream>
-
-#include <boost/shared_ptr.hpp>
-#include <boost/pointer_cast.hpp>
-#include "lattice.h"
-#include "tdict.h"
-#include "agrammar.h"
-#include "bottom_up_parser.h"
-#include "hg.h"
-#include "hg_intersect.h"
-#include "../utils/ParamsArray.h"
-
-
-using namespace std;
-
-vector<string> src_corpus;
-vector<string> tgt_corpus;
-
-bool openParallelCorpora(string & input_filename){
- ifstream input_file;
-
- input_file.open(input_filename.c_str());
- if (!input_file) {
- cerr << "Cannot open input file " << input_filename << ". Exiting..." << endl;
- return false;
- }
-
- int line =0;
- while (!input_file.eof()) {
- // get a line of source language data
- // cerr<<"new line "<<ctr<<endl;
- string str;
-
- getline(input_file, str);
- line++;
- if (str.length()==0){
- cerr<<" sentence number "<<line<<" is empty, skip the sentence\n";
- continue;
- }
- string delimiters("|||");
-
- vector<string> v = tokenize(str, delimiters);
-
- if ( (v.size() != 2) and (v.size() != 3) ) {
- cerr<<str<<endl;
- cerr<<" source or target sentence is not found in sentence number "<<line<<" , skip the sentence\n";
- continue;
- }
-
- src_corpus.push_back(v[0]);
- tgt_corpus.push_back(v[1]);
- }
- return true;
-}
-
-
-typedef aTextGrammar aGrammar;
-aGrammar * load_grammar(string & grammar_filename){
- cerr<<"start_load_grammar "<<grammar_filename<<endl;
-
- aGrammar * test = new aGrammar(grammar_filename);
-
- return test;
-}
-
-Lattice convertSentenceToLattice(const string & str){
-
- std::vector<WordID> vID;
- TD::ConvertSentence(str , &vID);
- Lattice lsentence;
- lsentence.resize(vID.size());
-
- for (int i=0; i<vID.size(); i++){
-
- lsentence[i].push_back( LatticeArc(vID[i], 0.0, 1) );
- }
-
- // if(!lsentence.IsSentence())
- // cout<<"not a sentence"<<endl;
-
- return lsentence;
-
-}
-
-bool parseSentencePair(const string & goal_sym, const string & src, const string & tgt, GrammarPtr & g, Hypergraph &hg){
-
-
- // cout<<" Start parse the sentence pairs\n"<<endl;
- Lattice lsource = convertSentenceToLattice(src);
-
- //parse the source sentence by the grammar
-
- vector<GrammarPtr> grammars(1, g);
-
- ExhaustiveBottomUpParser parser = ExhaustiveBottomUpParser(goal_sym, grammars);
-
- if (!parser.Parse(lsource, &hg)){
-
- cerr<<"source sentence is not parsed by the grammar!"<<endl;
- return false;
- }
-
- //intersect the hg with the target sentence
- Lattice ltarget = convertSentenceToLattice(tgt);
-
- //forest.PrintGraphviz();
- if (!HG::Intersect(ltarget, & hg)) return false;
-
- SparseVector<double> reweight;
-
- reweight.set_value(FD::Convert("MinusLogP"), -1 );
- hg.Reweight(reweight);
-
- return true;
-
-}
-
-
-
-
-int main(int argc, char** argv){
-
- ParamsArray params(argc, argv);
- params.setDescription("scfg models");
-
- params.addConstraint("grammar_file", "grammar file (default ./grammar.pr )", true); // optional
-
- params.addConstraint("input_file", "parallel input file (default ./parallel_corpora)", true); //optional
-
- params.addConstraint("output_file", "grammar output file (default ./grammar_output)", true); //optional
-
- params.addConstraint("goal_symbol", "top nonterminal symbol (default: X)", true); //optional
-
- params.addConstraint("split", "split one nonterminal into 'split' nonterminals (default: 2)", true); //optional
-
- params.addConstraint("prob_iters", "number of iterations (default: 10)", true); //optional
-
- params.addConstraint("split_iters", "number of splitting iterations (default: 3)", true); //optional
-
- params.addConstraint("alpha", "alpha (default: 0.1)", true); //optional
-
- if (!params.runConstraints("scfg")) {
- return 0;
- }
- cerr<<"get parametters\n\n\n";
-
-
- string grammar_file = params.asString("grammar_file", "./grammar.pr");
-
- string input_file = params.asString("input_file", "parallel_corpora");
-
- string output_file = params.asString("output_file", "grammar_output");
-
- string goal_sym = params.asString("goal_symbol", "X");
-
- int max_split = atoi(params.asString("split", "2").c_str());
-
- int prob_iters = atoi(params.asString("prob_iters", "2").c_str());
- int split_iters = atoi(params.asString("split_iters", "1").c_str());
- double alpha = atof(params.asString("alpha", ".001").c_str());
-
- /////
- cerr<<"grammar_file ="<<grammar_file<<endl;
- cerr<<"input_file ="<< input_file<<endl;
- cerr<<"output_file ="<< output_file<<endl;
- cerr<<"goal_sym ="<< goal_sym<<endl;
- cerr<<"max_split ="<< max_split<<endl;
- cerr<<"prob_iters ="<< prob_iters<<endl;
- cerr<<"split_iters ="<< split_iters<<endl;
- cerr<<"alpha ="<< alpha<<endl;
- //////////////////////////
-
- cerr<<"\n\nLoad parallel corpus...\n";
- if (! openParallelCorpora(input_file))
- exit(1);
-
- cerr<<"Load grammar file ...\n";
- aGrammar * agrammar = load_grammar(grammar_file);
- agrammar->SetGoalNT(goal_sym);
- agrammar->setMaxSplit(max_split);
- agrammar->set_alpha(alpha);
-
- srand(123);
-
- GrammarPtr g( agrammar);
- Hypergraph hg;
-
- int data_size = src_corpus.size();
- int cnt_unparsed =0;
- for (int i =0; i <split_iters; i++){
-
- cerr<<"Split Nonterminals, iteration "<<(i+1)<<endl;
- agrammar->PrintAllRules(output_file+".s" + itos(i+1));
- agrammar->splitAllNonterminals();
-
- //vector<string> src_corpus;
- //vector<string> tgt_corpus;
-
- for (int j=0; j<prob_iters; j++){
- cerr<<"reset grammar score\n";
- agrammar->ResetScore();
- // cerr<<"done reset grammar score\n";
- for (int k=0; k <data_size; k++){
- string src = src_corpus[k];
-
- string tgt = tgt_corpus[k];
- cerr <<"parse sentence pair: "<<src<<" ||| "<<tgt<<endl;
-
- if (! parseSentencePair(goal_sym, src, tgt, g, hg) ){
- cerr<<"target sentence is not parsed by the grammar!\n";
- //return 1;
- cnt_unparsed++;
- continue;
-
- }
-
- cerr<<"update edge posterior prob"<<endl;
- boost::static_pointer_cast<aGrammar>(g)->UpdateHgProsteriorProb(hg);
- hg.clear();
- if (k%1000 ==0 ) cerr<<"sentences "<<k<<endl;
- }
- cerr<<"cnt_unparased="<<cnt_unparsed<<endl;
- boost::static_pointer_cast<aGrammar>(g)->UpdateScore();
- }
- boost::static_pointer_cast<aGrammar>(g)->PrintAllRules(output_file+".e" + itos(i+1));
- }
-
-
-
-
-
-
-
-
-
- // // agrammar->ResetScore();
- // // agrammar->UpdateScore();
- // if (! parseSentencePair(goal_sym, src, tgt, g, hg) ){
- // cerr<<"target sentence is not parsed by the grammar!\n";
- // return 1;
-
- // }
- // // hg.PrintGraphviz();
- // //hg.clear();
-
- // agrammar->PrintAllRules();
- // /*split grammar*/
- // cout<<"split NTs\n";
- // cerr<<"first of all write all nonterminals"<<endl;
- // // agrammar->printAllNonterminals();
- // cout<<"after split nonterminal"<<endl;
- // agrammar->PrintAllRules();
- // Hypergraph hg1;
- // if (! parseSentencePair(goal_sym, src, tgt, g, hg1) ){
- // cerr<<"target sentence is not parsed by the grammar!\n";
- // return 1;
-
- // }
-
- // hg1.PrintGraphviz();
-
-
- // agrammar->splitNonterminal(15);
- // cout<<"after split nonterminal"<<TD::Convert(15)<<endl;
- // agrammar->PrintAllRules();
-
-
- /*load training corpus*/
-
-
- /*for each sentence pair in training corpus*/
-
- // forest.PrintGraphviz();
- /*calculate expected count*/
-
-}