From 15a587e247dc0954de27e2627f5511126243943d Mon Sep 17 00:00:00 2001 From: "linh.kitty" Date: Fri, 16 Jul 2010 17:44:44 +0000 Subject: add git-svn-id: https://ws10smt.googlecode.com/svn/trunk@286 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/scfg/abc/scfg.cpp | 213 +++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 166 insertions(+), 47 deletions(-) (limited to 'gi/scfg/abc/scfg.cpp') diff --git a/gi/scfg/abc/scfg.cpp b/gi/scfg/abc/scfg.cpp index 4d094488..b3dbad34 100644 --- a/gi/scfg/abc/scfg.cpp +++ b/gi/scfg/abc/scfg.cpp @@ -1,3 +1,8 @@ +#include +#include + +#include +#include #include "lattice.h" #include "tdict.h" #include "agrammar.h" @@ -9,13 +14,53 @@ using namespace std; +vector src_corpus; +vector tgt_corpus; + +bool openParallelCorpora(string & input_filename){ + ifstream input_file; + + input_file.open(input_filename.c_str()); + if (!input_file) { + cerr << "Cannot open input file " << input_filename << ". Exiting..." << endl; + return false; + } + + int line =0; + while (!input_file.eof()) { + // get a line of source language data + // cerr<<"new line "< v = tokenize(str, delimiters); + + if ( (v.size() != 2) and (v.size() != 3) ) { + cerr< reweight; + + reweight.set_value(FD::Convert("MinusLogP"), -1 ); + hg.Reweight(reweight); + return true; + } @@ -71,74 +124,140 @@ int main(int argc, char** argv){ ParamsArray params(argc, argv); params.setDescription("scfg models"); - params.addConstraint("grammar_file", "grammar file ", true); // optional + params.addConstraint("grammar_file", "grammar file (default ./grammar.pr )", true); // optional + + params.addConstraint("input_file", "parallel input file (default ./parallel_corpora)", true); //optional + + params.addConstraint("output_file", "grammar output file (default ./grammar_output)", true); //optional + + params.addConstraint("goal_symbol", "top nonterminal symbol (default: X)", true); //optional + + params.addConstraint("split", "split one nonterminal into 'split' nonterminals (default: 2)", true); //optional - params.addConstraint("input_file", "parallel input file", true); //optional + params.addConstraint("prob_iters", "number of iterations (default: 10)", true); //optional + + params.addConstraint("split_iters", "number of splitting iterations (default: 3)", true); //optional + + params.addConstraint("alpha", "alpha (default: 0.1)", true); //optional if (!params.runConstraints("scfg")) { return 0; } cerr<<"get parametters\n\n\n"; - string input_file = params.asString("input_file", "parallel_corpora"); string grammar_file = params.asString("grammar_file", "./grammar.pr"); + string input_file = params.asString("input_file", "parallel_corpora"); - string src = "el gato ."; - - string tgt = "the cat ."; - - - string goal_sym = "X"; - srand(123); - /*load grammar*/ + string output_file = params.asString("output_file", "grammar_output"); + string goal_sym = params.asString("goal_symbol", "X"); + int max_split = atoi(params.asString("split", "2").c_str()); + + int prob_iters = atoi(params.asString("prob_iters", "2").c_str()); + int split_iters = atoi(params.asString("split_iters", "1").c_str()); + double alpha = atof(params.asString("alpha", ".001").c_str()); + + ///// + cerr<<"grammar_file ="<SetGoalNT(goal_sym); - cout<<"before split nonterminal"<setMaxSplit(max_split); + agrammar->set_alpha(alpha); + srand(123); + + GrammarPtr g( agrammar); Hypergraph hg; - if (! parseSentencePair(goal_sym, src, tgt, g, hg) ){ - cerr<<"target sentence is not parsed by the grammar!\n"; - return 1; - } - hg.PrintGraphviz(); + int data_size = src_corpus.size(); + for (int i =0; i PrintAllRules(output_file+".s" + itos(i+1)); + agrammar->splitAllNonterminals(); + + //vector src_corpus; + //vector tgt_corpus; + + for (int j=0; jResetScore(); + // cerr<<"done reset grammar score\n"; + for (int k=0; k (g)->UpdateHgProsteriorProb(hg); + hg.clear(); + } + boost::static_pointer_cast(g)->UpdateScore(); + } + boost::static_pointer_cast(g)->PrintAllRules(output_file+".e" + itos(i+1)); + } - if (! parseSentencePair(goal_sym, src, tgt, g, hg) ){ - cerr<<"target sentence is not parsed by the grammar!\n"; - return 1; - } - hg.PrintGraphviz(); - //hg.clear(); - if (1==1) return 1; + + + - agrammar->PrintAllRules(); - /*split grammar*/ - cout<<"split NTs\n"; - cerr<<"first of all write all nonterminals"<printAllNonterminals(); - agrammar->setMaxSplit(2); - agrammar->splitNonterminal(4); - cout<<"after split nonterminal"<PrintAllRules(); - Hypergraph hg1; - if (! parseSentencePair(goal_sym, src, tgt, g, hg1) ){ - cerr<<"target sentence is not parsed by the grammar!\n"; - return 1; - } - hg1.PrintGraphviz(); + // // agrammar->ResetScore(); + // // agrammar->UpdateScore(); + // if (! parseSentencePair(goal_sym, src, tgt, g, hg) ){ + // cerr<<"target sentence is not parsed by the grammar!\n"; + // return 1; + + // } + // // hg.PrintGraphviz(); + // //hg.clear(); + + // agrammar->PrintAllRules(); + // /*split grammar*/ + // cout<<"split NTs\n"; + // cerr<<"first of all write all nonterminals"<printAllNonterminals(); + // cout<<"after split nonterminal"<PrintAllRules(); + // Hypergraph hg1; + // if (! parseSentencePair(goal_sym, src, tgt, g, hg1) ){ + // cerr<<"target sentence is not parsed by the grammar!\n"; + // return 1; + + // } + + // hg1.PrintGraphviz(); - agrammar->splitNonterminal(15); - cout<<"after split nonterminal"<PrintAllRules(); + // agrammar->splitNonterminal(15); + // cout<<"after split nonterminal"<PrintAllRules(); /*load training corpus*/ -- cgit v1.2.3