#include #include #include #include #include "lattice.h" #include "tdict.h" #include "agrammar.h" #include "bottom_up_parser.h" #include "hg.h" #include "hg_intersect.h" #include "../utils/ParamsArray.h" using namespace std; vector src_corpus; vector tgt_corpus; bool openParallelCorpora(string & input_filename){ ifstream input_file; input_file.open(input_filename.c_str()); if (!input_file) { cerr << "Cannot open input file " << input_filename << ". Exiting..." << endl; return false; } int line =0; while (!input_file.eof()) { // get a line of source language data // cerr<<"new line "< v = tokenize(str, delimiters); if ( (v.size() != 2) and (v.size() != 3) ) { cerr< vID; TD::ConvertSentence(str , &vID); Lattice lsentence; lsentence.resize(vID.size()); for (int i=0; i grammars(1, g); ExhaustiveBottomUpParser parser = ExhaustiveBottomUpParser(goal_sym, grammars); if (!parser.Parse(lsource, &hg)){ cerr<<"source sentence is not parsed by the grammar!"< reweight; reweight.set_value(FD::Convert("MinusLogP"), -1 ); hg.Reweight(reweight); return true; } int main(int argc, char** argv){ ParamsArray params(argc, argv); params.setDescription("scfg models"); params.addConstraint("grammar_file", "grammar file (default ./grammar.pr )", true); // optional params.addConstraint("input_file", "parallel input file (default ./parallel_corpora)", true); //optional params.addConstraint("output_file", "grammar output file (default ./grammar_output)", true); //optional params.addConstraint("goal_symbol", "top nonterminal symbol (default: X)", true); //optional params.addConstraint("split", "split one nonterminal into 'split' nonterminals (default: 2)", true); //optional params.addConstraint("prob_iters", "number of iterations (default: 10)", true); //optional params.addConstraint("split_iters", "number of splitting iterations (default: 3)", true); //optional params.addConstraint("alpha", "alpha (default: 0.1)", true); //optional if (!params.runConstraints("scfg")) { return 0; } cerr<<"get parametters\n\n\n"; string grammar_file = params.asString("grammar_file", "./grammar.pr"); string input_file = params.asString("input_file", "parallel_corpora"); string output_file = params.asString("output_file", "grammar_output"); string goal_sym = params.asString("goal_symbol", "X"); int max_split = atoi(params.asString("split", "2").c_str()); int prob_iters = atoi(params.asString("prob_iters", "2").c_str()); int split_iters = atoi(params.asString("split_iters", "1").c_str()); double alpha = atof(params.asString("alpha", ".001").c_str()); ///// cerr<<"grammar_file ="<SetGoalNT(goal_sym); agrammar->setMaxSplit(max_split); agrammar->set_alpha(alpha); srand(123); GrammarPtr g( agrammar); Hypergraph hg; int data_size = src_corpus.size(); int cnt_unparsed =0; for (int i =0; i PrintAllRules(output_file+".s" + itos(i+1)); agrammar->splitAllNonterminals(); //vector src_corpus; //vector tgt_corpus; for (int j=0; jResetScore(); // cerr<<"done reset grammar score\n"; for (int k=0; k (g)->UpdateHgProsteriorProb(hg); hg.clear(); if (k%1000 ==0 ) cerr<<"sentences "<ResetScore(); // // agrammar->UpdateScore(); // if (! parseSentencePair(goal_sym, src, tgt, g, hg) ){ // cerr<<"target sentence is not parsed by the grammar!\n"; // return 1; // } // // hg.PrintGraphviz(); // //hg.clear(); // agrammar->PrintAllRules(); // /*split grammar*/ // cout<<"split NTs\n"; // cerr<<"first of all write all nonterminals"<printAllNonterminals(); // cout<<"after split nonterminal"<PrintAllRules(); // Hypergraph hg1; // if (! parseSentencePair(goal_sym, src, tgt, g, hg1) ){ // cerr<<"target sentence is not parsed by the grammar!\n"; // return 1; // } // hg1.PrintGraphviz(); // agrammar->splitNonterminal(15); // cout<<"after split nonterminal"<PrintAllRules(); /*load training corpus*/ /*for each sentence pair in training corpus*/ // forest.PrintGraphviz(); /*calculate expected count*/ }