/* * argument_reorder_model.cc * * Created on: Dec 15, 2013 * Author: lijunhui */ #include #include #include "argument_reorder_model.h" #include "utility.h" #include "tsuruoka_maxent.h" inline void fnPreparingTrainingdata(const char* pszFName, int iCutoff, const char* pszNewFName) { SFReader *pFReader = new STxtFileReader(pszFName); char *pszLine = new char[ 100001 ]; int iLen; Map hashPredicate; while (pFReader->fnReadNextLine(pszLine, &iLen)) { if (iLen == 0) continue; vector vecTerms; SplitOnWhitespace(string(pszLine), &vecTerms); for (size_t i = 0; i < vecTerms.size() - 1; i++) { Iterator iter = hashPredicate.find(vecTerms[i]); if (iter == hashPredicate.end()) { hashPredicate[vecTerms[i]] = 1; } else { iter->second++; } } } delete pFReader; pFReader = new STxtFileReader(pszFName); FILE *fpOut = fopen(pszNewFName, "w"); while (pFReader->fnReadNextLine(pszLine, &iLen)) { if (iLen == 0) continue; vector vecTerms; SplitOnWhitespace(string(pszLine), &vecTerms); ostringstream ostr; for (size_t i = 0; i < vecTerms.size() - 1; i++) { Iterator iter = hashPredicate.find(vecTerms[i]); assert(iter != hashPredicate.end()); if (iter->second >= iCutoff) { ostr << vecTerms[i] << " "; } } if (ostr.str().length() > 0) { ostr << vecTerms[vecTerms.size() - 1]; fprintf(fpOut, "%s\n", ostr.str().c_str()); } } fclose(fpOut); delete pFReader; delete [] pszLine; } struct SArgumentReorderTrainer{ SArgumentReorderTrainer(const char* pszSRLFname, //source-side srl tree file name const char* pszAlignFname, //alignment filename const char* pszSourceFname, //source file name const char* pszTargetFname, //target file name const char* pszTopPredicateFname, //target file name const char* pszInstanceFname, //training instance file name const char* pszModelFname, //classifier model file name int iCutoff) { fnGenerateInstanceFiles(pszSRLFname, pszAlignFname, pszSourceFname, pszTargetFname, pszTopPredicateFname, pszInstanceFname); string strInstanceFname, strModelFname; strInstanceFname = string(pszInstanceFname) + string(".left"); strModelFname = string(pszModelFname) + string(".left"); fnTraining(strInstanceFname.c_str(), strModelFname.c_str(), iCutoff); strInstanceFname = string(pszInstanceFname) + string(".right"); strModelFname = string(pszModelFname) + string(".right"); fnTraining(strInstanceFname.c_str(), strModelFname.c_str(), iCutoff); } ~SArgumentReorderTrainer() { } private: void fnTraining(const char* pszInstanceFname, const char* pszModelFname, int iCutoff) { char *pszNewInstanceFName = new char[strlen(pszInstanceFname) + 50]; if (iCutoff > 0) { sprintf(pszNewInstanceFName, "%s.tmp", pszInstanceFname); fnPreparingTrainingdata(pszInstanceFname, iCutoff, pszNewInstanceFName); } else { strcpy(pszNewInstanceFName, pszInstanceFname); } Tsuruoka_Maxent *pMaxent = new Tsuruoka_Maxent(NULL); pMaxent->fnTrain(pszNewInstanceFName, "l1", pszModelFname, 300); delete pMaxent; if (strcmp(pszNewInstanceFName, pszInstanceFname) != 0) { sprintf(pszNewInstanceFName, "rm %s.tmp", pszInstanceFname); system(pszNewInstanceFName); } delete [] pszNewInstanceFName; } void fnGenerateInstanceFiles(const char* pszSRLFname, //source-side flattened parse tree file name const char* pszAlignFname, //alignment filename const char* pszSourceFname, //source file name const char* pszTargetFname, //target file name const char* pszTopPredicateFname, //top predicate file name (we only consider predicates with 100+ occurrences const char* pszInstanceFname //training instance file name ) { SAlignmentReader *pAlignReader = new SAlignmentReader(pszAlignFname); SSrlSentenceReader *pSRLReader = new SSrlSentenceReader(pszSRLFname); STxtFileReader *pTxtSReader = new STxtFileReader(pszSourceFname); STxtFileReader *pTxtTReader = new STxtFileReader(pszTargetFname); Map *pMapPredicate; if (pszTopPredicateFname != NULL) pMapPredicate = fnLoadTopPredicates(pszTopPredicateFname); else pMapPredicate = NULL; char *pszLine = new char[50001]; FILE *fpLeftOut, *fpRightOut; sprintf(pszLine, "%s.left", pszInstanceFname); fpLeftOut = fopen(pszLine, "w"); sprintf(pszLine, "%s.right", pszInstanceFname); fpRightOut = fopen(pszLine, "w"); //read sentence by sentence SAlignment *pAlign; SSrlSentence *pSRL; SParsedTree *pTree; int iSentNum = 0; while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { pSRL = pSRLReader->fnReadNextSrlSentence(); assert(pSRL != NULL); pTree = pSRL->m_pTree; assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); vector vecSTerms; SplitOnWhitespace(string(pszLine), &vecSTerms); assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); vector vecTTerms; SplitOnWhitespace(string(pszLine), &vecTTerms); //vecTPOSTerms.size() == 0, given the case when an english sentence fails parsing if (pTree != NULL) { for (size_t i = 0; i < pSRL->m_vecPred.size(); i++) { SPredicate *pPred = pSRL->m_vecPred[i]; if (strcmp(pTree->m_vecTerminals[pPred->m_iPosition]->m_ptParent->m_pszTerm, "VA") == 0) continue; string strPred = string(pTree->m_vecTerminals[pPred->m_iPosition]->m_pszTerm); if (pMapPredicate != NULL) { Map::iterator iter_map = pMapPredicate->find(strPred); if (pMapPredicate != NULL && iter_map == pMapPredicate->end()) continue; } SPredicateItem *pPredItem = new SPredicateItem(pTree, pPred); vector vecStrBlock; for (size_t j = 0; j < pPredItem->vec_items_.size(); j++) { SSRLItem *pItem1 = pPredItem->vec_items_[j]; vecStrBlock.push_back(SArgumentReorderModel::fnGetBlockOutcome(pItem1->tree_item_->m_iBegin, pItem1->tree_item_->m_iEnd, pAlign)); } vector vecStrLeftReorderType; vector vecStrRightReorderType; SArgumentReorderModel::fnGetReorderType(pPredItem, pAlign, vecStrLeftReorderType, vecStrRightReorderType); for (int j = 1; j < pPredItem->vec_items_.size(); j++) { string strLeftOutcome, strRightOutcome; strLeftOutcome = vecStrLeftReorderType[j - 1]; strRightOutcome = vecStrRightReorderType[j - 1]; ostringstream ostr; SArgumentReorderModel::fnGenerateFeature(pTree, pPred, pPredItem, j, vecStrBlock[j - 1], vecStrBlock[j], ostr); //fprintf(stderr, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); //fprintf(fpOut, "sentid=%d %s %s\n", iSentNum, ostr.str().c_str(), strOutcome.c_str()); fprintf(fpLeftOut, "%s %s\n", ostr.str().c_str(), strLeftOutcome.c_str()); fprintf(fpRightOut, "%s %s\n", ostr.str().c_str(), strRightOutcome.c_str()); } } } delete pSRL; delete pAlign; iSentNum++; if (iSentNum % 100000 == 0) fprintf(stderr, "#%d\n", iSentNum); } delete [] pszLine; fclose(fpLeftOut); fclose(fpRightOut); delete pAlignReader; delete pSRLReader; delete pTxtSReader; delete pTxtTReader; } Map* fnLoadTopPredicates(const char* pszTopPredicateFname) { if (pszTopPredicateFname == NULL) return NULL; Map* pMapPredicate = new Map(); STxtFileReader *pReader = new STxtFileReader(pszTopPredicateFname); char *pszLine = new char[50001]; int iNumCount = 0; while (pReader->fnReadNextLine(pszLine, NULL)) { if (pszLine[0] == '#') continue; char *p = strchr(pszLine, ' '); assert(p != NULL); p[0] = '\0'; p++; int iCount = atoi(p); if (iCount < 100) break; (*pMapPredicate)[string(pszLine)] = iNumCount++; } delete pReader; delete [] pszLine; return pMapPredicate; } }; namespace po = boost::program_options; inline void print_options(std::ostream &out,po::options_description const& opts) { typedef std::vector< boost::shared_ptr > Ds; Ds const& ds=opts.options(); out << '"'; for (unsigned i=0;ilong_name(); } out << '"\n'; } inline string str(char const* name,po::variables_map const& conf) { return conf[name].as(); } //--srl_file /scratch0/mt_exp/gale-align/gale-align.nw.srl.cn --align_file /scratch0/mt_exp/gale-align/gale-align.nw.al --source_file /scratch0/mt_exp/gale-align/gale-align.nw.cn --target_file /scratch0/mt_exp/gale-align/gale-align.nw.en --instance_file /scratch0/mt_exp/gale-align/gale-align.nw.argreorder.instance --model_prefix /scratch0/mt_exp/gale-align/gale-align.nw.argreorder.model --feature_cutoff 2 //--srl_file /scratch0/mt_exp/gale-ctb/gale-ctb.srl.cn --align_file /scratch0/mt_exp/gale-ctb/gale-ctb.align --source_file /scratch0/mt_exp/gale-ctb/gale-ctb.cn --target_file /scratch0/mt_exp/gale-ctb/gale-ctb.en0 --instance_file /scratch0/mt_exp/gale-ctb/gale-ctb.argreorder.instance --model_prefix /scratch0/mt_exp/gale-ctb/gale-ctb.argreorder.model --feature_cutoff 2 int main(int argc, char** argv) { po::options_description opts("Configuration options"); opts.add_options() ("srl_file",po::value(),"srl file path (input)") ("align_file",po::value(),"Alignment file path (input)") ("source_file",po::value(),"Source text file path (input)") ("target_file",po::value(),"Target text file path (input)") ("instance_file",po::value(),"Instance file path (output)") ("model_prefix",po::value(),"Model file path prefix (output): three files will be generated") ("feature_cutoff",po::value()->default_value(100),"Feature cutoff threshold") ("help", "produce help message"); po::variables_map vm; if (argc) { po::store(po::parse_command_line(argc, argv, opts), vm); po::notify(vm); } if (vm.count("help")) { print_options(cout, opts); return 1; } if (!vm.count("srl_file") || !vm.count("align_file") || !vm.count("source_file") || !vm.count("target_file") || !vm.count("instance_file") || !vm.count("model_prefix") ) { print_options(cout, opts); if (!vm.count("parse_file")) cout << "--parse_file NOT FOUND\n"; if (!vm.count("align_file")) cout << "--align_file NOT FOUND\n"; if (!vm.count("source_file")) cout << "--source_file NOT FOUND\n"; if (!vm.count("target_file")) cout << "--target_file NOT FOUND\n"; if (!vm.count("instance_file")) cout << "--instance_file NOT FOUND\n"; if (!vm.count("model_prefix")) cout << "--model_prefix NOT FOUND\n"; exit(0); } SArgumentReorderTrainer *pTrainer = new SArgumentReorderTrainer(str("srl_file", vm).c_str(), str("align_file", vm).c_str(), str("source_file", vm).c_str(), str("target_file", vm).c_str(), NULL, str("instance_file", vm).c_str(), str("model_prefix", vm).c_str(), vm["feature_cutoff"].as()); delete pTrainer; return 1; }