From 37cfe3758a07f414dfd28c36890567db2f2224c4 Mon Sep 17 00:00:00 2001 From: "Wu, Ke" Date: Fri, 10 Oct 2014 17:12:03 -0400 Subject: Trainers for reordering constraint models --- .gitignore | 1 + utils/Makefile.am | 4 +- utils/argument_reorder_model.cc | 6 ++ utils/constituent_reorder_model.cc | 197 +------------------------------------ 4 files changed, 13 insertions(+), 195 deletions(-) diff --git a/.gitignore b/.gitignore index 4efcdd8d..c75328dc 100644 --- a/.gitignore +++ b/.gitignore @@ -181,6 +181,7 @@ training/mr_reduce_to_weights training/optimize_test training/plftools training/test_ngram +utils/argument_reorder_model_trainer utils/atools utils/bin/ utils/const_reorder_model_trainer diff --git a/utils/Makefile.am b/utils/Makefile.am index 0bd21b2b..53967561 100644 --- a/utils/Makefile.am +++ b/utils/Makefile.am @@ -1,4 +1,4 @@ -bin_PROGRAMS = reconstruct_weights atools const_reorder_model_trainer +bin_PROGRAMS = reconstruct_weights atools const_reorder_model_trainer argument_reorder_model_trainer noinst_PROGRAMS = \ ts \ @@ -108,6 +108,8 @@ atools_LDADD = libutils.a atools_LDFLAGS = $(STATIC_FLAGS) const_reorder_model_trainer_SOURCES = constituent_reorder_model.cc const_reorder_model_trainer_LDADD = libutils.a +argument_reorder_model_trainer_SOURCES = argument_reorder_model.cc +argument_reorder_model_trainer_LDADD = libutils.a phmt_SOURCES = phmt.cc phmt_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) diff --git a/utils/argument_reorder_model.cc b/utils/argument_reorder_model.cc index 537dfb20..40647f8a 100644 --- a/utils/argument_reorder_model.cc +++ b/utils/argument_reorder_model.cc @@ -6,12 +6,18 @@ */ #include +#include #include +#include +#include +#include #include "argument_reorder_model.h" #include "synutils.h" #include "tsuruoka_maxent.h" +using namespace std; + inline void fnPreparingTrainingdata(const char* pszFName, int iCutoff, const char* pszNewFName) { SFReader* pFReader = new STxtFileReader(pszFName); diff --git a/utils/constituent_reorder_model.cc b/utils/constituent_reorder_model.cc index d5c1cf5b..5a5c5546 100644 --- a/utils/constituent_reorder_model.cc +++ b/utils/constituent_reorder_model.cc @@ -79,7 +79,6 @@ struct SConstReorderTrainer { const char* pszTargetFname, // target file name const char* pszInstanceFname, // training instance file name const char* pszModelPrefix, // classifier model file name prefix - int iClassifierType, // classifier type int iCutoff, // feature count threshold const char* pszOption // other classifier parameters (for svmlight) ) { @@ -578,193 +577,6 @@ delete pZhangleMaxent;*/ } }; -struct SConstContTrainer { - SConstContTrainer( - const char* pszFlattenedSynFname, // source-side flattened parse tree - // file name - const char* pszAlignFname, // alignment filename - const char* pszSourceFname, // source file name - const char* pszTargetFname, // target file name - const char* pszInstanceFname, // training instance file name - const char* pszModelPrefix, // classifier model file name prefix - int iClassifierType, // classifier type - int iCutoff, // feature count threshold - const char* pszOption // other classifier parameters (for svmlight) - ) { - fnGenerateInstanceFile(pszFlattenedSynFname, pszAlignFname, pszSourceFname, - pszTargetFname, pszInstanceFname); - // fnTraining(pszInstanceFname, pszModelPrefix, iClassifierType, iCutoff, - // pszOption); - fnTraining(pszInstanceFname, pszModelPrefix, iCutoff); - } - ~SConstContTrainer() {} - - private: - void fnTraining(const char* pszInstanceFname, const char* pszModelFname, - int iCutoff) { - char* pszNewInstanceFName = new char[strlen(pszInstanceFname) + 50]; - if (iCutoff > 0) { - sprintf(pszNewInstanceFName, "%s.tmp", pszInstanceFname); - fnPreparingTrainingdata(pszInstanceFname, iCutoff, pszNewInstanceFName); - } else { - strcpy(pszNewInstanceFName, pszInstanceFname); - } - - /*Zhangle_Maxent *pZhangleMaxent = new Zhangle_Maxent(NULL); - pZhangleMaxent->fnTrain(pszInstanceFname, "lbfgs", pszModelFname, 100, - 2.0); - delete pZhangleMaxent;*/ - - Tsuruoka_Maxent* pMaxent = new Tsuruoka_Maxent(NULL); - pMaxent->fnTrain(pszInstanceFname, "l1", pszModelFname, 300); - delete pMaxent; - - if (strcmp(pszNewInstanceFName, pszInstanceFname) != 0) { - sprintf(pszNewInstanceFName, "rm %s.tmp", pszInstanceFname); - system(pszNewInstanceFName); - } - delete[] pszNewInstanceFName; - } - - void fnGetFocusedParentNodes(const SParsedTree* pTree, - vector& vecFocused) { - for (size_t i = 0; i < pTree->m_vecTerminals.size(); i++) { - STreeItem* pParent = pTree->m_vecTerminals[i]->m_ptParent; - - while (pParent != NULL) { - // if (pParent->m_vecChildren.size() > 1 && pParent->m_iEnd - - // pParent->m_iBegin > 5) { - if (pParent->m_vecChildren.size() > 1) { - // do constituent reordering for all children of pParent - vecFocused.push_back(pParent); - } - if (pParent->m_iBrotherIndex != 0) break; - pParent = pParent->m_ptParent; - } - } - } - - inline void fnGetOutcome(int iL1, int iR1, const SAlignment* pAlign, - string& strOutcome) { - strOutcome = pAlign->fnIsContinuous(iL1, iR1); - } - - inline string fnGetLengthType(int iLen) { - if (iLen == 1) return string("1"); - if (iLen == 2) return string("2"); - if (iLen == 3) return string("3"); - if (iLen < 6) return string("4"); - if (iLen < 11) return string("6"); - return string("11"); - } - - /* - * Source side (11 features): - * f1: the syntactic category - * f2: the syntactic category of its parent - * f3: the head word's pos - * f4: =1 if it's the head of its parent node - * or - * the head of its parent node - * f5: length type - */ - void fnGenerateInstance(const SParsedTree* pTree, const STreeItem* pCon1, - const SAlignment* pAlign, - const vector& vecSTerms, - const vector& vecTTerms, string& strOutcome, - ostringstream& ostr) { - - fnGetOutcome(pCon1->m_iBegin, pCon1->m_iEnd, pAlign, strOutcome); - - // generate features - // f1 - ostr << "f1=" << pCon1->m_pszTerm; - // f2 - ostr << " f2=" << pCon1->m_ptParent->m_pszTerm; - // f3 - ostr << " f3=" << pTree->m_vecTerminals[pCon1->m_iHeadWord] - ->m_ptParent->m_pszTerm; - // f4 - if (pCon1->m_iBrotherIndex == pCon1->m_ptParent->m_iHeadChild) { - ostr << " f4=1"; - } else { - ostr << " f4=" - << pCon1->m_ptParent->m_vecChildren[pCon1->m_ptParent->m_iHeadChild] - ->m_pszTerm; - } - // f5 - ostr << " f5=" << fnGetLengthType(pCon1->m_iEnd - pCon1->m_iBegin + 1); - } - - void fnGenerateInstanceFile( - const char* pszFlattenedSynFname, // source-side flattened parse tree - // file name - const char* pszAlignFname, // alignment filename - const char* pszSourceFname, // source file name - const char* pszTargetFname, // target file name - const char* pszInstanceFname // training instance file name - ) { - SAlignmentReader* pAlignReader = new SAlignmentReader(pszAlignFname); - SParseReader* pParseReader = new SParseReader(pszFlattenedSynFname, true); - STxtFileReader* pTxtSReader = new STxtFileReader(pszSourceFname); - STxtFileReader* pTxtTReader = new STxtFileReader(pszTargetFname); - - FILE* fpOut = fopen(pszInstanceFname, "w"); - assert(fpOut != NULL); - - // read sentence by sentence - SAlignment* pAlign; - SParsedTree* pTree; - char* pszLine = new char[50001]; - int iSentNum = 0; - while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { - pTree = pParseReader->fnReadNextParseTree(); - assert(pTree != NULL); - assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); - vector vecSTerms; - SplitOnWhitespace(string(pszLine), &vecSTerms); - assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); - vector vecTTerms; - SplitOnWhitespace(string(pszLine), &vecTTerms); - - vector vecFocused; - fnGetFocusedParentNodes(pTree, vecFocused); - - for (size_t i = 0; - i < vecFocused.size() && pTree->m_vecTerminals.size() > 10; i++) { - - STreeItem* pParent = vecFocused[i]; - - for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) { - // children[j-1] vs. children[j] reordering - - string strOutcome; - ostringstream ostr; - - fnGenerateInstance(pTree, pParent->m_vecChildren[j], pAlign, - vecSTerms, vecTTerms, strOutcome, ostr); - - // fprintf(stderr, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); - fprintf(fpOut, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); - } - } - - delete pAlign; - delete pTree; - iSentNum++; - - if (iSentNum % 100000 == 0) fprintf(stderr, "#%d\n", iSentNum); - } - - fclose(fpOut); - delete pAlignReader; - delete pParseReader; - delete pTxtSReader; - delete pTxtTReader; - delete[] pszLine; - } -}; - inline void print_options(std::ostream& out, po::options_description const& opts) { typedef std::vector > Ds; @@ -786,7 +598,6 @@ inline string str(char const* name, po::variables_map const& conf) { /// scratch0/mt_exp/gq-ctb/data/train.en --instance_file /// scratch0/mt_exp/gq-ctb/data/srl-instance --model_prefix /// scratch0/mt_exp/gq-ctb/data/srl-instance --feature_cutoff 10 -//--classifier_type 1 int main(int argc, char** argv) { po::options_description opts("Configuration options"); @@ -798,9 +609,7 @@ int main(int argc, char** argv) { "instance_file", po::value(), "Instance file path (output)")( "model_prefix", po::value(), "Model file path prefix (output): three files will be generated")( - "classifier_type", po::value()->default_value(1), - "Classifier type: 1 for openNLP maxent; 2 for Zhangle maxent; and 3 for " - "SVMLight")("feature_cutoff", po::value()->default_value(100), + "feature_cutoff", po::value()->default_value(100), "Feature cutoff threshold")( "svm_option", po::value(), "Parameters for SVMLight classifier")( "help", "produce help message"); @@ -839,8 +648,8 @@ int main(int argc, char** argv) { str("parse_file", vm).c_str(), str("align_file", vm).c_str(), str("source_file", vm).c_str(), str("target_file", vm).c_str(), str("instance_file", vm).c_str(), str("model_prefix", vm).c_str(), - vm["classifier_type"].as(), vm["feature_cutoff"].as(), pOption); + vm["feature_cutoff"].as(), pOption); delete pTrainer; - return 1; + return 0; } -- cgit v1.2.3