diff options
Diffstat (limited to 'utils')
-rw-r--r-- | utils/Makefile.am | 4 | ||||
-rw-r--r-- | utils/alignment.h | 18 | ||||
-rw-r--r-- | utils/argument_reorder_model.cc | 46 | ||||
-rw-r--r-- | utils/argument_reorder_model.h | 73 | ||||
-rw-r--r-- | utils/constituent_reorder_model.cc | 239 | ||||
-rw-r--r-- | utils/lbfgs.h | 3 | ||||
-rw-r--r-- | utils/maxent.cpp | 2 | ||||
-rw-r--r-- | utils/srl_sentence.h | 31 | ||||
-rw-r--r-- | utils/synutils.h | 18 | ||||
-rw-r--r-- | utils/tree.h | 55 | ||||
-rw-r--r-- | utils/tsuruoka_maxent.h | 49 |
11 files changed, 176 insertions, 362 deletions
diff --git a/utils/Makefile.am b/utils/Makefile.am index 0bd21b2b..53967561 100644 --- a/utils/Makefile.am +++ b/utils/Makefile.am @@ -1,4 +1,4 @@ -bin_PROGRAMS = reconstruct_weights atools const_reorder_model_trainer +bin_PROGRAMS = reconstruct_weights atools const_reorder_model_trainer argument_reorder_model_trainer noinst_PROGRAMS = \ ts \ @@ -108,6 +108,8 @@ atools_LDADD = libutils.a atools_LDFLAGS = $(STATIC_FLAGS) const_reorder_model_trainer_SOURCES = constituent_reorder_model.cc const_reorder_model_trainer_LDADD = libutils.a +argument_reorder_model_trainer_SOURCES = argument_reorder_model.cc +argument_reorder_model_trainer_LDADD = libutils.a phmt_SOURCES = phmt.cc phmt_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS) diff --git a/utils/alignment.h b/utils/alignment.h index c0648aab..456577ca 100644 --- a/utils/alignment.h +++ b/utils/alignment.h @@ -15,8 +15,6 @@ #include "stringlib.h" -using namespace std; - /* * Note: * m_vec_s_align.size() may not be equal to the length of source side @@ -25,12 +23,12 @@ using namespace std; * */ struct SAlignment { - typedef vector<int> SingleAlign; + typedef std::vector<int> SingleAlign; SAlignment(const char* pszAlign) { fnInitializeAlignment(pszAlign); } ~SAlignment() {} bool fnIsAligned(int i, bool s) const { - const vector<SingleAlign>* palign; + const std::vector<SingleAlign>* palign; if (s == true) palign = &m_vec_s_align; else @@ -70,7 +68,7 @@ struct SAlignment { } bool fnIsAlignedTightPhrase(int b, int e, bool s, int* pob, int* poe) const { - const vector<SingleAlign>* palign; + const std::vector<SingleAlign>* palign; if (s == true) palign = &m_vec_s_align; else @@ -97,7 +95,7 @@ struct SAlignment { * aligned to any word outside source[b, e] * 3) return "Discon't": otherwise; */ - string fnIsContinuous(int b, int e) const { + std::string fnIsContinuous(int b, int e) const { int ob, oe; fnGetLeftRightMost(b, e, true, ob, oe); if (ob == -1) return "Unaligned"; @@ -124,7 +122,7 @@ struct SAlignment { } private: - void fnGetLeftRightMost(int b, int e, const vector<SingleAlign>& align, + void fnGetLeftRightMost(int b, int e, const std::vector<SingleAlign>& align, int& ob, int& oe) const { ob = oe = -1; for (int i = b; i <= e && i < align.size(); i++) { @@ -139,7 +137,7 @@ struct SAlignment { m_vec_s_align.clear(); m_vec_t_align.clear(); - vector<string> terms = SplitOnWhitespace(string(pszAlign)); + std::vector<std::string> terms = SplitOnWhitespace(std::string(pszAlign)); int si, ti; for (size_t i = 0; i < terms.size(); i++) { sscanf(terms[i].c_str(), "%d-%d", &si, &ti); @@ -167,8 +165,8 @@ struct SAlignment { } private: - vector<SingleAlign> m_vec_s_align; // source side words' alignment - vector<SingleAlign> m_vec_t_align; // target side words' alignment + std::vector<SingleAlign> m_vec_s_align; // source side words' alignment + std::vector<SingleAlign> m_vec_t_align; // target side words' alignment }; struct SAlignmentReader { diff --git a/utils/argument_reorder_model.cc b/utils/argument_reorder_model.cc index 58886251..5caf318f 100644 --- a/utils/argument_reorder_model.cc +++ b/utils/argument_reorder_model.cc @@ -6,12 +6,18 @@ */ #include <boost/program_options.hpp> +#include <iostream> #include <fstream> +#include <sstream> +#include <string> +#include <vector> #include "argument_reorder_model.h" #include "synutils.h" #include "tsuruoka_maxent.h" +using namespace std; + inline void fnPreparingTrainingdata(const char* pszFName, int iCutoff, const char* pszNewFName) { SFReader* pFReader = new STxtFileReader(pszFName); @@ -64,13 +70,13 @@ inline void fnPreparingTrainingdata(const char* pszFName, int iCutoff, struct SArgumentReorderTrainer { SArgumentReorderTrainer( - const char* pszSRLFname, // source-side srl tree file name - const char* pszAlignFname, // alignment filename - const char* pszSourceFname, // source file name - const char* pszTargetFname, // target file name + const char* pszSRLFname, // source-side srl tree file name + const char* pszAlignFname, // alignment filename + const char* pszSourceFname, // source file name + const char* pszTargetFname, // target file name const char* pszTopPredicateFname, // target file name - const char* pszInstanceFname, // training instance file name - const char* pszModelFname, // classifier model file name + const char* pszInstanceFname, // training instance file name + const char* pszModelFname, // classifier model file name int iCutoff) { fnGenerateInstanceFiles(pszSRLFname, pszAlignFname, pszSourceFname, pszTargetFname, pszTopPredicateFname, @@ -110,14 +116,14 @@ struct SArgumentReorderTrainer { } void fnGenerateInstanceFiles( - const char* pszSRLFname, // source-side flattened parse tree file name - const char* pszAlignFname, // alignment filename + const char* pszSRLFname, // source-side flattened parse tree file name + const char* pszAlignFname, // alignment filename const char* pszSourceFname, // source file name const char* pszTargetFname, // target file name const char* pszTopPredicateFname, // top predicate file name (we only // consider predicates with 100+ // occurrences - const char* pszInstanceFname // training instance file name + const char* pszInstanceFname // training instance file name ) { SAlignmentReader* pAlignReader = new SAlignmentReader(pszAlignFname); SSrlSentenceReader* pSRLReader = new SSrlSentenceReader(pszSRLFname); @@ -257,24 +263,24 @@ inline void print_options(std::ostream& out, if (i) out << ' '; out << "--" << ds[i]->long_name(); } - out << '"\n'; + out << '\n'; } inline string str(char const* name, po::variables_map const& conf) { return conf[name].as<string>(); } //--srl_file /scratch0/mt_exp/gale-align/gale-align.nw.srl.cn --align_file -///scratch0/mt_exp/gale-align/gale-align.nw.al --source_file -///scratch0/mt_exp/gale-align/gale-align.nw.cn --target_file -///scratch0/mt_exp/gale-align/gale-align.nw.en --instance_file -///scratch0/mt_exp/gale-align/gale-align.nw.argreorder.instance --model_prefix -///scratch0/mt_exp/gale-align/gale-align.nw.argreorder.model --feature_cutoff 2 +/// scratch0/mt_exp/gale-align/gale-align.nw.al --source_file +/// scratch0/mt_exp/gale-align/gale-align.nw.cn --target_file +/// scratch0/mt_exp/gale-align/gale-align.nw.en --instance_file +/// scratch0/mt_exp/gale-align/gale-align.nw.argreorder.instance --model_prefix +/// scratch0/mt_exp/gale-align/gale-align.nw.argreorder.model --feature_cutoff 2 //--srl_file /scratch0/mt_exp/gale-ctb/gale-ctb.srl.cn --align_file -///scratch0/mt_exp/gale-ctb/gale-ctb.align --source_file -///scratch0/mt_exp/gale-ctb/gale-ctb.cn --target_file -///scratch0/mt_exp/gale-ctb/gale-ctb.en0 --instance_file -///scratch0/mt_exp/gale-ctb/gale-ctb.argreorder.instance --model_prefix -///scratch0/mt_exp/gale-ctb/gale-ctb.argreorder.model --feature_cutoff 2 +/// scratch0/mt_exp/gale-ctb/gale-ctb.align --source_file +/// scratch0/mt_exp/gale-ctb/gale-ctb.cn --target_file +/// scratch0/mt_exp/gale-ctb/gale-ctb.en0 --instance_file +/// scratch0/mt_exp/gale-ctb/gale-ctb.argreorder.instance --model_prefix +/// scratch0/mt_exp/gale-ctb/gale-ctb.argreorder.model --feature_cutoff 2 int main(int argc, char** argv) { po::options_description opts("Configuration options"); diff --git a/utils/argument_reorder_model.h b/utils/argument_reorder_model.h index 062b8841..077fa5ba 100644 --- a/utils/argument_reorder_model.h +++ b/utils/argument_reorder_model.h @@ -8,17 +8,20 @@ #ifndef ARGUMENT_REORDER_MODEL_H_ #define ARGUMENT_REORDER_MODEL_H_ +#include <string> +#include <vector> + #include "alignment.h" #include "tree.h" #include "srl_sentence.h" // an argument item or a predicate item (the verb itself) struct SSRLItem { - SSRLItem(const STreeItem *tree_item, string role) + SSRLItem(const STreeItem *tree_item, std::string role) : tree_item_(tree_item), role_(role) {} ~SSRLItem() {} const STreeItem *tree_item_; - const string role_; + const std::string role_; }; struct SPredicateItem { @@ -26,11 +29,13 @@ struct SPredicateItem { : pred_(pred) { vec_items_.reserve(pred->m_vecArgt.size() + 1); for (int i = 0; i < pred->m_vecArgt.size(); i++) { - vec_items_.push_back(new SSRLItem(pred->m_vecArgt[i]->m_pTreeItem, - string(pred->m_vecArgt[i]->m_pszRole))); + vec_items_.push_back( + new SSRLItem(pred->m_vecArgt[i]->m_pTreeItem, + std::string(pred->m_vecArgt[i]->m_pszRole))); } - vec_items_.push_back(new SSRLItem( - tree->m_vecTerminals[pred->m_iPosition]->m_ptParent, string("Pred"))); + vec_items_.push_back( + new SSRLItem(tree->m_vecTerminals[pred->m_iPosition]->m_ptParent, + std::string("Pred"))); sort(vec_items_.begin(), vec_items_.end(), SortFunction); begin_ = vec_items_[0]->tree_item_->m_iBegin; @@ -43,7 +48,7 @@ struct SPredicateItem { return (i->tree_item_->m_iBegin < j->tree_item_->m_iBegin); } - vector<SSRLItem *> vec_items_; + std::vector<SSRLItem *> vec_items_; int begin_; int end_; const SPredicate *pred_; @@ -51,13 +56,14 @@ struct SPredicateItem { struct SArgumentReorderModel { public: - static string fnGetBlockOutcome(int iBegin, int iEnd, SAlignment *pAlign) { + static std::string fnGetBlockOutcome(int iBegin, int iEnd, + SAlignment *pAlign) { return pAlign->fnIsContinuous(iBegin, iEnd); } static void fnGetReorderType(SPredicateItem *pPredItem, SAlignment *pAlign, - vector<string> &vecStrLeftReorder, - vector<string> &vecStrRightReorder) { - vector<int> vecLeft, vecRight; + std::vector<std::string> &vecStrLeftReorder, + std::vector<std::string> &vecStrRightReorder) { + std::vector<int> vecLeft, vecRight; for (int i = 0; i < pPredItem->vec_items_.size(); i++) { const STreeItem *pCon1 = pPredItem->vec_items_[i]->tree_item_; int iLeft1, iRight1; @@ -66,15 +72,15 @@ struct SArgumentReorderModel { vecLeft.push_back(iLeft1); vecRight.push_back(iRight1); } - vector<int> vecLeftPosition; + std::vector<int> vecLeftPosition; fnGetRelativePosition(vecLeft, vecLeftPosition); - vector<int> vecRightPosition; + std::vector<int> vecRightPosition; fnGetRelativePosition(vecRight, vecRightPosition); vecStrLeftReorder.clear(); vecStrRightReorder.clear(); for (int i = 1; i < vecLeftPosition.size(); i++) { - string strOutcome; + std::string strOutcome; fnGetOutcome(vecLeftPosition[i - 1], vecLeftPosition[i], strOutcome); vecStrLeftReorder.push_back(strOutcome); fnGetOutcome(vecRightPosition[i - 1], vecRightPosition[i], strOutcome); @@ -115,32 +121,33 @@ struct SArgumentReorderModel { static void fnGenerateFeature(const SParsedTree *pTree, const SPredicate *pPred, const SPredicateItem *pPredItem, int iPos, - const string &strBlock1, - const string &strBlock2, ostringstream &ostr) { + const std::string &strBlock1, + const std::string &strBlock2, + std::ostringstream &ostr) { SSRLItem *pSRLItem1 = pPredItem->vec_items_[iPos - 1]; SSRLItem *pSRLItem2 = pPredItem->vec_items_[iPos]; const STreeItem *pCon1 = pSRLItem1->tree_item_; const STreeItem *pCon2 = pSRLItem2->tree_item_; - string left_role = pSRLItem1->role_; - string right_role = pSRLItem2->role_; + std::string left_role = pSRLItem1->role_; + std::string right_role = pSRLItem2->role_; - string predicate_term = + std::string predicate_term = pTree->m_vecTerminals[pPred->m_iPosition]->m_pszTerm; - vector<string> vec_other_right_sibling; + std::vector<std::string> vec_other_right_sibling; for (int i = iPos + 1; i < pPredItem->vec_items_.size(); i++) vec_other_right_sibling.push_back( - string(pPredItem->vec_items_[i]->role_)); + std::string(pPredItem->vec_items_[i]->role_)); if (vec_other_right_sibling.size() == 0) - vec_other_right_sibling.push_back(string("NULL")); + vec_other_right_sibling.push_back(std::string("NULL")); - vector<string> vec_other_left_sibling; + std::vector<std::string> vec_other_left_sibling; for (int i = 0; i < iPos - 1; i++) vec_other_right_sibling.push_back( - string(pPredItem->vec_items_[i]->role_)); + std::string(pPredItem->vec_items_[i]->role_)); if (vec_other_left_sibling.size() == 0) - vec_other_left_sibling.push_back(string("NULL")); + vec_other_left_sibling.push_back(std::string("NULL")); // generate features // f1 @@ -190,26 +197,26 @@ struct SArgumentReorderModel { } private: - static void fnGetOutcome(int i1, int i2, string &strOutcome) { + static void fnGetOutcome(int i1, int i2, std::string &strOutcome) { assert(i1 != i2); if (i1 < i2) { if (i2 > i1 + 1) - strOutcome = string("DM"); + strOutcome = std::string("DM"); else - strOutcome = string("M"); + strOutcome = std::string("M"); } else { if (i1 > i2 + 1) - strOutcome = string("DS"); + strOutcome = std::string("DS"); else - strOutcome = string("S"); + strOutcome = std::string("S"); } } - static void fnGetRelativePosition(const vector<int> &vecLeft, - vector<int> &vecPosition) { + static void fnGetRelativePosition(const std::vector<int> &vecLeft, + std::vector<int> &vecPosition) { vecPosition.clear(); - vector<float> vec; + std::vector<float> vec; for (int i = 0; i < vecLeft.size(); i++) { if (vecLeft[i] == -1) { if (i == 0) diff --git a/utils/constituent_reorder_model.cc b/utils/constituent_reorder_model.cc index 042c751b..df75a1a0 100644 --- a/utils/constituent_reorder_model.cc +++ b/utils/constituent_reorder_model.cc @@ -73,15 +73,14 @@ inline void fnPreparingTrainingdata(const char* pszFName, int iCutoff, struct SConstReorderTrainer { SConstReorderTrainer( - const char* pszSynFname, // source-side flattened parse tree file name - const char* pszAlignFname, // alignment filename + const char* pszSynFname, // source-side flattened parse tree file name + const char* pszAlignFname, // alignment filename const char* pszSourceFname, // source file name const char* pszTargetFname, // target file name const char* pszInstanceFname, // training instance file name - const char* pszModelPrefix, // classifier model file name prefix - int iClassifierType, // classifier type - int iCutoff, // feature count threshold - const char* pszOption // other classifier parameters (for svmlight) + const char* pszModelPrefix, // classifier model file name prefix + int iCutoff, // feature count threshold + const char* /*pszOption*/ // other classifier parameters (for svmlight) ) { fnGenerateInstanceFile(pszSynFname, pszAlignFname, pszSourceFname, pszTargetFname, pszInstanceFname); @@ -135,14 +134,14 @@ delete pZhangleMaxent;*/ } inline void fnGetOutcome(int iL1, int iR1, int iL2, int iR2, - const SAlignment* pAlign, string& strOutcome) { + const SAlignment* /*pAlign*/, string& strOutcome) { if (iL1 == -1 && iL2 == -1) strOutcome = "BU"; // 1. both are untranslated else if (iL1 == -1) strOutcome = "1U"; // 2. XP1 is untranslated else if (iL2 == -1) strOutcome = "2U"; // 3. XP2 is untranslated - else if (iL1 == iL2 && iR2 == iR2) + else if (iL1 == iL2 && iR1 == iR2) strOutcome = "SS"; // 4. Have same scope else if (iL1 <= iL2 && iR1 >= iR2) strOutcome = "1C2"; // 5. XP1's translation covers XP2's @@ -241,7 +240,7 @@ delete pZhangleMaxent;*/ int iPos, const vector<string>& vecChunkStatus, const vector<int>& vecPosition, const vector<string>& vecSTerms, - const vector<string>& vecTTerms, string& strOutcome, + const vector<string>& /*vecTTerms*/, string& strOutcome, ostringstream& ostr) { STreeItem* pCon1, *pCon2; pCon1 = pParent->m_vecChildren[iPos - 1]; @@ -314,11 +313,11 @@ delete pZhangleMaxent;*/ * f8: the first and the last word of XP2's translation (f8_f, f8_l) * f9: the translation of XP1's and XP2's head word (f9_1, f9_2) */ - void fnGenerateInstance(const SParsedTree* pTree, const STreeItem* pParent, + void fnGenerateInstance(const SParsedTree* /*pTree*/, const STreeItem* pParent, const STreeItem* pCon1, const STreeItem* pCon2, const SAlignment* pAlign, const vector<string>& vecSTerms, - const vector<string>& vecTTerms, string& strOutcome, + const vector<string>& /*vecTTerms*/, string& strOutcome, ostringstream& ostr) { int iLeft1, iRight1, iLeft2, iRight2; @@ -401,8 +400,8 @@ delete pZhangleMaxent;*/ } void fnGenerateInstanceFile( - const char* pszSynFname, // source-side flattened parse tree file name - const char* pszAlignFname, // alignment filename + const char* pszSynFname, // source-side flattened parse tree file name + const char* pszAlignFname, // alignment filename const char* pszSourceFname, // source file name const char* pszTargetFname, // target file name const char* pszInstanceFname // training instance file name @@ -507,8 +506,8 @@ delete pZhangleMaxent;*/ } void fnGenerateInstanceFile2( - const char* pszSynFname, // source-side flattened parse tree file name - const char* pszAlignFname, // alignment filename + const char* pszSynFname, // source-side flattened parse tree file name + const char* pszAlignFname, // alignment filename const char* pszSourceFname, // source file name const char* pszTargetFname, // target file name const char* pszInstanceFname // training instance file name @@ -578,193 +577,6 @@ delete pZhangleMaxent;*/ } }; -struct SConstContTrainer { - SConstContTrainer( - const char* pszFlattenedSynFname, // source-side flattened parse tree - // file name - const char* pszAlignFname, // alignment filename - const char* pszSourceFname, // source file name - const char* pszTargetFname, // target file name - const char* pszInstanceFname, // training instance file name - const char* pszModelPrefix, // classifier model file name prefix - int iClassifierType, // classifier type - int iCutoff, // feature count threshold - const char* pszOption // other classifier parameters (for svmlight) - ) { - fnGenerateInstanceFile(pszFlattenedSynFname, pszAlignFname, pszSourceFname, - pszTargetFname, pszInstanceFname); - // fnTraining(pszInstanceFname, pszModelPrefix, iClassifierType, iCutoff, - // pszOption); - fnTraining(pszInstanceFname, pszModelPrefix, iCutoff); - } - ~SConstContTrainer() {} - - private: - void fnTraining(const char* pszInstanceFname, const char* pszModelFname, - int iCutoff) { - char* pszNewInstanceFName = new char[strlen(pszInstanceFname) + 50]; - if (iCutoff > 0) { - sprintf(pszNewInstanceFName, "%s.tmp", pszInstanceFname); - fnPreparingTrainingdata(pszInstanceFname, iCutoff, pszNewInstanceFName); - } else { - strcpy(pszNewInstanceFName, pszInstanceFname); - } - - /*Zhangle_Maxent *pZhangleMaxent = new Zhangle_Maxent(NULL); - pZhangleMaxent->fnTrain(pszInstanceFname, "lbfgs", pszModelFname, 100, - 2.0); - delete pZhangleMaxent;*/ - - Tsuruoka_Maxent* pMaxent = new Tsuruoka_Maxent(NULL); - pMaxent->fnTrain(pszInstanceFname, "l1", pszModelFname, 300); - delete pMaxent; - - if (strcmp(pszNewInstanceFName, pszInstanceFname) != 0) { - sprintf(pszNewInstanceFName, "rm %s.tmp", pszInstanceFname); - system(pszNewInstanceFName); - } - delete[] pszNewInstanceFName; - } - - void fnGetFocusedParentNodes(const SParsedTree* pTree, - vector<STreeItem*>& vecFocused) { - for (size_t i = 0; i < pTree->m_vecTerminals.size(); i++) { - STreeItem* pParent = pTree->m_vecTerminals[i]->m_ptParent; - - while (pParent != NULL) { - // if (pParent->m_vecChildren.size() > 1 && pParent->m_iEnd - - // pParent->m_iBegin > 5) { - if (pParent->m_vecChildren.size() > 1) { - // do constituent reordering for all children of pParent - vecFocused.push_back(pParent); - } - if (pParent->m_iBrotherIndex != 0) break; - pParent = pParent->m_ptParent; - } - } - } - - inline void fnGetOutcome(int iL1, int iR1, const SAlignment* pAlign, - string& strOutcome) { - strOutcome = pAlign->fnIsContinuous(iL1, iR1); - } - - inline string fnGetLengthType(int iLen) { - if (iLen == 1) return string("1"); - if (iLen == 2) return string("2"); - if (iLen == 3) return string("3"); - if (iLen < 6) return string("4"); - if (iLen < 11) return string("6"); - return string("11"); - } - - /* - * Source side (11 features): - * f1: the syntactic category - * f2: the syntactic category of its parent - * f3: the head word's pos - * f4: =1 if it's the head of its parent node - * or - * the head of its parent node - * f5: length type - */ - void fnGenerateInstance(const SParsedTree* pTree, const STreeItem* pCon1, - const SAlignment* pAlign, - const vector<string>& vecSTerms, - const vector<string>& vecTTerms, string& strOutcome, - ostringstream& ostr) { - - fnGetOutcome(pCon1->m_iBegin, pCon1->m_iEnd, pAlign, strOutcome); - - // generate features - // f1 - ostr << "f1=" << pCon1->m_pszTerm; - // f2 - ostr << " f2=" << pCon1->m_ptParent->m_pszTerm; - // f3 - ostr << " f3=" << pTree->m_vecTerminals[pCon1->m_iHeadWord] - ->m_ptParent->m_pszTerm; - // f4 - if (pCon1->m_iBrotherIndex == pCon1->m_ptParent->m_iHeadChild) { - ostr << " f4=1"; - } else { - ostr << " f4=" - << pCon1->m_ptParent->m_vecChildren[pCon1->m_ptParent->m_iHeadChild] - ->m_pszTerm; - } - // f5 - ostr << " f5=" << fnGetLengthType(pCon1->m_iEnd - pCon1->m_iBegin + 1); - } - - void fnGenerateInstanceFile( - const char* pszFlattenedSynFname, // source-side flattened parse tree - // file name - const char* pszAlignFname, // alignment filename - const char* pszSourceFname, // source file name - const char* pszTargetFname, // target file name - const char* pszInstanceFname // training instance file name - ) { - SAlignmentReader* pAlignReader = new SAlignmentReader(pszAlignFname); - SParseReader* pParseReader = new SParseReader(pszFlattenedSynFname, true); - STxtFileReader* pTxtSReader = new STxtFileReader(pszSourceFname); - STxtFileReader* pTxtTReader = new STxtFileReader(pszTargetFname); - - FILE* fpOut = fopen(pszInstanceFname, "w"); - assert(fpOut != NULL); - - // read sentence by sentence - SAlignment* pAlign; - SParsedTree* pTree; - char* pszLine = new char[50001]; - int iSentNum = 0; - while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { - pTree = pParseReader->fnReadNextParseTree(); - assert(pTree != NULL); - assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); - vector<string> vecSTerms; - SplitOnWhitespace(string(pszLine), &vecSTerms); - assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); - vector<string> vecTTerms; - SplitOnWhitespace(string(pszLine), &vecTTerms); - - vector<STreeItem*> vecFocused; - fnGetFocusedParentNodes(pTree, vecFocused); - - for (size_t i = 0; - i < vecFocused.size() && pTree->m_vecTerminals.size() > 10; i++) { - - STreeItem* pParent = vecFocused[i]; - - for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) { - // children[j-1] vs. children[j] reordering - - string strOutcome; - ostringstream ostr; - - fnGenerateInstance(pTree, pParent->m_vecChildren[j], pAlign, - vecSTerms, vecTTerms, strOutcome, ostr); - - // fprintf(stderr, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); - fprintf(fpOut, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); - } - } - - delete pAlign; - delete pTree; - iSentNum++; - - if (iSentNum % 100000 == 0) fprintf(stderr, "#%d\n", iSentNum); - } - - fclose(fpOut); - delete pAlignReader; - delete pParseReader; - delete pTxtSReader; - delete pTxtTReader; - delete[] pszLine; - } -}; - inline void print_options(std::ostream& out, po::options_description const& opts) { typedef std::vector<boost::shared_ptr<po::option_description> > Ds; @@ -781,12 +593,11 @@ inline string str(char const* name, po::variables_map const& conf) { } //--parse_file /scratch0/mt_exp/gq-ctb/data/train.srl.cn --align_file -///scratch0/mt_exp/gq-ctb/data/aligned.grow-diag-final-and --source_file -///scratch0/mt_exp/gq-ctb/data/train.cn --target_file -///scratch0/mt_exp/gq-ctb/data/train.en --instance_file -///scratch0/mt_exp/gq-ctb/data/srl-instance --model_prefix -///scratch0/mt_exp/gq-ctb/data/srl-instance --feature_cutoff 10 -//--classifier_type 1 +/// scratch0/mt_exp/gq-ctb/data/aligned.grow-diag-final-and --source_file +/// scratch0/mt_exp/gq-ctb/data/train.cn --target_file +/// scratch0/mt_exp/gq-ctb/data/train.en --instance_file +/// scratch0/mt_exp/gq-ctb/data/srl-instance --model_prefix +/// scratch0/mt_exp/gq-ctb/data/srl-instance --feature_cutoff 10 int main(int argc, char** argv) { po::options_description opts("Configuration options"); @@ -798,11 +609,9 @@ int main(int argc, char** argv) { "instance_file", po::value<string>(), "Instance file path (output)")( "model_prefix", po::value<string>(), "Model file path prefix (output): three files will be generated")( - "classifier_type", po::value<int>()->default_value(1), - "Classifier type: 1 for openNLP maxent; 2 for Zhangle maxent; and 3 for " - "SVMLight")("feature_cutoff", po::value<int>()->default_value(100), - "Feature cutoff threshold")( - "svm_option", po::value<string>(), "Parameters for SVMLight classifier")( + "feature_cutoff", po::value<int>()->default_value(100), + "Feature cutoff threshold")("svm_option", po::value<string>(), + "Parameters for SVMLight classifier")( "help", "produce help message"); po::variables_map vm; @@ -839,8 +648,8 @@ int main(int argc, char** argv) { str("parse_file", vm).c_str(), str("align_file", vm).c_str(), str("source_file", vm).c_str(), str("target_file", vm).c_str(), str("instance_file", vm).c_str(), str("model_prefix", vm).c_str(), - vm["classifier_type"].as<int>(), vm["feature_cutoff"].as<int>(), pOption); + vm["feature_cutoff"].as<int>(), pOption); delete pTrainer; - return 1; + return 0; } diff --git a/utils/lbfgs.h b/utils/lbfgs.h index ed5cd944..4d706f7a 100644 --- a/utils/lbfgs.h +++ b/utils/lbfgs.h @@ -1,6 +1,8 @@ #ifndef _LBFGS_H_ #define _LBFGS_H_ +#include <vector> + // template<class FuncGrad> // std::vector<double> // perform_LBFGS(FuncGrad func_grad, const std::vector<double> & x0); @@ -13,7 +15,6 @@ std::vector<double> perform_OWLQN( double (*func_grad)(const std::vector<double> &, std::vector<double> &), const std::vector<double> &x0, const double C); -// const int LBFGS_M = 7; const int LBFGS_M = 10; #endif diff --git a/utils/maxent.cpp b/utils/maxent.cpp index 9115f6f2..0f49ee9d 100644 --- a/utils/maxent.cpp +++ b/utils/maxent.cpp @@ -142,7 +142,7 @@ int ME_Model::make_feature_bag(const int cutoff) { // count the occurrences of features #ifdef USE_HASH_MAP - typedef __gnu_cxx::hash_map<unsigned int, int> map_type; + typedef std::unordered_map<unsigned int, int> map_type; #else typedef std::map<unsigned int, int> map_type; #endif diff --git a/utils/srl_sentence.h b/utils/srl_sentence.h index 61532fb2..9d509600 100644 --- a/utils/srl_sentence.h +++ b/utils/srl_sentence.h @@ -8,15 +8,12 @@ #ifndef SRL_SENTENCE_H_ #define SRL_SENTENCE_H_ - #include <sstream> #include <vector> #include "tree.h" #include "stringlib.h" -using namespace std; - struct SArgument { SArgument(const char* pszRole, int iBegin, int iEnd, float fProb) { m_pszRole = new char[strlen(pszRole) + 1]; @@ -38,7 +35,7 @@ struct SArgument { char* m_pszRole; // argument rule, e.g., ARG0, ARGM-TMP int m_iBegin; - int m_iEnd; // the span of the argument, [m_iBegin, m_iEnd] + int m_iEnd; // the span of the argument, [m_iBegin, m_iEnd] float m_fProb; // the probability of this role, STreeItem* m_pTreeItem; }; @@ -68,8 +65,8 @@ struct SPredicate { char* m_pszLemma; // lemma of the predicate, for Chinese, it's always as same // as the predicate itself - int m_iPosition; // the position in sentence - vector<SArgument*> m_vecArgt; // arguments associated to the predicate + int m_iPosition; // the position in sentence + std::vector<SArgument*> m_vecArgt; // arguments associated to the predicate }; struct SSrlSentence { @@ -91,7 +88,7 @@ struct SSrlSentence { int GetPredicateNum() { return m_vecPred.size(); } SParsedTree* m_pTree; - vector<SPredicate*> m_vecPred; + std::vector<SPredicate*> m_vecPred; }; struct SSrlSentenceReader { @@ -116,7 +113,7 @@ struct SSrlSentenceReader { // TODO: here only considers flat predicate-argument structure // i.e., no overlap among them SSrlSentence* fnReadNextSrlSentence() { - vector<vector<string> > vecContent; + std::vector<std::vector<std::string> > vecContent; if (fnReadNextContent(vecContent) == false) return NULL; SSrlSentence* pSrlSentence = new SSrlSentence(); @@ -124,18 +121,18 @@ struct SSrlSentenceReader { // put together syntactic text std::ostringstream ostr; for (int i = 0; i < iSize; i++) { - string strSynSeg = + std::string strSynSeg = vecContent[i][5]; // the 5th column is the syntactic segment size_t iPosition = strSynSeg.find_first_of('*'); - assert(iPosition != string::npos); - ostringstream ostrTmp; + assert(iPosition != std::string::npos); + std::ostringstream ostrTmp; ostrTmp << "(" << vecContent[i][2] << " " << vecContent[i][0] << ")"; // the 2th column is POS-tag, and the 0th column is word strSynSeg.replace(iPosition, 1, ostrTmp.str()); fnReplaceAll(strSynSeg, "(", " ("); ostr << strSynSeg; } - string strSyn = ostr.str(); + std::string strSyn = ostr.str(); pSrlSentence->m_pTree = SParsedTree::fnConvertFromString(strSyn.c_str()); pSrlSentence->m_pTree->fnSetHeadWord(); pSrlSentence->m_pTree->fnSetSpanInfo(); @@ -143,9 +140,9 @@ struct SSrlSentenceReader { // read predicate-argument structure int iNumPred = vecContent[0].size() - 8; for (int i = 0; i < iNumPred; i++) { - vector<string> vecRole; - vector<int> vecBegin; - vector<int> vecEnd; + std::vector<std::string> vecRole; + std::vector<int> vecBegin; + std::vector<int> vecEnd; int iPred = -1; for (int j = 0; j < iSize; j++) { const char* p = vecContent[j][i + 8].c_str(); @@ -184,7 +181,7 @@ struct SSrlSentenceReader { } private: - bool fnReadNextContent(vector<vector<string> >& vecContent) { + bool fnReadNextContent(std::vector<std::vector<std::string> >& vecContent) { vecContent.clear(); if (feof(m_fpIn) == true) return false; char* pszLine; @@ -200,7 +197,7 @@ struct SSrlSentenceReader { } if (iLen == 0) break; // end of this sentence - vector<string> terms = SplitOnWhitespace(string(pszLine)); + std::vector<std::string> terms = SplitOnWhitespace(std::string(pszLine)); assert(terms.size() > 7); vecContent.push_back(terms); } diff --git a/utils/synutils.h b/utils/synutils.h index ef7b78b7..f611553e 100644 --- a/utils/synutils.h +++ b/utils/synutils.h @@ -17,21 +17,17 @@ #include <string> #include <unordered_map> -using namespace std; - typedef std::unordered_map<std::string, int> MapString2Int; typedef std::unordered_map<std::string, float> MapString2Float; typedef std::unordered_map<std::string, float>::iterator MapString2FloatIterator; -using namespace std; - struct SFReader { SFReader() {} virtual ~SFReader() {} virtual bool fnReadNextLine(char* pszLine, int* piLength) = 0; - virtual bool fnReadNextLine(string& strLine) = 0; + virtual bool fnReadNextLine(std::string& strLine) = 0; }; struct STxtFileReader : public SFReader { @@ -63,13 +59,13 @@ struct STxtFileReader : public SFReader { return true; } - bool fnReadNextLine(string& strLine) { + bool fnReadNextLine(std::string& strLine) { char* pszLine = new char[10001]; bool bOut = fnReadNextLine(pszLine, NULL); if (bOut) - strLine = string(pszLine); + strLine = std::string(pszLine); else - strLine = string(""); + strLine = std::string(""); delete[] pszLine; return bOut; @@ -108,13 +104,13 @@ struct SGZFileReader : public SFReader { return true; } - bool fnReadNextLine(string& strLine) { + bool fnReadNextLine(std::string& strLine) { char* pszLine = new char[10001]; bool bOut = fnReadNextLine(pszLine, NULL); if (bOut) - strLine = string(pszLine); + strLine = std::string(pszLine); else - strLine = string(""); + strLine = std::string(""); delete[] pszLine; return bOut; diff --git a/utils/tree.h b/utils/tree.h index 8070f828..6c3406d6 100644 --- a/utils/tree.h +++ b/utils/tree.h @@ -14,8 +14,6 @@ #include <string> #include <vector> -using namespace std; - struct STreeItem { STreeItem(const char *pszTerm) { m_pszTerm = new char[strlen(pszTerm) + 1]; @@ -53,18 +51,18 @@ struct STreeItem { public: char *m_pszTerm; - vector<STreeItem *> m_vecChildren; // children items - STreeItem *m_ptParent; // the parent item + std::vector<STreeItem *> m_vecChildren; // children items + STreeItem *m_ptParent; // the parent item int m_iBegin; - int m_iEnd; // the node span words[m_iBegin, m_iEnd] - int m_iHeadChild; // the index of its head child - int m_iHeadWord; // the index of its head word + int m_iEnd; // the node span words[m_iBegin, m_iEnd] + int m_iHeadChild; // the index of its head child + int m_iHeadWord; // the index of its head word int m_iBrotherIndex; // the index in his brothers }; struct SGetHeadWord { - typedef vector<string> CVectorStr; + typedef std::vector<std::string> CVectorStr; SGetHeadWord() {} ~SGetHeadWord() {} int fnGetHeadWord(char *pszCFGLeft, CVectorStr vectRight) { @@ -311,7 +309,7 @@ struct SParsedTree { if (strcmp(pszStr, "(())") == 0) return NULL; SParsedTree *pTree = new SParsedTree(); - vector<string> vecSyn; + std::vector<std::string> vecSyn; fnReadSyntactic(pszStr, vecSyn); int iLeft = 1, iRight = 1; //# left/right parenthesis @@ -418,13 +416,13 @@ struct SParsedTree { for (I = 0; I < ptItem->m_vecChildren.size(); I++) fnSuffixTraverseSetHeadWord(ptItem->m_vecChildren[I], pGetHeadWord); - vector<string> vecRight; + std::vector<std::string> vecRight; if (ptItem->m_vecChildren.size() == 1) iHeadchild = 0; else { for (I = 0; I < ptItem->m_vecChildren.size(); I++) - vecRight.push_back(string(ptItem->m_vecChildren[I]->m_pszTerm)); + vecRight.push_back(std::string(ptItem->m_vecChildren[I]->m_pszTerm)); iHeadchild = pGetHeadWord->fnGetHeadWord(ptItem->m_pszTerm, vecRight); } @@ -433,7 +431,8 @@ struct SParsedTree { ptItem->m_iHeadWord = ptItem->m_vecChildren[iHeadchild]->m_iHeadWord; } - static void fnReadSyntactic(const char *pszSyn, vector<string> &vec) { + static void fnReadSyntactic(const char *pszSyn, + std::vector<std::string> &vec) { char *p; int I; @@ -481,29 +480,29 @@ struct SParsedTree { if ((pszTerm[0] == '(') || (pszTerm[strlen(pszTerm) - 1] == ')')) { if (pszTerm[0] == '(') { - vec.push_back(string("(")); + vec.push_back(std::string("(")); iLeftNum++; I = 1; while (pszTerm[I] == '(' && pszTerm[I] != '\0') { - vec.push_back(string("(")); + vec.push_back(std::string("(")); iLeftNum++; I++; } - if (strlen(pszTerm) > 1) vec.push_back(string(pszTerm + I)); + if (strlen(pszTerm) > 1) vec.push_back(std::string(pszTerm + I)); } else { char *pTmp; pTmp = pszTerm + strlen(pszTerm) - 1; while ((pTmp[0] == ')') && (pTmp >= pszTerm)) pTmp--; pTmp[1] = '\0'; - if (strlen(pszTerm) > 0) vec.push_back(string(pszTerm)); + if (strlen(pszTerm) > 0) vec.push_back(std::string(pszTerm)); pTmp += 2; for (I = 0; I <= (int)strlen(pTmp); I++) { - vec.push_back(string(")")); + vec.push_back(std::string(")")); iRightNum++; } } @@ -512,26 +511,26 @@ struct SParsedTree { q = strchr(pszTerm, ')'); if (q != NULL) { q[0] = '\0'; - if (pszTerm[0] != '\0') vec.push_back(string(pszTerm)); - vec.push_back(string(")")); + if (pszTerm[0] != '\0') vec.push_back(std::string(pszTerm)); + vec.push_back(std::string(")")); iRightNum++; q++; while (q[0] == ')') { - vec.push_back(string(")")); + vec.push_back(std::string(")")); q++; iRightNum++; } while (q[0] == '(') { - vec.push_back(string("(")); + vec.push_back(std::string("(")); q++; iLeftNum++; } - if (q[0] != '\0') vec.push_back(string(q)); + if (q[0] != '\0') vec.push_back(std::string(q)); } else - vec.push_back(string(pszTerm)); + vec.push_back(std::string(pszTerm)); } } @@ -547,10 +546,10 @@ struct SParsedTree { if (vec.size() >= 2 && strcmp(vec[1].c_str(), "(") == 0) { //( (IP..) ) - std::vector<string>::iterator it; + std::vector<std::string>::iterator it; it = vec.begin(); it++; - vec.insert(it, string("ROOT")); + vec.insert(it, std::string("ROOT")); } break; @@ -563,7 +562,7 @@ struct SParsedTree { public: STreeItem *m_ptRoot; - vector<STreeItem *> m_vecTerminals; // the leaf nodes + std::vector<STreeItem *> m_vecTerminals; // the leaf nodes }; struct SParseReader { @@ -645,7 +644,7 @@ struct SParseReader { for (size_t i = 0; i < pTreeItem->m_vecChildren.size(); i++) fnSuffixTraverseSetHeadWord(pTreeItem->m_vecChildren[i]); - vector<string> vecRight; + std::vector<std::string> vecRight; int iHeadchild; @@ -658,7 +657,7 @@ struct SParseReader { if (p[0] == '*' && p[strlen(p) - 1] == '*') { iHeadchild = i; p[strlen(p) - 1] = '\0'; - string str = p + 1; + std::string str = p + 1; strcpy(p, str.c_str()); // erase the "*..*" break; } diff --git a/utils/tsuruoka_maxent.h b/utils/tsuruoka_maxent.h index e6bef232..550a4b7f 100644 --- a/utils/tsuruoka_maxent.h +++ b/utils/tsuruoka_maxent.h @@ -6,17 +6,16 @@ #ifndef TSURUOKA_MAXENT_H_ #define TSURUOKA_MAXENT_H_ -#include "synutils.h" -#include "stringlib.h" -#include "maxent.h" - #include <assert.h> -#include <vector> -#include <string> #include <string.h> +#include <string> #include <unordered_map> +#include <utility> +#include <vector> -using namespace std; +#include "synutils.h" +#include "stringlib.h" +#include "maxent.h" typedef std::unordered_map<std::string, int> Map; typedef std::unordered_map<std::string, int>::iterator Iterator; @@ -35,7 +34,7 @@ struct Tsuruoka_Maxent { } void fnTrain(const char* pszInstanceFName, const char* pszAlgorithm, - const char* pszModelFName, int iNumIteration) { + const char* pszModelFName, int /*iNumIteration*/) { assert(strcmp(pszAlgorithm, "l1") == 0 || strcmp(pszAlgorithm, "l2") == 0 || strcmp(pszAlgorithm, "sgd") == 0 || strcmp(pszAlgorithm, "SGD") == 0); @@ -67,10 +66,10 @@ struct Tsuruoka_Maxent { assert(p != NULL); p[0] = '\0'; p++; - vector<string> vecContext; - SplitOnWhitespace(string(pszLine), &vecContext); + std::vector<std::string> vecContext; + SplitOnWhitespace(std::string(pszLine), &vecContext); - pmes->label = string(p); + pmes->label = std::string(p); for (size_t i = 0; i < vecContext.size(); i++) pmes->add_feature(vecContext[i]); pModel->add_training_sample((*pmes)); @@ -98,53 +97,53 @@ struct Tsuruoka_Maxent { } double fnEval(const char* pszContext, const char* pszOutcome) const { - vector<string> vecContext; + std::vector<std::string> vecContext; ME_Sample* pmes = new ME_Sample(); - SplitOnWhitespace(string(pszContext), &vecContext); + SplitOnWhitespace(std::string(pszContext), &vecContext); for (size_t i = 0; i < vecContext.size(); i++) pmes->add_feature(vecContext[i]); - vector<double> vecProb = m_pModel->classify(*pmes); + std::vector<double> vecProb = m_pModel->classify(*pmes); delete pmes; int iLableID = m_pModel->get_class_id(pszOutcome); return vecProb[iLableID]; } void fnEval(const char* pszContext, - vector<pair<string, double> >& vecOutput) const { - vector<string> vecContext; + std::vector<std::pair<std::string, double> >& vecOutput) const { + std::vector<std::string> vecContext; ME_Sample* pmes = new ME_Sample(); - SplitOnWhitespace(string(pszContext), &vecContext); + SplitOnWhitespace(std::string(pszContext), &vecContext); vecOutput.clear(); for (size_t i = 0; i < vecContext.size(); i++) pmes->add_feature(vecContext[i]); - vector<double> vecProb = m_pModel->classify(*pmes); + std::vector<double> vecProb = m_pModel->classify(*pmes); for (size_t i = 0; i < vecProb.size(); i++) { - string label = m_pModel->get_class_label(i); + std::string label = m_pModel->get_class_label(i); vecOutput.push_back(make_pair(label, vecProb[i])); } delete pmes; } - void fnEval(const char* pszContext, vector<double>& vecOutput) const { - vector<string> vecContext; + void fnEval(const char* pszContext, std::vector<double>& vecOutput) const { + std::vector<std::string> vecContext; ME_Sample* pmes = new ME_Sample(); - SplitOnWhitespace(string(pszContext), &vecContext); + SplitOnWhitespace(std::string(pszContext), &vecContext); vecOutput.clear(); for (size_t i = 0; i < vecContext.size(); i++) pmes->add_feature(vecContext[i]); - vector<double> vecProb = m_pModel->classify(*pmes); + std::vector<double> vecProb = m_pModel->classify(*pmes); for (size_t i = 0; i < vecProb.size(); i++) { - string label = m_pModel->get_class_label(i); + std::string label = m_pModel->get_class_label(i); vecOutput.push_back(vecProb[i]); } delete pmes; } - int fnGetClassId(const string& strLabel) const { + int fnGetClassId(const std::string& strLabel) const { return m_pModel->get_class_id(strLabel); } |