diff options
author | Wu, Ke <wuke@cs.umd.edu> | 2014-10-07 17:12:32 -0400 |
---|---|---|
committer | Wu, Ke <wuke@cs.umd.edu> | 2014-10-07 17:12:32 -0400 |
commit | cc9bfaeafad972cfe40e6cb804f60adba0c17be1 (patch) | |
tree | 87589f120385e2ded4337e4d3683431fe4ced5a2 /utils/synutils/constituent_reorder_model.cc | |
parent | b559c5db28f86b506fbfe152fde027d2fa408111 (diff) |
Import synutils
Diffstat (limited to 'utils/synutils/constituent_reorder_model.cc')
-rw-r--r-- | utils/synutils/constituent_reorder_model.cc | 796 |
1 files changed, 796 insertions, 0 deletions
diff --git a/utils/synutils/constituent_reorder_model.cc b/utils/synutils/constituent_reorder_model.cc new file mode 100644 index 00000000..485c9667 --- /dev/null +++ b/utils/synutils/constituent_reorder_model.cc @@ -0,0 +1,796 @@ +/* + * constituent_reorder_model.cc + * + * Created on: Jul 10, 2013 + * Author: junhuili + */ + + +#include <boost/program_options.hpp> + +#include "alignment.h" +#include "tree.h" +#include "utility.h" +#include "tsuruoka_maxent.h" + +#include <tr1/unordered_map> + +using namespace std; + + +typedef std::tr1::unordered_map<std::string, int> Map; +typedef std::tr1::unordered_map<std::string, int>::iterator Iterator; + +namespace po = boost::program_options; + +inline void fnPreparingTrainingdata(const char* pszFName, int iCutoff, const char* pszNewFName) { + SFReader *pFReader = new STxtFileReader(pszFName); + char *pszLine = new char[ 100001 ]; + int iLen; + Map hashPredicate; + while (pFReader->fnReadNextLine(pszLine, &iLen)) { + if (iLen == 0) + continue; + + vector<string> vecTerms; + SplitOnWhitespace(string(pszLine), &vecTerms); + + for (size_t i = 0; i < vecTerms.size() - 1; i++) { + Iterator iter = hashPredicate.find(vecTerms[i]); + if (iter == hashPredicate.end()) { + hashPredicate[vecTerms[i]] = 1; + + } else { + iter->second++; + } + } + } + delete pFReader; + + pFReader = new STxtFileReader(pszFName); + FILE *fpOut = fopen(pszNewFName, "w"); + while (pFReader->fnReadNextLine(pszLine, &iLen)) { + if (iLen == 0) + continue; + + vector<string> vecTerms; + SplitOnWhitespace(string(pszLine), &vecTerms); + ostringstream ostr; + for (size_t i = 0; i < vecTerms.size() - 1; i++) { + Iterator iter = hashPredicate.find(vecTerms[i]); + assert(iter != hashPredicate.end()); + if (iter->second >= iCutoff) { + ostr << vecTerms[i] << " "; + } + } + if (ostr.str().length() > 0) { + ostr << vecTerms[vecTerms.size() - 1]; + fprintf(fpOut, "%s\n", ostr.str().c_str()); + } + } + fclose(fpOut); + delete pFReader; + + + delete [] pszLine; +} + +struct SConstReorderTrainer{ + SConstReorderTrainer(const char* pszSynFname, //source-side flattened parse tree file name + const char* pszAlignFname, //alignment filename + const char* pszSourceFname, //source file name + const char* pszTargetFname, //target file name + const char* pszInstanceFname, //training instance file name + const char* pszModelPrefix, //classifier model file name prefix + int iClassifierType, //classifier type + int iCutoff, //feature count threshold + const char* pszOption //other classifier parameters (for svmlight) + ) { + fnGenerateInstanceFile(pszSynFname, pszAlignFname, pszSourceFname, pszTargetFname, pszInstanceFname); + + string strInstanceLeftFname = string(pszInstanceFname) + string(".left"); + string strInstanceRightFname = string(pszInstanceFname) + string(".right"); + + string strModelLeftFname = string(pszModelPrefix) + string(".left"); + string strModelRightFname = string(pszModelPrefix) + string(".right"); + + fprintf(stdout, "...Training the left ordering model\n"); + fnTraining(strInstanceLeftFname.c_str(), strModelLeftFname.c_str(), iCutoff); + fprintf(stdout, "...Training the right ordering model\n"); + fnTraining(strInstanceRightFname.c_str(), strModelRightFname.c_str(), iCutoff); + } + ~SConstReorderTrainer() { + + } + +private: + + void fnTraining(const char* pszInstanceFname, const char* pszModelFname, int iCutoff) { + char *pszNewInstanceFName = new char[strlen(pszInstanceFname) + 50]; + if (iCutoff > 0) { + sprintf(pszNewInstanceFName, "%s.tmp", pszInstanceFname); + fnPreparingTrainingdata(pszInstanceFname, iCutoff, pszNewInstanceFName); + } else { + strcpy(pszNewInstanceFName, pszInstanceFname); + } + + /*Zhangle_Maxent *pZhangleMaxent = new Zhangle_Maxent(NULL); + pZhangleMaxent->fnTrain(pszInstanceFname, "lbfgs", pszModelFname, 100, 2.0); + delete pZhangleMaxent;*/ + + Tsuruoka_Maxent *pMaxent = new Tsuruoka_Maxent(NULL); + pMaxent->fnTrain(pszNewInstanceFName, "l1", pszModelFname, 300); + delete pMaxent; + + if (strcmp(pszNewInstanceFName, pszInstanceFname) != 0) { + sprintf(pszNewInstanceFName, "rm %s.tmp", pszInstanceFname); + system(pszNewInstanceFName); + } + delete [] pszNewInstanceFName; + } + + inline bool fnIsVerbPOS(const char* pszTerm) { + if (strcmp(pszTerm, "VV") == 0 + || strcmp(pszTerm, "VA") == 0 + || strcmp(pszTerm, "VC") == 0 + || strcmp(pszTerm, "VE") == 0) + return true; + return false; + } + + inline void fnGetOutcome(int iL1, int iR1, int iL2, int iR2, const SAlignment *pAlign, string& strOutcome) { + if (iL1 == -1 && iL2 == -1) + strOutcome = "BU"; //1. both are untranslated + else if (iL1 == -1) + strOutcome = "1U"; //2. XP1 is untranslated + else if (iL2 == -1) + strOutcome = "2U"; //3. XP2 is untranslated + else if (iL1 == iL2 && iR2 == iR2) + strOutcome = "SS"; //4. Have same scope + else if (iL1 <= iL2 && iR1 >= iR2) + strOutcome = "1C2"; //5. XP1's translation covers XP2's + else if (iL1 >= iL2 && iR1 <= iR2) + strOutcome = "2C1"; //6. XP2's translation covers XP1's + else if (iR1 < iL2) { + int i = iR1 + 1; + /*while (i < iL2) { + if (pAlign->fnIsAligned(i, false)) + break; + i++; + }*/ + if (i == iL2) + strOutcome = "M"; //7. Monotone + else + strOutcome = "DM"; //8. Discontinuous monotone + } else if (iL1 < iL2 && iL2 <= iR1 && iR1 < iR2) + strOutcome = "OM"; //9. Overlap monotone + else if (iR2 < iL1) { + int i = iR2 + 1; + /*while (i < iL1) { + if (pAlign->fnIsAligned(i, false)) + break; + i++; + }*/ + if (i == iL1) + strOutcome = "S"; //10. Swap + else + strOutcome = "DS"; //11. Discontinuous swap + } else if (iL2 < iL1 && iL1 <= iR2 && iR2 < iR1) + strOutcome = "OS"; //12. Overlap swap + else + assert(false); + } + + inline void fnGetOutcome(int i1, int i2, string& strOutcome) { + assert(i1 != i2); + if (i1 < i2) { + if (i2 > i1 + 1) strOutcome = string("DM"); + else strOutcome = string("M"); + } else { + if (i1 > i2 + 1) strOutcome = string("DS"); + else strOutcome = string("S"); + } + } + + inline void fnGetRelativePosition(const vector<int>& vecLeft, vector<int>& vecPosition) { + vecPosition.clear(); + + vector<float> vec; + for (size_t i = 0; i < vecLeft.size(); i++) { + if (vecLeft[i] == -1) { + if (i == 0) + vec.push_back(-1); + else + vec.push_back(vecLeft[i-1] + 0.1); + } else + vec.push_back(vecLeft[i]); + } + + for (size_t i = 0; i < vecLeft.size(); i++) { + int count = 0; + + for (size_t j = 0; j < vecLeft.size(); j++) { + if ( j == i) continue; + if (vec[j] < vec[i]) { + count++; + } else if (vec[j] == vec[i] && j < i) { + count++; + } + } + vecPosition.push_back(count); + } + } + + /* + * features: + * f1: (left_label, right_label, parent_label) + * f2: (left_label, right_label, parent_label, other_right_sibling_label) + * f3: (left_label, right_label, parent_label, other_left_sibling_label) + * f4: (left_label, right_label, left_head_pos) + * f5: (left_label, right_label, left_head_word) + * f6: (left_label, right_label, right_head_pos) + * f7: (left_label, right_label, right_head_word) + * f8: (left_label, right_label, left_chunk_status) + * f9: (left_label, right_label, right_chunk_status) + * f10: (left_label, parent_label) + * f11: (right_label, parent_label) + */ + void fnGenerateInstance(const SParsedTree *pTree, const STreeItem *pParent, int iPos, const vector<string>& vecChunkStatus, const vector<int>& vecPosition, const vector<string>& vecSTerms, const vector<string>& vecTTerms, string& strOutcome, ostringstream& ostr) { + STreeItem *pCon1, *pCon2; + pCon1 = pParent->m_vecChildren[iPos - 1]; + pCon2 = pParent->m_vecChildren[iPos]; + + fnGetOutcome(vecPosition[iPos-1], vecPosition[iPos], strOutcome); + + string left_label = string(pCon1->m_pszTerm); + string right_label = string(pCon2->m_pszTerm); + string parent_label = string(pParent->m_pszTerm); + + vector<string> vec_other_right_sibling; + for (int i = iPos + 1; i < pParent->m_vecChildren.size(); i++) + vec_other_right_sibling.push_back(string(pParent->m_vecChildren[i]->m_pszTerm)); + if (vec_other_right_sibling.size() == 0) + vec_other_right_sibling.push_back(string("NULL")); + vector<string> vec_other_left_sibling; + for (int i = 0; i < iPos - 1; i++) + vec_other_left_sibling.push_back(string(pParent->m_vecChildren[i]->m_pszTerm)); + if (vec_other_left_sibling.size() == 0) + vec_other_left_sibling.push_back(string("NULL")); + + //generate features + //f1 + ostr << "f1=" << left_label << "_" << right_label << "_" << parent_label; + //f2 + for (int i = 0; i < vec_other_right_sibling.size(); i++) + ostr << " f2=" << left_label << "_" << right_label << "_" << parent_label << "_" << vec_other_right_sibling[i]; + //f3 + for (int i = 0; i < vec_other_left_sibling.size(); i++) + ostr << " f3=" << left_label << "_" << right_label << "_" << parent_label << "_" << vec_other_left_sibling[i]; + //f4 + ostr << " f4=" << left_label << "_" << right_label << "_" << pTree->m_vecTerminals[pCon1->m_iHeadWord]->m_ptParent->m_pszTerm; + //f5 + ostr << " f5=" << left_label << "_" << right_label << "_" << vecSTerms[pCon1->m_iHeadWord]; + //f6 + ostr << " f6=" << left_label << "_" << right_label << "_" << pTree->m_vecTerminals[pCon2->m_iHeadWord]->m_ptParent->m_pszTerm; + //f7 + ostr << " f7=" << left_label << "_" << right_label << "_" << vecSTerms[pCon2->m_iHeadWord]; + //f8 + ostr << " f8=" << left_label << "_" << right_label << "_" << vecChunkStatus[iPos - 1]; + //f9 + ostr << " f9=" << left_label << "_" << right_label << "_" << vecChunkStatus[iPos]; + //f10 + ostr << " f10=" << left_label << "_" << parent_label; + //f11 + ostr << " f11=" << right_label << "_" << parent_label; + } + + /* + * Source side (11 features): + * f1: the categories of XP1 and XP2 (f1_1, f1_2) + * f2: the head words of XP1 and XP2 (f2_1, f2_2) + * f3: the first and last word of XP1 (f3_f, f3_l) + * f4: the first and last word of XP2 (f4_f, f4_l) + * f5: is XP1 or XP2 the head node (f5_1, f5_2) + * f6: the category of the common parent + * Target side (6 features): + * f7: the first and the last word of XP1's translation (f7_f, f7_l) + * f8: the first and the last word of XP2's translation (f8_f, f8_l) + * f9: the translation of XP1's and XP2's head word (f9_1, f9_2) + */ + void fnGenerateInstance(const SParsedTree *pTree, const STreeItem *pParent, const STreeItem *pCon1, const STreeItem *pCon2, const SAlignment *pAlign, const vector<string>& vecSTerms, const vector<string>& vecTTerms, string& strOutcome, ostringstream& ostr) { + + int iLeft1, iRight1, iLeft2, iRight2; + pAlign->fnGetLeftRightMost(pCon1->m_iBegin, pCon1->m_iEnd, true, iLeft1, iRight1); + pAlign->fnGetLeftRightMost(pCon2->m_iBegin, pCon2->m_iEnd, true, iLeft2, iRight2); + + fnGetOutcome(iLeft1, iRight1, iLeft2, iRight2, pAlign, strOutcome); + + //generate features + //f1 + ostr << "f1_1=" << pCon1->m_pszTerm << " f1_2=" << pCon2->m_pszTerm; + //f2 + ostr << " f2_1=" << vecSTerms[pCon1->m_iHeadWord] << " f2_2" << vecSTerms[pCon2->m_iHeadWord]; + //f3 + ostr << " f3_f=" << vecSTerms[pCon1->m_iBegin] << " f3_l=" << vecSTerms[pCon1->m_iEnd]; + //f4 + ostr << " f4_f=" << vecSTerms[pCon2->m_iBegin] << " f4_l=" << vecSTerms[pCon2->m_iEnd]; + //f5 + if (pParent->m_iHeadChild == pCon1->m_iBrotherIndex) + ostr << " f5_1=1"; + else + ostr << " f5_1=0"; + if (pParent->m_iHeadChild == pCon2->m_iBrotherIndex) + ostr << " f5_2=1"; + else + ostr << " f5_2=0"; + //f6 + ostr << " f6=" << pParent->m_pszTerm; + + /*//f7 + if (iLeft1 != -1) { + ostr << " f7_f=" << vecTTerms[iLeft1] << " f7_l=" << vecTTerms[iRight1]; + } + if (iLeft2 != -1) { + ostr << " f8_f=" << vecTTerms[iLeft2] << " f8_l=" << vecTTerms[iRight2]; + } + + const vector<int>* pvecTarget = pAlign->fnGetSingleWordAlign(pCon1->m_iHeadWord, true); + string str = ""; + for (size_t i = 0; pvecTarget != NULL && i < pvecTarget->size(); i++) { + str += vecTTerms[(*pvecTarget)[i]] + "_"; + } + if (str.length() > 0) { + ostr << " f9_1=" << str.substr(0, str.size()-1); + } + pvecTarget = pAlign->fnGetSingleWordAlign(pCon2->m_iHeadWord, true); + str = ""; + for (size_t i = 0; pvecTarget != NULL && i < pvecTarget->size(); i++) { + str += vecTTerms[(*pvecTarget)[i]] + "_"; + } + if (str.length() > 0) { + ostr << " f9_2=" << str.substr(0, str.size()-1); + } */ + + } + + void fnGetFocusedParentNodes(const SParsedTree* pTree, vector<STreeItem*>& vecFocused){ + for (size_t i = 0; i < pTree->m_vecTerminals.size(); i++) { + STreeItem *pParent = pTree->m_vecTerminals[i]->m_ptParent; + + while (pParent != NULL) { + //if (pParent->m_vecChildren.size() > 1 && pParent->m_iEnd - pParent->m_iBegin > 5) { + if (pParent->m_vecChildren.size() > 1) { + //do constituent reordering for all children of pParent + vecFocused.push_back(pParent); + } + if (pParent->m_iBrotherIndex != 0) break; + pParent = pParent->m_ptParent; + } + } + } + + void fnGenerateInstanceFile(const char* pszSynFname, //source-side flattened parse tree file name + const char* pszAlignFname, //alignment filename + const char* pszSourceFname, //source file name + const char* pszTargetFname, //target file name + const char* pszInstanceFname //training instance file name + ) { + SAlignmentReader *pAlignReader = new SAlignmentReader(pszAlignFname); + SParseReader *pParseReader = new SParseReader(pszSynFname, false); + STxtFileReader *pTxtSReader = new STxtFileReader(pszSourceFname); + STxtFileReader *pTxtTReader = new STxtFileReader(pszTargetFname); + + string strInstanceLeftFname = string(pszInstanceFname) + string(".left"); + string strInstanceRightFname = string(pszInstanceFname) + string(".right"); + + FILE *fpLeftOut = fopen(strInstanceLeftFname.c_str(), "w"); + assert(fpLeftOut != NULL); + + FILE *fpRightOut = fopen(strInstanceRightFname.c_str(), "w"); + assert(fpRightOut != NULL); + + //read sentence by sentence + SAlignment *pAlign; + SParsedTree *pTree; + char *pszLine = new char[50001]; + int iSentNum = 0; + while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { + pTree = pParseReader->fnReadNextParseTree(); + assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); + vector<string> vecSTerms; + SplitOnWhitespace(string(pszLine), &vecSTerms); + assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); + vector<string> vecTTerms; + SplitOnWhitespace(string(pszLine), &vecTTerms); + + + if (pTree != NULL) { + + vector<STreeItem*> vecFocused; + fnGetFocusedParentNodes(pTree, vecFocused); + + for (size_t i = 0; i < vecFocused.size(); i++) { + + STreeItem *pParent = vecFocused[i]; + + vector<int> vecLeft, vecRight; + for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) { + STreeItem *pCon1 = pParent->m_vecChildren[j]; + int iLeft1, iRight1; + pAlign->fnGetLeftRightMost(pCon1->m_iBegin, pCon1->m_iEnd, true, iLeft1, iRight1); + vecLeft.push_back(iLeft1); + vecRight.push_back(iRight1); + } + vector<int> vecLeftPosition; + fnGetRelativePosition(vecLeft, vecLeftPosition); + vector<int> vecRightPosition; + fnGetRelativePosition(vecRight, vecRightPosition); + + vector<string> vecChunkStatus; + for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) { + string strOutcome = pAlign->fnIsContinuous(pParent->m_vecChildren[j]->m_iBegin, pParent->m_vecChildren[j]->m_iEnd); + vecChunkStatus.push_back(strOutcome); + } + + for (size_t j = 1; j < pParent->m_vecChildren.size(); j++) { + //children[j-1] vs. children[j] reordering + + string strLeftOutcome; + ostringstream ostr; + + fnGenerateInstance(pTree, pParent, j, vecChunkStatus, vecLeftPosition, vecSTerms, vecTTerms, strLeftOutcome, ostr); + + //fprintf(stderr, "%s %s\n", ostr.str().c_str(), strLeftOutcome.c_str()); + fprintf(fpLeftOut, "%s %s\n", ostr.str().c_str(), strLeftOutcome.c_str()); + + string strRightOutcome; + fnGetOutcome(vecRightPosition[j-1], vecRightPosition[j], strRightOutcome); + fprintf(fpRightOut, "%s LeftOrder=%s %s\n", ostr.str().c_str(), strLeftOutcome.c_str(), strRightOutcome.c_str()); + } + } + delete pTree; + } + + delete pAlign; + iSentNum++; + + if (iSentNum % 100000 == 0) + fprintf(stderr, "#%d\n", iSentNum); + } + + + fclose(fpLeftOut); + fclose(fpRightOut); + delete pAlignReader; + delete pParseReader; + delete pTxtSReader; + delete pTxtTReader; + delete [] pszLine; + } + + void fnGenerateInstanceFile2(const char* pszSynFname, //source-side flattened parse tree file name + const char* pszAlignFname, //alignment filename + const char* pszSourceFname, //source file name + const char* pszTargetFname, //target file name + const char* pszInstanceFname //training instance file name + ) { + SAlignmentReader *pAlignReader = new SAlignmentReader(pszAlignFname); + SParseReader *pParseReader = new SParseReader(pszSynFname, false); + STxtFileReader *pTxtSReader = new STxtFileReader(pszSourceFname); + STxtFileReader *pTxtTReader = new STxtFileReader(pszTargetFname); + + FILE *fpOut = fopen(pszInstanceFname, "w"); + assert(fpOut != NULL); + + //read sentence by sentence + SAlignment *pAlign; + SParsedTree *pTree; + char *pszLine = new char[50001]; + int iSentNum = 0; + while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { + pTree = pParseReader->fnReadNextParseTree(); + assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); + vector<string> vecSTerms; + SplitOnWhitespace(string(pszLine), &vecSTerms); + assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); + vector<string> vecTTerms; + SplitOnWhitespace(string(pszLine), &vecTTerms); + + + if (pTree != NULL) { + + vector<STreeItem*> vecFocused; + fnGetFocusedParentNodes(pTree, vecFocused); + + for (size_t i = 0; i < vecFocused.size() && pTree->m_vecTerminals.size() > 10; i++) { + + STreeItem *pParent = vecFocused[i]; + + for (size_t j = 1; j < pParent->m_vecChildren.size(); j++) { + //children[j-1] vs. children[j] reordering + + string strOutcome; + ostringstream ostr; + + fnGenerateInstance(pTree, pParent, pParent->m_vecChildren[j-1], pParent->m_vecChildren[j], pAlign, vecSTerms, vecTTerms, strOutcome, ostr); + + //fprintf(stderr, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); + fprintf(fpOut, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); + } + } + delete pTree; + } + + delete pAlign; + iSentNum++; + + if (iSentNum % 100000 == 0) + fprintf(stderr, "#%d\n", iSentNum); + } + + + fclose(fpOut); + delete pAlignReader; + delete pParseReader; + delete pTxtSReader; + delete pTxtTReader; + delete [] pszLine; + } +}; + +struct SConstContTrainer{ + SConstContTrainer(const char* pszFlattenedSynFname, //source-side flattened parse tree file name + const char* pszAlignFname, //alignment filename + const char* pszSourceFname, //source file name + const char* pszTargetFname, //target file name + const char* pszInstanceFname, //training instance file name + const char* pszModelPrefix, //classifier model file name prefix + int iClassifierType, //classifier type + int iCutoff, //feature count threshold + const char* pszOption //other classifier parameters (for svmlight) + ) { + fnGenerateInstanceFile(pszFlattenedSynFname, pszAlignFname, pszSourceFname, pszTargetFname, pszInstanceFname); + //fnTraining(pszInstanceFname, pszModelPrefix, iClassifierType, iCutoff, pszOption); + fnTraining(pszInstanceFname, pszModelPrefix, iCutoff); + } + ~SConstContTrainer() { + + } + +private: + + void fnTraining(const char* pszInstanceFname, const char* pszModelFname, int iCutoff) { + char *pszNewInstanceFName = new char[strlen(pszInstanceFname) + 50]; + if (iCutoff > 0) { + sprintf(pszNewInstanceFName, "%s.tmp", pszInstanceFname); + fnPreparingTrainingdata(pszInstanceFname, iCutoff, pszNewInstanceFName); + } else { + strcpy(pszNewInstanceFName, pszInstanceFname); + } + + /*Zhangle_Maxent *pZhangleMaxent = new Zhangle_Maxent(NULL); + pZhangleMaxent->fnTrain(pszInstanceFname, "lbfgs", pszModelFname, 100, 2.0); + delete pZhangleMaxent;*/ + + Tsuruoka_Maxent *pMaxent = new Tsuruoka_Maxent(NULL); + pMaxent->fnTrain(pszInstanceFname, "l1", pszModelFname, 300); + delete pMaxent; + + if (strcmp(pszNewInstanceFName, pszInstanceFname) != 0) { + sprintf(pszNewInstanceFName, "rm %s.tmp", pszInstanceFname); + system(pszNewInstanceFName); + } + delete [] pszNewInstanceFName; + } + + + void fnGetFocusedParentNodes(const SParsedTree* pTree, vector<STreeItem*>& vecFocused){ + for (size_t i = 0; i < pTree->m_vecTerminals.size(); i++) { + STreeItem *pParent = pTree->m_vecTerminals[i]->m_ptParent; + + while (pParent != NULL) { + //if (pParent->m_vecChildren.size() > 1 && pParent->m_iEnd - pParent->m_iBegin > 5) { + if (pParent->m_vecChildren.size() > 1) { + //do constituent reordering for all children of pParent + vecFocused.push_back(pParent); + } + if (pParent->m_iBrotherIndex != 0) break; + pParent = pParent->m_ptParent; + } + } + } + + inline void fnGetOutcome(int iL1, int iR1, const SAlignment *pAlign, string& strOutcome) { + strOutcome = pAlign->fnIsContinuous(iL1, iR1); + } + + inline string fnGetLengthType(int iLen) { + if (iLen == 1) + return string("1"); + if (iLen == 2) + return string("2"); + if (iLen == 3) + return string("3"); + if (iLen < 6) + return string("4"); + if (iLen < 11) + return string("6"); + return string("11"); + } + + /* + * Source side (11 features): + * f1: the syntactic category + * f2: the syntactic category of its parent + * f3: the head word's pos + * f4: =1 if it's the head of its parent node + * or + * the head of its parent node + * f5: length type + */ + void fnGenerateInstance(const SParsedTree *pTree, const STreeItem *pCon1, const SAlignment *pAlign, const vector<string>& vecSTerms, const vector<string>& vecTTerms, string& strOutcome, ostringstream& ostr) { + + fnGetOutcome(pCon1->m_iBegin, pCon1->m_iEnd, pAlign, strOutcome); + + //generate features + //f1 + ostr << "f1=" << pCon1->m_pszTerm; + //f2 + ostr << " f2=" << pCon1->m_ptParent->m_pszTerm; + //f3 + ostr << " f3=" << pTree->m_vecTerminals[pCon1->m_iHeadWord]->m_ptParent->m_pszTerm; + //f4 + if (pCon1->m_iBrotherIndex == pCon1->m_ptParent->m_iHeadChild) { + ostr << " f4=1"; + } else { + ostr << " f4=" << pCon1->m_ptParent->m_vecChildren[pCon1->m_ptParent->m_iHeadChild]->m_pszTerm; + } + //f5 + ostr << " f5=" << fnGetLengthType(pCon1->m_iEnd - pCon1->m_iBegin + 1); + } + + void fnGenerateInstanceFile(const char* pszFlattenedSynFname, //source-side flattened parse tree file name + const char* pszAlignFname, //alignment filename + const char* pszSourceFname, //source file name + const char* pszTargetFname, //target file name + const char* pszInstanceFname //training instance file name + ) { + SAlignmentReader *pAlignReader = new SAlignmentReader(pszAlignFname); + SParseReader *pParseReader = new SParseReader(pszFlattenedSynFname, true); + STxtFileReader *pTxtSReader = new STxtFileReader(pszSourceFname); + STxtFileReader *pTxtTReader = new STxtFileReader(pszTargetFname); + + FILE *fpOut = fopen(pszInstanceFname, "w"); + assert(fpOut != NULL); + + //read sentence by sentence + SAlignment *pAlign; + SParsedTree *pTree; + char *pszLine = new char[50001]; + int iSentNum = 0; + while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { + pTree = pParseReader->fnReadNextParseTree(); + assert(pTree != NULL); + assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); + vector<string> vecSTerms; + SplitOnWhitespace(string(pszLine), &vecSTerms); + assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); + vector<string> vecTTerms; + SplitOnWhitespace(string(pszLine), &vecTTerms); + + vector<STreeItem*> vecFocused; + fnGetFocusedParentNodes(pTree, vecFocused); + + for (size_t i = 0; i < vecFocused.size() && pTree->m_vecTerminals.size() > 10; i++) { + + STreeItem *pParent = vecFocused[i]; + + for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) { + //children[j-1] vs. children[j] reordering + + string strOutcome; + ostringstream ostr; + + fnGenerateInstance(pTree, pParent->m_vecChildren[j], pAlign, vecSTerms, vecTTerms, strOutcome, ostr); + + //fprintf(stderr, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); + fprintf(fpOut, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); + } + } + + delete pAlign; + delete pTree; + iSentNum++; + + if (iSentNum % 100000 == 0) + fprintf(stderr, "#%d\n", iSentNum); + } + + + fclose(fpOut); + delete pAlignReader; + delete pParseReader; + delete pTxtSReader; + delete pTxtTReader; + delete [] pszLine; + } +}; + +inline void print_options(std::ostream &out,po::options_description const& opts) { + typedef std::vector< boost::shared_ptr<po::option_description> > Ds; + Ds const& ds=opts.options(); + out << '"'; + for (unsigned i=0;i<ds.size();++i) { + if (i) out<<' '; + out<<"--"<<ds[i]->long_name(); + } + out << '\n'; +} +inline string str(char const* name,po::variables_map const& conf) { + return conf[name].as<string>(); +} + +//--parse_file /scratch0/mt_exp/gq-ctb/data/train.srl.cn --align_file /scratch0/mt_exp/gq-ctb/data/aligned.grow-diag-final-and --source_file /scratch0/mt_exp/gq-ctb/data/train.cn --target_file /scratch0/mt_exp/gq-ctb/data/train.en --instance_file /scratch0/mt_exp/gq-ctb/data/srl-instance --model_prefix /scratch0/mt_exp/gq-ctb/data/srl-instance --feature_cutoff 10 --classifier_type 1 +int main(int argc, char** argv) { + + po::options_description opts("Configuration options"); + opts.add_options() + ("parse_file",po::value<string>(),"parse file path (input)") + ("align_file",po::value<string>(),"Alignment file path (input)") + ("source_file",po::value<string>(),"Source text file path (input)") + ("target_file",po::value<string>(),"Target text file path (input)") + ("instance_file",po::value<string>(),"Instance file path (output)") + ("model_prefix",po::value<string>(),"Model file path prefix (output): three files will be generated") + ("classifier_type",po::value<int>()->default_value(1),"Classifier type: 1 for openNLP maxent; 2 for Zhangle maxent; and 3 for SVMLight") + ("feature_cutoff",po::value<int>()->default_value(100),"Feature cutoff threshold") + ("svm_option",po::value<string>(),"Parameters for SVMLight classifier") + ("help", "produce help message"); + + po::variables_map vm; + if (argc) { + po::store(po::parse_command_line(argc, argv, opts), vm); + po::notify(vm); + } + + if (vm.count("help")) { + print_options(cout, opts); + return 1; + } + + if (!vm.count("parse_file") + || !vm.count("align_file") + || !vm.count("source_file") + || !vm.count("target_file") + || !vm.count("instance_file") + || !vm.count("model_prefix") + ) { + print_options(cout, opts); + if (!vm.count("parse_file")) cout << "--parse_file NOT FOUND\n"; + if (!vm.count("align_file")) cout << "--align_file NOT FOUND\n"; + if (!vm.count("source_file")) cout << "--source_file NOT FOUND\n"; + if (!vm.count("target_file")) cout << "--target_file NOT FOUND\n"; + if (!vm.count("instance_file")) cout << "--instance_file NOT FOUND\n"; + if (!vm.count("model_prefix")) cout << "--model_prefix NOT FOUND\n"; + exit(0); + } + + const char *pOption; + if (vm.count("svm_option")) + pOption = str("svm_option", vm).c_str(); + else + pOption = NULL; + + SConstReorderTrainer *pTrainer = new SConstReorderTrainer(str("parse_file", vm).c_str(), + str("align_file", vm).c_str(), + str("source_file", vm).c_str(), + str("target_file", vm).c_str(), + str("instance_file", vm).c_str(), + str("model_prefix", vm).c_str(), + vm["classifier_type"].as<int>(), + vm["feature_cutoff"].as<int>(), + pOption); + delete pTrainer; + + return 1; + +} |