diff options
Diffstat (limited to 'utils/synutils/constituent_reorder_model.cc')
-rw-r--r-- | utils/synutils/constituent_reorder_model.cc | 1562 |
1 files changed, 806 insertions, 756 deletions
diff --git a/utils/synutils/constituent_reorder_model.cc b/utils/synutils/constituent_reorder_model.cc index 485c9667..a4fb9627 100644 --- a/utils/synutils/constituent_reorder_model.cc +++ b/utils/synutils/constituent_reorder_model.cc @@ -5,7 +5,6 @@ * Author: junhuili */ - #include <boost/program_options.hpp> #include "alignment.h" @@ -17,780 +16,831 @@ using namespace std; - typedef std::tr1::unordered_map<std::string, int> Map; typedef std::tr1::unordered_map<std::string, int>::iterator Iterator; namespace po = boost::program_options; -inline void fnPreparingTrainingdata(const char* pszFName, int iCutoff, const char* pszNewFName) { - SFReader *pFReader = new STxtFileReader(pszFName); - char *pszLine = new char[ 100001 ]; - int iLen; - Map hashPredicate; - while (pFReader->fnReadNextLine(pszLine, &iLen)) { - if (iLen == 0) - continue; - - vector<string> vecTerms; - SplitOnWhitespace(string(pszLine), &vecTerms); - - for (size_t i = 0; i < vecTerms.size() - 1; i++) { - Iterator iter = hashPredicate.find(vecTerms[i]); - if (iter == hashPredicate.end()) { - hashPredicate[vecTerms[i]] = 1; - - } else { - iter->second++; - } - } - } - delete pFReader; - - pFReader = new STxtFileReader(pszFName); - FILE *fpOut = fopen(pszNewFName, "w"); - while (pFReader->fnReadNextLine(pszLine, &iLen)) { - if (iLen == 0) - continue; - - vector<string> vecTerms; - SplitOnWhitespace(string(pszLine), &vecTerms); - ostringstream ostr; - for (size_t i = 0; i < vecTerms.size() - 1; i++) { - Iterator iter = hashPredicate.find(vecTerms[i]); - assert(iter != hashPredicate.end()); - if (iter->second >= iCutoff) { - ostr << vecTerms[i] << " "; - } - } - if (ostr.str().length() > 0) { - ostr << vecTerms[vecTerms.size() - 1]; - fprintf(fpOut, "%s\n", ostr.str().c_str()); - } - } - fclose(fpOut); - delete pFReader; - - - delete [] pszLine; +inline void fnPreparingTrainingdata(const char* pszFName, int iCutoff, + const char* pszNewFName) { + SFReader* pFReader = new STxtFileReader(pszFName); + char* pszLine = new char[100001]; + int iLen; + Map hashPredicate; + while (pFReader->fnReadNextLine(pszLine, &iLen)) { + if (iLen == 0) continue; + + vector<string> vecTerms; + SplitOnWhitespace(string(pszLine), &vecTerms); + + for (size_t i = 0; i < vecTerms.size() - 1; i++) { + Iterator iter = hashPredicate.find(vecTerms[i]); + if (iter == hashPredicate.end()) { + hashPredicate[vecTerms[i]] = 1; + + } else { + iter->second++; + } + } + } + delete pFReader; + + pFReader = new STxtFileReader(pszFName); + FILE* fpOut = fopen(pszNewFName, "w"); + while (pFReader->fnReadNextLine(pszLine, &iLen)) { + if (iLen == 0) continue; + + vector<string> vecTerms; + SplitOnWhitespace(string(pszLine), &vecTerms); + ostringstream ostr; + for (size_t i = 0; i < vecTerms.size() - 1; i++) { + Iterator iter = hashPredicate.find(vecTerms[i]); + assert(iter != hashPredicate.end()); + if (iter->second >= iCutoff) { + ostr << vecTerms[i] << " "; + } + } + if (ostr.str().length() > 0) { + ostr << vecTerms[vecTerms.size() - 1]; + fprintf(fpOut, "%s\n", ostr.str().c_str()); + } + } + fclose(fpOut); + delete pFReader; + + delete[] pszLine; } -struct SConstReorderTrainer{ - SConstReorderTrainer(const char* pszSynFname, //source-side flattened parse tree file name - const char* pszAlignFname, //alignment filename - const char* pszSourceFname, //source file name - const char* pszTargetFname, //target file name - const char* pszInstanceFname, //training instance file name - const char* pszModelPrefix, //classifier model file name prefix - int iClassifierType, //classifier type - int iCutoff, //feature count threshold - const char* pszOption //other classifier parameters (for svmlight) - ) { - fnGenerateInstanceFile(pszSynFname, pszAlignFname, pszSourceFname, pszTargetFname, pszInstanceFname); - - string strInstanceLeftFname = string(pszInstanceFname) + string(".left"); - string strInstanceRightFname = string(pszInstanceFname) + string(".right"); - - string strModelLeftFname = string(pszModelPrefix) + string(".left"); - string strModelRightFname = string(pszModelPrefix) + string(".right"); - - fprintf(stdout, "...Training the left ordering model\n"); - fnTraining(strInstanceLeftFname.c_str(), strModelLeftFname.c_str(), iCutoff); - fprintf(stdout, "...Training the right ordering model\n"); - fnTraining(strInstanceRightFname.c_str(), strModelRightFname.c_str(), iCutoff); - } - ~SConstReorderTrainer() { - - } - -private: - - void fnTraining(const char* pszInstanceFname, const char* pszModelFname, int iCutoff) { - char *pszNewInstanceFName = new char[strlen(pszInstanceFname) + 50]; - if (iCutoff > 0) { - sprintf(pszNewInstanceFName, "%s.tmp", pszInstanceFname); - fnPreparingTrainingdata(pszInstanceFname, iCutoff, pszNewInstanceFName); - } else { - strcpy(pszNewInstanceFName, pszInstanceFname); - } - - /*Zhangle_Maxent *pZhangleMaxent = new Zhangle_Maxent(NULL); - pZhangleMaxent->fnTrain(pszInstanceFname, "lbfgs", pszModelFname, 100, 2.0); - delete pZhangleMaxent;*/ - - Tsuruoka_Maxent *pMaxent = new Tsuruoka_Maxent(NULL); - pMaxent->fnTrain(pszNewInstanceFName, "l1", pszModelFname, 300); - delete pMaxent; - - if (strcmp(pszNewInstanceFName, pszInstanceFname) != 0) { - sprintf(pszNewInstanceFName, "rm %s.tmp", pszInstanceFname); - system(pszNewInstanceFName); - } - delete [] pszNewInstanceFName; - } - - inline bool fnIsVerbPOS(const char* pszTerm) { - if (strcmp(pszTerm, "VV") == 0 - || strcmp(pszTerm, "VA") == 0 - || strcmp(pszTerm, "VC") == 0 - || strcmp(pszTerm, "VE") == 0) - return true; - return false; - } - - inline void fnGetOutcome(int iL1, int iR1, int iL2, int iR2, const SAlignment *pAlign, string& strOutcome) { - if (iL1 == -1 && iL2 == -1) - strOutcome = "BU"; //1. both are untranslated - else if (iL1 == -1) - strOutcome = "1U"; //2. XP1 is untranslated - else if (iL2 == -1) - strOutcome = "2U"; //3. XP2 is untranslated - else if (iL1 == iL2 && iR2 == iR2) - strOutcome = "SS"; //4. Have same scope - else if (iL1 <= iL2 && iR1 >= iR2) - strOutcome = "1C2"; //5. XP1's translation covers XP2's - else if (iL1 >= iL2 && iR1 <= iR2) - strOutcome = "2C1"; //6. XP2's translation covers XP1's - else if (iR1 < iL2) { - int i = iR1 + 1; - /*while (i < iL2) { - if (pAlign->fnIsAligned(i, false)) - break; - i++; - }*/ - if (i == iL2) - strOutcome = "M"; //7. Monotone - else - strOutcome = "DM"; //8. Discontinuous monotone - } else if (iL1 < iL2 && iL2 <= iR1 && iR1 < iR2) - strOutcome = "OM"; //9. Overlap monotone - else if (iR2 < iL1) { - int i = iR2 + 1; - /*while (i < iL1) { - if (pAlign->fnIsAligned(i, false)) - break; - i++; - }*/ - if (i == iL1) - strOutcome = "S"; //10. Swap - else - strOutcome = "DS"; //11. Discontinuous swap - } else if (iL2 < iL1 && iL1 <= iR2 && iR2 < iR1) - strOutcome = "OS"; //12. Overlap swap - else - assert(false); - } - - inline void fnGetOutcome(int i1, int i2, string& strOutcome) { - assert(i1 != i2); - if (i1 < i2) { - if (i2 > i1 + 1) strOutcome = string("DM"); - else strOutcome = string("M"); - } else { - if (i1 > i2 + 1) strOutcome = string("DS"); - else strOutcome = string("S"); - } - } - - inline void fnGetRelativePosition(const vector<int>& vecLeft, vector<int>& vecPosition) { - vecPosition.clear(); - - vector<float> vec; - for (size_t i = 0; i < vecLeft.size(); i++) { - if (vecLeft[i] == -1) { - if (i == 0) - vec.push_back(-1); - else - vec.push_back(vecLeft[i-1] + 0.1); - } else - vec.push_back(vecLeft[i]); - } - - for (size_t i = 0; i < vecLeft.size(); i++) { - int count = 0; - - for (size_t j = 0; j < vecLeft.size(); j++) { - if ( j == i) continue; - if (vec[j] < vec[i]) { - count++; - } else if (vec[j] == vec[i] && j < i) { - count++; - } - } - vecPosition.push_back(count); - } - } - - /* - * features: - * f1: (left_label, right_label, parent_label) - * f2: (left_label, right_label, parent_label, other_right_sibling_label) - * f3: (left_label, right_label, parent_label, other_left_sibling_label) - * f4: (left_label, right_label, left_head_pos) - * f5: (left_label, right_label, left_head_word) - * f6: (left_label, right_label, right_head_pos) - * f7: (left_label, right_label, right_head_word) - * f8: (left_label, right_label, left_chunk_status) - * f9: (left_label, right_label, right_chunk_status) - * f10: (left_label, parent_label) - * f11: (right_label, parent_label) - */ - void fnGenerateInstance(const SParsedTree *pTree, const STreeItem *pParent, int iPos, const vector<string>& vecChunkStatus, const vector<int>& vecPosition, const vector<string>& vecSTerms, const vector<string>& vecTTerms, string& strOutcome, ostringstream& ostr) { - STreeItem *pCon1, *pCon2; - pCon1 = pParent->m_vecChildren[iPos - 1]; - pCon2 = pParent->m_vecChildren[iPos]; - - fnGetOutcome(vecPosition[iPos-1], vecPosition[iPos], strOutcome); - - string left_label = string(pCon1->m_pszTerm); - string right_label = string(pCon2->m_pszTerm); - string parent_label = string(pParent->m_pszTerm); - - vector<string> vec_other_right_sibling; - for (int i = iPos + 1; i < pParent->m_vecChildren.size(); i++) - vec_other_right_sibling.push_back(string(pParent->m_vecChildren[i]->m_pszTerm)); - if (vec_other_right_sibling.size() == 0) - vec_other_right_sibling.push_back(string("NULL")); - vector<string> vec_other_left_sibling; - for (int i = 0; i < iPos - 1; i++) - vec_other_left_sibling.push_back(string(pParent->m_vecChildren[i]->m_pszTerm)); - if (vec_other_left_sibling.size() == 0) - vec_other_left_sibling.push_back(string("NULL")); - - //generate features - //f1 - ostr << "f1=" << left_label << "_" << right_label << "_" << parent_label; - //f2 - for (int i = 0; i < vec_other_right_sibling.size(); i++) - ostr << " f2=" << left_label << "_" << right_label << "_" << parent_label << "_" << vec_other_right_sibling[i]; - //f3 - for (int i = 0; i < vec_other_left_sibling.size(); i++) - ostr << " f3=" << left_label << "_" << right_label << "_" << parent_label << "_" << vec_other_left_sibling[i]; - //f4 - ostr << " f4=" << left_label << "_" << right_label << "_" << pTree->m_vecTerminals[pCon1->m_iHeadWord]->m_ptParent->m_pszTerm; - //f5 - ostr << " f5=" << left_label << "_" << right_label << "_" << vecSTerms[pCon1->m_iHeadWord]; - //f6 - ostr << " f6=" << left_label << "_" << right_label << "_" << pTree->m_vecTerminals[pCon2->m_iHeadWord]->m_ptParent->m_pszTerm; - //f7 - ostr << " f7=" << left_label << "_" << right_label << "_" << vecSTerms[pCon2->m_iHeadWord]; - //f8 - ostr << " f8=" << left_label << "_" << right_label << "_" << vecChunkStatus[iPos - 1]; - //f9 - ostr << " f9=" << left_label << "_" << right_label << "_" << vecChunkStatus[iPos]; - //f10 - ostr << " f10=" << left_label << "_" << parent_label; - //f11 - ostr << " f11=" << right_label << "_" << parent_label; - } - - /* - * Source side (11 features): - * f1: the categories of XP1 and XP2 (f1_1, f1_2) - * f2: the head words of XP1 and XP2 (f2_1, f2_2) - * f3: the first and last word of XP1 (f3_f, f3_l) - * f4: the first and last word of XP2 (f4_f, f4_l) - * f5: is XP1 or XP2 the head node (f5_1, f5_2) - * f6: the category of the common parent - * Target side (6 features): - * f7: the first and the last word of XP1's translation (f7_f, f7_l) - * f8: the first and the last word of XP2's translation (f8_f, f8_l) - * f9: the translation of XP1's and XP2's head word (f9_1, f9_2) - */ - void fnGenerateInstance(const SParsedTree *pTree, const STreeItem *pParent, const STreeItem *pCon1, const STreeItem *pCon2, const SAlignment *pAlign, const vector<string>& vecSTerms, const vector<string>& vecTTerms, string& strOutcome, ostringstream& ostr) { - - int iLeft1, iRight1, iLeft2, iRight2; - pAlign->fnGetLeftRightMost(pCon1->m_iBegin, pCon1->m_iEnd, true, iLeft1, iRight1); - pAlign->fnGetLeftRightMost(pCon2->m_iBegin, pCon2->m_iEnd, true, iLeft2, iRight2); - - fnGetOutcome(iLeft1, iRight1, iLeft2, iRight2, pAlign, strOutcome); - - //generate features - //f1 - ostr << "f1_1=" << pCon1->m_pszTerm << " f1_2=" << pCon2->m_pszTerm; - //f2 - ostr << " f2_1=" << vecSTerms[pCon1->m_iHeadWord] << " f2_2" << vecSTerms[pCon2->m_iHeadWord]; - //f3 - ostr << " f3_f=" << vecSTerms[pCon1->m_iBegin] << " f3_l=" << vecSTerms[pCon1->m_iEnd]; - //f4 - ostr << " f4_f=" << vecSTerms[pCon2->m_iBegin] << " f4_l=" << vecSTerms[pCon2->m_iEnd]; - //f5 - if (pParent->m_iHeadChild == pCon1->m_iBrotherIndex) - ostr << " f5_1=1"; - else - ostr << " f5_1=0"; - if (pParent->m_iHeadChild == pCon2->m_iBrotherIndex) - ostr << " f5_2=1"; - else - ostr << " f5_2=0"; - //f6 - ostr << " f6=" << pParent->m_pszTerm; - - /*//f7 - if (iLeft1 != -1) { - ostr << " f7_f=" << vecTTerms[iLeft1] << " f7_l=" << vecTTerms[iRight1]; - } - if (iLeft2 != -1) { - ostr << " f8_f=" << vecTTerms[iLeft2] << " f8_l=" << vecTTerms[iRight2]; - } - - const vector<int>* pvecTarget = pAlign->fnGetSingleWordAlign(pCon1->m_iHeadWord, true); - string str = ""; - for (size_t i = 0; pvecTarget != NULL && i < pvecTarget->size(); i++) { - str += vecTTerms[(*pvecTarget)[i]] + "_"; - } - if (str.length() > 0) { - ostr << " f9_1=" << str.substr(0, str.size()-1); - } - pvecTarget = pAlign->fnGetSingleWordAlign(pCon2->m_iHeadWord, true); - str = ""; - for (size_t i = 0; pvecTarget != NULL && i < pvecTarget->size(); i++) { - str += vecTTerms[(*pvecTarget)[i]] + "_"; - } - if (str.length() > 0) { - ostr << " f9_2=" << str.substr(0, str.size()-1); - } */ - - } - - void fnGetFocusedParentNodes(const SParsedTree* pTree, vector<STreeItem*>& vecFocused){ - for (size_t i = 0; i < pTree->m_vecTerminals.size(); i++) { - STreeItem *pParent = pTree->m_vecTerminals[i]->m_ptParent; - - while (pParent != NULL) { - //if (pParent->m_vecChildren.size() > 1 && pParent->m_iEnd - pParent->m_iBegin > 5) { - if (pParent->m_vecChildren.size() > 1) { - //do constituent reordering for all children of pParent - vecFocused.push_back(pParent); - } - if (pParent->m_iBrotherIndex != 0) break; - pParent = pParent->m_ptParent; - } - } - } - - void fnGenerateInstanceFile(const char* pszSynFname, //source-side flattened parse tree file name - const char* pszAlignFname, //alignment filename - const char* pszSourceFname, //source file name - const char* pszTargetFname, //target file name - const char* pszInstanceFname //training instance file name - ) { - SAlignmentReader *pAlignReader = new SAlignmentReader(pszAlignFname); - SParseReader *pParseReader = new SParseReader(pszSynFname, false); - STxtFileReader *pTxtSReader = new STxtFileReader(pszSourceFname); - STxtFileReader *pTxtTReader = new STxtFileReader(pszTargetFname); - - string strInstanceLeftFname = string(pszInstanceFname) + string(".left"); - string strInstanceRightFname = string(pszInstanceFname) + string(".right"); - - FILE *fpLeftOut = fopen(strInstanceLeftFname.c_str(), "w"); - assert(fpLeftOut != NULL); - - FILE *fpRightOut = fopen(strInstanceRightFname.c_str(), "w"); - assert(fpRightOut != NULL); - - //read sentence by sentence - SAlignment *pAlign; - SParsedTree *pTree; - char *pszLine = new char[50001]; - int iSentNum = 0; - while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { - pTree = pParseReader->fnReadNextParseTree(); - assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); - vector<string> vecSTerms; - SplitOnWhitespace(string(pszLine), &vecSTerms); - assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); - vector<string> vecTTerms; - SplitOnWhitespace(string(pszLine), &vecTTerms); - - - if (pTree != NULL) { - - vector<STreeItem*> vecFocused; - fnGetFocusedParentNodes(pTree, vecFocused); - - for (size_t i = 0; i < vecFocused.size(); i++) { - - STreeItem *pParent = vecFocused[i]; - - vector<int> vecLeft, vecRight; - for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) { - STreeItem *pCon1 = pParent->m_vecChildren[j]; - int iLeft1, iRight1; - pAlign->fnGetLeftRightMost(pCon1->m_iBegin, pCon1->m_iEnd, true, iLeft1, iRight1); - vecLeft.push_back(iLeft1); - vecRight.push_back(iRight1); - } - vector<int> vecLeftPosition; - fnGetRelativePosition(vecLeft, vecLeftPosition); - vector<int> vecRightPosition; - fnGetRelativePosition(vecRight, vecRightPosition); - - vector<string> vecChunkStatus; - for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) { - string strOutcome = pAlign->fnIsContinuous(pParent->m_vecChildren[j]->m_iBegin, pParent->m_vecChildren[j]->m_iEnd); - vecChunkStatus.push_back(strOutcome); - } - - for (size_t j = 1; j < pParent->m_vecChildren.size(); j++) { - //children[j-1] vs. children[j] reordering - - string strLeftOutcome; - ostringstream ostr; - - fnGenerateInstance(pTree, pParent, j, vecChunkStatus, vecLeftPosition, vecSTerms, vecTTerms, strLeftOutcome, ostr); - - //fprintf(stderr, "%s %s\n", ostr.str().c_str(), strLeftOutcome.c_str()); - fprintf(fpLeftOut, "%s %s\n", ostr.str().c_str(), strLeftOutcome.c_str()); - - string strRightOutcome; - fnGetOutcome(vecRightPosition[j-1], vecRightPosition[j], strRightOutcome); - fprintf(fpRightOut, "%s LeftOrder=%s %s\n", ostr.str().c_str(), strLeftOutcome.c_str(), strRightOutcome.c_str()); - } - } - delete pTree; - } - - delete pAlign; - iSentNum++; - - if (iSentNum % 100000 == 0) - fprintf(stderr, "#%d\n", iSentNum); - } - - - fclose(fpLeftOut); - fclose(fpRightOut); - delete pAlignReader; - delete pParseReader; - delete pTxtSReader; - delete pTxtTReader; - delete [] pszLine; - } - - void fnGenerateInstanceFile2(const char* pszSynFname, //source-side flattened parse tree file name - const char* pszAlignFname, //alignment filename - const char* pszSourceFname, //source file name - const char* pszTargetFname, //target file name - const char* pszInstanceFname //training instance file name - ) { - SAlignmentReader *pAlignReader = new SAlignmentReader(pszAlignFname); - SParseReader *pParseReader = new SParseReader(pszSynFname, false); - STxtFileReader *pTxtSReader = new STxtFileReader(pszSourceFname); - STxtFileReader *pTxtTReader = new STxtFileReader(pszTargetFname); - - FILE *fpOut = fopen(pszInstanceFname, "w"); - assert(fpOut != NULL); - - //read sentence by sentence - SAlignment *pAlign; - SParsedTree *pTree; - char *pszLine = new char[50001]; - int iSentNum = 0; - while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { - pTree = pParseReader->fnReadNextParseTree(); - assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); - vector<string> vecSTerms; - SplitOnWhitespace(string(pszLine), &vecSTerms); - assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); - vector<string> vecTTerms; - SplitOnWhitespace(string(pszLine), &vecTTerms); - - - if (pTree != NULL) { - - vector<STreeItem*> vecFocused; - fnGetFocusedParentNodes(pTree, vecFocused); - - for (size_t i = 0; i < vecFocused.size() && pTree->m_vecTerminals.size() > 10; i++) { - - STreeItem *pParent = vecFocused[i]; - - for (size_t j = 1; j < pParent->m_vecChildren.size(); j++) { - //children[j-1] vs. children[j] reordering - - string strOutcome; - ostringstream ostr; - - fnGenerateInstance(pTree, pParent, pParent->m_vecChildren[j-1], pParent->m_vecChildren[j], pAlign, vecSTerms, vecTTerms, strOutcome, ostr); - - //fprintf(stderr, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); - fprintf(fpOut, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); - } - } - delete pTree; - } - - delete pAlign; - iSentNum++; - - if (iSentNum % 100000 == 0) - fprintf(stderr, "#%d\n", iSentNum); - } - - - fclose(fpOut); - delete pAlignReader; - delete pParseReader; - delete pTxtSReader; - delete pTxtTReader; - delete [] pszLine; - } +struct SConstReorderTrainer { + SConstReorderTrainer( + const char* pszSynFname, // source-side flattened parse tree file name + const char* pszAlignFname, // alignment filename + const char* pszSourceFname, // source file name + const char* pszTargetFname, // target file name + const char* pszInstanceFname, // training instance file name + const char* pszModelPrefix, // classifier model file name prefix + int iClassifierType, // classifier type + int iCutoff, // feature count threshold + const char* pszOption // other classifier parameters (for svmlight) + ) { + fnGenerateInstanceFile(pszSynFname, pszAlignFname, pszSourceFname, + pszTargetFname, pszInstanceFname); + + string strInstanceLeftFname = string(pszInstanceFname) + string(".left"); + string strInstanceRightFname = string(pszInstanceFname) + string(".right"); + + string strModelLeftFname = string(pszModelPrefix) + string(".left"); + string strModelRightFname = string(pszModelPrefix) + string(".right"); + + fprintf(stdout, "...Training the left ordering model\n"); + fnTraining(strInstanceLeftFname.c_str(), strModelLeftFname.c_str(), + iCutoff); + fprintf(stdout, "...Training the right ordering model\n"); + fnTraining(strInstanceRightFname.c_str(), strModelRightFname.c_str(), + iCutoff); + } + ~SConstReorderTrainer() {} + + private: + void fnTraining(const char* pszInstanceFname, const char* pszModelFname, + int iCutoff) { + char* pszNewInstanceFName = new char[strlen(pszInstanceFname) + 50]; + if (iCutoff > 0) { + sprintf(pszNewInstanceFName, "%s.tmp", pszInstanceFname); + fnPreparingTrainingdata(pszInstanceFname, iCutoff, pszNewInstanceFName); + } else { + strcpy(pszNewInstanceFName, pszInstanceFname); + } + + /*Zhangle_Maxent *pZhangleMaxent = new Zhangle_Maxent(NULL); +pZhangleMaxent->fnTrain(pszInstanceFname, "lbfgs", pszModelFname, 100, 2.0); +delete pZhangleMaxent;*/ + + Tsuruoka_Maxent* pMaxent = new Tsuruoka_Maxent(NULL); + pMaxent->fnTrain(pszNewInstanceFName, "l1", pszModelFname, 300); + delete pMaxent; + + if (strcmp(pszNewInstanceFName, pszInstanceFname) != 0) { + sprintf(pszNewInstanceFName, "rm %s.tmp", pszInstanceFname); + system(pszNewInstanceFName); + } + delete[] pszNewInstanceFName; + } + + inline bool fnIsVerbPOS(const char* pszTerm) { + if (strcmp(pszTerm, "VV") == 0 || strcmp(pszTerm, "VA") == 0 || + strcmp(pszTerm, "VC") == 0 || strcmp(pszTerm, "VE") == 0) + return true; + return false; + } + + inline void fnGetOutcome(int iL1, int iR1, int iL2, int iR2, + const SAlignment* pAlign, string& strOutcome) { + if (iL1 == -1 && iL2 == -1) + strOutcome = "BU"; // 1. both are untranslated + else if (iL1 == -1) + strOutcome = "1U"; // 2. XP1 is untranslated + else if (iL2 == -1) + strOutcome = "2U"; // 3. XP2 is untranslated + else if (iL1 == iL2 && iR2 == iR2) + strOutcome = "SS"; // 4. Have same scope + else if (iL1 <= iL2 && iR1 >= iR2) + strOutcome = "1C2"; // 5. XP1's translation covers XP2's + else if (iL1 >= iL2 && iR1 <= iR2) + strOutcome = "2C1"; // 6. XP2's translation covers XP1's + else if (iR1 < iL2) { + int i = iR1 + 1; + /*while (i < iL2) { + if (pAlign->fnIsAligned(i, false)) + break; + i++; + }*/ + if (i == iL2) + strOutcome = "M"; // 7. Monotone + else + strOutcome = "DM"; // 8. Discontinuous monotone + } else if (iL1 < iL2 && iL2 <= iR1 && iR1 < iR2) + strOutcome = "OM"; // 9. Overlap monotone + else if (iR2 < iL1) { + int i = iR2 + 1; + /*while (i < iL1) { + if (pAlign->fnIsAligned(i, false)) + break; + i++; + }*/ + if (i == iL1) + strOutcome = "S"; // 10. Swap + else + strOutcome = "DS"; // 11. Discontinuous swap + } else if (iL2 < iL1 && iL1 <= iR2 && iR2 < iR1) + strOutcome = "OS"; // 12. Overlap swap + else + assert(false); + } + + inline void fnGetOutcome(int i1, int i2, string& strOutcome) { + assert(i1 != i2); + if (i1 < i2) { + if (i2 > i1 + 1) + strOutcome = string("DM"); + else + strOutcome = string("M"); + } else { + if (i1 > i2 + 1) + strOutcome = string("DS"); + else + strOutcome = string("S"); + } + } + + inline void fnGetRelativePosition(const vector<int>& vecLeft, + vector<int>& vecPosition) { + vecPosition.clear(); + + vector<float> vec; + for (size_t i = 0; i < vecLeft.size(); i++) { + if (vecLeft[i] == -1) { + if (i == 0) + vec.push_back(-1); + else + vec.push_back(vecLeft[i - 1] + 0.1); + } else + vec.push_back(vecLeft[i]); + } + + for (size_t i = 0; i < vecLeft.size(); i++) { + int count = 0; + + for (size_t j = 0; j < vecLeft.size(); j++) { + if (j == i) continue; + if (vec[j] < vec[i]) { + count++; + } else if (vec[j] == vec[i] && j < i) { + count++; + } + } + vecPosition.push_back(count); + } + } + + /* + * features: + * f1: (left_label, right_label, parent_label) + * f2: (left_label, right_label, parent_label, other_right_sibling_label) + * f3: (left_label, right_label, parent_label, other_left_sibling_label) + * f4: (left_label, right_label, left_head_pos) + * f5: (left_label, right_label, left_head_word) + * f6: (left_label, right_label, right_head_pos) + * f7: (left_label, right_label, right_head_word) + * f8: (left_label, right_label, left_chunk_status) + * f9: (left_label, right_label, right_chunk_status) + * f10: (left_label, parent_label) + * f11: (right_label, parent_label) + */ + void fnGenerateInstance(const SParsedTree* pTree, const STreeItem* pParent, + int iPos, const vector<string>& vecChunkStatus, + const vector<int>& vecPosition, + const vector<string>& vecSTerms, + const vector<string>& vecTTerms, string& strOutcome, + ostringstream& ostr) { + STreeItem* pCon1, *pCon2; + pCon1 = pParent->m_vecChildren[iPos - 1]; + pCon2 = pParent->m_vecChildren[iPos]; + + fnGetOutcome(vecPosition[iPos - 1], vecPosition[iPos], strOutcome); + + string left_label = string(pCon1->m_pszTerm); + string right_label = string(pCon2->m_pszTerm); + string parent_label = string(pParent->m_pszTerm); + + vector<string> vec_other_right_sibling; + for (int i = iPos + 1; i < pParent->m_vecChildren.size(); i++) + vec_other_right_sibling.push_back( + string(pParent->m_vecChildren[i]->m_pszTerm)); + if (vec_other_right_sibling.size() == 0) + vec_other_right_sibling.push_back(string("NULL")); + vector<string> vec_other_left_sibling; + for (int i = 0; i < iPos - 1; i++) + vec_other_left_sibling.push_back( + string(pParent->m_vecChildren[i]->m_pszTerm)); + if (vec_other_left_sibling.size() == 0) + vec_other_left_sibling.push_back(string("NULL")); + + // generate features + // f1 + ostr << "f1=" << left_label << "_" << right_label << "_" << parent_label; + // f2 + for (int i = 0; i < vec_other_right_sibling.size(); i++) + ostr << " f2=" << left_label << "_" << right_label << "_" << parent_label + << "_" << vec_other_right_sibling[i]; + // f3 + for (int i = 0; i < vec_other_left_sibling.size(); i++) + ostr << " f3=" << left_label << "_" << right_label << "_" << parent_label + << "_" << vec_other_left_sibling[i]; + // f4 + ostr << " f4=" << left_label << "_" << right_label << "_" + << pTree->m_vecTerminals[pCon1->m_iHeadWord]->m_ptParent->m_pszTerm; + // f5 + ostr << " f5=" << left_label << "_" << right_label << "_" + << vecSTerms[pCon1->m_iHeadWord]; + // f6 + ostr << " f6=" << left_label << "_" << right_label << "_" + << pTree->m_vecTerminals[pCon2->m_iHeadWord]->m_ptParent->m_pszTerm; + // f7 + ostr << " f7=" << left_label << "_" << right_label << "_" + << vecSTerms[pCon2->m_iHeadWord]; + // f8 + ostr << " f8=" << left_label << "_" << right_label << "_" + << vecChunkStatus[iPos - 1]; + // f9 + ostr << " f9=" << left_label << "_" << right_label << "_" + << vecChunkStatus[iPos]; + // f10 + ostr << " f10=" << left_label << "_" << parent_label; + // f11 + ostr << " f11=" << right_label << "_" << parent_label; + } + + /* + * Source side (11 features): + * f1: the categories of XP1 and XP2 (f1_1, f1_2) + * f2: the head words of XP1 and XP2 (f2_1, f2_2) + * f3: the first and last word of XP1 (f3_f, f3_l) + * f4: the first and last word of XP2 (f4_f, f4_l) + * f5: is XP1 or XP2 the head node (f5_1, f5_2) + * f6: the category of the common parent + * Target side (6 features): + * f7: the first and the last word of XP1's translation (f7_f, f7_l) + * f8: the first and the last word of XP2's translation (f8_f, f8_l) + * f9: the translation of XP1's and XP2's head word (f9_1, f9_2) + */ + void fnGenerateInstance(const SParsedTree* pTree, const STreeItem* pParent, + const STreeItem* pCon1, const STreeItem* pCon2, + const SAlignment* pAlign, + const vector<string>& vecSTerms, + const vector<string>& vecTTerms, string& strOutcome, + ostringstream& ostr) { + + int iLeft1, iRight1, iLeft2, iRight2; + pAlign->fnGetLeftRightMost(pCon1->m_iBegin, pCon1->m_iEnd, true, iLeft1, + iRight1); + pAlign->fnGetLeftRightMost(pCon2->m_iBegin, pCon2->m_iEnd, true, iLeft2, + iRight2); + + fnGetOutcome(iLeft1, iRight1, iLeft2, iRight2, pAlign, strOutcome); + + // generate features + // f1 + ostr << "f1_1=" << pCon1->m_pszTerm << " f1_2=" << pCon2->m_pszTerm; + // f2 + ostr << " f2_1=" << vecSTerms[pCon1->m_iHeadWord] << " f2_2" + << vecSTerms[pCon2->m_iHeadWord]; + // f3 + ostr << " f3_f=" << vecSTerms[pCon1->m_iBegin] + << " f3_l=" << vecSTerms[pCon1->m_iEnd]; + // f4 + ostr << " f4_f=" << vecSTerms[pCon2->m_iBegin] + << " f4_l=" << vecSTerms[pCon2->m_iEnd]; + // f5 + if (pParent->m_iHeadChild == pCon1->m_iBrotherIndex) + ostr << " f5_1=1"; + else + ostr << " f5_1=0"; + if (pParent->m_iHeadChild == pCon2->m_iBrotherIndex) + ostr << " f5_2=1"; + else + ostr << " f5_2=0"; + // f6 + ostr << " f6=" << pParent->m_pszTerm; + + /*//f7 + if (iLeft1 != -1) { + ostr << " f7_f=" << vecTTerms[iLeft1] << " f7_l=" << + vecTTerms[iRight1]; + } + if (iLeft2 != -1) { + ostr << " f8_f=" << vecTTerms[iLeft2] << " f8_l=" << + vecTTerms[iRight2]; + } + + const vector<int>* pvecTarget = + pAlign->fnGetSingleWordAlign(pCon1->m_iHeadWord, true); + string str = ""; + for (size_t i = 0; pvecTarget != NULL && i < pvecTarget->size(); i++) { + str += vecTTerms[(*pvecTarget)[i]] + "_"; + } + if (str.length() > 0) { + ostr << " f9_1=" << str.substr(0, str.size()-1); + } + pvecTarget = pAlign->fnGetSingleWordAlign(pCon2->m_iHeadWord, true); + str = ""; + for (size_t i = 0; pvecTarget != NULL && i < pvecTarget->size(); i++) { + str += vecTTerms[(*pvecTarget)[i]] + "_"; + } + if (str.length() > 0) { + ostr << " f9_2=" << str.substr(0, str.size()-1); + } */ + } + + void fnGetFocusedParentNodes(const SParsedTree* pTree, + vector<STreeItem*>& vecFocused) { + for (size_t i = 0; i < pTree->m_vecTerminals.size(); i++) { + STreeItem* pParent = pTree->m_vecTerminals[i]->m_ptParent; + + while (pParent != NULL) { + // if (pParent->m_vecChildren.size() > 1 && pParent->m_iEnd - + // pParent->m_iBegin > 5) { + if (pParent->m_vecChildren.size() > 1) { + // do constituent reordering for all children of pParent + vecFocused.push_back(pParent); + } + if (pParent->m_iBrotherIndex != 0) break; + pParent = pParent->m_ptParent; + } + } + } + + void fnGenerateInstanceFile( + const char* pszSynFname, // source-side flattened parse tree file name + const char* pszAlignFname, // alignment filename + const char* pszSourceFname, // source file name + const char* pszTargetFname, // target file name + const char* pszInstanceFname // training instance file name + ) { + SAlignmentReader* pAlignReader = new SAlignmentReader(pszAlignFname); + SParseReader* pParseReader = new SParseReader(pszSynFname, false); + STxtFileReader* pTxtSReader = new STxtFileReader(pszSourceFname); + STxtFileReader* pTxtTReader = new STxtFileReader(pszTargetFname); + + string strInstanceLeftFname = string(pszInstanceFname) + string(".left"); + string strInstanceRightFname = string(pszInstanceFname) + string(".right"); + + FILE* fpLeftOut = fopen(strInstanceLeftFname.c_str(), "w"); + assert(fpLeftOut != NULL); + + FILE* fpRightOut = fopen(strInstanceRightFname.c_str(), "w"); + assert(fpRightOut != NULL); + + // read sentence by sentence + SAlignment* pAlign; + SParsedTree* pTree; + char* pszLine = new char[50001]; + int iSentNum = 0; + while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { + pTree = pParseReader->fnReadNextParseTree(); + assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); + vector<string> vecSTerms; + SplitOnWhitespace(string(pszLine), &vecSTerms); + assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); + vector<string> vecTTerms; + SplitOnWhitespace(string(pszLine), &vecTTerms); + + if (pTree != NULL) { + + vector<STreeItem*> vecFocused; + fnGetFocusedParentNodes(pTree, vecFocused); + + for (size_t i = 0; i < vecFocused.size(); i++) { + + STreeItem* pParent = vecFocused[i]; + + vector<int> vecLeft, vecRight; + for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) { + STreeItem* pCon1 = pParent->m_vecChildren[j]; + int iLeft1, iRight1; + pAlign->fnGetLeftRightMost(pCon1->m_iBegin, pCon1->m_iEnd, true, + iLeft1, iRight1); + vecLeft.push_back(iLeft1); + vecRight.push_back(iRight1); + } + vector<int> vecLeftPosition; + fnGetRelativePosition(vecLeft, vecLeftPosition); + vector<int> vecRightPosition; + fnGetRelativePosition(vecRight, vecRightPosition); + + vector<string> vecChunkStatus; + for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) { + string strOutcome = + pAlign->fnIsContinuous(pParent->m_vecChildren[j]->m_iBegin, + pParent->m_vecChildren[j]->m_iEnd); + vecChunkStatus.push_back(strOutcome); + } + + for (size_t j = 1; j < pParent->m_vecChildren.size(); j++) { + // children[j-1] vs. children[j] reordering + + string strLeftOutcome; + ostringstream ostr; + + fnGenerateInstance(pTree, pParent, j, vecChunkStatus, + vecLeftPosition, vecSTerms, vecTTerms, + strLeftOutcome, ostr); + + // fprintf(stderr, "%s %s\n", ostr.str().c_str(), + // strLeftOutcome.c_str()); + fprintf(fpLeftOut, "%s %s\n", ostr.str().c_str(), + strLeftOutcome.c_str()); + + string strRightOutcome; + fnGetOutcome(vecRightPosition[j - 1], vecRightPosition[j], + strRightOutcome); + fprintf(fpRightOut, "%s LeftOrder=%s %s\n", ostr.str().c_str(), + strLeftOutcome.c_str(), strRightOutcome.c_str()); + } + } + delete pTree; + } + + delete pAlign; + iSentNum++; + + if (iSentNum % 100000 == 0) fprintf(stderr, "#%d\n", iSentNum); + } + + fclose(fpLeftOut); + fclose(fpRightOut); + delete pAlignReader; + delete pParseReader; + delete pTxtSReader; + delete pTxtTReader; + delete[] pszLine; + } + + void fnGenerateInstanceFile2( + const char* pszSynFname, // source-side flattened parse tree file name + const char* pszAlignFname, // alignment filename + const char* pszSourceFname, // source file name + const char* pszTargetFname, // target file name + const char* pszInstanceFname // training instance file name + ) { + SAlignmentReader* pAlignReader = new SAlignmentReader(pszAlignFname); + SParseReader* pParseReader = new SParseReader(pszSynFname, false); + STxtFileReader* pTxtSReader = new STxtFileReader(pszSourceFname); + STxtFileReader* pTxtTReader = new STxtFileReader(pszTargetFname); + + FILE* fpOut = fopen(pszInstanceFname, "w"); + assert(fpOut != NULL); + + // read sentence by sentence + SAlignment* pAlign; + SParsedTree* pTree; + char* pszLine = new char[50001]; + int iSentNum = 0; + while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { + pTree = pParseReader->fnReadNextParseTree(); + assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); + vector<string> vecSTerms; + SplitOnWhitespace(string(pszLine), &vecSTerms); + assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); + vector<string> vecTTerms; + SplitOnWhitespace(string(pszLine), &vecTTerms); + + if (pTree != NULL) { + + vector<STreeItem*> vecFocused; + fnGetFocusedParentNodes(pTree, vecFocused); + + for (size_t i = 0; + i < vecFocused.size() && pTree->m_vecTerminals.size() > 10; i++) { + + STreeItem* pParent = vecFocused[i]; + + for (size_t j = 1; j < pParent->m_vecChildren.size(); j++) { + // children[j-1] vs. children[j] reordering + + string strOutcome; + ostringstream ostr; + + fnGenerateInstance(pTree, pParent, pParent->m_vecChildren[j - 1], + pParent->m_vecChildren[j], pAlign, vecSTerms, + vecTTerms, strOutcome, ostr); + + // fprintf(stderr, "%s %s\n", ostr.str().c_str(), + // strOutcome.c_str()); + fprintf(fpOut, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); + } + } + delete pTree; + } + + delete pAlign; + iSentNum++; + + if (iSentNum % 100000 == 0) fprintf(stderr, "#%d\n", iSentNum); + } + + fclose(fpOut); + delete pAlignReader; + delete pParseReader; + delete pTxtSReader; + delete pTxtTReader; + delete[] pszLine; + } }; -struct SConstContTrainer{ - SConstContTrainer(const char* pszFlattenedSynFname, //source-side flattened parse tree file name - const char* pszAlignFname, //alignment filename - const char* pszSourceFname, //source file name - const char* pszTargetFname, //target file name - const char* pszInstanceFname, //training instance file name - const char* pszModelPrefix, //classifier model file name prefix - int iClassifierType, //classifier type - int iCutoff, //feature count threshold - const char* pszOption //other classifier parameters (for svmlight) - ) { - fnGenerateInstanceFile(pszFlattenedSynFname, pszAlignFname, pszSourceFname, pszTargetFname, pszInstanceFname); - //fnTraining(pszInstanceFname, pszModelPrefix, iClassifierType, iCutoff, pszOption); - fnTraining(pszInstanceFname, pszModelPrefix, iCutoff); - } - ~SConstContTrainer() { - - } - -private: - - void fnTraining(const char* pszInstanceFname, const char* pszModelFname, int iCutoff) { - char *pszNewInstanceFName = new char[strlen(pszInstanceFname) + 50]; - if (iCutoff > 0) { - sprintf(pszNewInstanceFName, "%s.tmp", pszInstanceFname); - fnPreparingTrainingdata(pszInstanceFname, iCutoff, pszNewInstanceFName); - } else { - strcpy(pszNewInstanceFName, pszInstanceFname); - } - - /*Zhangle_Maxent *pZhangleMaxent = new Zhangle_Maxent(NULL); - pZhangleMaxent->fnTrain(pszInstanceFname, "lbfgs", pszModelFname, 100, 2.0); - delete pZhangleMaxent;*/ - - Tsuruoka_Maxent *pMaxent = new Tsuruoka_Maxent(NULL); - pMaxent->fnTrain(pszInstanceFname, "l1", pszModelFname, 300); - delete pMaxent; - - if (strcmp(pszNewInstanceFName, pszInstanceFname) != 0) { - sprintf(pszNewInstanceFName, "rm %s.tmp", pszInstanceFname); - system(pszNewInstanceFName); - } - delete [] pszNewInstanceFName; - } - - - void fnGetFocusedParentNodes(const SParsedTree* pTree, vector<STreeItem*>& vecFocused){ - for (size_t i = 0; i < pTree->m_vecTerminals.size(); i++) { - STreeItem *pParent = pTree->m_vecTerminals[i]->m_ptParent; - - while (pParent != NULL) { - //if (pParent->m_vecChildren.size() > 1 && pParent->m_iEnd - pParent->m_iBegin > 5) { - if (pParent->m_vecChildren.size() > 1) { - //do constituent reordering for all children of pParent - vecFocused.push_back(pParent); - } - if (pParent->m_iBrotherIndex != 0) break; - pParent = pParent->m_ptParent; - } - } - } - - inline void fnGetOutcome(int iL1, int iR1, const SAlignment *pAlign, string& strOutcome) { - strOutcome = pAlign->fnIsContinuous(iL1, iR1); - } - - inline string fnGetLengthType(int iLen) { - if (iLen == 1) - return string("1"); - if (iLen == 2) - return string("2"); - if (iLen == 3) - return string("3"); - if (iLen < 6) - return string("4"); - if (iLen < 11) - return string("6"); - return string("11"); - } - - /* - * Source side (11 features): - * f1: the syntactic category - * f2: the syntactic category of its parent - * f3: the head word's pos - * f4: =1 if it's the head of its parent node - * or - * the head of its parent node - * f5: length type - */ - void fnGenerateInstance(const SParsedTree *pTree, const STreeItem *pCon1, const SAlignment *pAlign, const vector<string>& vecSTerms, const vector<string>& vecTTerms, string& strOutcome, ostringstream& ostr) { - - fnGetOutcome(pCon1->m_iBegin, pCon1->m_iEnd, pAlign, strOutcome); - - //generate features - //f1 - ostr << "f1=" << pCon1->m_pszTerm; - //f2 - ostr << " f2=" << pCon1->m_ptParent->m_pszTerm; - //f3 - ostr << " f3=" << pTree->m_vecTerminals[pCon1->m_iHeadWord]->m_ptParent->m_pszTerm; - //f4 - if (pCon1->m_iBrotherIndex == pCon1->m_ptParent->m_iHeadChild) { - ostr << " f4=1"; - } else { - ostr << " f4=" << pCon1->m_ptParent->m_vecChildren[pCon1->m_ptParent->m_iHeadChild]->m_pszTerm; - } - //f5 - ostr << " f5=" << fnGetLengthType(pCon1->m_iEnd - pCon1->m_iBegin + 1); - } - - void fnGenerateInstanceFile(const char* pszFlattenedSynFname, //source-side flattened parse tree file name - const char* pszAlignFname, //alignment filename - const char* pszSourceFname, //source file name - const char* pszTargetFname, //target file name - const char* pszInstanceFname //training instance file name - ) { - SAlignmentReader *pAlignReader = new SAlignmentReader(pszAlignFname); - SParseReader *pParseReader = new SParseReader(pszFlattenedSynFname, true); - STxtFileReader *pTxtSReader = new STxtFileReader(pszSourceFname); - STxtFileReader *pTxtTReader = new STxtFileReader(pszTargetFname); - - FILE *fpOut = fopen(pszInstanceFname, "w"); - assert(fpOut != NULL); - - //read sentence by sentence - SAlignment *pAlign; - SParsedTree *pTree; - char *pszLine = new char[50001]; - int iSentNum = 0; - while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { - pTree = pParseReader->fnReadNextParseTree(); - assert(pTree != NULL); - assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); - vector<string> vecSTerms; - SplitOnWhitespace(string(pszLine), &vecSTerms); - assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); - vector<string> vecTTerms; - SplitOnWhitespace(string(pszLine), &vecTTerms); - - vector<STreeItem*> vecFocused; - fnGetFocusedParentNodes(pTree, vecFocused); - - for (size_t i = 0; i < vecFocused.size() && pTree->m_vecTerminals.size() > 10; i++) { - - STreeItem *pParent = vecFocused[i]; - - for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) { - //children[j-1] vs. children[j] reordering - - string strOutcome; - ostringstream ostr; - - fnGenerateInstance(pTree, pParent->m_vecChildren[j], pAlign, vecSTerms, vecTTerms, strOutcome, ostr); - - //fprintf(stderr, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); - fprintf(fpOut, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); - } - } - - delete pAlign; - delete pTree; - iSentNum++; - - if (iSentNum % 100000 == 0) - fprintf(stderr, "#%d\n", iSentNum); - } - - - fclose(fpOut); - delete pAlignReader; - delete pParseReader; - delete pTxtSReader; - delete pTxtTReader; - delete [] pszLine; - } +struct SConstContTrainer { + SConstContTrainer( + const char* pszFlattenedSynFname, // source-side flattened parse tree + // file name + const char* pszAlignFname, // alignment filename + const char* pszSourceFname, // source file name + const char* pszTargetFname, // target file name + const char* pszInstanceFname, // training instance file name + const char* pszModelPrefix, // classifier model file name prefix + int iClassifierType, // classifier type + int iCutoff, // feature count threshold + const char* pszOption // other classifier parameters (for svmlight) + ) { + fnGenerateInstanceFile(pszFlattenedSynFname, pszAlignFname, pszSourceFname, + pszTargetFname, pszInstanceFname); + // fnTraining(pszInstanceFname, pszModelPrefix, iClassifierType, iCutoff, + // pszOption); + fnTraining(pszInstanceFname, pszModelPrefix, iCutoff); + } + ~SConstContTrainer() {} + + private: + void fnTraining(const char* pszInstanceFname, const char* pszModelFname, + int iCutoff) { + char* pszNewInstanceFName = new char[strlen(pszInstanceFname) + 50]; + if (iCutoff > 0) { + sprintf(pszNewInstanceFName, "%s.tmp", pszInstanceFname); + fnPreparingTrainingdata(pszInstanceFname, iCutoff, pszNewInstanceFName); + } else { + strcpy(pszNewInstanceFName, pszInstanceFname); + } + + /*Zhangle_Maxent *pZhangleMaxent = new Zhangle_Maxent(NULL); + pZhangleMaxent->fnTrain(pszInstanceFname, "lbfgs", pszModelFname, 100, + 2.0); + delete pZhangleMaxent;*/ + + Tsuruoka_Maxent* pMaxent = new Tsuruoka_Maxent(NULL); + pMaxent->fnTrain(pszInstanceFname, "l1", pszModelFname, 300); + delete pMaxent; + + if (strcmp(pszNewInstanceFName, pszInstanceFname) != 0) { + sprintf(pszNewInstanceFName, "rm %s.tmp", pszInstanceFname); + system(pszNewInstanceFName); + } + delete[] pszNewInstanceFName; + } + + void fnGetFocusedParentNodes(const SParsedTree* pTree, + vector<STreeItem*>& vecFocused) { + for (size_t i = 0; i < pTree->m_vecTerminals.size(); i++) { + STreeItem* pParent = pTree->m_vecTerminals[i]->m_ptParent; + + while (pParent != NULL) { + // if (pParent->m_vecChildren.size() > 1 && pParent->m_iEnd - + // pParent->m_iBegin > 5) { + if (pParent->m_vecChildren.size() > 1) { + // do constituent reordering for all children of pParent + vecFocused.push_back(pParent); + } + if (pParent->m_iBrotherIndex != 0) break; + pParent = pParent->m_ptParent; + } + } + } + + inline void fnGetOutcome(int iL1, int iR1, const SAlignment* pAlign, + string& strOutcome) { + strOutcome = pAlign->fnIsContinuous(iL1, iR1); + } + + inline string fnGetLengthType(int iLen) { + if (iLen == 1) return string("1"); + if (iLen == 2) return string("2"); + if (iLen == 3) return string("3"); + if (iLen < 6) return string("4"); + if (iLen < 11) return string("6"); + return string("11"); + } + + /* + * Source side (11 features): + * f1: the syntactic category + * f2: the syntactic category of its parent + * f3: the head word's pos + * f4: =1 if it's the head of its parent node + * or + * the head of its parent node + * f5: length type + */ + void fnGenerateInstance(const SParsedTree* pTree, const STreeItem* pCon1, + const SAlignment* pAlign, + const vector<string>& vecSTerms, + const vector<string>& vecTTerms, string& strOutcome, + ostringstream& ostr) { + + fnGetOutcome(pCon1->m_iBegin, pCon1->m_iEnd, pAlign, strOutcome); + + // generate features + // f1 + ostr << "f1=" << pCon1->m_pszTerm; + // f2 + ostr << " f2=" << pCon1->m_ptParent->m_pszTerm; + // f3 + ostr << " f3=" << pTree->m_vecTerminals[pCon1->m_iHeadWord] + ->m_ptParent->m_pszTerm; + // f4 + if (pCon1->m_iBrotherIndex == pCon1->m_ptParent->m_iHeadChild) { + ostr << " f4=1"; + } else { + ostr << " f4=" + << pCon1->m_ptParent->m_vecChildren[pCon1->m_ptParent->m_iHeadChild] + ->m_pszTerm; + } + // f5 + ostr << " f5=" << fnGetLengthType(pCon1->m_iEnd - pCon1->m_iBegin + 1); + } + + void fnGenerateInstanceFile( + const char* pszFlattenedSynFname, // source-side flattened parse tree + // file name + const char* pszAlignFname, // alignment filename + const char* pszSourceFname, // source file name + const char* pszTargetFname, // target file name + const char* pszInstanceFname // training instance file name + ) { + SAlignmentReader* pAlignReader = new SAlignmentReader(pszAlignFname); + SParseReader* pParseReader = new SParseReader(pszFlattenedSynFname, true); + STxtFileReader* pTxtSReader = new STxtFileReader(pszSourceFname); + STxtFileReader* pTxtTReader = new STxtFileReader(pszTargetFname); + + FILE* fpOut = fopen(pszInstanceFname, "w"); + assert(fpOut != NULL); + + // read sentence by sentence + SAlignment* pAlign; + SParsedTree* pTree; + char* pszLine = new char[50001]; + int iSentNum = 0; + while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { + pTree = pParseReader->fnReadNextParseTree(); + assert(pTree != NULL); + assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); + vector<string> vecSTerms; + SplitOnWhitespace(string(pszLine), &vecSTerms); + assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); + vector<string> vecTTerms; + SplitOnWhitespace(string(pszLine), &vecTTerms); + + vector<STreeItem*> vecFocused; + fnGetFocusedParentNodes(pTree, vecFocused); + + for (size_t i = 0; + i < vecFocused.size() && pTree->m_vecTerminals.size() > 10; i++) { + + STreeItem* pParent = vecFocused[i]; + + for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) { + // children[j-1] vs. children[j] reordering + + string strOutcome; + ostringstream ostr; + + fnGenerateInstance(pTree, pParent->m_vecChildren[j], pAlign, + vecSTerms, vecTTerms, strOutcome, ostr); + + // fprintf(stderr, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); + fprintf(fpOut, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); + } + } + + delete pAlign; + delete pTree; + iSentNum++; + + if (iSentNum % 100000 == 0) fprintf(stderr, "#%d\n", iSentNum); + } + + fclose(fpOut); + delete pAlignReader; + delete pParseReader; + delete pTxtSReader; + delete pTxtTReader; + delete[] pszLine; + } }; -inline void print_options(std::ostream &out,po::options_description const& opts) { - typedef std::vector< boost::shared_ptr<po::option_description> > Ds; - Ds const& ds=opts.options(); +inline void print_options(std::ostream& out, + po::options_description const& opts) { + typedef std::vector<boost::shared_ptr<po::option_description> > Ds; + Ds const& ds = opts.options(); out << '"'; - for (unsigned i=0;i<ds.size();++i) { - if (i) out<<' '; - out<<"--"<<ds[i]->long_name(); + for (unsigned i = 0; i < ds.size(); ++i) { + if (i) out << ' '; + out << "--" << ds[i]->long_name(); } out << '\n'; } -inline string str(char const* name,po::variables_map const& conf) { +inline string str(char const* name, po::variables_map const& conf) { return conf[name].as<string>(); } -//--parse_file /scratch0/mt_exp/gq-ctb/data/train.srl.cn --align_file /scratch0/mt_exp/gq-ctb/data/aligned.grow-diag-final-and --source_file /scratch0/mt_exp/gq-ctb/data/train.cn --target_file /scratch0/mt_exp/gq-ctb/data/train.en --instance_file /scratch0/mt_exp/gq-ctb/data/srl-instance --model_prefix /scratch0/mt_exp/gq-ctb/data/srl-instance --feature_cutoff 10 --classifier_type 1 +//--parse_file /scratch0/mt_exp/gq-ctb/data/train.srl.cn --align_file +///scratch0/mt_exp/gq-ctb/data/aligned.grow-diag-final-and --source_file +///scratch0/mt_exp/gq-ctb/data/train.cn --target_file +///scratch0/mt_exp/gq-ctb/data/train.en --instance_file +///scratch0/mt_exp/gq-ctb/data/srl-instance --model_prefix +///scratch0/mt_exp/gq-ctb/data/srl-instance --feature_cutoff 10 +//--classifier_type 1 int main(int argc, char** argv) { - po::options_description opts("Configuration options"); - opts.add_options() - ("parse_file",po::value<string>(),"parse file path (input)") - ("align_file",po::value<string>(),"Alignment file path (input)") - ("source_file",po::value<string>(),"Source text file path (input)") - ("target_file",po::value<string>(),"Target text file path (input)") - ("instance_file",po::value<string>(),"Instance file path (output)") - ("model_prefix",po::value<string>(),"Model file path prefix (output): three files will be generated") - ("classifier_type",po::value<int>()->default_value(1),"Classifier type: 1 for openNLP maxent; 2 for Zhangle maxent; and 3 for SVMLight") - ("feature_cutoff",po::value<int>()->default_value(100),"Feature cutoff threshold") - ("svm_option",po::value<string>(),"Parameters for SVMLight classifier") - ("help", "produce help message"); - - po::variables_map vm; - if (argc) { - po::store(po::parse_command_line(argc, argv, opts), vm); - po::notify(vm); - } - - if (vm.count("help")) { - print_options(cout, opts); - return 1; - } - - if (!vm.count("parse_file") - || !vm.count("align_file") - || !vm.count("source_file") - || !vm.count("target_file") - || !vm.count("instance_file") - || !vm.count("model_prefix") - ) { - print_options(cout, opts); - if (!vm.count("parse_file")) cout << "--parse_file NOT FOUND\n"; - if (!vm.count("align_file")) cout << "--align_file NOT FOUND\n"; - if (!vm.count("source_file")) cout << "--source_file NOT FOUND\n"; - if (!vm.count("target_file")) cout << "--target_file NOT FOUND\n"; - if (!vm.count("instance_file")) cout << "--instance_file NOT FOUND\n"; - if (!vm.count("model_prefix")) cout << "--model_prefix NOT FOUND\n"; - exit(0); - } - - const char *pOption; - if (vm.count("svm_option")) - pOption = str("svm_option", vm).c_str(); - else - pOption = NULL; - - SConstReorderTrainer *pTrainer = new SConstReorderTrainer(str("parse_file", vm).c_str(), - str("align_file", vm).c_str(), - str("source_file", vm).c_str(), - str("target_file", vm).c_str(), - str("instance_file", vm).c_str(), - str("model_prefix", vm).c_str(), - vm["classifier_type"].as<int>(), - vm["feature_cutoff"].as<int>(), - pOption); - delete pTrainer; - - return 1; + po::options_description opts("Configuration options"); + opts.add_options()("parse_file", po::value<string>(), + "parse file path (input)")( + "align_file", po::value<string>(), "Alignment file path (input)")( + "source_file", po::value<string>(), "Source text file path (input)")( + "target_file", po::value<string>(), "Target text file path (input)")( + "instance_file", po::value<string>(), "Instance file path (output)")( + "model_prefix", po::value<string>(), + "Model file path prefix (output): three files will be generated")( + "classifier_type", po::value<int>()->default_value(1), + "Classifier type: 1 for openNLP maxent; 2 for Zhangle maxent; and 3 for " + "SVMLight")("feature_cutoff", po::value<int>()->default_value(100), + "Feature cutoff threshold")( + "svm_option", po::value<string>(), "Parameters for SVMLight classifier")( + "help", "produce help message"); + + po::variables_map vm; + if (argc) { + po::store(po::parse_command_line(argc, argv, opts), vm); + po::notify(vm); + } + + if (vm.count("help")) { + print_options(cout, opts); + return 1; + } + + if (!vm.count("parse_file") || !vm.count("align_file") || + !vm.count("source_file") || !vm.count("target_file") || + !vm.count("instance_file") || !vm.count("model_prefix")) { + print_options(cout, opts); + if (!vm.count("parse_file")) cout << "--parse_file NOT FOUND\n"; + if (!vm.count("align_file")) cout << "--align_file NOT FOUND\n"; + if (!vm.count("source_file")) cout << "--source_file NOT FOUND\n"; + if (!vm.count("target_file")) cout << "--target_file NOT FOUND\n"; + if (!vm.count("instance_file")) cout << "--instance_file NOT FOUND\n"; + if (!vm.count("model_prefix")) cout << "--model_prefix NOT FOUND\n"; + exit(0); + } + + const char* pOption; + if (vm.count("svm_option")) + pOption = str("svm_option", vm).c_str(); + else + pOption = NULL; + + SConstReorderTrainer* pTrainer = new SConstReorderTrainer( + str("parse_file", vm).c_str(), str("align_file", vm).c_str(), + str("source_file", vm).c_str(), str("target_file", vm).c_str(), + str("instance_file", vm).c_str(), str("model_prefix", vm).c_str(), + vm["classifier_type"].as<int>(), vm["feature_cutoff"].as<int>(), pOption); + delete pTrainer; + return 1; } |