summaryrefslogtreecommitdiff
path: root/utils/synutils/constituent_reorder_model.cc
diff options
context:
space:
mode:
Diffstat (limited to 'utils/synutils/constituent_reorder_model.cc')
-rw-r--r--utils/synutils/constituent_reorder_model.cc1562
1 files changed, 806 insertions, 756 deletions
diff --git a/utils/synutils/constituent_reorder_model.cc b/utils/synutils/constituent_reorder_model.cc
index 485c9667..a4fb9627 100644
--- a/utils/synutils/constituent_reorder_model.cc
+++ b/utils/synutils/constituent_reorder_model.cc
@@ -5,7 +5,6 @@
* Author: junhuili
*/
-
#include <boost/program_options.hpp>
#include "alignment.h"
@@ -17,780 +16,831 @@
using namespace std;
-
typedef std::tr1::unordered_map<std::string, int> Map;
typedef std::tr1::unordered_map<std::string, int>::iterator Iterator;
namespace po = boost::program_options;
-inline void fnPreparingTrainingdata(const char* pszFName, int iCutoff, const char* pszNewFName) {
- SFReader *pFReader = new STxtFileReader(pszFName);
- char *pszLine = new char[ 100001 ];
- int iLen;
- Map hashPredicate;
- while (pFReader->fnReadNextLine(pszLine, &iLen)) {
- if (iLen == 0)
- continue;
-
- vector<string> vecTerms;
- SplitOnWhitespace(string(pszLine), &vecTerms);
-
- for (size_t i = 0; i < vecTerms.size() - 1; i++) {
- Iterator iter = hashPredicate.find(vecTerms[i]);
- if (iter == hashPredicate.end()) {
- hashPredicate[vecTerms[i]] = 1;
-
- } else {
- iter->second++;
- }
- }
- }
- delete pFReader;
-
- pFReader = new STxtFileReader(pszFName);
- FILE *fpOut = fopen(pszNewFName, "w");
- while (pFReader->fnReadNextLine(pszLine, &iLen)) {
- if (iLen == 0)
- continue;
-
- vector<string> vecTerms;
- SplitOnWhitespace(string(pszLine), &vecTerms);
- ostringstream ostr;
- for (size_t i = 0; i < vecTerms.size() - 1; i++) {
- Iterator iter = hashPredicate.find(vecTerms[i]);
- assert(iter != hashPredicate.end());
- if (iter->second >= iCutoff) {
- ostr << vecTerms[i] << " ";
- }
- }
- if (ostr.str().length() > 0) {
- ostr << vecTerms[vecTerms.size() - 1];
- fprintf(fpOut, "%s\n", ostr.str().c_str());
- }
- }
- fclose(fpOut);
- delete pFReader;
-
-
- delete [] pszLine;
+inline void fnPreparingTrainingdata(const char* pszFName, int iCutoff,
+ const char* pszNewFName) {
+ SFReader* pFReader = new STxtFileReader(pszFName);
+ char* pszLine = new char[100001];
+ int iLen;
+ Map hashPredicate;
+ while (pFReader->fnReadNextLine(pszLine, &iLen)) {
+ if (iLen == 0) continue;
+
+ vector<string> vecTerms;
+ SplitOnWhitespace(string(pszLine), &vecTerms);
+
+ for (size_t i = 0; i < vecTerms.size() - 1; i++) {
+ Iterator iter = hashPredicate.find(vecTerms[i]);
+ if (iter == hashPredicate.end()) {
+ hashPredicate[vecTerms[i]] = 1;
+
+ } else {
+ iter->second++;
+ }
+ }
+ }
+ delete pFReader;
+
+ pFReader = new STxtFileReader(pszFName);
+ FILE* fpOut = fopen(pszNewFName, "w");
+ while (pFReader->fnReadNextLine(pszLine, &iLen)) {
+ if (iLen == 0) continue;
+
+ vector<string> vecTerms;
+ SplitOnWhitespace(string(pszLine), &vecTerms);
+ ostringstream ostr;
+ for (size_t i = 0; i < vecTerms.size() - 1; i++) {
+ Iterator iter = hashPredicate.find(vecTerms[i]);
+ assert(iter != hashPredicate.end());
+ if (iter->second >= iCutoff) {
+ ostr << vecTerms[i] << " ";
+ }
+ }
+ if (ostr.str().length() > 0) {
+ ostr << vecTerms[vecTerms.size() - 1];
+ fprintf(fpOut, "%s\n", ostr.str().c_str());
+ }
+ }
+ fclose(fpOut);
+ delete pFReader;
+
+ delete[] pszLine;
}
-struct SConstReorderTrainer{
- SConstReorderTrainer(const char* pszSynFname, //source-side flattened parse tree file name
- const char* pszAlignFname, //alignment filename
- const char* pszSourceFname, //source file name
- const char* pszTargetFname, //target file name
- const char* pszInstanceFname, //training instance file name
- const char* pszModelPrefix, //classifier model file name prefix
- int iClassifierType, //classifier type
- int iCutoff, //feature count threshold
- const char* pszOption //other classifier parameters (for svmlight)
- ) {
- fnGenerateInstanceFile(pszSynFname, pszAlignFname, pszSourceFname, pszTargetFname, pszInstanceFname);
-
- string strInstanceLeftFname = string(pszInstanceFname) + string(".left");
- string strInstanceRightFname = string(pszInstanceFname) + string(".right");
-
- string strModelLeftFname = string(pszModelPrefix) + string(".left");
- string strModelRightFname = string(pszModelPrefix) + string(".right");
-
- fprintf(stdout, "...Training the left ordering model\n");
- fnTraining(strInstanceLeftFname.c_str(), strModelLeftFname.c_str(), iCutoff);
- fprintf(stdout, "...Training the right ordering model\n");
- fnTraining(strInstanceRightFname.c_str(), strModelRightFname.c_str(), iCutoff);
- }
- ~SConstReorderTrainer() {
-
- }
-
-private:
-
- void fnTraining(const char* pszInstanceFname, const char* pszModelFname, int iCutoff) {
- char *pszNewInstanceFName = new char[strlen(pszInstanceFname) + 50];
- if (iCutoff > 0) {
- sprintf(pszNewInstanceFName, "%s.tmp", pszInstanceFname);
- fnPreparingTrainingdata(pszInstanceFname, iCutoff, pszNewInstanceFName);
- } else {
- strcpy(pszNewInstanceFName, pszInstanceFname);
- }
-
- /*Zhangle_Maxent *pZhangleMaxent = new Zhangle_Maxent(NULL);
- pZhangleMaxent->fnTrain(pszInstanceFname, "lbfgs", pszModelFname, 100, 2.0);
- delete pZhangleMaxent;*/
-
- Tsuruoka_Maxent *pMaxent = new Tsuruoka_Maxent(NULL);
- pMaxent->fnTrain(pszNewInstanceFName, "l1", pszModelFname, 300);
- delete pMaxent;
-
- if (strcmp(pszNewInstanceFName, pszInstanceFname) != 0) {
- sprintf(pszNewInstanceFName, "rm %s.tmp", pszInstanceFname);
- system(pszNewInstanceFName);
- }
- delete [] pszNewInstanceFName;
- }
-
- inline bool fnIsVerbPOS(const char* pszTerm) {
- if (strcmp(pszTerm, "VV") == 0
- || strcmp(pszTerm, "VA") == 0
- || strcmp(pszTerm, "VC") == 0
- || strcmp(pszTerm, "VE") == 0)
- return true;
- return false;
- }
-
- inline void fnGetOutcome(int iL1, int iR1, int iL2, int iR2, const SAlignment *pAlign, string& strOutcome) {
- if (iL1 == -1 && iL2 == -1)
- strOutcome = "BU"; //1. both are untranslated
- else if (iL1 == -1)
- strOutcome = "1U"; //2. XP1 is untranslated
- else if (iL2 == -1)
- strOutcome = "2U"; //3. XP2 is untranslated
- else if (iL1 == iL2 && iR2 == iR2)
- strOutcome = "SS"; //4. Have same scope
- else if (iL1 <= iL2 && iR1 >= iR2)
- strOutcome = "1C2"; //5. XP1's translation covers XP2's
- else if (iL1 >= iL2 && iR1 <= iR2)
- strOutcome = "2C1"; //6. XP2's translation covers XP1's
- else if (iR1 < iL2) {
- int i = iR1 + 1;
- /*while (i < iL2) {
- if (pAlign->fnIsAligned(i, false))
- break;
- i++;
- }*/
- if (i == iL2)
- strOutcome = "M"; //7. Monotone
- else
- strOutcome = "DM"; //8. Discontinuous monotone
- } else if (iL1 < iL2 && iL2 <= iR1 && iR1 < iR2)
- strOutcome = "OM"; //9. Overlap monotone
- else if (iR2 < iL1) {
- int i = iR2 + 1;
- /*while (i < iL1) {
- if (pAlign->fnIsAligned(i, false))
- break;
- i++;
- }*/
- if (i == iL1)
- strOutcome = "S"; //10. Swap
- else
- strOutcome = "DS"; //11. Discontinuous swap
- } else if (iL2 < iL1 && iL1 <= iR2 && iR2 < iR1)
- strOutcome = "OS"; //12. Overlap swap
- else
- assert(false);
- }
-
- inline void fnGetOutcome(int i1, int i2, string& strOutcome) {
- assert(i1 != i2);
- if (i1 < i2) {
- if (i2 > i1 + 1) strOutcome = string("DM");
- else strOutcome = string("M");
- } else {
- if (i1 > i2 + 1) strOutcome = string("DS");
- else strOutcome = string("S");
- }
- }
-
- inline void fnGetRelativePosition(const vector<int>& vecLeft, vector<int>& vecPosition) {
- vecPosition.clear();
-
- vector<float> vec;
- for (size_t i = 0; i < vecLeft.size(); i++) {
- if (vecLeft[i] == -1) {
- if (i == 0)
- vec.push_back(-1);
- else
- vec.push_back(vecLeft[i-1] + 0.1);
- } else
- vec.push_back(vecLeft[i]);
- }
-
- for (size_t i = 0; i < vecLeft.size(); i++) {
- int count = 0;
-
- for (size_t j = 0; j < vecLeft.size(); j++) {
- if ( j == i) continue;
- if (vec[j] < vec[i]) {
- count++;
- } else if (vec[j] == vec[i] && j < i) {
- count++;
- }
- }
- vecPosition.push_back(count);
- }
- }
-
- /*
- * features:
- * f1: (left_label, right_label, parent_label)
- * f2: (left_label, right_label, parent_label, other_right_sibling_label)
- * f3: (left_label, right_label, parent_label, other_left_sibling_label)
- * f4: (left_label, right_label, left_head_pos)
- * f5: (left_label, right_label, left_head_word)
- * f6: (left_label, right_label, right_head_pos)
- * f7: (left_label, right_label, right_head_word)
- * f8: (left_label, right_label, left_chunk_status)
- * f9: (left_label, right_label, right_chunk_status)
- * f10: (left_label, parent_label)
- * f11: (right_label, parent_label)
- */
- void fnGenerateInstance(const SParsedTree *pTree, const STreeItem *pParent, int iPos, const vector<string>& vecChunkStatus, const vector<int>& vecPosition, const vector<string>& vecSTerms, const vector<string>& vecTTerms, string& strOutcome, ostringstream& ostr) {
- STreeItem *pCon1, *pCon2;
- pCon1 = pParent->m_vecChildren[iPos - 1];
- pCon2 = pParent->m_vecChildren[iPos];
-
- fnGetOutcome(vecPosition[iPos-1], vecPosition[iPos], strOutcome);
-
- string left_label = string(pCon1->m_pszTerm);
- string right_label = string(pCon2->m_pszTerm);
- string parent_label = string(pParent->m_pszTerm);
-
- vector<string> vec_other_right_sibling;
- for (int i = iPos + 1; i < pParent->m_vecChildren.size(); i++)
- vec_other_right_sibling.push_back(string(pParent->m_vecChildren[i]->m_pszTerm));
- if (vec_other_right_sibling.size() == 0)
- vec_other_right_sibling.push_back(string("NULL"));
- vector<string> vec_other_left_sibling;
- for (int i = 0; i < iPos - 1; i++)
- vec_other_left_sibling.push_back(string(pParent->m_vecChildren[i]->m_pszTerm));
- if (vec_other_left_sibling.size() == 0)
- vec_other_left_sibling.push_back(string("NULL"));
-
- //generate features
- //f1
- ostr << "f1=" << left_label << "_" << right_label << "_" << parent_label;
- //f2
- for (int i = 0; i < vec_other_right_sibling.size(); i++)
- ostr << " f2=" << left_label << "_" << right_label << "_" << parent_label << "_" << vec_other_right_sibling[i];
- //f3
- for (int i = 0; i < vec_other_left_sibling.size(); i++)
- ostr << " f3=" << left_label << "_" << right_label << "_" << parent_label << "_" << vec_other_left_sibling[i];
- //f4
- ostr << " f4=" << left_label << "_" << right_label << "_" << pTree->m_vecTerminals[pCon1->m_iHeadWord]->m_ptParent->m_pszTerm;
- //f5
- ostr << " f5=" << left_label << "_" << right_label << "_" << vecSTerms[pCon1->m_iHeadWord];
- //f6
- ostr << " f6=" << left_label << "_" << right_label << "_" << pTree->m_vecTerminals[pCon2->m_iHeadWord]->m_ptParent->m_pszTerm;
- //f7
- ostr << " f7=" << left_label << "_" << right_label << "_" << vecSTerms[pCon2->m_iHeadWord];
- //f8
- ostr << " f8=" << left_label << "_" << right_label << "_" << vecChunkStatus[iPos - 1];
- //f9
- ostr << " f9=" << left_label << "_" << right_label << "_" << vecChunkStatus[iPos];
- //f10
- ostr << " f10=" << left_label << "_" << parent_label;
- //f11
- ostr << " f11=" << right_label << "_" << parent_label;
- }
-
- /*
- * Source side (11 features):
- * f1: the categories of XP1 and XP2 (f1_1, f1_2)
- * f2: the head words of XP1 and XP2 (f2_1, f2_2)
- * f3: the first and last word of XP1 (f3_f, f3_l)
- * f4: the first and last word of XP2 (f4_f, f4_l)
- * f5: is XP1 or XP2 the head node (f5_1, f5_2)
- * f6: the category of the common parent
- * Target side (6 features):
- * f7: the first and the last word of XP1's translation (f7_f, f7_l)
- * f8: the first and the last word of XP2's translation (f8_f, f8_l)
- * f9: the translation of XP1's and XP2's head word (f9_1, f9_2)
- */
- void fnGenerateInstance(const SParsedTree *pTree, const STreeItem *pParent, const STreeItem *pCon1, const STreeItem *pCon2, const SAlignment *pAlign, const vector<string>& vecSTerms, const vector<string>& vecTTerms, string& strOutcome, ostringstream& ostr) {
-
- int iLeft1, iRight1, iLeft2, iRight2;
- pAlign->fnGetLeftRightMost(pCon1->m_iBegin, pCon1->m_iEnd, true, iLeft1, iRight1);
- pAlign->fnGetLeftRightMost(pCon2->m_iBegin, pCon2->m_iEnd, true, iLeft2, iRight2);
-
- fnGetOutcome(iLeft1, iRight1, iLeft2, iRight2, pAlign, strOutcome);
-
- //generate features
- //f1
- ostr << "f1_1=" << pCon1->m_pszTerm << " f1_2=" << pCon2->m_pszTerm;
- //f2
- ostr << " f2_1=" << vecSTerms[pCon1->m_iHeadWord] << " f2_2" << vecSTerms[pCon2->m_iHeadWord];
- //f3
- ostr << " f3_f=" << vecSTerms[pCon1->m_iBegin] << " f3_l=" << vecSTerms[pCon1->m_iEnd];
- //f4
- ostr << " f4_f=" << vecSTerms[pCon2->m_iBegin] << " f4_l=" << vecSTerms[pCon2->m_iEnd];
- //f5
- if (pParent->m_iHeadChild == pCon1->m_iBrotherIndex)
- ostr << " f5_1=1";
- else
- ostr << " f5_1=0";
- if (pParent->m_iHeadChild == pCon2->m_iBrotherIndex)
- ostr << " f5_2=1";
- else
- ostr << " f5_2=0";
- //f6
- ostr << " f6=" << pParent->m_pszTerm;
-
- /*//f7
- if (iLeft1 != -1) {
- ostr << " f7_f=" << vecTTerms[iLeft1] << " f7_l=" << vecTTerms[iRight1];
- }
- if (iLeft2 != -1) {
- ostr << " f8_f=" << vecTTerms[iLeft2] << " f8_l=" << vecTTerms[iRight2];
- }
-
- const vector<int>* pvecTarget = pAlign->fnGetSingleWordAlign(pCon1->m_iHeadWord, true);
- string str = "";
- for (size_t i = 0; pvecTarget != NULL && i < pvecTarget->size(); i++) {
- str += vecTTerms[(*pvecTarget)[i]] + "_";
- }
- if (str.length() > 0) {
- ostr << " f9_1=" << str.substr(0, str.size()-1);
- }
- pvecTarget = pAlign->fnGetSingleWordAlign(pCon2->m_iHeadWord, true);
- str = "";
- for (size_t i = 0; pvecTarget != NULL && i < pvecTarget->size(); i++) {
- str += vecTTerms[(*pvecTarget)[i]] + "_";
- }
- if (str.length() > 0) {
- ostr << " f9_2=" << str.substr(0, str.size()-1);
- } */
-
- }
-
- void fnGetFocusedParentNodes(const SParsedTree* pTree, vector<STreeItem*>& vecFocused){
- for (size_t i = 0; i < pTree->m_vecTerminals.size(); i++) {
- STreeItem *pParent = pTree->m_vecTerminals[i]->m_ptParent;
-
- while (pParent != NULL) {
- //if (pParent->m_vecChildren.size() > 1 && pParent->m_iEnd - pParent->m_iBegin > 5) {
- if (pParent->m_vecChildren.size() > 1) {
- //do constituent reordering for all children of pParent
- vecFocused.push_back(pParent);
- }
- if (pParent->m_iBrotherIndex != 0) break;
- pParent = pParent->m_ptParent;
- }
- }
- }
-
- void fnGenerateInstanceFile(const char* pszSynFname, //source-side flattened parse tree file name
- const char* pszAlignFname, //alignment filename
- const char* pszSourceFname, //source file name
- const char* pszTargetFname, //target file name
- const char* pszInstanceFname //training instance file name
- ) {
- SAlignmentReader *pAlignReader = new SAlignmentReader(pszAlignFname);
- SParseReader *pParseReader = new SParseReader(pszSynFname, false);
- STxtFileReader *pTxtSReader = new STxtFileReader(pszSourceFname);
- STxtFileReader *pTxtTReader = new STxtFileReader(pszTargetFname);
-
- string strInstanceLeftFname = string(pszInstanceFname) + string(".left");
- string strInstanceRightFname = string(pszInstanceFname) + string(".right");
-
- FILE *fpLeftOut = fopen(strInstanceLeftFname.c_str(), "w");
- assert(fpLeftOut != NULL);
-
- FILE *fpRightOut = fopen(strInstanceRightFname.c_str(), "w");
- assert(fpRightOut != NULL);
-
- //read sentence by sentence
- SAlignment *pAlign;
- SParsedTree *pTree;
- char *pszLine = new char[50001];
- int iSentNum = 0;
- while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) {
- pTree = pParseReader->fnReadNextParseTree();
- assert(pTxtSReader->fnReadNextLine(pszLine, NULL));
- vector<string> vecSTerms;
- SplitOnWhitespace(string(pszLine), &vecSTerms);
- assert(pTxtTReader->fnReadNextLine(pszLine, NULL));
- vector<string> vecTTerms;
- SplitOnWhitespace(string(pszLine), &vecTTerms);
-
-
- if (pTree != NULL) {
-
- vector<STreeItem*> vecFocused;
- fnGetFocusedParentNodes(pTree, vecFocused);
-
- for (size_t i = 0; i < vecFocused.size(); i++) {
-
- STreeItem *pParent = vecFocused[i];
-
- vector<int> vecLeft, vecRight;
- for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) {
- STreeItem *pCon1 = pParent->m_vecChildren[j];
- int iLeft1, iRight1;
- pAlign->fnGetLeftRightMost(pCon1->m_iBegin, pCon1->m_iEnd, true, iLeft1, iRight1);
- vecLeft.push_back(iLeft1);
- vecRight.push_back(iRight1);
- }
- vector<int> vecLeftPosition;
- fnGetRelativePosition(vecLeft, vecLeftPosition);
- vector<int> vecRightPosition;
- fnGetRelativePosition(vecRight, vecRightPosition);
-
- vector<string> vecChunkStatus;
- for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) {
- string strOutcome = pAlign->fnIsContinuous(pParent->m_vecChildren[j]->m_iBegin, pParent->m_vecChildren[j]->m_iEnd);
- vecChunkStatus.push_back(strOutcome);
- }
-
- for (size_t j = 1; j < pParent->m_vecChildren.size(); j++) {
- //children[j-1] vs. children[j] reordering
-
- string strLeftOutcome;
- ostringstream ostr;
-
- fnGenerateInstance(pTree, pParent, j, vecChunkStatus, vecLeftPosition, vecSTerms, vecTTerms, strLeftOutcome, ostr);
-
- //fprintf(stderr, "%s %s\n", ostr.str().c_str(), strLeftOutcome.c_str());
- fprintf(fpLeftOut, "%s %s\n", ostr.str().c_str(), strLeftOutcome.c_str());
-
- string strRightOutcome;
- fnGetOutcome(vecRightPosition[j-1], vecRightPosition[j], strRightOutcome);
- fprintf(fpRightOut, "%s LeftOrder=%s %s\n", ostr.str().c_str(), strLeftOutcome.c_str(), strRightOutcome.c_str());
- }
- }
- delete pTree;
- }
-
- delete pAlign;
- iSentNum++;
-
- if (iSentNum % 100000 == 0)
- fprintf(stderr, "#%d\n", iSentNum);
- }
-
-
- fclose(fpLeftOut);
- fclose(fpRightOut);
- delete pAlignReader;
- delete pParseReader;
- delete pTxtSReader;
- delete pTxtTReader;
- delete [] pszLine;
- }
-
- void fnGenerateInstanceFile2(const char* pszSynFname, //source-side flattened parse tree file name
- const char* pszAlignFname, //alignment filename
- const char* pszSourceFname, //source file name
- const char* pszTargetFname, //target file name
- const char* pszInstanceFname //training instance file name
- ) {
- SAlignmentReader *pAlignReader = new SAlignmentReader(pszAlignFname);
- SParseReader *pParseReader = new SParseReader(pszSynFname, false);
- STxtFileReader *pTxtSReader = new STxtFileReader(pszSourceFname);
- STxtFileReader *pTxtTReader = new STxtFileReader(pszTargetFname);
-
- FILE *fpOut = fopen(pszInstanceFname, "w");
- assert(fpOut != NULL);
-
- //read sentence by sentence
- SAlignment *pAlign;
- SParsedTree *pTree;
- char *pszLine = new char[50001];
- int iSentNum = 0;
- while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) {
- pTree = pParseReader->fnReadNextParseTree();
- assert(pTxtSReader->fnReadNextLine(pszLine, NULL));
- vector<string> vecSTerms;
- SplitOnWhitespace(string(pszLine), &vecSTerms);
- assert(pTxtTReader->fnReadNextLine(pszLine, NULL));
- vector<string> vecTTerms;
- SplitOnWhitespace(string(pszLine), &vecTTerms);
-
-
- if (pTree != NULL) {
-
- vector<STreeItem*> vecFocused;
- fnGetFocusedParentNodes(pTree, vecFocused);
-
- for (size_t i = 0; i < vecFocused.size() && pTree->m_vecTerminals.size() > 10; i++) {
-
- STreeItem *pParent = vecFocused[i];
-
- for (size_t j = 1; j < pParent->m_vecChildren.size(); j++) {
- //children[j-1] vs. children[j] reordering
-
- string strOutcome;
- ostringstream ostr;
-
- fnGenerateInstance(pTree, pParent, pParent->m_vecChildren[j-1], pParent->m_vecChildren[j], pAlign, vecSTerms, vecTTerms, strOutcome, ostr);
-
- //fprintf(stderr, "%s %s\n", ostr.str().c_str(), strOutcome.c_str());
- fprintf(fpOut, "%s %s\n", ostr.str().c_str(), strOutcome.c_str());
- }
- }
- delete pTree;
- }
-
- delete pAlign;
- iSentNum++;
-
- if (iSentNum % 100000 == 0)
- fprintf(stderr, "#%d\n", iSentNum);
- }
-
-
- fclose(fpOut);
- delete pAlignReader;
- delete pParseReader;
- delete pTxtSReader;
- delete pTxtTReader;
- delete [] pszLine;
- }
+struct SConstReorderTrainer {
+ SConstReorderTrainer(
+ const char* pszSynFname, // source-side flattened parse tree file name
+ const char* pszAlignFname, // alignment filename
+ const char* pszSourceFname, // source file name
+ const char* pszTargetFname, // target file name
+ const char* pszInstanceFname, // training instance file name
+ const char* pszModelPrefix, // classifier model file name prefix
+ int iClassifierType, // classifier type
+ int iCutoff, // feature count threshold
+ const char* pszOption // other classifier parameters (for svmlight)
+ ) {
+ fnGenerateInstanceFile(pszSynFname, pszAlignFname, pszSourceFname,
+ pszTargetFname, pszInstanceFname);
+
+ string strInstanceLeftFname = string(pszInstanceFname) + string(".left");
+ string strInstanceRightFname = string(pszInstanceFname) + string(".right");
+
+ string strModelLeftFname = string(pszModelPrefix) + string(".left");
+ string strModelRightFname = string(pszModelPrefix) + string(".right");
+
+ fprintf(stdout, "...Training the left ordering model\n");
+ fnTraining(strInstanceLeftFname.c_str(), strModelLeftFname.c_str(),
+ iCutoff);
+ fprintf(stdout, "...Training the right ordering model\n");
+ fnTraining(strInstanceRightFname.c_str(), strModelRightFname.c_str(),
+ iCutoff);
+ }
+ ~SConstReorderTrainer() {}
+
+ private:
+ void fnTraining(const char* pszInstanceFname, const char* pszModelFname,
+ int iCutoff) {
+ char* pszNewInstanceFName = new char[strlen(pszInstanceFname) + 50];
+ if (iCutoff > 0) {
+ sprintf(pszNewInstanceFName, "%s.tmp", pszInstanceFname);
+ fnPreparingTrainingdata(pszInstanceFname, iCutoff, pszNewInstanceFName);
+ } else {
+ strcpy(pszNewInstanceFName, pszInstanceFname);
+ }
+
+ /*Zhangle_Maxent *pZhangleMaxent = new Zhangle_Maxent(NULL);
+pZhangleMaxent->fnTrain(pszInstanceFname, "lbfgs", pszModelFname, 100, 2.0);
+delete pZhangleMaxent;*/
+
+ Tsuruoka_Maxent* pMaxent = new Tsuruoka_Maxent(NULL);
+ pMaxent->fnTrain(pszNewInstanceFName, "l1", pszModelFname, 300);
+ delete pMaxent;
+
+ if (strcmp(pszNewInstanceFName, pszInstanceFname) != 0) {
+ sprintf(pszNewInstanceFName, "rm %s.tmp", pszInstanceFname);
+ system(pszNewInstanceFName);
+ }
+ delete[] pszNewInstanceFName;
+ }
+
+ inline bool fnIsVerbPOS(const char* pszTerm) {
+ if (strcmp(pszTerm, "VV") == 0 || strcmp(pszTerm, "VA") == 0 ||
+ strcmp(pszTerm, "VC") == 0 || strcmp(pszTerm, "VE") == 0)
+ return true;
+ return false;
+ }
+
+ inline void fnGetOutcome(int iL1, int iR1, int iL2, int iR2,
+ const SAlignment* pAlign, string& strOutcome) {
+ if (iL1 == -1 && iL2 == -1)
+ strOutcome = "BU"; // 1. both are untranslated
+ else if (iL1 == -1)
+ strOutcome = "1U"; // 2. XP1 is untranslated
+ else if (iL2 == -1)
+ strOutcome = "2U"; // 3. XP2 is untranslated
+ else if (iL1 == iL2 && iR2 == iR2)
+ strOutcome = "SS"; // 4. Have same scope
+ else if (iL1 <= iL2 && iR1 >= iR2)
+ strOutcome = "1C2"; // 5. XP1's translation covers XP2's
+ else if (iL1 >= iL2 && iR1 <= iR2)
+ strOutcome = "2C1"; // 6. XP2's translation covers XP1's
+ else if (iR1 < iL2) {
+ int i = iR1 + 1;
+ /*while (i < iL2) {
+ if (pAlign->fnIsAligned(i, false))
+ break;
+ i++;
+ }*/
+ if (i == iL2)
+ strOutcome = "M"; // 7. Monotone
+ else
+ strOutcome = "DM"; // 8. Discontinuous monotone
+ } else if (iL1 < iL2 && iL2 <= iR1 && iR1 < iR2)
+ strOutcome = "OM"; // 9. Overlap monotone
+ else if (iR2 < iL1) {
+ int i = iR2 + 1;
+ /*while (i < iL1) {
+ if (pAlign->fnIsAligned(i, false))
+ break;
+ i++;
+ }*/
+ if (i == iL1)
+ strOutcome = "S"; // 10. Swap
+ else
+ strOutcome = "DS"; // 11. Discontinuous swap
+ } else if (iL2 < iL1 && iL1 <= iR2 && iR2 < iR1)
+ strOutcome = "OS"; // 12. Overlap swap
+ else
+ assert(false);
+ }
+
+ inline void fnGetOutcome(int i1, int i2, string& strOutcome) {
+ assert(i1 != i2);
+ if (i1 < i2) {
+ if (i2 > i1 + 1)
+ strOutcome = string("DM");
+ else
+ strOutcome = string("M");
+ } else {
+ if (i1 > i2 + 1)
+ strOutcome = string("DS");
+ else
+ strOutcome = string("S");
+ }
+ }
+
+ inline void fnGetRelativePosition(const vector<int>& vecLeft,
+ vector<int>& vecPosition) {
+ vecPosition.clear();
+
+ vector<float> vec;
+ for (size_t i = 0; i < vecLeft.size(); i++) {
+ if (vecLeft[i] == -1) {
+ if (i == 0)
+ vec.push_back(-1);
+ else
+ vec.push_back(vecLeft[i - 1] + 0.1);
+ } else
+ vec.push_back(vecLeft[i]);
+ }
+
+ for (size_t i = 0; i < vecLeft.size(); i++) {
+ int count = 0;
+
+ for (size_t j = 0; j < vecLeft.size(); j++) {
+ if (j == i) continue;
+ if (vec[j] < vec[i]) {
+ count++;
+ } else if (vec[j] == vec[i] && j < i) {
+ count++;
+ }
+ }
+ vecPosition.push_back(count);
+ }
+ }
+
+ /*
+ * features:
+ * f1: (left_label, right_label, parent_label)
+ * f2: (left_label, right_label, parent_label, other_right_sibling_label)
+ * f3: (left_label, right_label, parent_label, other_left_sibling_label)
+ * f4: (left_label, right_label, left_head_pos)
+ * f5: (left_label, right_label, left_head_word)
+ * f6: (left_label, right_label, right_head_pos)
+ * f7: (left_label, right_label, right_head_word)
+ * f8: (left_label, right_label, left_chunk_status)
+ * f9: (left_label, right_label, right_chunk_status)
+ * f10: (left_label, parent_label)
+ * f11: (right_label, parent_label)
+ */
+ void fnGenerateInstance(const SParsedTree* pTree, const STreeItem* pParent,
+ int iPos, const vector<string>& vecChunkStatus,
+ const vector<int>& vecPosition,
+ const vector<string>& vecSTerms,
+ const vector<string>& vecTTerms, string& strOutcome,
+ ostringstream& ostr) {
+ STreeItem* pCon1, *pCon2;
+ pCon1 = pParent->m_vecChildren[iPos - 1];
+ pCon2 = pParent->m_vecChildren[iPos];
+
+ fnGetOutcome(vecPosition[iPos - 1], vecPosition[iPos], strOutcome);
+
+ string left_label = string(pCon1->m_pszTerm);
+ string right_label = string(pCon2->m_pszTerm);
+ string parent_label = string(pParent->m_pszTerm);
+
+ vector<string> vec_other_right_sibling;
+ for (int i = iPos + 1; i < pParent->m_vecChildren.size(); i++)
+ vec_other_right_sibling.push_back(
+ string(pParent->m_vecChildren[i]->m_pszTerm));
+ if (vec_other_right_sibling.size() == 0)
+ vec_other_right_sibling.push_back(string("NULL"));
+ vector<string> vec_other_left_sibling;
+ for (int i = 0; i < iPos - 1; i++)
+ vec_other_left_sibling.push_back(
+ string(pParent->m_vecChildren[i]->m_pszTerm));
+ if (vec_other_left_sibling.size() == 0)
+ vec_other_left_sibling.push_back(string("NULL"));
+
+ // generate features
+ // f1
+ ostr << "f1=" << left_label << "_" << right_label << "_" << parent_label;
+ // f2
+ for (int i = 0; i < vec_other_right_sibling.size(); i++)
+ ostr << " f2=" << left_label << "_" << right_label << "_" << parent_label
+ << "_" << vec_other_right_sibling[i];
+ // f3
+ for (int i = 0; i < vec_other_left_sibling.size(); i++)
+ ostr << " f3=" << left_label << "_" << right_label << "_" << parent_label
+ << "_" << vec_other_left_sibling[i];
+ // f4
+ ostr << " f4=" << left_label << "_" << right_label << "_"
+ << pTree->m_vecTerminals[pCon1->m_iHeadWord]->m_ptParent->m_pszTerm;
+ // f5
+ ostr << " f5=" << left_label << "_" << right_label << "_"
+ << vecSTerms[pCon1->m_iHeadWord];
+ // f6
+ ostr << " f6=" << left_label << "_" << right_label << "_"
+ << pTree->m_vecTerminals[pCon2->m_iHeadWord]->m_ptParent->m_pszTerm;
+ // f7
+ ostr << " f7=" << left_label << "_" << right_label << "_"
+ << vecSTerms[pCon2->m_iHeadWord];
+ // f8
+ ostr << " f8=" << left_label << "_" << right_label << "_"
+ << vecChunkStatus[iPos - 1];
+ // f9
+ ostr << " f9=" << left_label << "_" << right_label << "_"
+ << vecChunkStatus[iPos];
+ // f10
+ ostr << " f10=" << left_label << "_" << parent_label;
+ // f11
+ ostr << " f11=" << right_label << "_" << parent_label;
+ }
+
+ /*
+ * Source side (11 features):
+ * f1: the categories of XP1 and XP2 (f1_1, f1_2)
+ * f2: the head words of XP1 and XP2 (f2_1, f2_2)
+ * f3: the first and last word of XP1 (f3_f, f3_l)
+ * f4: the first and last word of XP2 (f4_f, f4_l)
+ * f5: is XP1 or XP2 the head node (f5_1, f5_2)
+ * f6: the category of the common parent
+ * Target side (6 features):
+ * f7: the first and the last word of XP1's translation (f7_f, f7_l)
+ * f8: the first and the last word of XP2's translation (f8_f, f8_l)
+ * f9: the translation of XP1's and XP2's head word (f9_1, f9_2)
+ */
+ void fnGenerateInstance(const SParsedTree* pTree, const STreeItem* pParent,
+ const STreeItem* pCon1, const STreeItem* pCon2,
+ const SAlignment* pAlign,
+ const vector<string>& vecSTerms,
+ const vector<string>& vecTTerms, string& strOutcome,
+ ostringstream& ostr) {
+
+ int iLeft1, iRight1, iLeft2, iRight2;
+ pAlign->fnGetLeftRightMost(pCon1->m_iBegin, pCon1->m_iEnd, true, iLeft1,
+ iRight1);
+ pAlign->fnGetLeftRightMost(pCon2->m_iBegin, pCon2->m_iEnd, true, iLeft2,
+ iRight2);
+
+ fnGetOutcome(iLeft1, iRight1, iLeft2, iRight2, pAlign, strOutcome);
+
+ // generate features
+ // f1
+ ostr << "f1_1=" << pCon1->m_pszTerm << " f1_2=" << pCon2->m_pszTerm;
+ // f2
+ ostr << " f2_1=" << vecSTerms[pCon1->m_iHeadWord] << " f2_2"
+ << vecSTerms[pCon2->m_iHeadWord];
+ // f3
+ ostr << " f3_f=" << vecSTerms[pCon1->m_iBegin]
+ << " f3_l=" << vecSTerms[pCon1->m_iEnd];
+ // f4
+ ostr << " f4_f=" << vecSTerms[pCon2->m_iBegin]
+ << " f4_l=" << vecSTerms[pCon2->m_iEnd];
+ // f5
+ if (pParent->m_iHeadChild == pCon1->m_iBrotherIndex)
+ ostr << " f5_1=1";
+ else
+ ostr << " f5_1=0";
+ if (pParent->m_iHeadChild == pCon2->m_iBrotherIndex)
+ ostr << " f5_2=1";
+ else
+ ostr << " f5_2=0";
+ // f6
+ ostr << " f6=" << pParent->m_pszTerm;
+
+ /*//f7
+ if (iLeft1 != -1) {
+ ostr << " f7_f=" << vecTTerms[iLeft1] << " f7_l=" <<
+ vecTTerms[iRight1];
+ }
+ if (iLeft2 != -1) {
+ ostr << " f8_f=" << vecTTerms[iLeft2] << " f8_l=" <<
+ vecTTerms[iRight2];
+ }
+
+ const vector<int>* pvecTarget =
+ pAlign->fnGetSingleWordAlign(pCon1->m_iHeadWord, true);
+ string str = "";
+ for (size_t i = 0; pvecTarget != NULL && i < pvecTarget->size(); i++) {
+ str += vecTTerms[(*pvecTarget)[i]] + "_";
+ }
+ if (str.length() > 0) {
+ ostr << " f9_1=" << str.substr(0, str.size()-1);
+ }
+ pvecTarget = pAlign->fnGetSingleWordAlign(pCon2->m_iHeadWord, true);
+ str = "";
+ for (size_t i = 0; pvecTarget != NULL && i < pvecTarget->size(); i++) {
+ str += vecTTerms[(*pvecTarget)[i]] + "_";
+ }
+ if (str.length() > 0) {
+ ostr << " f9_2=" << str.substr(0, str.size()-1);
+ } */
+ }
+
+ void fnGetFocusedParentNodes(const SParsedTree* pTree,
+ vector<STreeItem*>& vecFocused) {
+ for (size_t i = 0; i < pTree->m_vecTerminals.size(); i++) {
+ STreeItem* pParent = pTree->m_vecTerminals[i]->m_ptParent;
+
+ while (pParent != NULL) {
+ // if (pParent->m_vecChildren.size() > 1 && pParent->m_iEnd -
+ // pParent->m_iBegin > 5) {
+ if (pParent->m_vecChildren.size() > 1) {
+ // do constituent reordering for all children of pParent
+ vecFocused.push_back(pParent);
+ }
+ if (pParent->m_iBrotherIndex != 0) break;
+ pParent = pParent->m_ptParent;
+ }
+ }
+ }
+
+ void fnGenerateInstanceFile(
+ const char* pszSynFname, // source-side flattened parse tree file name
+ const char* pszAlignFname, // alignment filename
+ const char* pszSourceFname, // source file name
+ const char* pszTargetFname, // target file name
+ const char* pszInstanceFname // training instance file name
+ ) {
+ SAlignmentReader* pAlignReader = new SAlignmentReader(pszAlignFname);
+ SParseReader* pParseReader = new SParseReader(pszSynFname, false);
+ STxtFileReader* pTxtSReader = new STxtFileReader(pszSourceFname);
+ STxtFileReader* pTxtTReader = new STxtFileReader(pszTargetFname);
+
+ string strInstanceLeftFname = string(pszInstanceFname) + string(".left");
+ string strInstanceRightFname = string(pszInstanceFname) + string(".right");
+
+ FILE* fpLeftOut = fopen(strInstanceLeftFname.c_str(), "w");
+ assert(fpLeftOut != NULL);
+
+ FILE* fpRightOut = fopen(strInstanceRightFname.c_str(), "w");
+ assert(fpRightOut != NULL);
+
+ // read sentence by sentence
+ SAlignment* pAlign;
+ SParsedTree* pTree;
+ char* pszLine = new char[50001];
+ int iSentNum = 0;
+ while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) {
+ pTree = pParseReader->fnReadNextParseTree();
+ assert(pTxtSReader->fnReadNextLine(pszLine, NULL));
+ vector<string> vecSTerms;
+ SplitOnWhitespace(string(pszLine), &vecSTerms);
+ assert(pTxtTReader->fnReadNextLine(pszLine, NULL));
+ vector<string> vecTTerms;
+ SplitOnWhitespace(string(pszLine), &vecTTerms);
+
+ if (pTree != NULL) {
+
+ vector<STreeItem*> vecFocused;
+ fnGetFocusedParentNodes(pTree, vecFocused);
+
+ for (size_t i = 0; i < vecFocused.size(); i++) {
+
+ STreeItem* pParent = vecFocused[i];
+
+ vector<int> vecLeft, vecRight;
+ for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) {
+ STreeItem* pCon1 = pParent->m_vecChildren[j];
+ int iLeft1, iRight1;
+ pAlign->fnGetLeftRightMost(pCon1->m_iBegin, pCon1->m_iEnd, true,
+ iLeft1, iRight1);
+ vecLeft.push_back(iLeft1);
+ vecRight.push_back(iRight1);
+ }
+ vector<int> vecLeftPosition;
+ fnGetRelativePosition(vecLeft, vecLeftPosition);
+ vector<int> vecRightPosition;
+ fnGetRelativePosition(vecRight, vecRightPosition);
+
+ vector<string> vecChunkStatus;
+ for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) {
+ string strOutcome =
+ pAlign->fnIsContinuous(pParent->m_vecChildren[j]->m_iBegin,
+ pParent->m_vecChildren[j]->m_iEnd);
+ vecChunkStatus.push_back(strOutcome);
+ }
+
+ for (size_t j = 1; j < pParent->m_vecChildren.size(); j++) {
+ // children[j-1] vs. children[j] reordering
+
+ string strLeftOutcome;
+ ostringstream ostr;
+
+ fnGenerateInstance(pTree, pParent, j, vecChunkStatus,
+ vecLeftPosition, vecSTerms, vecTTerms,
+ strLeftOutcome, ostr);
+
+ // fprintf(stderr, "%s %s\n", ostr.str().c_str(),
+ // strLeftOutcome.c_str());
+ fprintf(fpLeftOut, "%s %s\n", ostr.str().c_str(),
+ strLeftOutcome.c_str());
+
+ string strRightOutcome;
+ fnGetOutcome(vecRightPosition[j - 1], vecRightPosition[j],
+ strRightOutcome);
+ fprintf(fpRightOut, "%s LeftOrder=%s %s\n", ostr.str().c_str(),
+ strLeftOutcome.c_str(), strRightOutcome.c_str());
+ }
+ }
+ delete pTree;
+ }
+
+ delete pAlign;
+ iSentNum++;
+
+ if (iSentNum % 100000 == 0) fprintf(stderr, "#%d\n", iSentNum);
+ }
+
+ fclose(fpLeftOut);
+ fclose(fpRightOut);
+ delete pAlignReader;
+ delete pParseReader;
+ delete pTxtSReader;
+ delete pTxtTReader;
+ delete[] pszLine;
+ }
+
+ void fnGenerateInstanceFile2(
+ const char* pszSynFname, // source-side flattened parse tree file name
+ const char* pszAlignFname, // alignment filename
+ const char* pszSourceFname, // source file name
+ const char* pszTargetFname, // target file name
+ const char* pszInstanceFname // training instance file name
+ ) {
+ SAlignmentReader* pAlignReader = new SAlignmentReader(pszAlignFname);
+ SParseReader* pParseReader = new SParseReader(pszSynFname, false);
+ STxtFileReader* pTxtSReader = new STxtFileReader(pszSourceFname);
+ STxtFileReader* pTxtTReader = new STxtFileReader(pszTargetFname);
+
+ FILE* fpOut = fopen(pszInstanceFname, "w");
+ assert(fpOut != NULL);
+
+ // read sentence by sentence
+ SAlignment* pAlign;
+ SParsedTree* pTree;
+ char* pszLine = new char[50001];
+ int iSentNum = 0;
+ while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) {
+ pTree = pParseReader->fnReadNextParseTree();
+ assert(pTxtSReader->fnReadNextLine(pszLine, NULL));
+ vector<string> vecSTerms;
+ SplitOnWhitespace(string(pszLine), &vecSTerms);
+ assert(pTxtTReader->fnReadNextLine(pszLine, NULL));
+ vector<string> vecTTerms;
+ SplitOnWhitespace(string(pszLine), &vecTTerms);
+
+ if (pTree != NULL) {
+
+ vector<STreeItem*> vecFocused;
+ fnGetFocusedParentNodes(pTree, vecFocused);
+
+ for (size_t i = 0;
+ i < vecFocused.size() && pTree->m_vecTerminals.size() > 10; i++) {
+
+ STreeItem* pParent = vecFocused[i];
+
+ for (size_t j = 1; j < pParent->m_vecChildren.size(); j++) {
+ // children[j-1] vs. children[j] reordering
+
+ string strOutcome;
+ ostringstream ostr;
+
+ fnGenerateInstance(pTree, pParent, pParent->m_vecChildren[j - 1],
+ pParent->m_vecChildren[j], pAlign, vecSTerms,
+ vecTTerms, strOutcome, ostr);
+
+ // fprintf(stderr, "%s %s\n", ostr.str().c_str(),
+ // strOutcome.c_str());
+ fprintf(fpOut, "%s %s\n", ostr.str().c_str(), strOutcome.c_str());
+ }
+ }
+ delete pTree;
+ }
+
+ delete pAlign;
+ iSentNum++;
+
+ if (iSentNum % 100000 == 0) fprintf(stderr, "#%d\n", iSentNum);
+ }
+
+ fclose(fpOut);
+ delete pAlignReader;
+ delete pParseReader;
+ delete pTxtSReader;
+ delete pTxtTReader;
+ delete[] pszLine;
+ }
};
-struct SConstContTrainer{
- SConstContTrainer(const char* pszFlattenedSynFname, //source-side flattened parse tree file name
- const char* pszAlignFname, //alignment filename
- const char* pszSourceFname, //source file name
- const char* pszTargetFname, //target file name
- const char* pszInstanceFname, //training instance file name
- const char* pszModelPrefix, //classifier model file name prefix
- int iClassifierType, //classifier type
- int iCutoff, //feature count threshold
- const char* pszOption //other classifier parameters (for svmlight)
- ) {
- fnGenerateInstanceFile(pszFlattenedSynFname, pszAlignFname, pszSourceFname, pszTargetFname, pszInstanceFname);
- //fnTraining(pszInstanceFname, pszModelPrefix, iClassifierType, iCutoff, pszOption);
- fnTraining(pszInstanceFname, pszModelPrefix, iCutoff);
- }
- ~SConstContTrainer() {
-
- }
-
-private:
-
- void fnTraining(const char* pszInstanceFname, const char* pszModelFname, int iCutoff) {
- char *pszNewInstanceFName = new char[strlen(pszInstanceFname) + 50];
- if (iCutoff > 0) {
- sprintf(pszNewInstanceFName, "%s.tmp", pszInstanceFname);
- fnPreparingTrainingdata(pszInstanceFname, iCutoff, pszNewInstanceFName);
- } else {
- strcpy(pszNewInstanceFName, pszInstanceFname);
- }
-
- /*Zhangle_Maxent *pZhangleMaxent = new Zhangle_Maxent(NULL);
- pZhangleMaxent->fnTrain(pszInstanceFname, "lbfgs", pszModelFname, 100, 2.0);
- delete pZhangleMaxent;*/
-
- Tsuruoka_Maxent *pMaxent = new Tsuruoka_Maxent(NULL);
- pMaxent->fnTrain(pszInstanceFname, "l1", pszModelFname, 300);
- delete pMaxent;
-
- if (strcmp(pszNewInstanceFName, pszInstanceFname) != 0) {
- sprintf(pszNewInstanceFName, "rm %s.tmp", pszInstanceFname);
- system(pszNewInstanceFName);
- }
- delete [] pszNewInstanceFName;
- }
-
-
- void fnGetFocusedParentNodes(const SParsedTree* pTree, vector<STreeItem*>& vecFocused){
- for (size_t i = 0; i < pTree->m_vecTerminals.size(); i++) {
- STreeItem *pParent = pTree->m_vecTerminals[i]->m_ptParent;
-
- while (pParent != NULL) {
- //if (pParent->m_vecChildren.size() > 1 && pParent->m_iEnd - pParent->m_iBegin > 5) {
- if (pParent->m_vecChildren.size() > 1) {
- //do constituent reordering for all children of pParent
- vecFocused.push_back(pParent);
- }
- if (pParent->m_iBrotherIndex != 0) break;
- pParent = pParent->m_ptParent;
- }
- }
- }
-
- inline void fnGetOutcome(int iL1, int iR1, const SAlignment *pAlign, string& strOutcome) {
- strOutcome = pAlign->fnIsContinuous(iL1, iR1);
- }
-
- inline string fnGetLengthType(int iLen) {
- if (iLen == 1)
- return string("1");
- if (iLen == 2)
- return string("2");
- if (iLen == 3)
- return string("3");
- if (iLen < 6)
- return string("4");
- if (iLen < 11)
- return string("6");
- return string("11");
- }
-
- /*
- * Source side (11 features):
- * f1: the syntactic category
- * f2: the syntactic category of its parent
- * f3: the head word's pos
- * f4: =1 if it's the head of its parent node
- * or
- * the head of its parent node
- * f5: length type
- */
- void fnGenerateInstance(const SParsedTree *pTree, const STreeItem *pCon1, const SAlignment *pAlign, const vector<string>& vecSTerms, const vector<string>& vecTTerms, string& strOutcome, ostringstream& ostr) {
-
- fnGetOutcome(pCon1->m_iBegin, pCon1->m_iEnd, pAlign, strOutcome);
-
- //generate features
- //f1
- ostr << "f1=" << pCon1->m_pszTerm;
- //f2
- ostr << " f2=" << pCon1->m_ptParent->m_pszTerm;
- //f3
- ostr << " f3=" << pTree->m_vecTerminals[pCon1->m_iHeadWord]->m_ptParent->m_pszTerm;
- //f4
- if (pCon1->m_iBrotherIndex == pCon1->m_ptParent->m_iHeadChild) {
- ostr << " f4=1";
- } else {
- ostr << " f4=" << pCon1->m_ptParent->m_vecChildren[pCon1->m_ptParent->m_iHeadChild]->m_pszTerm;
- }
- //f5
- ostr << " f5=" << fnGetLengthType(pCon1->m_iEnd - pCon1->m_iBegin + 1);
- }
-
- void fnGenerateInstanceFile(const char* pszFlattenedSynFname, //source-side flattened parse tree file name
- const char* pszAlignFname, //alignment filename
- const char* pszSourceFname, //source file name
- const char* pszTargetFname, //target file name
- const char* pszInstanceFname //training instance file name
- ) {
- SAlignmentReader *pAlignReader = new SAlignmentReader(pszAlignFname);
- SParseReader *pParseReader = new SParseReader(pszFlattenedSynFname, true);
- STxtFileReader *pTxtSReader = new STxtFileReader(pszSourceFname);
- STxtFileReader *pTxtTReader = new STxtFileReader(pszTargetFname);
-
- FILE *fpOut = fopen(pszInstanceFname, "w");
- assert(fpOut != NULL);
-
- //read sentence by sentence
- SAlignment *pAlign;
- SParsedTree *pTree;
- char *pszLine = new char[50001];
- int iSentNum = 0;
- while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) {
- pTree = pParseReader->fnReadNextParseTree();
- assert(pTree != NULL);
- assert(pTxtSReader->fnReadNextLine(pszLine, NULL));
- vector<string> vecSTerms;
- SplitOnWhitespace(string(pszLine), &vecSTerms);
- assert(pTxtTReader->fnReadNextLine(pszLine, NULL));
- vector<string> vecTTerms;
- SplitOnWhitespace(string(pszLine), &vecTTerms);
-
- vector<STreeItem*> vecFocused;
- fnGetFocusedParentNodes(pTree, vecFocused);
-
- for (size_t i = 0; i < vecFocused.size() && pTree->m_vecTerminals.size() > 10; i++) {
-
- STreeItem *pParent = vecFocused[i];
-
- for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) {
- //children[j-1] vs. children[j] reordering
-
- string strOutcome;
- ostringstream ostr;
-
- fnGenerateInstance(pTree, pParent->m_vecChildren[j], pAlign, vecSTerms, vecTTerms, strOutcome, ostr);
-
- //fprintf(stderr, "%s %s\n", ostr.str().c_str(), strOutcome.c_str());
- fprintf(fpOut, "%s %s\n", ostr.str().c_str(), strOutcome.c_str());
- }
- }
-
- delete pAlign;
- delete pTree;
- iSentNum++;
-
- if (iSentNum % 100000 == 0)
- fprintf(stderr, "#%d\n", iSentNum);
- }
-
-
- fclose(fpOut);
- delete pAlignReader;
- delete pParseReader;
- delete pTxtSReader;
- delete pTxtTReader;
- delete [] pszLine;
- }
+struct SConstContTrainer {
+ SConstContTrainer(
+ const char* pszFlattenedSynFname, // source-side flattened parse tree
+ // file name
+ const char* pszAlignFname, // alignment filename
+ const char* pszSourceFname, // source file name
+ const char* pszTargetFname, // target file name
+ const char* pszInstanceFname, // training instance file name
+ const char* pszModelPrefix, // classifier model file name prefix
+ int iClassifierType, // classifier type
+ int iCutoff, // feature count threshold
+ const char* pszOption // other classifier parameters (for svmlight)
+ ) {
+ fnGenerateInstanceFile(pszFlattenedSynFname, pszAlignFname, pszSourceFname,
+ pszTargetFname, pszInstanceFname);
+ // fnTraining(pszInstanceFname, pszModelPrefix, iClassifierType, iCutoff,
+ // pszOption);
+ fnTraining(pszInstanceFname, pszModelPrefix, iCutoff);
+ }
+ ~SConstContTrainer() {}
+
+ private:
+ void fnTraining(const char* pszInstanceFname, const char* pszModelFname,
+ int iCutoff) {
+ char* pszNewInstanceFName = new char[strlen(pszInstanceFname) + 50];
+ if (iCutoff > 0) {
+ sprintf(pszNewInstanceFName, "%s.tmp", pszInstanceFname);
+ fnPreparingTrainingdata(pszInstanceFname, iCutoff, pszNewInstanceFName);
+ } else {
+ strcpy(pszNewInstanceFName, pszInstanceFname);
+ }
+
+ /*Zhangle_Maxent *pZhangleMaxent = new Zhangle_Maxent(NULL);
+ pZhangleMaxent->fnTrain(pszInstanceFname, "lbfgs", pszModelFname, 100,
+ 2.0);
+ delete pZhangleMaxent;*/
+
+ Tsuruoka_Maxent* pMaxent = new Tsuruoka_Maxent(NULL);
+ pMaxent->fnTrain(pszInstanceFname, "l1", pszModelFname, 300);
+ delete pMaxent;
+
+ if (strcmp(pszNewInstanceFName, pszInstanceFname) != 0) {
+ sprintf(pszNewInstanceFName, "rm %s.tmp", pszInstanceFname);
+ system(pszNewInstanceFName);
+ }
+ delete[] pszNewInstanceFName;
+ }
+
+ void fnGetFocusedParentNodes(const SParsedTree* pTree,
+ vector<STreeItem*>& vecFocused) {
+ for (size_t i = 0; i < pTree->m_vecTerminals.size(); i++) {
+ STreeItem* pParent = pTree->m_vecTerminals[i]->m_ptParent;
+
+ while (pParent != NULL) {
+ // if (pParent->m_vecChildren.size() > 1 && pParent->m_iEnd -
+ // pParent->m_iBegin > 5) {
+ if (pParent->m_vecChildren.size() > 1) {
+ // do constituent reordering for all children of pParent
+ vecFocused.push_back(pParent);
+ }
+ if (pParent->m_iBrotherIndex != 0) break;
+ pParent = pParent->m_ptParent;
+ }
+ }
+ }
+
+ inline void fnGetOutcome(int iL1, int iR1, const SAlignment* pAlign,
+ string& strOutcome) {
+ strOutcome = pAlign->fnIsContinuous(iL1, iR1);
+ }
+
+ inline string fnGetLengthType(int iLen) {
+ if (iLen == 1) return string("1");
+ if (iLen == 2) return string("2");
+ if (iLen == 3) return string("3");
+ if (iLen < 6) return string("4");
+ if (iLen < 11) return string("6");
+ return string("11");
+ }
+
+ /*
+ * Source side (11 features):
+ * f1: the syntactic category
+ * f2: the syntactic category of its parent
+ * f3: the head word's pos
+ * f4: =1 if it's the head of its parent node
+ * or
+ * the head of its parent node
+ * f5: length type
+ */
+ void fnGenerateInstance(const SParsedTree* pTree, const STreeItem* pCon1,
+ const SAlignment* pAlign,
+ const vector<string>& vecSTerms,
+ const vector<string>& vecTTerms, string& strOutcome,
+ ostringstream& ostr) {
+
+ fnGetOutcome(pCon1->m_iBegin, pCon1->m_iEnd, pAlign, strOutcome);
+
+ // generate features
+ // f1
+ ostr << "f1=" << pCon1->m_pszTerm;
+ // f2
+ ostr << " f2=" << pCon1->m_ptParent->m_pszTerm;
+ // f3
+ ostr << " f3=" << pTree->m_vecTerminals[pCon1->m_iHeadWord]
+ ->m_ptParent->m_pszTerm;
+ // f4
+ if (pCon1->m_iBrotherIndex == pCon1->m_ptParent->m_iHeadChild) {
+ ostr << " f4=1";
+ } else {
+ ostr << " f4="
+ << pCon1->m_ptParent->m_vecChildren[pCon1->m_ptParent->m_iHeadChild]
+ ->m_pszTerm;
+ }
+ // f5
+ ostr << " f5=" << fnGetLengthType(pCon1->m_iEnd - pCon1->m_iBegin + 1);
+ }
+
+ void fnGenerateInstanceFile(
+ const char* pszFlattenedSynFname, // source-side flattened parse tree
+ // file name
+ const char* pszAlignFname, // alignment filename
+ const char* pszSourceFname, // source file name
+ const char* pszTargetFname, // target file name
+ const char* pszInstanceFname // training instance file name
+ ) {
+ SAlignmentReader* pAlignReader = new SAlignmentReader(pszAlignFname);
+ SParseReader* pParseReader = new SParseReader(pszFlattenedSynFname, true);
+ STxtFileReader* pTxtSReader = new STxtFileReader(pszSourceFname);
+ STxtFileReader* pTxtTReader = new STxtFileReader(pszTargetFname);
+
+ FILE* fpOut = fopen(pszInstanceFname, "w");
+ assert(fpOut != NULL);
+
+ // read sentence by sentence
+ SAlignment* pAlign;
+ SParsedTree* pTree;
+ char* pszLine = new char[50001];
+ int iSentNum = 0;
+ while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) {
+ pTree = pParseReader->fnReadNextParseTree();
+ assert(pTree != NULL);
+ assert(pTxtSReader->fnReadNextLine(pszLine, NULL));
+ vector<string> vecSTerms;
+ SplitOnWhitespace(string(pszLine), &vecSTerms);
+ assert(pTxtTReader->fnReadNextLine(pszLine, NULL));
+ vector<string> vecTTerms;
+ SplitOnWhitespace(string(pszLine), &vecTTerms);
+
+ vector<STreeItem*> vecFocused;
+ fnGetFocusedParentNodes(pTree, vecFocused);
+
+ for (size_t i = 0;
+ i < vecFocused.size() && pTree->m_vecTerminals.size() > 10; i++) {
+
+ STreeItem* pParent = vecFocused[i];
+
+ for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) {
+ // children[j-1] vs. children[j] reordering
+
+ string strOutcome;
+ ostringstream ostr;
+
+ fnGenerateInstance(pTree, pParent->m_vecChildren[j], pAlign,
+ vecSTerms, vecTTerms, strOutcome, ostr);
+
+ // fprintf(stderr, "%s %s\n", ostr.str().c_str(), strOutcome.c_str());
+ fprintf(fpOut, "%s %s\n", ostr.str().c_str(), strOutcome.c_str());
+ }
+ }
+
+ delete pAlign;
+ delete pTree;
+ iSentNum++;
+
+ if (iSentNum % 100000 == 0) fprintf(stderr, "#%d\n", iSentNum);
+ }
+
+ fclose(fpOut);
+ delete pAlignReader;
+ delete pParseReader;
+ delete pTxtSReader;
+ delete pTxtTReader;
+ delete[] pszLine;
+ }
};
-inline void print_options(std::ostream &out,po::options_description const& opts) {
- typedef std::vector< boost::shared_ptr<po::option_description> > Ds;
- Ds const& ds=opts.options();
+inline void print_options(std::ostream& out,
+ po::options_description const& opts) {
+ typedef std::vector<boost::shared_ptr<po::option_description> > Ds;
+ Ds const& ds = opts.options();
out << '"';
- for (unsigned i=0;i<ds.size();++i) {
- if (i) out<<' ';
- out<<"--"<<ds[i]->long_name();
+ for (unsigned i = 0; i < ds.size(); ++i) {
+ if (i) out << ' ';
+ out << "--" << ds[i]->long_name();
}
out << '\n';
}
-inline string str(char const* name,po::variables_map const& conf) {
+inline string str(char const* name, po::variables_map const& conf) {
return conf[name].as<string>();
}
-//--parse_file /scratch0/mt_exp/gq-ctb/data/train.srl.cn --align_file /scratch0/mt_exp/gq-ctb/data/aligned.grow-diag-final-and --source_file /scratch0/mt_exp/gq-ctb/data/train.cn --target_file /scratch0/mt_exp/gq-ctb/data/train.en --instance_file /scratch0/mt_exp/gq-ctb/data/srl-instance --model_prefix /scratch0/mt_exp/gq-ctb/data/srl-instance --feature_cutoff 10 --classifier_type 1
+//--parse_file /scratch0/mt_exp/gq-ctb/data/train.srl.cn --align_file
+///scratch0/mt_exp/gq-ctb/data/aligned.grow-diag-final-and --source_file
+///scratch0/mt_exp/gq-ctb/data/train.cn --target_file
+///scratch0/mt_exp/gq-ctb/data/train.en --instance_file
+///scratch0/mt_exp/gq-ctb/data/srl-instance --model_prefix
+///scratch0/mt_exp/gq-ctb/data/srl-instance --feature_cutoff 10
+//--classifier_type 1
int main(int argc, char** argv) {
- po::options_description opts("Configuration options");
- opts.add_options()
- ("parse_file",po::value<string>(),"parse file path (input)")
- ("align_file",po::value<string>(),"Alignment file path (input)")
- ("source_file",po::value<string>(),"Source text file path (input)")
- ("target_file",po::value<string>(),"Target text file path (input)")
- ("instance_file",po::value<string>(),"Instance file path (output)")
- ("model_prefix",po::value<string>(),"Model file path prefix (output): three files will be generated")
- ("classifier_type",po::value<int>()->default_value(1),"Classifier type: 1 for openNLP maxent; 2 for Zhangle maxent; and 3 for SVMLight")
- ("feature_cutoff",po::value<int>()->default_value(100),"Feature cutoff threshold")
- ("svm_option",po::value<string>(),"Parameters for SVMLight classifier")
- ("help", "produce help message");
-
- po::variables_map vm;
- if (argc) {
- po::store(po::parse_command_line(argc, argv, opts), vm);
- po::notify(vm);
- }
-
- if (vm.count("help")) {
- print_options(cout, opts);
- return 1;
- }
-
- if (!vm.count("parse_file")
- || !vm.count("align_file")
- || !vm.count("source_file")
- || !vm.count("target_file")
- || !vm.count("instance_file")
- || !vm.count("model_prefix")
- ) {
- print_options(cout, opts);
- if (!vm.count("parse_file")) cout << "--parse_file NOT FOUND\n";
- if (!vm.count("align_file")) cout << "--align_file NOT FOUND\n";
- if (!vm.count("source_file")) cout << "--source_file NOT FOUND\n";
- if (!vm.count("target_file")) cout << "--target_file NOT FOUND\n";
- if (!vm.count("instance_file")) cout << "--instance_file NOT FOUND\n";
- if (!vm.count("model_prefix")) cout << "--model_prefix NOT FOUND\n";
- exit(0);
- }
-
- const char *pOption;
- if (vm.count("svm_option"))
- pOption = str("svm_option", vm).c_str();
- else
- pOption = NULL;
-
- SConstReorderTrainer *pTrainer = new SConstReorderTrainer(str("parse_file", vm).c_str(),
- str("align_file", vm).c_str(),
- str("source_file", vm).c_str(),
- str("target_file", vm).c_str(),
- str("instance_file", vm).c_str(),
- str("model_prefix", vm).c_str(),
- vm["classifier_type"].as<int>(),
- vm["feature_cutoff"].as<int>(),
- pOption);
- delete pTrainer;
-
- return 1;
+ po::options_description opts("Configuration options");
+ opts.add_options()("parse_file", po::value<string>(),
+ "parse file path (input)")(
+ "align_file", po::value<string>(), "Alignment file path (input)")(
+ "source_file", po::value<string>(), "Source text file path (input)")(
+ "target_file", po::value<string>(), "Target text file path (input)")(
+ "instance_file", po::value<string>(), "Instance file path (output)")(
+ "model_prefix", po::value<string>(),
+ "Model file path prefix (output): three files will be generated")(
+ "classifier_type", po::value<int>()->default_value(1),
+ "Classifier type: 1 for openNLP maxent; 2 for Zhangle maxent; and 3 for "
+ "SVMLight")("feature_cutoff", po::value<int>()->default_value(100),
+ "Feature cutoff threshold")(
+ "svm_option", po::value<string>(), "Parameters for SVMLight classifier")(
+ "help", "produce help message");
+
+ po::variables_map vm;
+ if (argc) {
+ po::store(po::parse_command_line(argc, argv, opts), vm);
+ po::notify(vm);
+ }
+
+ if (vm.count("help")) {
+ print_options(cout, opts);
+ return 1;
+ }
+
+ if (!vm.count("parse_file") || !vm.count("align_file") ||
+ !vm.count("source_file") || !vm.count("target_file") ||
+ !vm.count("instance_file") || !vm.count("model_prefix")) {
+ print_options(cout, opts);
+ if (!vm.count("parse_file")) cout << "--parse_file NOT FOUND\n";
+ if (!vm.count("align_file")) cout << "--align_file NOT FOUND\n";
+ if (!vm.count("source_file")) cout << "--source_file NOT FOUND\n";
+ if (!vm.count("target_file")) cout << "--target_file NOT FOUND\n";
+ if (!vm.count("instance_file")) cout << "--instance_file NOT FOUND\n";
+ if (!vm.count("model_prefix")) cout << "--model_prefix NOT FOUND\n";
+ exit(0);
+ }
+
+ const char* pOption;
+ if (vm.count("svm_option"))
+ pOption = str("svm_option", vm).c_str();
+ else
+ pOption = NULL;
+
+ SConstReorderTrainer* pTrainer = new SConstReorderTrainer(
+ str("parse_file", vm).c_str(), str("align_file", vm).c_str(),
+ str("source_file", vm).c_str(), str("target_file", vm).c_str(),
+ str("instance_file", vm).c_str(), str("model_prefix", vm).c_str(),
+ vm["classifier_type"].as<int>(), vm["feature_cutoff"].as<int>(), pOption);
+ delete pTrainer;
+ return 1;
}