diff options
Diffstat (limited to 'utils/synutils/constituent_reorder_model.cc')
| -rw-r--r-- | utils/synutils/constituent_reorder_model.cc | 1562 | 
1 files changed, 806 insertions, 756 deletions
| diff --git a/utils/synutils/constituent_reorder_model.cc b/utils/synutils/constituent_reorder_model.cc index 485c9667..a4fb9627 100644 --- a/utils/synutils/constituent_reorder_model.cc +++ b/utils/synutils/constituent_reorder_model.cc @@ -5,7 +5,6 @@   *      Author: junhuili   */ -  #include <boost/program_options.hpp>  #include "alignment.h" @@ -17,780 +16,831 @@  using namespace std; -  typedef std::tr1::unordered_map<std::string, int> Map;  typedef std::tr1::unordered_map<std::string, int>::iterator Iterator;  namespace po = boost::program_options; -inline void fnPreparingTrainingdata(const char* pszFName, int iCutoff, const char* pszNewFName) { -	SFReader *pFReader = new STxtFileReader(pszFName); -	char *pszLine = new char[ 100001 ]; -	int iLen; -	Map hashPredicate; -	while (pFReader->fnReadNextLine(pszLine, &iLen)) { -		if (iLen == 0) -			continue; - -		vector<string> vecTerms; -		SplitOnWhitespace(string(pszLine), &vecTerms); - -		for (size_t i = 0; i < vecTerms.size() - 1; i++) { -			Iterator iter = hashPredicate.find(vecTerms[i]); -			if (iter == hashPredicate.end()) { -				hashPredicate[vecTerms[i]] = 1; - -			} else { -				iter->second++; -			} -		} -	} -	delete pFReader; - -	pFReader = new STxtFileReader(pszFName); -	FILE *fpOut = fopen(pszNewFName, "w"); -	while (pFReader->fnReadNextLine(pszLine, &iLen)) { -		if (iLen == 0) -			continue; - -		vector<string> vecTerms; -		SplitOnWhitespace(string(pszLine), &vecTerms); -		ostringstream ostr; -		for (size_t i = 0; i < vecTerms.size() - 1; i++) { -			Iterator iter = hashPredicate.find(vecTerms[i]); -			assert(iter != hashPredicate.end()); -			if (iter->second >= iCutoff) { -				ostr << vecTerms[i] << " "; -			} -		} -		if (ostr.str().length() > 0) { -			ostr << vecTerms[vecTerms.size() - 1]; -			fprintf(fpOut, "%s\n", ostr.str().c_str()); -		} -	} -	fclose(fpOut); -	delete pFReader; - - -	delete [] pszLine; +inline void fnPreparingTrainingdata(const char* pszFName, int iCutoff, +                                    const char* pszNewFName) { +  SFReader* pFReader = new STxtFileReader(pszFName); +  char* pszLine = new char[100001]; +  int iLen; +  Map hashPredicate; +  while (pFReader->fnReadNextLine(pszLine, &iLen)) { +    if (iLen == 0) continue; + +    vector<string> vecTerms; +    SplitOnWhitespace(string(pszLine), &vecTerms); + +    for (size_t i = 0; i < vecTerms.size() - 1; i++) { +      Iterator iter = hashPredicate.find(vecTerms[i]); +      if (iter == hashPredicate.end()) { +        hashPredicate[vecTerms[i]] = 1; + +      } else { +        iter->second++; +      } +    } +  } +  delete pFReader; + +  pFReader = new STxtFileReader(pszFName); +  FILE* fpOut = fopen(pszNewFName, "w"); +  while (pFReader->fnReadNextLine(pszLine, &iLen)) { +    if (iLen == 0) continue; + +    vector<string> vecTerms; +    SplitOnWhitespace(string(pszLine), &vecTerms); +    ostringstream ostr; +    for (size_t i = 0; i < vecTerms.size() - 1; i++) { +      Iterator iter = hashPredicate.find(vecTerms[i]); +      assert(iter != hashPredicate.end()); +      if (iter->second >= iCutoff) { +        ostr << vecTerms[i] << " "; +      } +    } +    if (ostr.str().length() > 0) { +      ostr << vecTerms[vecTerms.size() - 1]; +      fprintf(fpOut, "%s\n", ostr.str().c_str()); +    } +  } +  fclose(fpOut); +  delete pFReader; + +  delete[] pszLine;  } -struct SConstReorderTrainer{ -	SConstReorderTrainer(const char* pszSynFname,        //source-side flattened parse tree file name -                         const char* pszAlignFname,               //alignment filename -                         const char* pszSourceFname,              //source file name -                         const char* pszTargetFname,              //target file name -                         const char* pszInstanceFname,            //training instance file name -                         const char* pszModelPrefix,              //classifier model file name prefix -                         int iClassifierType,                     //classifier type -                         int iCutoff,                             //feature count threshold -                         const char* pszOption                    //other classifier parameters (for svmlight) -                         ) { -		fnGenerateInstanceFile(pszSynFname, pszAlignFname, pszSourceFname, pszTargetFname, pszInstanceFname); - -		string strInstanceLeftFname = string(pszInstanceFname) + string(".left"); -		string strInstanceRightFname = string(pszInstanceFname) + string(".right"); - -		string strModelLeftFname = string(pszModelPrefix) + string(".left"); -		string strModelRightFname = string(pszModelPrefix) + string(".right"); - -		fprintf(stdout, "...Training the left ordering model\n"); -		fnTraining(strInstanceLeftFname.c_str(), strModelLeftFname.c_str(), iCutoff); -		fprintf(stdout, "...Training the right ordering model\n"); -		fnTraining(strInstanceRightFname.c_str(), strModelRightFname.c_str(), iCutoff); -	} -	~SConstReorderTrainer() { - -	} - -private: - -	void fnTraining(const char* pszInstanceFname, const char* pszModelFname, int iCutoff) { -		char *pszNewInstanceFName = new char[strlen(pszInstanceFname) + 50]; -		if (iCutoff > 0) { -			sprintf(pszNewInstanceFName, "%s.tmp", pszInstanceFname); -			fnPreparingTrainingdata(pszInstanceFname, iCutoff, pszNewInstanceFName); -		} else { -			strcpy(pszNewInstanceFName, pszInstanceFname); -		} - -		/*Zhangle_Maxent *pZhangleMaxent = new Zhangle_Maxent(NULL); -	    pZhangleMaxent->fnTrain(pszInstanceFname, "lbfgs", pszModelFname, 100, 2.0); -	    delete pZhangleMaxent;*/ - -		Tsuruoka_Maxent *pMaxent = new Tsuruoka_Maxent(NULL); -		pMaxent->fnTrain(pszNewInstanceFName, "l1", pszModelFname, 300); -		delete pMaxent; - -		if (strcmp(pszNewInstanceFName, pszInstanceFname) != 0) { -			sprintf(pszNewInstanceFName, "rm %s.tmp", pszInstanceFname); -			system(pszNewInstanceFName); -		} -		delete [] pszNewInstanceFName; -	} - -	inline bool fnIsVerbPOS(const char* pszTerm) { -		if (strcmp(pszTerm, "VV") == 0 -			|| strcmp(pszTerm, "VA") == 0 -			|| strcmp(pszTerm, "VC") == 0 -			|| strcmp(pszTerm, "VE") == 0) -			return true; -		return false; -	} - -	inline void fnGetOutcome(int iL1, int iR1, int iL2, int iR2, const SAlignment *pAlign, string& strOutcome) { -		if (iL1 == -1 && iL2 == -1) -			strOutcome = "BU"; //1. both are untranslated -		else if (iL1 == -1) -			strOutcome = "1U"; //2. XP1 is untranslated -		else if (iL2 == -1) -			strOutcome = "2U"; //3. XP2 is untranslated -		else if (iL1 == iL2 && iR2 == iR2) -			strOutcome = "SS"; //4. Have same scope -		else if (iL1 <= iL2 && iR1 >= iR2) -			strOutcome = "1C2"; //5. XP1's translation covers XP2's -		else if (iL1 >= iL2 && iR1 <= iR2) -			strOutcome = "2C1"; //6. XP2's translation covers XP1's -		else if (iR1 < iL2) { -			int i = iR1 + 1; -			/*while (i < iL2) { -				if (pAlign->fnIsAligned(i, false)) -					break; -				i++; -			}*/ -			if (i == iL2) -				strOutcome = "M"; //7. Monotone -			else -				strOutcome = "DM"; //8. Discontinuous monotone -		} else if (iL1 < iL2 && iL2 <= iR1 && iR1 < iR2) -			strOutcome = "OM"; //9. Overlap monotone -		else if (iR2 < iL1) { -			int i = iR2 + 1; -			/*while (i < iL1) { -				if (pAlign->fnIsAligned(i, false)) -					break; -				i++; -			}*/ -			if (i == iL1) -				strOutcome = "S"; //10. Swap -			else -				strOutcome = "DS"; //11. Discontinuous swap -		} else if (iL2 < iL1 && iL1 <= iR2 && iR2 < iR1) -			strOutcome = "OS"; //12. Overlap swap -		else -			assert(false); -	} - -	inline void fnGetOutcome(int i1, int i2, string& strOutcome) { -		assert(i1 != i2); -		if (i1 < i2) { -			if (i2 > i1 + 1) strOutcome = string("DM"); -			else strOutcome = string("M"); -		} else { -			if (i1 > i2 + 1) strOutcome = string("DS"); -			else strOutcome = string("S"); -		} -	} - -	inline void fnGetRelativePosition(const vector<int>& vecLeft, vector<int>& vecPosition) { -		vecPosition.clear(); - -		vector<float> vec; -		for (size_t i = 0; i < vecLeft.size(); i++) { -			if (vecLeft[i] == -1) { -				if (i == 0) -					vec.push_back(-1); -				else -					vec.push_back(vecLeft[i-1] + 0.1); -			} else -				vec.push_back(vecLeft[i]); -		} - -		for (size_t i = 0; i < vecLeft.size(); i++) { -			int count = 0; - -			for (size_t j = 0; j < vecLeft.size(); j++) { -				if ( j == i) continue; -				if (vec[j] < vec[i]) { -					count++; -				} else if (vec[j] == vec[i] && j < i) { -					count++; -				} -			} -			vecPosition.push_back(count); -		} -	} - -	/* -	 * features: -	 * f1: (left_label, right_label, parent_label) -	 * f2: (left_label, right_label, parent_label, other_right_sibling_label) -	 * f3: (left_label, right_label, parent_label, other_left_sibling_label) -	 * f4: (left_label, right_label, left_head_pos) -	 * f5: (left_label, right_label, left_head_word) -	 * f6: (left_label, right_label, right_head_pos) -	 * f7: (left_label, right_label, right_head_word) -	 * f8: (left_label, right_label, left_chunk_status) -	 * f9: (left_label, right_label, right_chunk_status) -	 * f10: (left_label, parent_label) -	 * f11: (right_label, parent_label) -	 */ -	void fnGenerateInstance(const SParsedTree *pTree, const STreeItem *pParent, int iPos, const vector<string>& vecChunkStatus, const vector<int>& vecPosition, const vector<string>& vecSTerms, const vector<string>& vecTTerms, string& strOutcome, ostringstream& ostr) { -		STreeItem *pCon1, *pCon2; -		pCon1 = pParent->m_vecChildren[iPos - 1]; -		pCon2 = pParent->m_vecChildren[iPos]; - -		fnGetOutcome(vecPosition[iPos-1], vecPosition[iPos], strOutcome); - -		string left_label = string(pCon1->m_pszTerm); -		string right_label = string(pCon2->m_pszTerm); -		string parent_label = string(pParent->m_pszTerm); - -		vector<string> vec_other_right_sibling; -		for (int i = iPos + 1; i < pParent->m_vecChildren.size(); i++) -			vec_other_right_sibling.push_back(string(pParent->m_vecChildren[i]->m_pszTerm)); -		if (vec_other_right_sibling.size() == 0) -			vec_other_right_sibling.push_back(string("NULL")); -		vector<string> vec_other_left_sibling; -		for (int i = 0; i < iPos - 1; i++) -			vec_other_left_sibling.push_back(string(pParent->m_vecChildren[i]->m_pszTerm)); -		if (vec_other_left_sibling.size() == 0) -			vec_other_left_sibling.push_back(string("NULL")); - -		//generate features -		//f1 -		ostr << "f1=" << left_label << "_" << right_label << "_" << parent_label; -		//f2 -		for (int i = 0; i < vec_other_right_sibling.size(); i++) -			ostr << " f2=" << left_label << "_" << right_label << "_" << parent_label << "_" << vec_other_right_sibling[i]; -		//f3 -		for (int i = 0; i < vec_other_left_sibling.size(); i++) -			ostr << " f3=" << left_label << "_" << right_label << "_" << parent_label << "_" << vec_other_left_sibling[i]; -		//f4 -		ostr << " f4=" << left_label << "_" << right_label << "_" << pTree->m_vecTerminals[pCon1->m_iHeadWord]->m_ptParent->m_pszTerm; -		//f5 -		ostr << " f5=" << left_label << "_" << right_label << "_" << vecSTerms[pCon1->m_iHeadWord]; -		//f6 -		ostr << " f6=" << left_label << "_" << right_label << "_" << pTree->m_vecTerminals[pCon2->m_iHeadWord]->m_ptParent->m_pszTerm; -		//f7 -		ostr << " f7=" << left_label << "_" << right_label << "_" << vecSTerms[pCon2->m_iHeadWord]; -		//f8 -		ostr << " f8=" << left_label << "_" << right_label << "_" << vecChunkStatus[iPos - 1]; -		//f9 -		ostr << " f9=" << left_label << "_" << right_label << "_" << vecChunkStatus[iPos]; -		//f10 -		ostr << " f10=" << left_label << "_" << parent_label; -		//f11 -		ostr << " f11=" << right_label << "_" << parent_label; -	} - -	/* -	 * Source side (11 features): -	 * f1: the categories of XP1 and XP2 (f1_1, f1_2) -	 * f2: the head words of XP1 and XP2 (f2_1, f2_2) -	 * f3: the first and last word of XP1 (f3_f, f3_l) -	 * f4: the first and last word of XP2 (f4_f, f4_l) -	 * f5: is XP1 or XP2 the head node (f5_1, f5_2) -	 * f6: the category of the common parent -	 * Target side (6 features): -	 * f7: the first and the last word of XP1's translation (f7_f, f7_l) -	 * f8: the first and the last word of XP2's translation (f8_f, f8_l) -	 * f9: the translation of XP1's and XP2's head word (f9_1, f9_2) -	 */ -	void fnGenerateInstance(const SParsedTree *pTree, const STreeItem *pParent, const STreeItem *pCon1, const STreeItem *pCon2, const SAlignment *pAlign, const vector<string>& vecSTerms, const vector<string>& vecTTerms, string& strOutcome, ostringstream& ostr) { - -		int iLeft1, iRight1, iLeft2, iRight2; -		pAlign->fnGetLeftRightMost(pCon1->m_iBegin, pCon1->m_iEnd, true, iLeft1, iRight1); -		pAlign->fnGetLeftRightMost(pCon2->m_iBegin, pCon2->m_iEnd, true, iLeft2, iRight2); - -		fnGetOutcome(iLeft1, iRight1, iLeft2, iRight2, pAlign, strOutcome); - -		//generate features -		//f1 -		ostr << "f1_1=" << pCon1->m_pszTerm << " f1_2=" << pCon2->m_pszTerm; -		//f2 -		ostr << " f2_1=" << vecSTerms[pCon1->m_iHeadWord] << " f2_2" << vecSTerms[pCon2->m_iHeadWord]; -		//f3 -		ostr << " f3_f=" << vecSTerms[pCon1->m_iBegin] << " f3_l=" << vecSTerms[pCon1->m_iEnd]; -		//f4 -		ostr << " f4_f=" << vecSTerms[pCon2->m_iBegin] << " f4_l=" << vecSTerms[pCon2->m_iEnd]; -		//f5 -		if (pParent->m_iHeadChild == pCon1->m_iBrotherIndex) -			ostr << " f5_1=1"; -		else -			ostr << " f5_1=0"; -		if (pParent->m_iHeadChild == pCon2->m_iBrotherIndex) -			ostr << " f5_2=1"; -		else -			ostr << " f5_2=0"; -		//f6 -		ostr << " f6=" << pParent->m_pszTerm; - -		/*//f7 -		if (iLeft1 != -1) { -			ostr << " f7_f=" << vecTTerms[iLeft1] << " f7_l=" << vecTTerms[iRight1]; -		} -		if (iLeft2 != -1) { -			ostr << " f8_f=" << vecTTerms[iLeft2] << " f8_l=" << vecTTerms[iRight2]; -		} - -		const vector<int>* pvecTarget = pAlign->fnGetSingleWordAlign(pCon1->m_iHeadWord, true); -		string str = ""; -		for (size_t i = 0; pvecTarget != NULL && i < pvecTarget->size(); i++) { -			str += vecTTerms[(*pvecTarget)[i]] + "_"; -		} -		if (str.length() > 0) { -			ostr << " f9_1=" << str.substr(0, str.size()-1); -		} -		pvecTarget = pAlign->fnGetSingleWordAlign(pCon2->m_iHeadWord, true); -		str = ""; -		for (size_t i = 0; pvecTarget != NULL && i < pvecTarget->size(); i++) { -			str += vecTTerms[(*pvecTarget)[i]] + "_"; -		} -		if (str.length() > 0) { -			ostr << " f9_2=" << str.substr(0, str.size()-1); -		} */ - -	} - -	void fnGetFocusedParentNodes(const SParsedTree* pTree, vector<STreeItem*>& vecFocused){ -		for (size_t i = 0; i < pTree->m_vecTerminals.size(); i++) { -			STreeItem *pParent = pTree->m_vecTerminals[i]->m_ptParent; - -			while (pParent != NULL) { -				//if (pParent->m_vecChildren.size() > 1 && pParent->m_iEnd - pParent->m_iBegin > 5) { -				if (pParent->m_vecChildren.size() > 1) { -					//do constituent reordering for all children of pParent -					vecFocused.push_back(pParent); -				} -				if (pParent->m_iBrotherIndex != 0) break; -				pParent = pParent->m_ptParent; -			} -		} -	} - -	void fnGenerateInstanceFile(const char* pszSynFname,        //source-side flattened parse tree file name -                                const char* pszAlignFname,               //alignment filename -                                const char* pszSourceFname,              //source file name -                                const char* pszTargetFname,              //target file name -                                const char* pszInstanceFname             //training instance file name -                                ) { -		SAlignmentReader *pAlignReader = new SAlignmentReader(pszAlignFname); -		SParseReader *pParseReader = new SParseReader(pszSynFname, false); -		STxtFileReader *pTxtSReader = new STxtFileReader(pszSourceFname); -		STxtFileReader *pTxtTReader = new STxtFileReader(pszTargetFname); - -		string strInstanceLeftFname = string(pszInstanceFname) + string(".left"); -		string strInstanceRightFname = string(pszInstanceFname) + string(".right"); - -		FILE *fpLeftOut = fopen(strInstanceLeftFname.c_str(), "w"); -		assert(fpLeftOut != NULL); - -		FILE *fpRightOut = fopen(strInstanceRightFname.c_str(), "w"); -		assert(fpRightOut != NULL); - -		//read sentence by sentence -		SAlignment *pAlign; -		SParsedTree *pTree; -		char *pszLine = new char[50001]; -		int iSentNum = 0; -		while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { -			pTree = pParseReader->fnReadNextParseTree(); -			assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); -			vector<string> vecSTerms; -			SplitOnWhitespace(string(pszLine), &vecSTerms); -			assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); -			vector<string> vecTTerms; -			SplitOnWhitespace(string(pszLine), &vecTTerms); - - -			if (pTree != NULL) { - -				vector<STreeItem*> vecFocused; -				fnGetFocusedParentNodes(pTree, vecFocused); - -				for (size_t i = 0; i < vecFocused.size(); i++) { - -					STreeItem *pParent = vecFocused[i]; - -	            	vector<int> vecLeft, vecRight; -	            	for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) { -	                    STreeItem *pCon1 = pParent->m_vecChildren[j]; -	            		int iLeft1, iRight1; -	                    pAlign->fnGetLeftRightMost(pCon1->m_iBegin, pCon1->m_iEnd, true, iLeft1, iRight1); -	                    vecLeft.push_back(iLeft1); -	                    vecRight.push_back(iRight1); -	            	} -	            	vector<int> vecLeftPosition; -	            	fnGetRelativePosition(vecLeft, vecLeftPosition); -	            	vector<int> vecRightPosition; -	            	fnGetRelativePosition(vecRight, vecRightPosition); - -	            	vector<string> vecChunkStatus; -	            	for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) { -	            		string strOutcome = pAlign->fnIsContinuous(pParent->m_vecChildren[j]->m_iBegin, pParent->m_vecChildren[j]->m_iEnd); -	            		vecChunkStatus.push_back(strOutcome); -	            	} - -					for (size_t j = 1; j < pParent->m_vecChildren.size(); j++) { -						//children[j-1] vs. children[j] reordering - -						string strLeftOutcome; -						ostringstream ostr; - -						fnGenerateInstance(pTree, pParent, j, vecChunkStatus, vecLeftPosition, vecSTerms, vecTTerms, strLeftOutcome, ostr); - -						//fprintf(stderr, "%s %s\n", ostr.str().c_str(), strLeftOutcome.c_str()); -						fprintf(fpLeftOut, "%s %s\n", ostr.str().c_str(), strLeftOutcome.c_str()); - -						string strRightOutcome; -						fnGetOutcome(vecRightPosition[j-1], vecRightPosition[j], strRightOutcome); -						fprintf(fpRightOut, "%s LeftOrder=%s %s\n", ostr.str().c_str(), strLeftOutcome.c_str(), strRightOutcome.c_str()); -					} -				} -				delete pTree; -			} - -			delete pAlign; -			iSentNum++; - -			if (iSentNum % 100000 == 0) -				fprintf(stderr, "#%d\n", iSentNum); -		} - - -		fclose(fpLeftOut); -		fclose(fpRightOut); -		delete pAlignReader; -		delete pParseReader; -		delete pTxtSReader; -		delete pTxtTReader; -		delete [] pszLine; -	} - -	void fnGenerateInstanceFile2(const char* pszSynFname,        //source-side flattened parse tree file name -                                const char* pszAlignFname,               //alignment filename -                                const char* pszSourceFname,              //source file name -                                const char* pszTargetFname,              //target file name -                                const char* pszInstanceFname             //training instance file name -                                ) { -		SAlignmentReader *pAlignReader = new SAlignmentReader(pszAlignFname); -		SParseReader *pParseReader = new SParseReader(pszSynFname, false); -		STxtFileReader *pTxtSReader = new STxtFileReader(pszSourceFname); -		STxtFileReader *pTxtTReader = new STxtFileReader(pszTargetFname); - -		FILE *fpOut = fopen(pszInstanceFname, "w"); -		assert(fpOut != NULL); - -		//read sentence by sentence -		SAlignment *pAlign; -		SParsedTree *pTree; -		char *pszLine = new char[50001]; -		int iSentNum = 0; -		while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { -			pTree = pParseReader->fnReadNextParseTree(); -			assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); -			vector<string> vecSTerms; -			SplitOnWhitespace(string(pszLine), &vecSTerms); -			assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); -			vector<string> vecTTerms; -			SplitOnWhitespace(string(pszLine), &vecTTerms); - - -			if (pTree != NULL) { - -				vector<STreeItem*> vecFocused; -				fnGetFocusedParentNodes(pTree, vecFocused); - -				for (size_t i = 0; i < vecFocused.size() && pTree->m_vecTerminals.size() > 10; i++) { - -					STreeItem *pParent = vecFocused[i]; - -					for (size_t j = 1; j < pParent->m_vecChildren.size(); j++) { -						//children[j-1] vs. children[j] reordering - -						string strOutcome; -						ostringstream ostr; - -						fnGenerateInstance(pTree, pParent, pParent->m_vecChildren[j-1], pParent->m_vecChildren[j], pAlign, vecSTerms, vecTTerms, strOutcome, ostr); - -						//fprintf(stderr, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); -						fprintf(fpOut, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); -					} -				} -				delete pTree; -			} - -			delete pAlign; -			iSentNum++; - -			if (iSentNum % 100000 == 0) -				fprintf(stderr, "#%d\n", iSentNum); -		} - - -		fclose(fpOut); -		delete pAlignReader; -		delete pParseReader; -		delete pTxtSReader; -		delete pTxtTReader; -		delete [] pszLine; -	} +struct SConstReorderTrainer { +  SConstReorderTrainer( +      const char* pszSynFname,  // source-side flattened parse tree file name +      const char* pszAlignFname,  // alignment filename +      const char* pszSourceFname,  // source file name +      const char* pszTargetFname,  // target file name +      const char* pszInstanceFname,  // training instance file name +      const char* pszModelPrefix,  // classifier model file name prefix +      int iClassifierType,  // classifier type +      int iCutoff,  // feature count threshold +      const char* pszOption  // other classifier parameters (for svmlight) +      ) { +    fnGenerateInstanceFile(pszSynFname, pszAlignFname, pszSourceFname, +                           pszTargetFname, pszInstanceFname); + +    string strInstanceLeftFname = string(pszInstanceFname) + string(".left"); +    string strInstanceRightFname = string(pszInstanceFname) + string(".right"); + +    string strModelLeftFname = string(pszModelPrefix) + string(".left"); +    string strModelRightFname = string(pszModelPrefix) + string(".right"); + +    fprintf(stdout, "...Training the left ordering model\n"); +    fnTraining(strInstanceLeftFname.c_str(), strModelLeftFname.c_str(), +               iCutoff); +    fprintf(stdout, "...Training the right ordering model\n"); +    fnTraining(strInstanceRightFname.c_str(), strModelRightFname.c_str(), +               iCutoff); +  } +  ~SConstReorderTrainer() {} + + private: +  void fnTraining(const char* pszInstanceFname, const char* pszModelFname, +                  int iCutoff) { +    char* pszNewInstanceFName = new char[strlen(pszInstanceFname) + 50]; +    if (iCutoff > 0) { +      sprintf(pszNewInstanceFName, "%s.tmp", pszInstanceFname); +      fnPreparingTrainingdata(pszInstanceFname, iCutoff, pszNewInstanceFName); +    } else { +      strcpy(pszNewInstanceFName, pszInstanceFname); +    } + +    /*Zhangle_Maxent *pZhangleMaxent = new Zhangle_Maxent(NULL); +pZhangleMaxent->fnTrain(pszInstanceFname, "lbfgs", pszModelFname, 100, 2.0); +delete pZhangleMaxent;*/ + +    Tsuruoka_Maxent* pMaxent = new Tsuruoka_Maxent(NULL); +    pMaxent->fnTrain(pszNewInstanceFName, "l1", pszModelFname, 300); +    delete pMaxent; + +    if (strcmp(pszNewInstanceFName, pszInstanceFname) != 0) { +      sprintf(pszNewInstanceFName, "rm %s.tmp", pszInstanceFname); +      system(pszNewInstanceFName); +    } +    delete[] pszNewInstanceFName; +  } + +  inline bool fnIsVerbPOS(const char* pszTerm) { +    if (strcmp(pszTerm, "VV") == 0 || strcmp(pszTerm, "VA") == 0 || +        strcmp(pszTerm, "VC") == 0 || strcmp(pszTerm, "VE") == 0) +      return true; +    return false; +  } + +  inline void fnGetOutcome(int iL1, int iR1, int iL2, int iR2, +                           const SAlignment* pAlign, string& strOutcome) { +    if (iL1 == -1 && iL2 == -1) +      strOutcome = "BU";  // 1. both are untranslated +    else if (iL1 == -1) +      strOutcome = "1U";  // 2. XP1 is untranslated +    else if (iL2 == -1) +      strOutcome = "2U";  // 3. XP2 is untranslated +    else if (iL1 == iL2 && iR2 == iR2) +      strOutcome = "SS";  // 4. Have same scope +    else if (iL1 <= iL2 && iR1 >= iR2) +      strOutcome = "1C2";  // 5. XP1's translation covers XP2's +    else if (iL1 >= iL2 && iR1 <= iR2) +      strOutcome = "2C1";  // 6. XP2's translation covers XP1's +    else if (iR1 < iL2) { +      int i = iR1 + 1; +      /*while (i < iL2) { +              if (pAlign->fnIsAligned(i, false)) +                      break; +              i++; +      }*/ +      if (i == iL2) +        strOutcome = "M";  // 7. Monotone +      else +        strOutcome = "DM";  // 8. Discontinuous monotone +    } else if (iL1 < iL2 && iL2 <= iR1 && iR1 < iR2) +      strOutcome = "OM";  // 9. Overlap monotone +    else if (iR2 < iL1) { +      int i = iR2 + 1; +      /*while (i < iL1) { +              if (pAlign->fnIsAligned(i, false)) +                      break; +              i++; +      }*/ +      if (i == iL1) +        strOutcome = "S";  // 10. Swap +      else +        strOutcome = "DS";  // 11. Discontinuous swap +    } else if (iL2 < iL1 && iL1 <= iR2 && iR2 < iR1) +      strOutcome = "OS";  // 12. Overlap swap +    else +      assert(false); +  } + +  inline void fnGetOutcome(int i1, int i2, string& strOutcome) { +    assert(i1 != i2); +    if (i1 < i2) { +      if (i2 > i1 + 1) +        strOutcome = string("DM"); +      else +        strOutcome = string("M"); +    } else { +      if (i1 > i2 + 1) +        strOutcome = string("DS"); +      else +        strOutcome = string("S"); +    } +  } + +  inline void fnGetRelativePosition(const vector<int>& vecLeft, +                                    vector<int>& vecPosition) { +    vecPosition.clear(); + +    vector<float> vec; +    for (size_t i = 0; i < vecLeft.size(); i++) { +      if (vecLeft[i] == -1) { +        if (i == 0) +          vec.push_back(-1); +        else +          vec.push_back(vecLeft[i - 1] + 0.1); +      } else +        vec.push_back(vecLeft[i]); +    } + +    for (size_t i = 0; i < vecLeft.size(); i++) { +      int count = 0; + +      for (size_t j = 0; j < vecLeft.size(); j++) { +        if (j == i) continue; +        if (vec[j] < vec[i]) { +          count++; +        } else if (vec[j] == vec[i] && j < i) { +          count++; +        } +      } +      vecPosition.push_back(count); +    } +  } + +  /* +   * features: +   * f1: (left_label, right_label, parent_label) +   * f2: (left_label, right_label, parent_label, other_right_sibling_label) +   * f3: (left_label, right_label, parent_label, other_left_sibling_label) +   * f4: (left_label, right_label, left_head_pos) +   * f5: (left_label, right_label, left_head_word) +   * f6: (left_label, right_label, right_head_pos) +   * f7: (left_label, right_label, right_head_word) +   * f8: (left_label, right_label, left_chunk_status) +   * f9: (left_label, right_label, right_chunk_status) +   * f10: (left_label, parent_label) +   * f11: (right_label, parent_label) +   */ +  void fnGenerateInstance(const SParsedTree* pTree, const STreeItem* pParent, +                          int iPos, const vector<string>& vecChunkStatus, +                          const vector<int>& vecPosition, +                          const vector<string>& vecSTerms, +                          const vector<string>& vecTTerms, string& strOutcome, +                          ostringstream& ostr) { +    STreeItem* pCon1, *pCon2; +    pCon1 = pParent->m_vecChildren[iPos - 1]; +    pCon2 = pParent->m_vecChildren[iPos]; + +    fnGetOutcome(vecPosition[iPos - 1], vecPosition[iPos], strOutcome); + +    string left_label = string(pCon1->m_pszTerm); +    string right_label = string(pCon2->m_pszTerm); +    string parent_label = string(pParent->m_pszTerm); + +    vector<string> vec_other_right_sibling; +    for (int i = iPos + 1; i < pParent->m_vecChildren.size(); i++) +      vec_other_right_sibling.push_back( +          string(pParent->m_vecChildren[i]->m_pszTerm)); +    if (vec_other_right_sibling.size() == 0) +      vec_other_right_sibling.push_back(string("NULL")); +    vector<string> vec_other_left_sibling; +    for (int i = 0; i < iPos - 1; i++) +      vec_other_left_sibling.push_back( +          string(pParent->m_vecChildren[i]->m_pszTerm)); +    if (vec_other_left_sibling.size() == 0) +      vec_other_left_sibling.push_back(string("NULL")); + +    // generate features +    // f1 +    ostr << "f1=" << left_label << "_" << right_label << "_" << parent_label; +    // f2 +    for (int i = 0; i < vec_other_right_sibling.size(); i++) +      ostr << " f2=" << left_label << "_" << right_label << "_" << parent_label +           << "_" << vec_other_right_sibling[i]; +    // f3 +    for (int i = 0; i < vec_other_left_sibling.size(); i++) +      ostr << " f3=" << left_label << "_" << right_label << "_" << parent_label +           << "_" << vec_other_left_sibling[i]; +    // f4 +    ostr << " f4=" << left_label << "_" << right_label << "_" +         << pTree->m_vecTerminals[pCon1->m_iHeadWord]->m_ptParent->m_pszTerm; +    // f5 +    ostr << " f5=" << left_label << "_" << right_label << "_" +         << vecSTerms[pCon1->m_iHeadWord]; +    // f6 +    ostr << " f6=" << left_label << "_" << right_label << "_" +         << pTree->m_vecTerminals[pCon2->m_iHeadWord]->m_ptParent->m_pszTerm; +    // f7 +    ostr << " f7=" << left_label << "_" << right_label << "_" +         << vecSTerms[pCon2->m_iHeadWord]; +    // f8 +    ostr << " f8=" << left_label << "_" << right_label << "_" +         << vecChunkStatus[iPos - 1]; +    // f9 +    ostr << " f9=" << left_label << "_" << right_label << "_" +         << vecChunkStatus[iPos]; +    // f10 +    ostr << " f10=" << left_label << "_" << parent_label; +    // f11 +    ostr << " f11=" << right_label << "_" << parent_label; +  } + +  /* +   * Source side (11 features): +   * f1: the categories of XP1 and XP2 (f1_1, f1_2) +   * f2: the head words of XP1 and XP2 (f2_1, f2_2) +   * f3: the first and last word of XP1 (f3_f, f3_l) +   * f4: the first and last word of XP2 (f4_f, f4_l) +   * f5: is XP1 or XP2 the head node (f5_1, f5_2) +   * f6: the category of the common parent +   * Target side (6 features): +   * f7: the first and the last word of XP1's translation (f7_f, f7_l) +   * f8: the first and the last word of XP2's translation (f8_f, f8_l) +   * f9: the translation of XP1's and XP2's head word (f9_1, f9_2) +   */ +  void fnGenerateInstance(const SParsedTree* pTree, const STreeItem* pParent, +                          const STreeItem* pCon1, const STreeItem* pCon2, +                          const SAlignment* pAlign, +                          const vector<string>& vecSTerms, +                          const vector<string>& vecTTerms, string& strOutcome, +                          ostringstream& ostr) { + +    int iLeft1, iRight1, iLeft2, iRight2; +    pAlign->fnGetLeftRightMost(pCon1->m_iBegin, pCon1->m_iEnd, true, iLeft1, +                               iRight1); +    pAlign->fnGetLeftRightMost(pCon2->m_iBegin, pCon2->m_iEnd, true, iLeft2, +                               iRight2); + +    fnGetOutcome(iLeft1, iRight1, iLeft2, iRight2, pAlign, strOutcome); + +    // generate features +    // f1 +    ostr << "f1_1=" << pCon1->m_pszTerm << " f1_2=" << pCon2->m_pszTerm; +    // f2 +    ostr << " f2_1=" << vecSTerms[pCon1->m_iHeadWord] << " f2_2" +         << vecSTerms[pCon2->m_iHeadWord]; +    // f3 +    ostr << " f3_f=" << vecSTerms[pCon1->m_iBegin] +         << " f3_l=" << vecSTerms[pCon1->m_iEnd]; +    // f4 +    ostr << " f4_f=" << vecSTerms[pCon2->m_iBegin] +         << " f4_l=" << vecSTerms[pCon2->m_iEnd]; +    // f5 +    if (pParent->m_iHeadChild == pCon1->m_iBrotherIndex) +      ostr << " f5_1=1"; +    else +      ostr << " f5_1=0"; +    if (pParent->m_iHeadChild == pCon2->m_iBrotherIndex) +      ostr << " f5_2=1"; +    else +      ostr << " f5_2=0"; +    // f6 +    ostr << " f6=" << pParent->m_pszTerm; + +    /*//f7 +    if (iLeft1 != -1) { +            ostr << " f7_f=" << vecTTerms[iLeft1] << " f7_l=" << +    vecTTerms[iRight1]; +    } +    if (iLeft2 != -1) { +            ostr << " f8_f=" << vecTTerms[iLeft2] << " f8_l=" << +    vecTTerms[iRight2]; +    } + +    const vector<int>* pvecTarget = +    pAlign->fnGetSingleWordAlign(pCon1->m_iHeadWord, true); +    string str = ""; +    for (size_t i = 0; pvecTarget != NULL && i < pvecTarget->size(); i++) { +            str += vecTTerms[(*pvecTarget)[i]] + "_"; +    } +    if (str.length() > 0) { +            ostr << " f9_1=" << str.substr(0, str.size()-1); +    } +    pvecTarget = pAlign->fnGetSingleWordAlign(pCon2->m_iHeadWord, true); +    str = ""; +    for (size_t i = 0; pvecTarget != NULL && i < pvecTarget->size(); i++) { +            str += vecTTerms[(*pvecTarget)[i]] + "_"; +    } +    if (str.length() > 0) { +            ostr << " f9_2=" << str.substr(0, str.size()-1); +    } */ +  } + +  void fnGetFocusedParentNodes(const SParsedTree* pTree, +                               vector<STreeItem*>& vecFocused) { +    for (size_t i = 0; i < pTree->m_vecTerminals.size(); i++) { +      STreeItem* pParent = pTree->m_vecTerminals[i]->m_ptParent; + +      while (pParent != NULL) { +        // if (pParent->m_vecChildren.size() > 1 && pParent->m_iEnd - +        // pParent->m_iBegin > 5) { +        if (pParent->m_vecChildren.size() > 1) { +          // do constituent reordering for all children of pParent +          vecFocused.push_back(pParent); +        } +        if (pParent->m_iBrotherIndex != 0) break; +        pParent = pParent->m_ptParent; +      } +    } +  } + +  void fnGenerateInstanceFile( +      const char* pszSynFname,  // source-side flattened parse tree file name +      const char* pszAlignFname,  // alignment filename +      const char* pszSourceFname,  // source file name +      const char* pszTargetFname,  // target file name +      const char* pszInstanceFname  // training instance file name +      ) { +    SAlignmentReader* pAlignReader = new SAlignmentReader(pszAlignFname); +    SParseReader* pParseReader = new SParseReader(pszSynFname, false); +    STxtFileReader* pTxtSReader = new STxtFileReader(pszSourceFname); +    STxtFileReader* pTxtTReader = new STxtFileReader(pszTargetFname); + +    string strInstanceLeftFname = string(pszInstanceFname) + string(".left"); +    string strInstanceRightFname = string(pszInstanceFname) + string(".right"); + +    FILE* fpLeftOut = fopen(strInstanceLeftFname.c_str(), "w"); +    assert(fpLeftOut != NULL); + +    FILE* fpRightOut = fopen(strInstanceRightFname.c_str(), "w"); +    assert(fpRightOut != NULL); + +    // read sentence by sentence +    SAlignment* pAlign; +    SParsedTree* pTree; +    char* pszLine = new char[50001]; +    int iSentNum = 0; +    while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { +      pTree = pParseReader->fnReadNextParseTree(); +      assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); +      vector<string> vecSTerms; +      SplitOnWhitespace(string(pszLine), &vecSTerms); +      assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); +      vector<string> vecTTerms; +      SplitOnWhitespace(string(pszLine), &vecTTerms); + +      if (pTree != NULL) { + +        vector<STreeItem*> vecFocused; +        fnGetFocusedParentNodes(pTree, vecFocused); + +        for (size_t i = 0; i < vecFocused.size(); i++) { + +          STreeItem* pParent = vecFocused[i]; + +          vector<int> vecLeft, vecRight; +          for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) { +            STreeItem* pCon1 = pParent->m_vecChildren[j]; +            int iLeft1, iRight1; +            pAlign->fnGetLeftRightMost(pCon1->m_iBegin, pCon1->m_iEnd, true, +                                       iLeft1, iRight1); +            vecLeft.push_back(iLeft1); +            vecRight.push_back(iRight1); +          } +          vector<int> vecLeftPosition; +          fnGetRelativePosition(vecLeft, vecLeftPosition); +          vector<int> vecRightPosition; +          fnGetRelativePosition(vecRight, vecRightPosition); + +          vector<string> vecChunkStatus; +          for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) { +            string strOutcome = +                pAlign->fnIsContinuous(pParent->m_vecChildren[j]->m_iBegin, +                                       pParent->m_vecChildren[j]->m_iEnd); +            vecChunkStatus.push_back(strOutcome); +          } + +          for (size_t j = 1; j < pParent->m_vecChildren.size(); j++) { +            // children[j-1] vs. children[j] reordering + +            string strLeftOutcome; +            ostringstream ostr; + +            fnGenerateInstance(pTree, pParent, j, vecChunkStatus, +                               vecLeftPosition, vecSTerms, vecTTerms, +                               strLeftOutcome, ostr); + +            // fprintf(stderr, "%s %s\n", ostr.str().c_str(), +            // strLeftOutcome.c_str()); +            fprintf(fpLeftOut, "%s %s\n", ostr.str().c_str(), +                    strLeftOutcome.c_str()); + +            string strRightOutcome; +            fnGetOutcome(vecRightPosition[j - 1], vecRightPosition[j], +                         strRightOutcome); +            fprintf(fpRightOut, "%s LeftOrder=%s %s\n", ostr.str().c_str(), +                    strLeftOutcome.c_str(), strRightOutcome.c_str()); +          } +        } +        delete pTree; +      } + +      delete pAlign; +      iSentNum++; + +      if (iSentNum % 100000 == 0) fprintf(stderr, "#%d\n", iSentNum); +    } + +    fclose(fpLeftOut); +    fclose(fpRightOut); +    delete pAlignReader; +    delete pParseReader; +    delete pTxtSReader; +    delete pTxtTReader; +    delete[] pszLine; +  } + +  void fnGenerateInstanceFile2( +      const char* pszSynFname,  // source-side flattened parse tree file name +      const char* pszAlignFname,  // alignment filename +      const char* pszSourceFname,  // source file name +      const char* pszTargetFname,  // target file name +      const char* pszInstanceFname  // training instance file name +      ) { +    SAlignmentReader* pAlignReader = new SAlignmentReader(pszAlignFname); +    SParseReader* pParseReader = new SParseReader(pszSynFname, false); +    STxtFileReader* pTxtSReader = new STxtFileReader(pszSourceFname); +    STxtFileReader* pTxtTReader = new STxtFileReader(pszTargetFname); + +    FILE* fpOut = fopen(pszInstanceFname, "w"); +    assert(fpOut != NULL); + +    // read sentence by sentence +    SAlignment* pAlign; +    SParsedTree* pTree; +    char* pszLine = new char[50001]; +    int iSentNum = 0; +    while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { +      pTree = pParseReader->fnReadNextParseTree(); +      assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); +      vector<string> vecSTerms; +      SplitOnWhitespace(string(pszLine), &vecSTerms); +      assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); +      vector<string> vecTTerms; +      SplitOnWhitespace(string(pszLine), &vecTTerms); + +      if (pTree != NULL) { + +        vector<STreeItem*> vecFocused; +        fnGetFocusedParentNodes(pTree, vecFocused); + +        for (size_t i = 0; +             i < vecFocused.size() && pTree->m_vecTerminals.size() > 10; i++) { + +          STreeItem* pParent = vecFocused[i]; + +          for (size_t j = 1; j < pParent->m_vecChildren.size(); j++) { +            // children[j-1] vs. children[j] reordering + +            string strOutcome; +            ostringstream ostr; + +            fnGenerateInstance(pTree, pParent, pParent->m_vecChildren[j - 1], +                               pParent->m_vecChildren[j], pAlign, vecSTerms, +                               vecTTerms, strOutcome, ostr); + +            // fprintf(stderr, "%s %s\n", ostr.str().c_str(), +            // strOutcome.c_str()); +            fprintf(fpOut, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); +          } +        } +        delete pTree; +      } + +      delete pAlign; +      iSentNum++; + +      if (iSentNum % 100000 == 0) fprintf(stderr, "#%d\n", iSentNum); +    } + +    fclose(fpOut); +    delete pAlignReader; +    delete pParseReader; +    delete pTxtSReader; +    delete pTxtTReader; +    delete[] pszLine; +  }  }; -struct SConstContTrainer{ -	SConstContTrainer(const char* pszFlattenedSynFname,        //source-side flattened parse tree file name -	                  const char* pszAlignFname,               //alignment filename -	                  const char* pszSourceFname,              //source file name -	                  const char* pszTargetFname,              //target file name -	                  const char* pszInstanceFname,            //training instance file name -	                  const char* pszModelPrefix,              //classifier model file name prefix -	                  int iClassifierType,                     //classifier type -	                  int iCutoff,                             //feature count threshold -	                  const char* pszOption                    //other classifier parameters (for svmlight) -	                 ) { -		fnGenerateInstanceFile(pszFlattenedSynFname, pszAlignFname, pszSourceFname, pszTargetFname, pszInstanceFname); -		//fnTraining(pszInstanceFname, pszModelPrefix, iClassifierType, iCutoff, pszOption); -		fnTraining(pszInstanceFname, pszModelPrefix, iCutoff); -	} -	~SConstContTrainer() { - -	} - -private: - -	void fnTraining(const char* pszInstanceFname, const char* pszModelFname, int iCutoff) { -		char *pszNewInstanceFName = new char[strlen(pszInstanceFname) + 50]; -		if (iCutoff > 0) { -			sprintf(pszNewInstanceFName, "%s.tmp", pszInstanceFname); -			fnPreparingTrainingdata(pszInstanceFname, iCutoff, pszNewInstanceFName); -		} else { -			strcpy(pszNewInstanceFName, pszInstanceFname); -		} - -		/*Zhangle_Maxent *pZhangleMaxent = new Zhangle_Maxent(NULL); -		   pZhangleMaxent->fnTrain(pszInstanceFname, "lbfgs", pszModelFname, 100, 2.0); -		   delete pZhangleMaxent;*/ - -		Tsuruoka_Maxent *pMaxent = new Tsuruoka_Maxent(NULL); -		pMaxent->fnTrain(pszInstanceFname, "l1", pszModelFname, 300); -		delete pMaxent; - -		if (strcmp(pszNewInstanceFName, pszInstanceFname) != 0) { -			sprintf(pszNewInstanceFName, "rm %s.tmp", pszInstanceFname); -			system(pszNewInstanceFName); -		} -		delete [] pszNewInstanceFName; -	} - - -	void fnGetFocusedParentNodes(const SParsedTree* pTree, vector<STreeItem*>& vecFocused){ -		for (size_t i = 0; i < pTree->m_vecTerminals.size(); i++) { -			STreeItem *pParent = pTree->m_vecTerminals[i]->m_ptParent; - -			while (pParent != NULL) { -				//if (pParent->m_vecChildren.size() > 1 && pParent->m_iEnd - pParent->m_iBegin > 5) { -				if (pParent->m_vecChildren.size() > 1) { -					//do constituent reordering for all children of pParent -					vecFocused.push_back(pParent); -				} -				if (pParent->m_iBrotherIndex != 0) break; -				pParent = pParent->m_ptParent; -			} -		} -	} - -	inline void fnGetOutcome(int iL1, int iR1, const SAlignment *pAlign, string& strOutcome) { -		strOutcome = pAlign->fnIsContinuous(iL1, iR1); -	} - -	inline string fnGetLengthType(int iLen) { -		if (iLen == 1) -			return string("1"); -		if (iLen == 2) -			return string("2"); -		if (iLen == 3) -			return string("3"); -		if (iLen < 6) -			return string("4"); -		if (iLen < 11) -			return string("6"); -		return string("11"); -	} - -	/* -	 * Source side (11 features): -	 * f1: the syntactic category -	 * f2: the syntactic category of its parent -	 * f3: the head word's pos -	 * f4: =1 if it's the head of its parent node -	 *     or -	 *     the head of its parent node -	 * f5: length type -	 */ -	void fnGenerateInstance(const SParsedTree *pTree, const STreeItem *pCon1, const SAlignment *pAlign, const vector<string>& vecSTerms, const vector<string>& vecTTerms, string& strOutcome, ostringstream& ostr) { - -		fnGetOutcome(pCon1->m_iBegin, pCon1->m_iEnd, pAlign, strOutcome); - -		//generate features -		//f1 -		ostr << "f1=" << pCon1->m_pszTerm; -		//f2 -		ostr << " f2=" << pCon1->m_ptParent->m_pszTerm; -		//f3 -		ostr << " f3=" << pTree->m_vecTerminals[pCon1->m_iHeadWord]->m_ptParent->m_pszTerm; -		//f4 -		if (pCon1->m_iBrotherIndex == pCon1->m_ptParent->m_iHeadChild) { -			ostr << " f4=1"; -		} else { -			ostr << " f4=" << pCon1->m_ptParent->m_vecChildren[pCon1->m_ptParent->m_iHeadChild]->m_pszTerm; -		} -		//f5 -		ostr << " f5=" << fnGetLengthType(pCon1->m_iEnd - pCon1->m_iBegin + 1); -	} - -	void fnGenerateInstanceFile(const char* pszFlattenedSynFname,        //source-side flattened parse tree file name -                                const char* pszAlignFname,               //alignment filename -                                const char* pszSourceFname,              //source file name -                                const char* pszTargetFname,              //target file name -                                const char* pszInstanceFname             //training instance file name -                                ) { -		SAlignmentReader *pAlignReader = new SAlignmentReader(pszAlignFname); -		SParseReader *pParseReader = new SParseReader(pszFlattenedSynFname, true); -		STxtFileReader *pTxtSReader = new STxtFileReader(pszSourceFname); -		STxtFileReader *pTxtTReader = new STxtFileReader(pszTargetFname); - -		FILE *fpOut = fopen(pszInstanceFname, "w"); -		assert(fpOut != NULL); - -		//read sentence by sentence -		SAlignment *pAlign; -		SParsedTree *pTree; -		char *pszLine = new char[50001]; -		int iSentNum = 0; -		while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { -			pTree = pParseReader->fnReadNextParseTree(); -			assert(pTree != NULL); -			assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); -			vector<string> vecSTerms; -			SplitOnWhitespace(string(pszLine), &vecSTerms); -			assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); -			vector<string> vecTTerms; -			SplitOnWhitespace(string(pszLine), &vecTTerms); - -			vector<STreeItem*> vecFocused; -			fnGetFocusedParentNodes(pTree, vecFocused); - -			for (size_t i = 0; i < vecFocused.size() && pTree->m_vecTerminals.size() > 10; i++) { - -				STreeItem *pParent = vecFocused[i]; - -				for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) { -					//children[j-1] vs. children[j] reordering - -					string strOutcome; -					ostringstream ostr; - -					fnGenerateInstance(pTree, pParent->m_vecChildren[j], pAlign, vecSTerms, vecTTerms, strOutcome, ostr); - -					//fprintf(stderr, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); -					fprintf(fpOut, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); -				} -			} - -			delete pAlign; -			delete pTree; -			iSentNum++; - -			if (iSentNum % 100000 == 0) -				fprintf(stderr, "#%d\n", iSentNum); -		} - - -		fclose(fpOut); -		delete pAlignReader; -		delete pParseReader; -		delete pTxtSReader; -		delete pTxtTReader; -		delete [] pszLine; -	} +struct SConstContTrainer { +  SConstContTrainer( +      const char* pszFlattenedSynFname,  // source-side flattened parse tree +                                         // file name +      const char* pszAlignFname,  // alignment filename +      const char* pszSourceFname,  // source file name +      const char* pszTargetFname,  // target file name +      const char* pszInstanceFname,  // training instance file name +      const char* pszModelPrefix,  // classifier model file name prefix +      int iClassifierType,  // classifier type +      int iCutoff,  // feature count threshold +      const char* pszOption  // other classifier parameters (for svmlight) +      ) { +    fnGenerateInstanceFile(pszFlattenedSynFname, pszAlignFname, pszSourceFname, +                           pszTargetFname, pszInstanceFname); +    // fnTraining(pszInstanceFname, pszModelPrefix, iClassifierType, iCutoff, +    // pszOption); +    fnTraining(pszInstanceFname, pszModelPrefix, iCutoff); +  } +  ~SConstContTrainer() {} + + private: +  void fnTraining(const char* pszInstanceFname, const char* pszModelFname, +                  int iCutoff) { +    char* pszNewInstanceFName = new char[strlen(pszInstanceFname) + 50]; +    if (iCutoff > 0) { +      sprintf(pszNewInstanceFName, "%s.tmp", pszInstanceFname); +      fnPreparingTrainingdata(pszInstanceFname, iCutoff, pszNewInstanceFName); +    } else { +      strcpy(pszNewInstanceFName, pszInstanceFname); +    } + +    /*Zhangle_Maxent *pZhangleMaxent = new Zhangle_Maxent(NULL); +       pZhangleMaxent->fnTrain(pszInstanceFname, "lbfgs", pszModelFname, 100, +       2.0); +       delete pZhangleMaxent;*/ + +    Tsuruoka_Maxent* pMaxent = new Tsuruoka_Maxent(NULL); +    pMaxent->fnTrain(pszInstanceFname, "l1", pszModelFname, 300); +    delete pMaxent; + +    if (strcmp(pszNewInstanceFName, pszInstanceFname) != 0) { +      sprintf(pszNewInstanceFName, "rm %s.tmp", pszInstanceFname); +      system(pszNewInstanceFName); +    } +    delete[] pszNewInstanceFName; +  } + +  void fnGetFocusedParentNodes(const SParsedTree* pTree, +                               vector<STreeItem*>& vecFocused) { +    for (size_t i = 0; i < pTree->m_vecTerminals.size(); i++) { +      STreeItem* pParent = pTree->m_vecTerminals[i]->m_ptParent; + +      while (pParent != NULL) { +        // if (pParent->m_vecChildren.size() > 1 && pParent->m_iEnd - +        // pParent->m_iBegin > 5) { +        if (pParent->m_vecChildren.size() > 1) { +          // do constituent reordering for all children of pParent +          vecFocused.push_back(pParent); +        } +        if (pParent->m_iBrotherIndex != 0) break; +        pParent = pParent->m_ptParent; +      } +    } +  } + +  inline void fnGetOutcome(int iL1, int iR1, const SAlignment* pAlign, +                           string& strOutcome) { +    strOutcome = pAlign->fnIsContinuous(iL1, iR1); +  } + +  inline string fnGetLengthType(int iLen) { +    if (iLen == 1) return string("1"); +    if (iLen == 2) return string("2"); +    if (iLen == 3) return string("3"); +    if (iLen < 6) return string("4"); +    if (iLen < 11) return string("6"); +    return string("11"); +  } + +  /* +   * Source side (11 features): +   * f1: the syntactic category +   * f2: the syntactic category of its parent +   * f3: the head word's pos +   * f4: =1 if it's the head of its parent node +   *     or +   *     the head of its parent node +   * f5: length type +   */ +  void fnGenerateInstance(const SParsedTree* pTree, const STreeItem* pCon1, +                          const SAlignment* pAlign, +                          const vector<string>& vecSTerms, +                          const vector<string>& vecTTerms, string& strOutcome, +                          ostringstream& ostr) { + +    fnGetOutcome(pCon1->m_iBegin, pCon1->m_iEnd, pAlign, strOutcome); + +    // generate features +    // f1 +    ostr << "f1=" << pCon1->m_pszTerm; +    // f2 +    ostr << " f2=" << pCon1->m_ptParent->m_pszTerm; +    // f3 +    ostr << " f3=" << pTree->m_vecTerminals[pCon1->m_iHeadWord] +                          ->m_ptParent->m_pszTerm; +    // f4 +    if (pCon1->m_iBrotherIndex == pCon1->m_ptParent->m_iHeadChild) { +      ostr << " f4=1"; +    } else { +      ostr << " f4=" +           << pCon1->m_ptParent->m_vecChildren[pCon1->m_ptParent->m_iHeadChild] +                  ->m_pszTerm; +    } +    // f5 +    ostr << " f5=" << fnGetLengthType(pCon1->m_iEnd - pCon1->m_iBegin + 1); +  } + +  void fnGenerateInstanceFile( +      const char* pszFlattenedSynFname,  // source-side flattened parse tree +                                         // file name +      const char* pszAlignFname,  // alignment filename +      const char* pszSourceFname,  // source file name +      const char* pszTargetFname,  // target file name +      const char* pszInstanceFname  // training instance file name +      ) { +    SAlignmentReader* pAlignReader = new SAlignmentReader(pszAlignFname); +    SParseReader* pParseReader = new SParseReader(pszFlattenedSynFname, true); +    STxtFileReader* pTxtSReader = new STxtFileReader(pszSourceFname); +    STxtFileReader* pTxtTReader = new STxtFileReader(pszTargetFname); + +    FILE* fpOut = fopen(pszInstanceFname, "w"); +    assert(fpOut != NULL); + +    // read sentence by sentence +    SAlignment* pAlign; +    SParsedTree* pTree; +    char* pszLine = new char[50001]; +    int iSentNum = 0; +    while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { +      pTree = pParseReader->fnReadNextParseTree(); +      assert(pTree != NULL); +      assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); +      vector<string> vecSTerms; +      SplitOnWhitespace(string(pszLine), &vecSTerms); +      assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); +      vector<string> vecTTerms; +      SplitOnWhitespace(string(pszLine), &vecTTerms); + +      vector<STreeItem*> vecFocused; +      fnGetFocusedParentNodes(pTree, vecFocused); + +      for (size_t i = 0; +           i < vecFocused.size() && pTree->m_vecTerminals.size() > 10; i++) { + +        STreeItem* pParent = vecFocused[i]; + +        for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) { +          // children[j-1] vs. children[j] reordering + +          string strOutcome; +          ostringstream ostr; + +          fnGenerateInstance(pTree, pParent->m_vecChildren[j], pAlign, +                             vecSTerms, vecTTerms, strOutcome, ostr); + +          // fprintf(stderr, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); +          fprintf(fpOut, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); +        } +      } + +      delete pAlign; +      delete pTree; +      iSentNum++; + +      if (iSentNum % 100000 == 0) fprintf(stderr, "#%d\n", iSentNum); +    } + +    fclose(fpOut); +    delete pAlignReader; +    delete pParseReader; +    delete pTxtSReader; +    delete pTxtTReader; +    delete[] pszLine; +  }  }; -inline void print_options(std::ostream &out,po::options_description const& opts) { -  typedef std::vector< boost::shared_ptr<po::option_description> > Ds; -  Ds const& ds=opts.options(); +inline void print_options(std::ostream& out, +                          po::options_description const& opts) { +  typedef std::vector<boost::shared_ptr<po::option_description> > Ds; +  Ds const& ds = opts.options();    out << '"'; -  for (unsigned i=0;i<ds.size();++i) { -    if (i) out<<' '; -    out<<"--"<<ds[i]->long_name(); +  for (unsigned i = 0; i < ds.size(); ++i) { +    if (i) out << ' '; +    out << "--" << ds[i]->long_name();    }    out << '\n';  } -inline string str(char const* name,po::variables_map const& conf) { +inline string str(char const* name, po::variables_map const& conf) {    return conf[name].as<string>();  } -//--parse_file /scratch0/mt_exp/gq-ctb/data/train.srl.cn --align_file /scratch0/mt_exp/gq-ctb/data/aligned.grow-diag-final-and --source_file /scratch0/mt_exp/gq-ctb/data/train.cn --target_file /scratch0/mt_exp/gq-ctb/data/train.en --instance_file /scratch0/mt_exp/gq-ctb/data/srl-instance --model_prefix /scratch0/mt_exp/gq-ctb/data/srl-instance --feature_cutoff 10 --classifier_type 1 +//--parse_file /scratch0/mt_exp/gq-ctb/data/train.srl.cn --align_file +///scratch0/mt_exp/gq-ctb/data/aligned.grow-diag-final-and --source_file +///scratch0/mt_exp/gq-ctb/data/train.cn --target_file +///scratch0/mt_exp/gq-ctb/data/train.en --instance_file +///scratch0/mt_exp/gq-ctb/data/srl-instance --model_prefix +///scratch0/mt_exp/gq-ctb/data/srl-instance --feature_cutoff 10 +//--classifier_type 1  int main(int argc, char** argv) { -	po::options_description opts("Configuration options"); -	opts.add_options() -                ("parse_file",po::value<string>(),"parse file path (input)") -                ("align_file",po::value<string>(),"Alignment file path (input)") -                ("source_file",po::value<string>(),"Source text file path (input)") -                ("target_file",po::value<string>(),"Target text file path (input)") -                ("instance_file",po::value<string>(),"Instance file path (output)") -                ("model_prefix",po::value<string>(),"Model file path prefix (output): three files will be generated") -                ("classifier_type",po::value<int>()->default_value(1),"Classifier type: 1 for openNLP maxent; 2 for Zhangle maxent; and 3 for SVMLight") -                ("feature_cutoff",po::value<int>()->default_value(100),"Feature cutoff threshold") -                ("svm_option",po::value<string>(),"Parameters for SVMLight classifier") -                ("help", "produce help message"); - -	po::variables_map vm; -	if (argc) { -		po::store(po::parse_command_line(argc, argv, opts), vm); -		po::notify(vm); -	} - -	if (vm.count("help")) { -		print_options(cout, opts); -		return 1; -	} - -	if (!vm.count("parse_file") -			|| !vm.count("align_file") -			|| !vm.count("source_file") -			|| !vm.count("target_file") -			|| !vm.count("instance_file") -			|| !vm.count("model_prefix") -			) { -		print_options(cout, opts); -		if (!vm.count("parse_file")) cout << "--parse_file NOT FOUND\n"; -		if (!vm.count("align_file")) cout << "--align_file NOT FOUND\n"; -		if (!vm.count("source_file")) cout << "--source_file NOT FOUND\n"; -		if (!vm.count("target_file")) cout << "--target_file NOT FOUND\n"; -		if (!vm.count("instance_file")) cout << "--instance_file NOT FOUND\n"; -		if (!vm.count("model_prefix")) cout << "--model_prefix NOT FOUND\n"; -		exit(0); -	} - -	const char *pOption; -	if (vm.count("svm_option")) -		pOption = str("svm_option", vm).c_str(); -	else -		pOption = NULL; - -	SConstReorderTrainer *pTrainer = new SConstReorderTrainer(str("parse_file", vm).c_str(), -			                                                  str("align_file", vm).c_str(), -			                                                  str("source_file", vm).c_str(), -			                                                  str("target_file", vm).c_str(), -			                                                  str("instance_file", vm).c_str(), -			                                                  str("model_prefix", vm).c_str(), -			                                                  vm["classifier_type"].as<int>(), -			                                                  vm["feature_cutoff"].as<int>(), -			                                                  pOption); -	delete pTrainer; - -	return 1; +  po::options_description opts("Configuration options"); +  opts.add_options()("parse_file", po::value<string>(), +                     "parse file path (input)")( +      "align_file", po::value<string>(), "Alignment file path (input)")( +      "source_file", po::value<string>(), "Source text file path (input)")( +      "target_file", po::value<string>(), "Target text file path (input)")( +      "instance_file", po::value<string>(), "Instance file path (output)")( +      "model_prefix", po::value<string>(), +      "Model file path prefix (output): three files will be generated")( +      "classifier_type", po::value<int>()->default_value(1), +      "Classifier type: 1 for openNLP maxent; 2 for Zhangle maxent; and 3 for " +      "SVMLight")("feature_cutoff", po::value<int>()->default_value(100), +                  "Feature cutoff threshold")( +      "svm_option", po::value<string>(), "Parameters for SVMLight classifier")( +      "help", "produce help message"); + +  po::variables_map vm; +  if (argc) { +    po::store(po::parse_command_line(argc, argv, opts), vm); +    po::notify(vm); +  } + +  if (vm.count("help")) { +    print_options(cout, opts); +    return 1; +  } + +  if (!vm.count("parse_file") || !vm.count("align_file") || +      !vm.count("source_file") || !vm.count("target_file") || +      !vm.count("instance_file") || !vm.count("model_prefix")) { +    print_options(cout, opts); +    if (!vm.count("parse_file")) cout << "--parse_file NOT FOUND\n"; +    if (!vm.count("align_file")) cout << "--align_file NOT FOUND\n"; +    if (!vm.count("source_file")) cout << "--source_file NOT FOUND\n"; +    if (!vm.count("target_file")) cout << "--target_file NOT FOUND\n"; +    if (!vm.count("instance_file")) cout << "--instance_file NOT FOUND\n"; +    if (!vm.count("model_prefix")) cout << "--model_prefix NOT FOUND\n"; +    exit(0); +  } + +  const char* pOption; +  if (vm.count("svm_option")) +    pOption = str("svm_option", vm).c_str(); +  else +    pOption = NULL; + +  SConstReorderTrainer* pTrainer = new SConstReorderTrainer( +      str("parse_file", vm).c_str(), str("align_file", vm).c_str(), +      str("source_file", vm).c_str(), str("target_file", vm).c_str(), +      str("instance_file", vm).c_str(), str("model_prefix", vm).c_str(), +      vm["classifier_type"].as<int>(), vm["feature_cutoff"].as<int>(), pOption); +  delete pTrainer; +  return 1;  } | 
