diff options
| author | Wu, Ke <wuke@cs.umd.edu> | 2014-10-07 17:22:11 -0400 | 
|---|---|---|
| committer | Wu, Ke <wuke@cs.umd.edu> | 2014-10-07 17:22:11 -0400 | 
| commit | 00968d1ba03c0603440fe5a765b87869b99a0a93 (patch) | |
| tree | f264c07d9851b47b117839fc9345d7e1d5e880a2 | |
| parent | f762dbbf10a8204d0d0b82e9acb29feacd3b3bb4 (diff) | |
Apply clang-format
| -rw-r--r-- | utils/synutils/alignment.h | 348 | ||||
| -rw-r--r-- | utils/synutils/argument_reorder_model.cc | 572 | ||||
| -rw-r--r-- | utils/synutils/argument_reorder_model.h | 417 | ||||
| -rw-r--r-- | utils/synutils/constituent_reorder_model.cc | 1562 | ||||
| -rw-r--r-- | utils/synutils/maxent-3.0/lbfgs.cpp | 48 | ||||
| -rw-r--r-- | utils/synutils/maxent-3.0/lbfgs.h | 24 | ||||
| -rw-r--r-- | utils/synutils/maxent-3.0/mathvec.h | 48 | ||||
| -rw-r--r-- | utils/synutils/maxent-3.0/maxent.cpp | 291 | ||||
| -rw-r--r-- | utils/synutils/maxent-3.0/maxent.h | 187 | ||||
| -rw-r--r-- | utils/synutils/maxent-3.0/owlqn.cpp | 46 | ||||
| -rw-r--r-- | utils/synutils/maxent-3.0/sgd.cpp | 147 | ||||
| -rw-r--r-- | utils/synutils/srl_sentence.h | 376 | ||||
| -rw-r--r-- | utils/synutils/tree.h | 1377 | ||||
| -rw-r--r-- | utils/synutils/tsuruoka_maxent.h | 263 | ||||
| -rw-r--r-- | utils/synutils/utility.h | 188 | 
15 files changed, 2961 insertions, 2933 deletions
| diff --git a/utils/synutils/alignment.h b/utils/synutils/alignment.h index f8fe1993..c0648aab 100644 --- a/utils/synutils/alignment.h +++ b/utils/synutils/alignment.h @@ -19,190 +19,184 @@ using namespace std;  /*   * Note: - *      m_vec_s_align.size() may not be equal to the length of source side sentence + *      m_vec_s_align.size() may not be equal to the length of source side + *sentence   *                           due to the last words may not be aligned   *   */  struct SAlignment { -	typedef vector<int> SingleAlign; -	SAlignment(const char* pszAlign) { -		fnInitializeAlignment(pszAlign); -	} -	~SAlignment() { - -	} - -	bool fnIsAligned(int i, bool s) const { -		const vector<SingleAlign>* palign; -		if (s == true) -			palign = &m_vec_s_align; -		else -			palign = &m_vec_t_align; -		if ((*palign)[i].size() == 0) -			return false; -		return true; -	} - -	/* -	 * return true if [b, e] is aligned phrases on source side (if s==true) or on the target side (if s==false); -	 * return false, otherwise. -	 */ -	bool fnIsAlignedPhrase(int b, int e, bool s, int *pob, int *poe) const { -		int ob, oe; //[b, e] on the other side -		if (s == true) -			fnGetLeftRightMost(b, e, m_vec_s_align, ob, oe); -		else -			fnGetLeftRightMost(b, e, m_vec_t_align, ob, oe); - -		if (ob == -1) { -			if (pob != NULL) (*pob) = -1; -			if (poe != NULL) (*poe) = -1; -			return false; //no aligned word among [b, e] -		} -		if (pob != NULL) (*pob) = ob; -		if (poe != NULL) (*poe) = oe; - -		int bb, be; //[b, e] back given [ob, oe] on the other side -		if (s == true) -			fnGetLeftRightMost(ob, oe, m_vec_t_align, bb, be); -		else -			fnGetLeftRightMost(ob, oe, m_vec_s_align, bb, be); - -		if (bb < b || be > e) -			return false; -		return true; -	} - -	bool fnIsAlignedTightPhrase(int b, int e, bool s, int *pob, int *poe) const { -		const vector<SingleAlign>* palign; -		if (s == true) -			palign = &m_vec_s_align; -		else -			palign = &m_vec_t_align; - -		if ((*palign).size() <= e || (*palign)[b].size() == 0 || (*palign)[e].size() == 0) -			return false; - -		return fnIsAlignedPhrase(b, e, s, pob, poe); -	} - -	void fnGetLeftRightMost(int b, int e, bool s, int& ob, int& oe) const { -		if (s == true) -			fnGetLeftRightMost(b, e, m_vec_s_align, ob, oe); -		else -			fnGetLeftRightMost(b, e, m_vec_t_align, ob, oe); -	} - -	/* -	 * look the translation of source[b, e] is continuous or not -	 * 1) return "Unaligned": if the source[b, e] is translated silently; -	 * 2) return "Con't": if none of target words in target[.., ..] is exclusively aligned to any word outside source[b, e] -	 * 3) return "Discon't": otherwise; -	 */ -	string fnIsContinuous(int b, int e) const { -		int ob, oe; -		fnGetLeftRightMost(b, e, true, ob, oe); -		if (ob == -1) return "Unaligned"; - -		for (int i = ob; i <= oe; i++) { -			if (!fnIsAligned(i, false)) continue; -			const SingleAlign& a = m_vec_t_align[i]; -			int j; -			for (j = 0; j < a.size(); j++) -				if (a[j] >= b && a[j] <= e) break; -			if (j == a.size()) return "Discon't"; -		} -		return "Con't"; -	} - -	const SingleAlign* fnGetSingleWordAlign(int i, bool s) const { -		if (s == true) { -			if (i >= m_vec_s_align.size()) -				return NULL; -			return &(m_vec_s_align[i]); -		} else { -			if (i >= m_vec_t_align.size()) -				return NULL; -			return &(m_vec_t_align[i]); -		} -	} -private: -	void fnGetLeftRightMost(int b, int e, const vector<SingleAlign>& align, int& ob, int& oe) const { -		ob = oe = -1; -		for (int i = b; i <= e && i < align.size(); i++) { -			if (align[i].size() > 0) { -				if (align[i][0] < ob || ob == -1) -					ob = align[i][0]; -				if (oe < align[i][align[i].size() - 1]) -					oe = align[i][align[i].size() - 1]; -			} -		} -	} -	void fnInitializeAlignment(const char* pszAlign) { -		m_vec_s_align.clear(); -		m_vec_t_align.clear(); - -		vector<string> terms = SplitOnWhitespace(string(pszAlign)); -		int si, ti; -		for (size_t i = 0; i < terms.size(); i++) { -			sscanf(terms[i].c_str(), "%d-%d", &si, &ti); - -			while (m_vec_s_align.size() <= si) { -				SingleAlign sa; -				m_vec_s_align.push_back(sa); -			} -			while (m_vec_t_align.size() <= ti) { -				SingleAlign sa; -				m_vec_t_align.push_back(sa); -			} - -			m_vec_s_align[si].push_back(ti); -			m_vec_t_align[ti].push_back(si); -		} - -		//sort -		for (size_t i = 0; i < m_vec_s_align.size(); i++) { -			std::sort(m_vec_s_align[i].begin(), m_vec_s_align[i].end()); -		} -		for (size_t i = 0; i < m_vec_t_align.size(); i++) { -			std::sort(m_vec_t_align[i].begin(), m_vec_t_align[i].end()); -		} -	} - -private: -	vector<SingleAlign> m_vec_s_align; //source side words' alignment -	vector<SingleAlign> m_vec_t_align; //target side words' alignment +  typedef vector<int> SingleAlign; +  SAlignment(const char* pszAlign) { fnInitializeAlignment(pszAlign); } +  ~SAlignment() {} + +  bool fnIsAligned(int i, bool s) const { +    const vector<SingleAlign>* palign; +    if (s == true) +      palign = &m_vec_s_align; +    else +      palign = &m_vec_t_align; +    if ((*palign)[i].size() == 0) return false; +    return true; +  } + +  /* +   * return true if [b, e] is aligned phrases on source side (if s==true) or on +   * the target side (if s==false); +   * return false, otherwise. +   */ +  bool fnIsAlignedPhrase(int b, int e, bool s, int* pob, int* poe) const { +    int ob, oe;  //[b, e] on the other side +    if (s == true) +      fnGetLeftRightMost(b, e, m_vec_s_align, ob, oe); +    else +      fnGetLeftRightMost(b, e, m_vec_t_align, ob, oe); + +    if (ob == -1) { +      if (pob != NULL) (*pob) = -1; +      if (poe != NULL) (*poe) = -1; +      return false;  // no aligned word among [b, e] +    } +    if (pob != NULL) (*pob) = ob; +    if (poe != NULL) (*poe) = oe; + +    int bb, be;  //[b, e] back given [ob, oe] on the other side +    if (s == true) +      fnGetLeftRightMost(ob, oe, m_vec_t_align, bb, be); +    else +      fnGetLeftRightMost(ob, oe, m_vec_s_align, bb, be); + +    if (bb < b || be > e) return false; +    return true; +  } + +  bool fnIsAlignedTightPhrase(int b, int e, bool s, int* pob, int* poe) const { +    const vector<SingleAlign>* palign; +    if (s == true) +      palign = &m_vec_s_align; +    else +      palign = &m_vec_t_align; + +    if ((*palign).size() <= e || (*palign)[b].size() == 0 || +        (*palign)[e].size() == 0) +      return false; + +    return fnIsAlignedPhrase(b, e, s, pob, poe); +  } + +  void fnGetLeftRightMost(int b, int e, bool s, int& ob, int& oe) const { +    if (s == true) +      fnGetLeftRightMost(b, e, m_vec_s_align, ob, oe); +    else +      fnGetLeftRightMost(b, e, m_vec_t_align, ob, oe); +  } + +  /* +   * look the translation of source[b, e] is continuous or not +   * 1) return "Unaligned": if the source[b, e] is translated silently; +   * 2) return "Con't": if none of target words in target[.., ..] is exclusively +   * aligned to any word outside source[b, e] +   * 3) return "Discon't": otherwise; +   */ +  string fnIsContinuous(int b, int e) const { +    int ob, oe; +    fnGetLeftRightMost(b, e, true, ob, oe); +    if (ob == -1) return "Unaligned"; + +    for (int i = ob; i <= oe; i++) { +      if (!fnIsAligned(i, false)) continue; +      const SingleAlign& a = m_vec_t_align[i]; +      int j; +      for (j = 0; j < a.size(); j++) +        if (a[j] >= b && a[j] <= e) break; +      if (j == a.size()) return "Discon't"; +    } +    return "Con't"; +  } + +  const SingleAlign* fnGetSingleWordAlign(int i, bool s) const { +    if (s == true) { +      if (i >= m_vec_s_align.size()) return NULL; +      return &(m_vec_s_align[i]); +    } else { +      if (i >= m_vec_t_align.size()) return NULL; +      return &(m_vec_t_align[i]); +    } +  } + + private: +  void fnGetLeftRightMost(int b, int e, const vector<SingleAlign>& align, +                          int& ob, int& oe) const { +    ob = oe = -1; +    for (int i = b; i <= e && i < align.size(); i++) { +      if (align[i].size() > 0) { +        if (align[i][0] < ob || ob == -1) ob = align[i][0]; +        if (oe < align[i][align[i].size() - 1]) +          oe = align[i][align[i].size() - 1]; +      } +    } +  } +  void fnInitializeAlignment(const char* pszAlign) { +    m_vec_s_align.clear(); +    m_vec_t_align.clear(); + +    vector<string> terms = SplitOnWhitespace(string(pszAlign)); +    int si, ti; +    for (size_t i = 0; i < terms.size(); i++) { +      sscanf(terms[i].c_str(), "%d-%d", &si, &ti); + +      while (m_vec_s_align.size() <= si) { +        SingleAlign sa; +        m_vec_s_align.push_back(sa); +      } +      while (m_vec_t_align.size() <= ti) { +        SingleAlign sa; +        m_vec_t_align.push_back(sa); +      } + +      m_vec_s_align[si].push_back(ti); +      m_vec_t_align[ti].push_back(si); +    } + +    // sort +    for (size_t i = 0; i < m_vec_s_align.size(); i++) { +      std::sort(m_vec_s_align[i].begin(), m_vec_s_align[i].end()); +    } +    for (size_t i = 0; i < m_vec_t_align.size(); i++) { +      std::sort(m_vec_t_align[i].begin(), m_vec_t_align[i].end()); +    } +  } + + private: +  vector<SingleAlign> m_vec_s_align;  // source side words' alignment +  vector<SingleAlign> m_vec_t_align;  // target side words' alignment  };  struct SAlignmentReader { -	SAlignmentReader(const char *pszFname) { -		m_fpIn = fopen(pszFname, "r"); -		assert(m_fpIn != NULL); -	} -	~SAlignmentReader() { -		if (m_fpIn != NULL) -			fclose(m_fpIn); -	} -	SAlignment* fnReadNextAlignment() { -		if (feof(m_fpIn) == true) -			return NULL; -		char *pszLine = new char[100001]; -		pszLine[0] = '\0'; -		fgets(pszLine, 10001, m_fpIn); -		int iLen = strlen(pszLine); -		if (iLen == 0) -			return NULL; -		while (iLen > 0 && pszLine[iLen - 1] > 0 && pszLine[iLen -1] < 33) { -			pszLine[ iLen - 1 ] = '\0'; -			iLen--; -		} -		SAlignment *pAlign = new SAlignment(pszLine); -		delete [] pszLine; -		return pAlign; -	} -private: -	FILE *m_fpIn; +  SAlignmentReader(const char* pszFname) { +    m_fpIn = fopen(pszFname, "r"); +    assert(m_fpIn != NULL); +  } +  ~SAlignmentReader() { +    if (m_fpIn != NULL) fclose(m_fpIn); +  } +  SAlignment* fnReadNextAlignment() { +    if (feof(m_fpIn) == true) return NULL; +    char* pszLine = new char[100001]; +    pszLine[0] = '\0'; +    fgets(pszLine, 10001, m_fpIn); +    int iLen = strlen(pszLine); +    if (iLen == 0) return NULL; +    while (iLen > 0 && pszLine[iLen - 1] > 0 && pszLine[iLen - 1] < 33) { +      pszLine[iLen - 1] = '\0'; +      iLen--; +    } +    SAlignment* pAlign = new SAlignment(pszLine); +    delete[] pszLine; +    return pAlign; +  } + + private: +  FILE* m_fpIn;  }; -  #endif /* ALIGNMENT_H_ */ diff --git a/utils/synutils/argument_reorder_model.cc b/utils/synutils/argument_reorder_model.cc index e30ee971..2b6bcda1 100644 --- a/utils/synutils/argument_reorder_model.cc +++ b/utils/synutils/argument_reorder_model.cc @@ -12,300 +12,312 @@  #include "utility.h"  #include "tsuruoka_maxent.h" -inline void fnPreparingTrainingdata(const char* pszFName, int iCutoff, const char* pszNewFName) { -	SFReader *pFReader = new STxtFileReader(pszFName); -	char *pszLine = new char[ 100001 ]; -	int iLen; -	Map hashPredicate; -	while (pFReader->fnReadNextLine(pszLine, &iLen)) { -		if (iLen == 0) -			continue; - -		vector<string> vecTerms; -		SplitOnWhitespace(string(pszLine), &vecTerms); - -		for (size_t i = 0; i < vecTerms.size() - 1; i++) { -			Iterator iter = hashPredicate.find(vecTerms[i]); -			if (iter == hashPredicate.end()) { -				hashPredicate[vecTerms[i]] = 1; - -			} else { -				iter->second++; -			} -		} -	} -	delete pFReader; - -	pFReader = new STxtFileReader(pszFName); -	FILE *fpOut = fopen(pszNewFName, "w"); -	while (pFReader->fnReadNextLine(pszLine, &iLen)) { -		if (iLen == 0) -			continue; - -		vector<string> vecTerms; -		SplitOnWhitespace(string(pszLine), &vecTerms); -		ostringstream ostr; -		for (size_t i = 0; i < vecTerms.size() - 1; i++) { -			Iterator iter = hashPredicate.find(vecTerms[i]); -			assert(iter != hashPredicate.end()); -			if (iter->second >= iCutoff) { -				ostr << vecTerms[i] << " "; -			} -		} -		if (ostr.str().length() > 0) { -			ostr << vecTerms[vecTerms.size() - 1]; -			fprintf(fpOut, "%s\n", ostr.str().c_str()); -		} -	} -	fclose(fpOut); -	delete pFReader; - - -	delete [] pszLine; +inline void fnPreparingTrainingdata(const char* pszFName, int iCutoff, +                                    const char* pszNewFName) { +  SFReader* pFReader = new STxtFileReader(pszFName); +  char* pszLine = new char[100001]; +  int iLen; +  Map hashPredicate; +  while (pFReader->fnReadNextLine(pszLine, &iLen)) { +    if (iLen == 0) continue; + +    vector<string> vecTerms; +    SplitOnWhitespace(string(pszLine), &vecTerms); + +    for (size_t i = 0; i < vecTerms.size() - 1; i++) { +      Iterator iter = hashPredicate.find(vecTerms[i]); +      if (iter == hashPredicate.end()) { +        hashPredicate[vecTerms[i]] = 1; + +      } else { +        iter->second++; +      } +    } +  } +  delete pFReader; + +  pFReader = new STxtFileReader(pszFName); +  FILE* fpOut = fopen(pszNewFName, "w"); +  while (pFReader->fnReadNextLine(pszLine, &iLen)) { +    if (iLen == 0) continue; + +    vector<string> vecTerms; +    SplitOnWhitespace(string(pszLine), &vecTerms); +    ostringstream ostr; +    for (size_t i = 0; i < vecTerms.size() - 1; i++) { +      Iterator iter = hashPredicate.find(vecTerms[i]); +      assert(iter != hashPredicate.end()); +      if (iter->second >= iCutoff) { +        ostr << vecTerms[i] << " "; +      } +    } +    if (ostr.str().length() > 0) { +      ostr << vecTerms[vecTerms.size() - 1]; +      fprintf(fpOut, "%s\n", ostr.str().c_str()); +    } +  } +  fclose(fpOut); +  delete pFReader; + +  delete[] pszLine;  } -struct SArgumentReorderTrainer{ -	SArgumentReorderTrainer(const char* pszSRLFname,        //source-side srl tree file name -				const char* pszAlignFname,               //alignment filename -	            const char* pszSourceFname,              //source file name -	            const char* pszTargetFname,              //target file name -	            const char* pszTopPredicateFname,        //target file name -	            const char* pszInstanceFname,            //training instance file name -	            const char* pszModelFname,              //classifier model file name -	            int iCutoff) { -		fnGenerateInstanceFiles(pszSRLFname, pszAlignFname, pszSourceFname, pszTargetFname, pszTopPredicateFname, pszInstanceFname); - -		string strInstanceFname, strModelFname; -		strInstanceFname = string(pszInstanceFname) + string(".left"); -		strModelFname = string(pszModelFname) + string(".left"); -		fnTraining(strInstanceFname.c_str(), strModelFname.c_str(), iCutoff); -		strInstanceFname = string(pszInstanceFname) + string(".right"); -		strModelFname = string(pszModelFname) + string(".right"); -		fnTraining(strInstanceFname.c_str(), strModelFname.c_str(), iCutoff); -	} - -	~SArgumentReorderTrainer() { - -	} - -private: -	void fnTraining(const char* pszInstanceFname, const char* pszModelFname, int iCutoff) { -		char *pszNewInstanceFName = new char[strlen(pszInstanceFname) + 50]; -		if (iCutoff > 0) { -			sprintf(pszNewInstanceFName, "%s.tmp", pszInstanceFname); -			fnPreparingTrainingdata(pszInstanceFname, iCutoff, pszNewInstanceFName); -		} else { -			strcpy(pszNewInstanceFName, pszInstanceFname); -		} - -		Tsuruoka_Maxent *pMaxent = new Tsuruoka_Maxent(NULL); -		pMaxent->fnTrain(pszNewInstanceFName, "l1", pszModelFname, 300); -		delete pMaxent; - -		if (strcmp(pszNewInstanceFName, pszInstanceFname) != 0) { -			sprintf(pszNewInstanceFName, "rm %s.tmp", pszInstanceFname); -			system(pszNewInstanceFName); -		} -		delete [] pszNewInstanceFName; -	} - -	void fnGenerateInstanceFiles(const char* pszSRLFname,        //source-side flattened parse tree file name -				const char* pszAlignFname,               //alignment filename -				const char* pszSourceFname,              //source file name -				const char* pszTargetFname,              //target file name -				const char* pszTopPredicateFname,        //top predicate file name (we only consider predicates with 100+ occurrences -				const char* pszInstanceFname             //training instance file name -				) { -		SAlignmentReader *pAlignReader = new SAlignmentReader(pszAlignFname); -		SSrlSentenceReader *pSRLReader = new SSrlSentenceReader(pszSRLFname); -		STxtFileReader *pTxtSReader = new STxtFileReader(pszSourceFname); -		STxtFileReader *pTxtTReader = new STxtFileReader(pszTargetFname); - -		Map *pMapPredicate; -		if (pszTopPredicateFname != NULL) -			pMapPredicate = fnLoadTopPredicates(pszTopPredicateFname); -		else -			pMapPredicate = NULL; - -		char *pszLine = new char[50001]; - -		FILE *fpLeftOut, *fpRightOut; -		sprintf(pszLine, "%s.left", pszInstanceFname); -		fpLeftOut = fopen(pszLine, "w"); -		sprintf(pszLine, "%s.right", pszInstanceFname); -		fpRightOut = fopen(pszLine, "w"); - -		//read sentence by sentence -		SAlignment *pAlign; -		SSrlSentence *pSRL; -		SParsedTree *pTree; -		int iSentNum = 0; -		while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { -			pSRL = pSRLReader->fnReadNextSrlSentence(); -			assert(pSRL != NULL); -			pTree = pSRL->m_pTree; -			assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); -			vector<string> vecSTerms; -			SplitOnWhitespace(string(pszLine), &vecSTerms); -			assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); -			vector<string> vecTTerms; -			SplitOnWhitespace(string(pszLine), &vecTTerms); -			//vecTPOSTerms.size() == 0, given the case when an english sentence fails parsing - -			if (pTree != NULL) { -				for (size_t i = 0; i < pSRL->m_vecPred.size(); i++) { -					SPredicate *pPred = pSRL->m_vecPred[i]; -					if (strcmp(pTree->m_vecTerminals[pPred->m_iPosition]->m_ptParent->m_pszTerm, "VA") == 0) continue; -					string strPred = string(pTree->m_vecTerminals[pPred->m_iPosition]->m_pszTerm); -					if (pMapPredicate != NULL) { -						Map::iterator iter_map = pMapPredicate->find(strPred); -						if (pMapPredicate != NULL && iter_map == pMapPredicate->end()) -							continue; -					} - -					SPredicateItem *pPredItem = new SPredicateItem(pTree, pPred); - -					vector<string> vecStrBlock; -					for (size_t j = 0; j < pPredItem->vec_items_.size(); j++) { -						SSRLItem *pItem1 = pPredItem->vec_items_[j]; -						vecStrBlock.push_back(SArgumentReorderModel::fnGetBlockOutcome(pItem1->tree_item_->m_iBegin, pItem1->tree_item_->m_iEnd, pAlign)); -					} - -					vector<string> vecStrLeftReorderType; -					vector<string> vecStrRightReorderType; -					SArgumentReorderModel::fnGetReorderType(pPredItem, pAlign, vecStrLeftReorderType, vecStrRightReorderType); -					for (int j = 1; j < pPredItem->vec_items_.size(); j++) { -						string strLeftOutcome, strRightOutcome; -						strLeftOutcome = vecStrLeftReorderType[j - 1]; -						strRightOutcome = vecStrRightReorderType[j - 1]; -						ostringstream ostr; -						SArgumentReorderModel::fnGenerateFeature(pTree, pPred, pPredItem, j, vecStrBlock[j - 1], vecStrBlock[j], ostr); - -						//fprintf(stderr, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); -						//fprintf(fpOut, "sentid=%d %s %s\n", iSentNum, ostr.str().c_str(), strOutcome.c_str()); -						fprintf(fpLeftOut, "%s %s\n", ostr.str().c_str(), strLeftOutcome.c_str()); -						fprintf(fpRightOut, "%s %s\n", ostr.str().c_str(), strRightOutcome.c_str()); -					} -				} -			} -			delete pSRL; - -			delete pAlign; -			iSentNum++; - -			if (iSentNum % 100000 == 0) -				fprintf(stderr, "#%d\n", iSentNum); -		} -		delete [] pszLine; - -		fclose(fpLeftOut); -		fclose(fpRightOut); - - -		delete pAlignReader; -		delete pSRLReader; -		delete pTxtSReader; -		delete pTxtTReader; -	} - - - -	Map* fnLoadTopPredicates(const char* pszTopPredicateFname) { -		if (pszTopPredicateFname == NULL) -			return NULL; - -		Map* pMapPredicate  = new Map(); -		STxtFileReader *pReader = new STxtFileReader(pszTopPredicateFname); -		char *pszLine = new char[50001]; -		int iNumCount = 0; -		while (pReader->fnReadNextLine(pszLine, NULL)) { -			if (pszLine[0] == '#') -				continue; -			char *p = strchr(pszLine, ' '); -			assert(p != NULL); -			p[0] = '\0'; -			p++; -			int iCount = atoi(p); -			if (iCount < 100) -				break; -			(*pMapPredicate)[string(pszLine)] = iNumCount++; -		} -		delete pReader; -		delete [] pszLine; -		return pMapPredicate; -	} -}; +struct SArgumentReorderTrainer { +  SArgumentReorderTrainer( +      const char* pszSRLFname,  // source-side srl tree file name +      const char* pszAlignFname,  // alignment filename +      const char* pszSourceFname,  // source file name +      const char* pszTargetFname,  // target file name +      const char* pszTopPredicateFname,  // target file name +      const char* pszInstanceFname,  // training instance file name +      const char* pszModelFname,  // classifier model file name +      int iCutoff) { +    fnGenerateInstanceFiles(pszSRLFname, pszAlignFname, pszSourceFname, +                            pszTargetFname, pszTopPredicateFname, +                            pszInstanceFname); + +    string strInstanceFname, strModelFname; +    strInstanceFname = string(pszInstanceFname) + string(".left"); +    strModelFname = string(pszModelFname) + string(".left"); +    fnTraining(strInstanceFname.c_str(), strModelFname.c_str(), iCutoff); +    strInstanceFname = string(pszInstanceFname) + string(".right"); +    strModelFname = string(pszModelFname) + string(".right"); +    fnTraining(strInstanceFname.c_str(), strModelFname.c_str(), iCutoff); +  } +  ~SArgumentReorderTrainer() {} + + private: +  void fnTraining(const char* pszInstanceFname, const char* pszModelFname, +                  int iCutoff) { +    char* pszNewInstanceFName = new char[strlen(pszInstanceFname) + 50]; +    if (iCutoff > 0) { +      sprintf(pszNewInstanceFName, "%s.tmp", pszInstanceFname); +      fnPreparingTrainingdata(pszInstanceFname, iCutoff, pszNewInstanceFName); +    } else { +      strcpy(pszNewInstanceFName, pszInstanceFname); +    } + +    Tsuruoka_Maxent* pMaxent = new Tsuruoka_Maxent(NULL); +    pMaxent->fnTrain(pszNewInstanceFName, "l1", pszModelFname, 300); +    delete pMaxent; + +    if (strcmp(pszNewInstanceFName, pszInstanceFname) != 0) { +      sprintf(pszNewInstanceFName, "rm %s.tmp", pszInstanceFname); +      system(pszNewInstanceFName); +    } +    delete[] pszNewInstanceFName; +  } +  void fnGenerateInstanceFiles( +      const char* pszSRLFname,  // source-side flattened parse tree file name +      const char* pszAlignFname,  // alignment filename +      const char* pszSourceFname,  // source file name +      const char* pszTargetFname,  // target file name +      const char* pszTopPredicateFname,  // top predicate file name (we only +                                         // consider predicates with 100+ +                                         // occurrences +      const char* pszInstanceFname  // training instance file name +      ) { +    SAlignmentReader* pAlignReader = new SAlignmentReader(pszAlignFname); +    SSrlSentenceReader* pSRLReader = new SSrlSentenceReader(pszSRLFname); +    STxtFileReader* pTxtSReader = new STxtFileReader(pszSourceFname); +    STxtFileReader* pTxtTReader = new STxtFileReader(pszTargetFname); + +    Map* pMapPredicate; +    if (pszTopPredicateFname != NULL) +      pMapPredicate = fnLoadTopPredicates(pszTopPredicateFname); +    else +      pMapPredicate = NULL; + +    char* pszLine = new char[50001]; + +    FILE* fpLeftOut, *fpRightOut; +    sprintf(pszLine, "%s.left", pszInstanceFname); +    fpLeftOut = fopen(pszLine, "w"); +    sprintf(pszLine, "%s.right", pszInstanceFname); +    fpRightOut = fopen(pszLine, "w"); + +    // read sentence by sentence +    SAlignment* pAlign; +    SSrlSentence* pSRL; +    SParsedTree* pTree; +    int iSentNum = 0; +    while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { +      pSRL = pSRLReader->fnReadNextSrlSentence(); +      assert(pSRL != NULL); +      pTree = pSRL->m_pTree; +      assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); +      vector<string> vecSTerms; +      SplitOnWhitespace(string(pszLine), &vecSTerms); +      assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); +      vector<string> vecTTerms; +      SplitOnWhitespace(string(pszLine), &vecTTerms); +      // vecTPOSTerms.size() == 0, given the case when an english sentence fails +      // parsing + +      if (pTree != NULL) { +        for (size_t i = 0; i < pSRL->m_vecPred.size(); i++) { +          SPredicate* pPred = pSRL->m_vecPred[i]; +          if (strcmp(pTree->m_vecTerminals[pPred->m_iPosition] +                         ->m_ptParent->m_pszTerm, +                     "VA") == 0) +            continue; +          string strPred = +              string(pTree->m_vecTerminals[pPred->m_iPosition]->m_pszTerm); +          if (pMapPredicate != NULL) { +            Map::iterator iter_map = pMapPredicate->find(strPred); +            if (pMapPredicate != NULL && iter_map == pMapPredicate->end()) +              continue; +          } + +          SPredicateItem* pPredItem = new SPredicateItem(pTree, pPred); + +          vector<string> vecStrBlock; +          for (size_t j = 0; j < pPredItem->vec_items_.size(); j++) { +            SSRLItem* pItem1 = pPredItem->vec_items_[j]; +            vecStrBlock.push_back(SArgumentReorderModel::fnGetBlockOutcome( +                pItem1->tree_item_->m_iBegin, pItem1->tree_item_->m_iEnd, +                pAlign)); +          } + +          vector<string> vecStrLeftReorderType; +          vector<string> vecStrRightReorderType; +          SArgumentReorderModel::fnGetReorderType( +              pPredItem, pAlign, vecStrLeftReorderType, vecStrRightReorderType); +          for (int j = 1; j < pPredItem->vec_items_.size(); j++) { +            string strLeftOutcome, strRightOutcome; +            strLeftOutcome = vecStrLeftReorderType[j - 1]; +            strRightOutcome = vecStrRightReorderType[j - 1]; +            ostringstream ostr; +            SArgumentReorderModel::fnGenerateFeature(pTree, pPred, pPredItem, j, +                                                     vecStrBlock[j - 1], +                                                     vecStrBlock[j], ostr); + +            // fprintf(stderr, "%s %s\n", ostr.str().c_str(), +            // strOutcome.c_str()); +            // fprintf(fpOut, "sentid=%d %s %s\n", iSentNum, ostr.str().c_str(), +            // strOutcome.c_str()); +            fprintf(fpLeftOut, "%s %s\n", ostr.str().c_str(), +                    strLeftOutcome.c_str()); +            fprintf(fpRightOut, "%s %s\n", ostr.str().c_str(), +                    strRightOutcome.c_str()); +          } +        } +      } +      delete pSRL; + +      delete pAlign; +      iSentNum++; + +      if (iSentNum % 100000 == 0) fprintf(stderr, "#%d\n", iSentNum); +    } +    delete[] pszLine; + +    fclose(fpLeftOut); +    fclose(fpRightOut); + +    delete pAlignReader; +    delete pSRLReader; +    delete pTxtSReader; +    delete pTxtTReader; +  } + +  Map* fnLoadTopPredicates(const char* pszTopPredicateFname) { +    if (pszTopPredicateFname == NULL) return NULL; + +    Map* pMapPredicate = new Map(); +    STxtFileReader* pReader = new STxtFileReader(pszTopPredicateFname); +    char* pszLine = new char[50001]; +    int iNumCount = 0; +    while (pReader->fnReadNextLine(pszLine, NULL)) { +      if (pszLine[0] == '#') continue; +      char* p = strchr(pszLine, ' '); +      assert(p != NULL); +      p[0] = '\0'; +      p++; +      int iCount = atoi(p); +      if (iCount < 100) break; +      (*pMapPredicate)[string(pszLine)] = iNumCount++; +    } +    delete pReader; +    delete[] pszLine; +    return pMapPredicate; +  } +};  namespace po = boost::program_options; -inline void print_options(std::ostream &out,po::options_description const& opts) { -  typedef std::vector< boost::shared_ptr<po::option_description> > Ds; -  Ds const& ds=opts.options(); +inline void print_options(std::ostream& out, +                          po::options_description const& opts) { +  typedef std::vector<boost::shared_ptr<po::option_description> > Ds; +  Ds const& ds = opts.options();    out << '"'; -  for (unsigned i=0;i<ds.size();++i) { -    if (i) out<<' '; -    out<<"--"<<ds[i]->long_name(); +  for (unsigned i = 0; i < ds.size(); ++i) { +    if (i) out << ' '; +    out << "--" << ds[i]->long_name();    }    out << '"\n';  } -inline string str(char const* name,po::variables_map const& conf) { +inline string str(char const* name, po::variables_map const& conf) {    return conf[name].as<string>();  } -//--srl_file /scratch0/mt_exp/gale-align/gale-align.nw.srl.cn --align_file /scratch0/mt_exp/gale-align/gale-align.nw.al --source_file /scratch0/mt_exp/gale-align/gale-align.nw.cn --target_file /scratch0/mt_exp/gale-align/gale-align.nw.en --instance_file /scratch0/mt_exp/gale-align/gale-align.nw.argreorder.instance --model_prefix /scratch0/mt_exp/gale-align/gale-align.nw.argreorder.model --feature_cutoff 2 -//--srl_file /scratch0/mt_exp/gale-ctb/gale-ctb.srl.cn --align_file /scratch0/mt_exp/gale-ctb/gale-ctb.align --source_file /scratch0/mt_exp/gale-ctb/gale-ctb.cn --target_file /scratch0/mt_exp/gale-ctb/gale-ctb.en0 --instance_file /scratch0/mt_exp/gale-ctb/gale-ctb.argreorder.instance --model_prefix /scratch0/mt_exp/gale-ctb/gale-ctb.argreorder.model --feature_cutoff 2 +//--srl_file /scratch0/mt_exp/gale-align/gale-align.nw.srl.cn --align_file +///scratch0/mt_exp/gale-align/gale-align.nw.al --source_file +///scratch0/mt_exp/gale-align/gale-align.nw.cn --target_file +///scratch0/mt_exp/gale-align/gale-align.nw.en --instance_file +///scratch0/mt_exp/gale-align/gale-align.nw.argreorder.instance --model_prefix +///scratch0/mt_exp/gale-align/gale-align.nw.argreorder.model --feature_cutoff 2 +//--srl_file /scratch0/mt_exp/gale-ctb/gale-ctb.srl.cn --align_file +///scratch0/mt_exp/gale-ctb/gale-ctb.align --source_file +///scratch0/mt_exp/gale-ctb/gale-ctb.cn --target_file +///scratch0/mt_exp/gale-ctb/gale-ctb.en0 --instance_file +///scratch0/mt_exp/gale-ctb/gale-ctb.argreorder.instance --model_prefix +///scratch0/mt_exp/gale-ctb/gale-ctb.argreorder.model --feature_cutoff 2  int main(int argc, char** argv) { -	po::options_description opts("Configuration options"); -	opts.add_options() -                ("srl_file",po::value<string>(),"srl file path (input)") -                ("align_file",po::value<string>(),"Alignment file path (input)") -                ("source_file",po::value<string>(),"Source text file path (input)") -                ("target_file",po::value<string>(),"Target text file path (input)") -                ("instance_file",po::value<string>(),"Instance file path (output)") -                ("model_prefix",po::value<string>(),"Model file path prefix (output): three files will be generated") -                ("feature_cutoff",po::value<int>()->default_value(100),"Feature cutoff threshold") -                ("help", "produce help message"); - -	po::variables_map vm; -	if (argc) { -		po::store(po::parse_command_line(argc, argv, opts), vm); -		po::notify(vm); -	} - -	if (vm.count("help")) { -		print_options(cout, opts); -		return 1; -	} - -	if (!vm.count("srl_file") -			|| !vm.count("align_file") -			|| !vm.count("source_file") -			|| !vm.count("target_file") -			|| !vm.count("instance_file") -			|| !vm.count("model_prefix") -			) { -		print_options(cout, opts); -		if (!vm.count("parse_file")) cout << "--parse_file NOT FOUND\n"; -		if (!vm.count("align_file")) cout << "--align_file NOT FOUND\n"; -		if (!vm.count("source_file")) cout << "--source_file NOT FOUND\n"; -		if (!vm.count("target_file")) cout << "--target_file NOT FOUND\n"; -		if (!vm.count("instance_file")) cout << "--instance_file NOT FOUND\n"; -		if (!vm.count("model_prefix")) cout << "--model_prefix NOT FOUND\n"; -		exit(0); -	} - -	SArgumentReorderTrainer *pTrainer = new SArgumentReorderTrainer(str("srl_file", vm).c_str(), -			                                                  str("align_file", vm).c_str(), -			                                                  str("source_file", vm).c_str(), -			                                                  str("target_file", vm).c_str(), -			                                                  NULL, -			                                                  str("instance_file", vm).c_str(), -			                                                  str("model_prefix", vm).c_str(), -			                                                  vm["feature_cutoff"].as<int>()); -	delete pTrainer; - -	return 1; +  po::options_description opts("Configuration options"); +  opts.add_options()("srl_file", po::value<string>(), "srl file path (input)")( +      "align_file", po::value<string>(), "Alignment file path (input)")( +      "source_file", po::value<string>(), "Source text file path (input)")( +      "target_file", po::value<string>(), "Target text file path (input)")( +      "instance_file", po::value<string>(), "Instance file path (output)")( +      "model_prefix", po::value<string>(), +      "Model file path prefix (output): three files will be generated")( +      "feature_cutoff", po::value<int>()->default_value(100), +      "Feature cutoff threshold")("help", "produce help message"); + +  po::variables_map vm; +  if (argc) { +    po::store(po::parse_command_line(argc, argv, opts), vm); +    po::notify(vm); +  } + +  if (vm.count("help")) { +    print_options(cout, opts); +    return 1; +  } + +  if (!vm.count("srl_file") || !vm.count("align_file") || +      !vm.count("source_file") || !vm.count("target_file") || +      !vm.count("instance_file") || !vm.count("model_prefix")) { +    print_options(cout, opts); +    if (!vm.count("parse_file")) cout << "--parse_file NOT FOUND\n"; +    if (!vm.count("align_file")) cout << "--align_file NOT FOUND\n"; +    if (!vm.count("source_file")) cout << "--source_file NOT FOUND\n"; +    if (!vm.count("target_file")) cout << "--target_file NOT FOUND\n"; +    if (!vm.count("instance_file")) cout << "--instance_file NOT FOUND\n"; +    if (!vm.count("model_prefix")) cout << "--model_prefix NOT FOUND\n"; +    exit(0); +  } + +  SArgumentReorderTrainer* pTrainer = new SArgumentReorderTrainer( +      str("srl_file", vm).c_str(), str("align_file", vm).c_str(), +      str("source_file", vm).c_str(), str("target_file", vm).c_str(), NULL, +      str("instance_file", vm).c_str(), str("model_prefix", vm).c_str(), +      vm["feature_cutoff"].as<int>()); +  delete pTrainer; + +  return 1;  } diff --git a/utils/synutils/argument_reorder_model.h b/utils/synutils/argument_reorder_model.h index b553b34b..062b8841 100644 --- a/utils/synutils/argument_reorder_model.h +++ b/utils/synutils/argument_reorder_model.h @@ -12,211 +12,228 @@  #include "tree.h"  #include "srl_sentence.h" - -//an argument item or a predicate item (the verb itself) -struct SSRLItem{ -        SSRLItem(const STreeItem *tree_item, string role): -        tree_item_(tree_item), -        role_(role) { - -        } -        ~SSRLItem() { - -        } -        const STreeItem *tree_item_; -        const string role_; +// an argument item or a predicate item (the verb itself) +struct SSRLItem { +  SSRLItem(const STreeItem *tree_item, string role) +      : tree_item_(tree_item), role_(role) {} +  ~SSRLItem() {} +  const STreeItem *tree_item_; +  const string role_;  }; -struct SPredicateItem{ -	SPredicateItem(const SParsedTree *tree, const SPredicate *pred): -                pred_(pred) { -                vec_items_.reserve(pred->m_vecArgt.size() + 1); -                for (int i = 0; i < pred->m_vecArgt.size(); i++) { -                        vec_items_.push_back(new SSRLItem(pred->m_vecArgt[i]->m_pTreeItem, string(pred->m_vecArgt[i]->m_pszRole))); -                } -                vec_items_.push_back(new SSRLItem(tree->m_vecTerminals[pred->m_iPosition]->m_ptParent, string("Pred"))); -                sort(vec_items_.begin(), vec_items_.end(), SortFunction); - -                begin_ = vec_items_[0]->tree_item_->m_iBegin; -                end_ = vec_items_[vec_items_.size() - 1]->tree_item_->m_iEnd; -        } - -        ~SPredicateItem() { -                vec_items_.clear(); -        } - -        static bool SortFunction (SSRLItem *i, SSRLItem* j) { return (i->tree_item_->m_iBegin < j->tree_item_->m_iBegin); } - -        vector<SSRLItem*> vec_items_; -        int begin_; -        int end_; -        const SPredicate *pred_; +struct SPredicateItem { +  SPredicateItem(const SParsedTree *tree, const SPredicate *pred) +      : pred_(pred) { +    vec_items_.reserve(pred->m_vecArgt.size() + 1); +    for (int i = 0; i < pred->m_vecArgt.size(); i++) { +      vec_items_.push_back(new SSRLItem(pred->m_vecArgt[i]->m_pTreeItem, +                                        string(pred->m_vecArgt[i]->m_pszRole))); +    } +    vec_items_.push_back(new SSRLItem( +        tree->m_vecTerminals[pred->m_iPosition]->m_ptParent, string("Pred"))); +    sort(vec_items_.begin(), vec_items_.end(), SortFunction); + +    begin_ = vec_items_[0]->tree_item_->m_iBegin; +    end_ = vec_items_[vec_items_.size() - 1]->tree_item_->m_iEnd; +  } + +  ~SPredicateItem() { vec_items_.clear(); } + +  static bool SortFunction(SSRLItem *i, SSRLItem *j) { +    return (i->tree_item_->m_iBegin < j->tree_item_->m_iBegin); +  } + +  vector<SSRLItem *> vec_items_; +  int begin_; +  int end_; +  const SPredicate *pred_;  };  struct SArgumentReorderModel { -public: -	static string fnGetBlockOutcome(int iBegin, int iEnd, SAlignment *pAlign) { -		return pAlign->fnIsContinuous(iBegin, iEnd); -	} -	static void fnGetReorderType(SPredicateItem *pPredItem, SAlignment *pAlign, vector<string>& vecStrLeftReorder, vector<string>& vecStrRightReorder) { -    	vector<int> vecLeft, vecRight; -    	for (int i = 0; i < pPredItem->vec_items_.size(); i++) { -            const STreeItem *pCon1 = pPredItem->vec_items_[i]->tree_item_; -    		int iLeft1, iRight1; -            pAlign->fnGetLeftRightMost(pCon1->m_iBegin, pCon1->m_iEnd, true, iLeft1, iRight1); -            vecLeft.push_back(iLeft1); -            vecRight.push_back(iRight1); -    	} -    	vector<int> vecLeftPosition; -    	fnGetRelativePosition(vecLeft, vecLeftPosition); -    	vector<int> vecRightPosition; -    	fnGetRelativePosition(vecRight, vecRightPosition); - -    	vecStrLeftReorder.clear(); -    	vecStrRightReorder.clear(); -    	for (int i = 1; i < vecLeftPosition.size(); i++) { -    		string strOutcome; -    		fnGetOutcome(vecLeftPosition[i - 1], vecLeftPosition[i], strOutcome); -    		vecStrLeftReorder.push_back(strOutcome); -    		fnGetOutcome(vecRightPosition[i - 1], vecRightPosition[i], strOutcome); -    		vecStrRightReorder.push_back(strOutcome); -    	} -	} - -	/* -	 * features: -	 * f1: (left_label, right_label, parent_label) -	 * f2: (left_label, right_label, parent_label, other_right_sibling_label) -	 * f3: (left_label, right_label, parent_label, other_left_sibling_label) -	 * f4: (left_label, right_label, left_head_pos) -	 * f5: (left_label, right_label, left_head_word) -	 * f6: (left_label, right_label, right_head_pos) -	 * f7: (left_label, right_label, right_head_word) -	 * f8: (left_label, right_label, left_chunk_status) -	 * f9: (left_label, right_label, right_chunk_status) -	 * f10: (left_label, parent_label) -	 * f11: (right_label, parent_label) -	 * -	 * f1: (left_role, right_role, predicate_term) -	 * f2: (left_role, right_role, predicate_term, other_right_role) -	 * f3: (left_role, right_role, predicate_term, other_left_role) -	 * f4: (left_role, right_role, left_head_pos) -	 * f5: (left_role, right_role, left_head_word) -	 * f6: (left_role, right_role, left_syntactic_label) -	 * f7: (left_role, right_role, right_head_pos) -	 * f8: (left_role, right_role, right_head_word) -	 * f8: (left_role, right_role, right_syntactic_label) -	 * f8: (left_role, right_role, left_chunk_status) -	 * f9: (left_role, right_role, right_chunk_status) -	 * f10: (left_role, right_role, left_chunk_status) -	 * f11: (left_role, right_role, right_chunk_status) -	 * f12: (left_label, parent_label) -	 * f13: (right_label, parent_label) -	 */ -	static void fnGenerateFeature(const SParsedTree *pTree, const SPredicate *pPred, const SPredicateItem *pPredItem, int iPos, const string& strBlock1, const string& strBlock2, ostringstream& ostr) { -		SSRLItem *pSRLItem1 = pPredItem->vec_items_[iPos - 1]; -		SSRLItem *pSRLItem2 = pPredItem->vec_items_[iPos]; -		const STreeItem *pCon1 = pSRLItem1->tree_item_; -		const STreeItem *pCon2 = pSRLItem2->tree_item_; - -		string left_role = pSRLItem1->role_; -		string right_role = pSRLItem2->role_; - -		string predicate_term = pTree->m_vecTerminals[pPred->m_iPosition]->m_pszTerm; - -		vector<string> vec_other_right_sibling; -		for (int i = iPos + 1; i < pPredItem->vec_items_.size(); i++) -			vec_other_right_sibling.push_back(string(pPredItem->vec_items_[i]->role_)); -		if (vec_other_right_sibling.size() == 0) -			vec_other_right_sibling.push_back(string("NULL")); - -		vector<string> vec_other_left_sibling; -		for (int i = 0; i < iPos - 1; i++) -			vec_other_right_sibling.push_back(string(pPredItem->vec_items_[i]->role_)); -		if (vec_other_left_sibling.size() == 0) -			vec_other_left_sibling.push_back(string("NULL")); - - -		//generate features -		//f1 -		ostr << "f1=" << left_role << "_" << right_role << "_" << predicate_term; -		ostr << "f1=" << left_role << "_" << right_role; - -		//f2 -		for (int i = 0; i < vec_other_right_sibling.size(); i++) { -			ostr << " f2=" << left_role << "_" << right_role << "_" << predicate_term << "_" << vec_other_right_sibling[i]; -			ostr << " f2=" << left_role << "_" << right_role << "_" << vec_other_right_sibling[i]; -		} -		//f3 -		for (int i = 0; i < vec_other_left_sibling.size(); i++) { -			ostr << " f3=" << left_role << "_" << right_role << "_" << predicate_term << "_" << vec_other_left_sibling[i]; -			ostr << " f3=" << left_role << "_" << right_role << "_" << vec_other_left_sibling[i]; -		} -		//f4 -		ostr << " f4=" << left_role << "_" << right_role << "_" << pTree->m_vecTerminals[pCon1->m_iHeadWord]->m_ptParent->m_pszTerm; -		//f5 -		ostr << " f5=" << left_role << "_" << right_role << "_" << pTree->m_vecTerminals[pCon1->m_iHeadWord]->m_pszTerm; -		//f6 -		ostr << " f6=" << left_role << "_" << right_role << "_" << pCon2->m_pszTerm; -		//f7 -		ostr << " f7=" << left_role << "_" << right_role << "_" << pTree->m_vecTerminals[pCon2->m_iHeadWord]->m_ptParent->m_pszTerm; -		//f8 -		ostr << " f8=" << left_role << "_" << right_role << "_" << pTree->m_vecTerminals[pCon2->m_iHeadWord]->m_pszTerm; -		//f9 -		ostr << " f9=" << left_role << "_" << right_role << "_" << pCon2->m_pszTerm; -		//f10 -		ostr << " f10=" << left_role << "_" << right_role << "_" << strBlock1; -		//f11 -		ostr << " f11=" << left_role << "_" << right_role << "_" << strBlock2; -		//f12 -		ostr << " f12=" << left_role << "_" << predicate_term; -		ostr << " f12=" << left_role; -		//f13 -		ostr << " f13=" << right_role << "_" << predicate_term; -		ostr << " f13=" << right_role; -	} - -private: -	static void fnGetOutcome(int i1, int i2, string& strOutcome) { -		assert(i1 != i2); -		if (i1 < i2) { -			if (i2 > i1 + 1) strOutcome = string("DM"); -			else strOutcome = string("M"); -		} else { -			if (i1 > i2 + 1) strOutcome = string("DS"); -			else strOutcome = string("S"); -		} -	} - -	static void fnGetRelativePosition(const vector<int>& vecLeft, vector<int>& vecPosition) { -		vecPosition.clear(); - -		vector<float> vec; -		for (int i = 0; i < vecLeft.size(); i++) { -			if (vecLeft[i] == -1) { -				if (i == 0) -					vec.push_back(-1); -				else -					vec.push_back(vecLeft[i-1] + 0.1); -			} else -				vec.push_back(vecLeft[i]); -		} - -		for (int i = 0; i < vecLeft.size(); i++) { -			int count = 0; - -			for (int j = 0; j < vecLeft.size(); j++) { -				if ( j == i) continue; -				if (vec[j] < vec[i]) { -					count++; -				} else if (vec[j] == vec[i] && j < i) { -					count++; -				} -			} -			vecPosition.push_back(count); -		} -	} + public: +  static string fnGetBlockOutcome(int iBegin, int iEnd, SAlignment *pAlign) { +    return pAlign->fnIsContinuous(iBegin, iEnd); +  } +  static void fnGetReorderType(SPredicateItem *pPredItem, SAlignment *pAlign, +                               vector<string> &vecStrLeftReorder, +                               vector<string> &vecStrRightReorder) { +    vector<int> vecLeft, vecRight; +    for (int i = 0; i < pPredItem->vec_items_.size(); i++) { +      const STreeItem *pCon1 = pPredItem->vec_items_[i]->tree_item_; +      int iLeft1, iRight1; +      pAlign->fnGetLeftRightMost(pCon1->m_iBegin, pCon1->m_iEnd, true, iLeft1, +                                 iRight1); +      vecLeft.push_back(iLeft1); +      vecRight.push_back(iRight1); +    } +    vector<int> vecLeftPosition; +    fnGetRelativePosition(vecLeft, vecLeftPosition); +    vector<int> vecRightPosition; +    fnGetRelativePosition(vecRight, vecRightPosition); + +    vecStrLeftReorder.clear(); +    vecStrRightReorder.clear(); +    for (int i = 1; i < vecLeftPosition.size(); i++) { +      string strOutcome; +      fnGetOutcome(vecLeftPosition[i - 1], vecLeftPosition[i], strOutcome); +      vecStrLeftReorder.push_back(strOutcome); +      fnGetOutcome(vecRightPosition[i - 1], vecRightPosition[i], strOutcome); +      vecStrRightReorder.push_back(strOutcome); +    } +  } + +  /* +   * features: +   * f1: (left_label, right_label, parent_label) +   * f2: (left_label, right_label, parent_label, other_right_sibling_label) +   * f3: (left_label, right_label, parent_label, other_left_sibling_label) +   * f4: (left_label, right_label, left_head_pos) +   * f5: (left_label, right_label, left_head_word) +   * f6: (left_label, right_label, right_head_pos) +   * f7: (left_label, right_label, right_head_word) +   * f8: (left_label, right_label, left_chunk_status) +   * f9: (left_label, right_label, right_chunk_status) +   * f10: (left_label, parent_label) +   * f11: (right_label, parent_label) +   * +   * f1: (left_role, right_role, predicate_term) +   * f2: (left_role, right_role, predicate_term, other_right_role) +   * f3: (left_role, right_role, predicate_term, other_left_role) +   * f4: (left_role, right_role, left_head_pos) +   * f5: (left_role, right_role, left_head_word) +   * f6: (left_role, right_role, left_syntactic_label) +   * f7: (left_role, right_role, right_head_pos) +   * f8: (left_role, right_role, right_head_word) +   * f8: (left_role, right_role, right_syntactic_label) +   * f8: (left_role, right_role, left_chunk_status) +   * f9: (left_role, right_role, right_chunk_status) +   * f10: (left_role, right_role, left_chunk_status) +   * f11: (left_role, right_role, right_chunk_status) +   * f12: (left_label, parent_label) +   * f13: (right_label, parent_label) +   */ +  static void fnGenerateFeature(const SParsedTree *pTree, +                                const SPredicate *pPred, +                                const SPredicateItem *pPredItem, int iPos, +                                const string &strBlock1, +                                const string &strBlock2, ostringstream &ostr) { +    SSRLItem *pSRLItem1 = pPredItem->vec_items_[iPos - 1]; +    SSRLItem *pSRLItem2 = pPredItem->vec_items_[iPos]; +    const STreeItem *pCon1 = pSRLItem1->tree_item_; +    const STreeItem *pCon2 = pSRLItem2->tree_item_; + +    string left_role = pSRLItem1->role_; +    string right_role = pSRLItem2->role_; + +    string predicate_term = +        pTree->m_vecTerminals[pPred->m_iPosition]->m_pszTerm; + +    vector<string> vec_other_right_sibling; +    for (int i = iPos + 1; i < pPredItem->vec_items_.size(); i++) +      vec_other_right_sibling.push_back( +          string(pPredItem->vec_items_[i]->role_)); +    if (vec_other_right_sibling.size() == 0) +      vec_other_right_sibling.push_back(string("NULL")); + +    vector<string> vec_other_left_sibling; +    for (int i = 0; i < iPos - 1; i++) +      vec_other_right_sibling.push_back( +          string(pPredItem->vec_items_[i]->role_)); +    if (vec_other_left_sibling.size() == 0) +      vec_other_left_sibling.push_back(string("NULL")); + +    // generate features +    // f1 +    ostr << "f1=" << left_role << "_" << right_role << "_" << predicate_term; +    ostr << "f1=" << left_role << "_" << right_role; + +    // f2 +    for (int i = 0; i < vec_other_right_sibling.size(); i++) { +      ostr << " f2=" << left_role << "_" << right_role << "_" << predicate_term +           << "_" << vec_other_right_sibling[i]; +      ostr << " f2=" << left_role << "_" << right_role << "_" +           << vec_other_right_sibling[i]; +    } +    // f3 +    for (int i = 0; i < vec_other_left_sibling.size(); i++) { +      ostr << " f3=" << left_role << "_" << right_role << "_" << predicate_term +           << "_" << vec_other_left_sibling[i]; +      ostr << " f3=" << left_role << "_" << right_role << "_" +           << vec_other_left_sibling[i]; +    } +    // f4 +    ostr << " f4=" << left_role << "_" << right_role << "_" +         << pTree->m_vecTerminals[pCon1->m_iHeadWord]->m_ptParent->m_pszTerm; +    // f5 +    ostr << " f5=" << left_role << "_" << right_role << "_" +         << pTree->m_vecTerminals[pCon1->m_iHeadWord]->m_pszTerm; +    // f6 +    ostr << " f6=" << left_role << "_" << right_role << "_" << pCon2->m_pszTerm; +    // f7 +    ostr << " f7=" << left_role << "_" << right_role << "_" +         << pTree->m_vecTerminals[pCon2->m_iHeadWord]->m_ptParent->m_pszTerm; +    // f8 +    ostr << " f8=" << left_role << "_" << right_role << "_" +         << pTree->m_vecTerminals[pCon2->m_iHeadWord]->m_pszTerm; +    // f9 +    ostr << " f9=" << left_role << "_" << right_role << "_" << pCon2->m_pszTerm; +    // f10 +    ostr << " f10=" << left_role << "_" << right_role << "_" << strBlock1; +    // f11 +    ostr << " f11=" << left_role << "_" << right_role << "_" << strBlock2; +    // f12 +    ostr << " f12=" << left_role << "_" << predicate_term; +    ostr << " f12=" << left_role; +    // f13 +    ostr << " f13=" << right_role << "_" << predicate_term; +    ostr << " f13=" << right_role; +  } + + private: +  static void fnGetOutcome(int i1, int i2, string &strOutcome) { +    assert(i1 != i2); +    if (i1 < i2) { +      if (i2 > i1 + 1) +        strOutcome = string("DM"); +      else +        strOutcome = string("M"); +    } else { +      if (i1 > i2 + 1) +        strOutcome = string("DS"); +      else +        strOutcome = string("S"); +    } +  } + +  static void fnGetRelativePosition(const vector<int> &vecLeft, +                                    vector<int> &vecPosition) { +    vecPosition.clear(); + +    vector<float> vec; +    for (int i = 0; i < vecLeft.size(); i++) { +      if (vecLeft[i] == -1) { +        if (i == 0) +          vec.push_back(-1); +        else +          vec.push_back(vecLeft[i - 1] + 0.1); +      } else +        vec.push_back(vecLeft[i]); +    } + +    for (int i = 0; i < vecLeft.size(); i++) { +      int count = 0; + +      for (int j = 0; j < vecLeft.size(); j++) { +        if (j == i) continue; +        if (vec[j] < vec[i]) { +          count++; +        } else if (vec[j] == vec[i] && j < i) { +          count++; +        } +      } +      vecPosition.push_back(count); +    } +  }  }; -  #endif /* ARGUMENT_REORDER_MODEL_H_ */ diff --git a/utils/synutils/constituent_reorder_model.cc b/utils/synutils/constituent_reorder_model.cc index 485c9667..a4fb9627 100644 --- a/utils/synutils/constituent_reorder_model.cc +++ b/utils/synutils/constituent_reorder_model.cc @@ -5,7 +5,6 @@   *      Author: junhuili   */ -  #include <boost/program_options.hpp>  #include "alignment.h" @@ -17,780 +16,831 @@  using namespace std; -  typedef std::tr1::unordered_map<std::string, int> Map;  typedef std::tr1::unordered_map<std::string, int>::iterator Iterator;  namespace po = boost::program_options; -inline void fnPreparingTrainingdata(const char* pszFName, int iCutoff, const char* pszNewFName) { -	SFReader *pFReader = new STxtFileReader(pszFName); -	char *pszLine = new char[ 100001 ]; -	int iLen; -	Map hashPredicate; -	while (pFReader->fnReadNextLine(pszLine, &iLen)) { -		if (iLen == 0) -			continue; - -		vector<string> vecTerms; -		SplitOnWhitespace(string(pszLine), &vecTerms); - -		for (size_t i = 0; i < vecTerms.size() - 1; i++) { -			Iterator iter = hashPredicate.find(vecTerms[i]); -			if (iter == hashPredicate.end()) { -				hashPredicate[vecTerms[i]] = 1; - -			} else { -				iter->second++; -			} -		} -	} -	delete pFReader; - -	pFReader = new STxtFileReader(pszFName); -	FILE *fpOut = fopen(pszNewFName, "w"); -	while (pFReader->fnReadNextLine(pszLine, &iLen)) { -		if (iLen == 0) -			continue; - -		vector<string> vecTerms; -		SplitOnWhitespace(string(pszLine), &vecTerms); -		ostringstream ostr; -		for (size_t i = 0; i < vecTerms.size() - 1; i++) { -			Iterator iter = hashPredicate.find(vecTerms[i]); -			assert(iter != hashPredicate.end()); -			if (iter->second >= iCutoff) { -				ostr << vecTerms[i] << " "; -			} -		} -		if (ostr.str().length() > 0) { -			ostr << vecTerms[vecTerms.size() - 1]; -			fprintf(fpOut, "%s\n", ostr.str().c_str()); -		} -	} -	fclose(fpOut); -	delete pFReader; - - -	delete [] pszLine; +inline void fnPreparingTrainingdata(const char* pszFName, int iCutoff, +                                    const char* pszNewFName) { +  SFReader* pFReader = new STxtFileReader(pszFName); +  char* pszLine = new char[100001]; +  int iLen; +  Map hashPredicate; +  while (pFReader->fnReadNextLine(pszLine, &iLen)) { +    if (iLen == 0) continue; + +    vector<string> vecTerms; +    SplitOnWhitespace(string(pszLine), &vecTerms); + +    for (size_t i = 0; i < vecTerms.size() - 1; i++) { +      Iterator iter = hashPredicate.find(vecTerms[i]); +      if (iter == hashPredicate.end()) { +        hashPredicate[vecTerms[i]] = 1; + +      } else { +        iter->second++; +      } +    } +  } +  delete pFReader; + +  pFReader = new STxtFileReader(pszFName); +  FILE* fpOut = fopen(pszNewFName, "w"); +  while (pFReader->fnReadNextLine(pszLine, &iLen)) { +    if (iLen == 0) continue; + +    vector<string> vecTerms; +    SplitOnWhitespace(string(pszLine), &vecTerms); +    ostringstream ostr; +    for (size_t i = 0; i < vecTerms.size() - 1; i++) { +      Iterator iter = hashPredicate.find(vecTerms[i]); +      assert(iter != hashPredicate.end()); +      if (iter->second >= iCutoff) { +        ostr << vecTerms[i] << " "; +      } +    } +    if (ostr.str().length() > 0) { +      ostr << vecTerms[vecTerms.size() - 1]; +      fprintf(fpOut, "%s\n", ostr.str().c_str()); +    } +  } +  fclose(fpOut); +  delete pFReader; + +  delete[] pszLine;  } -struct SConstReorderTrainer{ -	SConstReorderTrainer(const char* pszSynFname,        //source-side flattened parse tree file name -                         const char* pszAlignFname,               //alignment filename -                         const char* pszSourceFname,              //source file name -                         const char* pszTargetFname,              //target file name -                         const char* pszInstanceFname,            //training instance file name -                         const char* pszModelPrefix,              //classifier model file name prefix -                         int iClassifierType,                     //classifier type -                         int iCutoff,                             //feature count threshold -                         const char* pszOption                    //other classifier parameters (for svmlight) -                         ) { -		fnGenerateInstanceFile(pszSynFname, pszAlignFname, pszSourceFname, pszTargetFname, pszInstanceFname); - -		string strInstanceLeftFname = string(pszInstanceFname) + string(".left"); -		string strInstanceRightFname = string(pszInstanceFname) + string(".right"); - -		string strModelLeftFname = string(pszModelPrefix) + string(".left"); -		string strModelRightFname = string(pszModelPrefix) + string(".right"); - -		fprintf(stdout, "...Training the left ordering model\n"); -		fnTraining(strInstanceLeftFname.c_str(), strModelLeftFname.c_str(), iCutoff); -		fprintf(stdout, "...Training the right ordering model\n"); -		fnTraining(strInstanceRightFname.c_str(), strModelRightFname.c_str(), iCutoff); -	} -	~SConstReorderTrainer() { - -	} - -private: - -	void fnTraining(const char* pszInstanceFname, const char* pszModelFname, int iCutoff) { -		char *pszNewInstanceFName = new char[strlen(pszInstanceFname) + 50]; -		if (iCutoff > 0) { -			sprintf(pszNewInstanceFName, "%s.tmp", pszInstanceFname); -			fnPreparingTrainingdata(pszInstanceFname, iCutoff, pszNewInstanceFName); -		} else { -			strcpy(pszNewInstanceFName, pszInstanceFname); -		} - -		/*Zhangle_Maxent *pZhangleMaxent = new Zhangle_Maxent(NULL); -	    pZhangleMaxent->fnTrain(pszInstanceFname, "lbfgs", pszModelFname, 100, 2.0); -	    delete pZhangleMaxent;*/ - -		Tsuruoka_Maxent *pMaxent = new Tsuruoka_Maxent(NULL); -		pMaxent->fnTrain(pszNewInstanceFName, "l1", pszModelFname, 300); -		delete pMaxent; - -		if (strcmp(pszNewInstanceFName, pszInstanceFname) != 0) { -			sprintf(pszNewInstanceFName, "rm %s.tmp", pszInstanceFname); -			system(pszNewInstanceFName); -		} -		delete [] pszNewInstanceFName; -	} - -	inline bool fnIsVerbPOS(const char* pszTerm) { -		if (strcmp(pszTerm, "VV") == 0 -			|| strcmp(pszTerm, "VA") == 0 -			|| strcmp(pszTerm, "VC") == 0 -			|| strcmp(pszTerm, "VE") == 0) -			return true; -		return false; -	} - -	inline void fnGetOutcome(int iL1, int iR1, int iL2, int iR2, const SAlignment *pAlign, string& strOutcome) { -		if (iL1 == -1 && iL2 == -1) -			strOutcome = "BU"; //1. both are untranslated -		else if (iL1 == -1) -			strOutcome = "1U"; //2. XP1 is untranslated -		else if (iL2 == -1) -			strOutcome = "2U"; //3. XP2 is untranslated -		else if (iL1 == iL2 && iR2 == iR2) -			strOutcome = "SS"; //4. Have same scope -		else if (iL1 <= iL2 && iR1 >= iR2) -			strOutcome = "1C2"; //5. XP1's translation covers XP2's -		else if (iL1 >= iL2 && iR1 <= iR2) -			strOutcome = "2C1"; //6. XP2's translation covers XP1's -		else if (iR1 < iL2) { -			int i = iR1 + 1; -			/*while (i < iL2) { -				if (pAlign->fnIsAligned(i, false)) -					break; -				i++; -			}*/ -			if (i == iL2) -				strOutcome = "M"; //7. Monotone -			else -				strOutcome = "DM"; //8. Discontinuous monotone -		} else if (iL1 < iL2 && iL2 <= iR1 && iR1 < iR2) -			strOutcome = "OM"; //9. Overlap monotone -		else if (iR2 < iL1) { -			int i = iR2 + 1; -			/*while (i < iL1) { -				if (pAlign->fnIsAligned(i, false)) -					break; -				i++; -			}*/ -			if (i == iL1) -				strOutcome = "S"; //10. Swap -			else -				strOutcome = "DS"; //11. Discontinuous swap -		} else if (iL2 < iL1 && iL1 <= iR2 && iR2 < iR1) -			strOutcome = "OS"; //12. Overlap swap -		else -			assert(false); -	} - -	inline void fnGetOutcome(int i1, int i2, string& strOutcome) { -		assert(i1 != i2); -		if (i1 < i2) { -			if (i2 > i1 + 1) strOutcome = string("DM"); -			else strOutcome = string("M"); -		} else { -			if (i1 > i2 + 1) strOutcome = string("DS"); -			else strOutcome = string("S"); -		} -	} - -	inline void fnGetRelativePosition(const vector<int>& vecLeft, vector<int>& vecPosition) { -		vecPosition.clear(); - -		vector<float> vec; -		for (size_t i = 0; i < vecLeft.size(); i++) { -			if (vecLeft[i] == -1) { -				if (i == 0) -					vec.push_back(-1); -				else -					vec.push_back(vecLeft[i-1] + 0.1); -			} else -				vec.push_back(vecLeft[i]); -		} - -		for (size_t i = 0; i < vecLeft.size(); i++) { -			int count = 0; - -			for (size_t j = 0; j < vecLeft.size(); j++) { -				if ( j == i) continue; -				if (vec[j] < vec[i]) { -					count++; -				} else if (vec[j] == vec[i] && j < i) { -					count++; -				} -			} -			vecPosition.push_back(count); -		} -	} - -	/* -	 * features: -	 * f1: (left_label, right_label, parent_label) -	 * f2: (left_label, right_label, parent_label, other_right_sibling_label) -	 * f3: (left_label, right_label, parent_label, other_left_sibling_label) -	 * f4: (left_label, right_label, left_head_pos) -	 * f5: (left_label, right_label, left_head_word) -	 * f6: (left_label, right_label, right_head_pos) -	 * f7: (left_label, right_label, right_head_word) -	 * f8: (left_label, right_label, left_chunk_status) -	 * f9: (left_label, right_label, right_chunk_status) -	 * f10: (left_label, parent_label) -	 * f11: (right_label, parent_label) -	 */ -	void fnGenerateInstance(const SParsedTree *pTree, const STreeItem *pParent, int iPos, const vector<string>& vecChunkStatus, const vector<int>& vecPosition, const vector<string>& vecSTerms, const vector<string>& vecTTerms, string& strOutcome, ostringstream& ostr) { -		STreeItem *pCon1, *pCon2; -		pCon1 = pParent->m_vecChildren[iPos - 1]; -		pCon2 = pParent->m_vecChildren[iPos]; - -		fnGetOutcome(vecPosition[iPos-1], vecPosition[iPos], strOutcome); - -		string left_label = string(pCon1->m_pszTerm); -		string right_label = string(pCon2->m_pszTerm); -		string parent_label = string(pParent->m_pszTerm); - -		vector<string> vec_other_right_sibling; -		for (int i = iPos + 1; i < pParent->m_vecChildren.size(); i++) -			vec_other_right_sibling.push_back(string(pParent->m_vecChildren[i]->m_pszTerm)); -		if (vec_other_right_sibling.size() == 0) -			vec_other_right_sibling.push_back(string("NULL")); -		vector<string> vec_other_left_sibling; -		for (int i = 0; i < iPos - 1; i++) -			vec_other_left_sibling.push_back(string(pParent->m_vecChildren[i]->m_pszTerm)); -		if (vec_other_left_sibling.size() == 0) -			vec_other_left_sibling.push_back(string("NULL")); - -		//generate features -		//f1 -		ostr << "f1=" << left_label << "_" << right_label << "_" << parent_label; -		//f2 -		for (int i = 0; i < vec_other_right_sibling.size(); i++) -			ostr << " f2=" << left_label << "_" << right_label << "_" << parent_label << "_" << vec_other_right_sibling[i]; -		//f3 -		for (int i = 0; i < vec_other_left_sibling.size(); i++) -			ostr << " f3=" << left_label << "_" << right_label << "_" << parent_label << "_" << vec_other_left_sibling[i]; -		//f4 -		ostr << " f4=" << left_label << "_" << right_label << "_" << pTree->m_vecTerminals[pCon1->m_iHeadWord]->m_ptParent->m_pszTerm; -		//f5 -		ostr << " f5=" << left_label << "_" << right_label << "_" << vecSTerms[pCon1->m_iHeadWord]; -		//f6 -		ostr << " f6=" << left_label << "_" << right_label << "_" << pTree->m_vecTerminals[pCon2->m_iHeadWord]->m_ptParent->m_pszTerm; -		//f7 -		ostr << " f7=" << left_label << "_" << right_label << "_" << vecSTerms[pCon2->m_iHeadWord]; -		//f8 -		ostr << " f8=" << left_label << "_" << right_label << "_" << vecChunkStatus[iPos - 1]; -		//f9 -		ostr << " f9=" << left_label << "_" << right_label << "_" << vecChunkStatus[iPos]; -		//f10 -		ostr << " f10=" << left_label << "_" << parent_label; -		//f11 -		ostr << " f11=" << right_label << "_" << parent_label; -	} - -	/* -	 * Source side (11 features): -	 * f1: the categories of XP1 and XP2 (f1_1, f1_2) -	 * f2: the head words of XP1 and XP2 (f2_1, f2_2) -	 * f3: the first and last word of XP1 (f3_f, f3_l) -	 * f4: the first and last word of XP2 (f4_f, f4_l) -	 * f5: is XP1 or XP2 the head node (f5_1, f5_2) -	 * f6: the category of the common parent -	 * Target side (6 features): -	 * f7: the first and the last word of XP1's translation (f7_f, f7_l) -	 * f8: the first and the last word of XP2's translation (f8_f, f8_l) -	 * f9: the translation of XP1's and XP2's head word (f9_1, f9_2) -	 */ -	void fnGenerateInstance(const SParsedTree *pTree, const STreeItem *pParent, const STreeItem *pCon1, const STreeItem *pCon2, const SAlignment *pAlign, const vector<string>& vecSTerms, const vector<string>& vecTTerms, string& strOutcome, ostringstream& ostr) { - -		int iLeft1, iRight1, iLeft2, iRight2; -		pAlign->fnGetLeftRightMost(pCon1->m_iBegin, pCon1->m_iEnd, true, iLeft1, iRight1); -		pAlign->fnGetLeftRightMost(pCon2->m_iBegin, pCon2->m_iEnd, true, iLeft2, iRight2); - -		fnGetOutcome(iLeft1, iRight1, iLeft2, iRight2, pAlign, strOutcome); - -		//generate features -		//f1 -		ostr << "f1_1=" << pCon1->m_pszTerm << " f1_2=" << pCon2->m_pszTerm; -		//f2 -		ostr << " f2_1=" << vecSTerms[pCon1->m_iHeadWord] << " f2_2" << vecSTerms[pCon2->m_iHeadWord]; -		//f3 -		ostr << " f3_f=" << vecSTerms[pCon1->m_iBegin] << " f3_l=" << vecSTerms[pCon1->m_iEnd]; -		//f4 -		ostr << " f4_f=" << vecSTerms[pCon2->m_iBegin] << " f4_l=" << vecSTerms[pCon2->m_iEnd]; -		//f5 -		if (pParent->m_iHeadChild == pCon1->m_iBrotherIndex) -			ostr << " f5_1=1"; -		else -			ostr << " f5_1=0"; -		if (pParent->m_iHeadChild == pCon2->m_iBrotherIndex) -			ostr << " f5_2=1"; -		else -			ostr << " f5_2=0"; -		//f6 -		ostr << " f6=" << pParent->m_pszTerm; - -		/*//f7 -		if (iLeft1 != -1) { -			ostr << " f7_f=" << vecTTerms[iLeft1] << " f7_l=" << vecTTerms[iRight1]; -		} -		if (iLeft2 != -1) { -			ostr << " f8_f=" << vecTTerms[iLeft2] << " f8_l=" << vecTTerms[iRight2]; -		} - -		const vector<int>* pvecTarget = pAlign->fnGetSingleWordAlign(pCon1->m_iHeadWord, true); -		string str = ""; -		for (size_t i = 0; pvecTarget != NULL && i < pvecTarget->size(); i++) { -			str += vecTTerms[(*pvecTarget)[i]] + "_"; -		} -		if (str.length() > 0) { -			ostr << " f9_1=" << str.substr(0, str.size()-1); -		} -		pvecTarget = pAlign->fnGetSingleWordAlign(pCon2->m_iHeadWord, true); -		str = ""; -		for (size_t i = 0; pvecTarget != NULL && i < pvecTarget->size(); i++) { -			str += vecTTerms[(*pvecTarget)[i]] + "_"; -		} -		if (str.length() > 0) { -			ostr << " f9_2=" << str.substr(0, str.size()-1); -		} */ - -	} - -	void fnGetFocusedParentNodes(const SParsedTree* pTree, vector<STreeItem*>& vecFocused){ -		for (size_t i = 0; i < pTree->m_vecTerminals.size(); i++) { -			STreeItem *pParent = pTree->m_vecTerminals[i]->m_ptParent; - -			while (pParent != NULL) { -				//if (pParent->m_vecChildren.size() > 1 && pParent->m_iEnd - pParent->m_iBegin > 5) { -				if (pParent->m_vecChildren.size() > 1) { -					//do constituent reordering for all children of pParent -					vecFocused.push_back(pParent); -				} -				if (pParent->m_iBrotherIndex != 0) break; -				pParent = pParent->m_ptParent; -			} -		} -	} - -	void fnGenerateInstanceFile(const char* pszSynFname,        //source-side flattened parse tree file name -                                const char* pszAlignFname,               //alignment filename -                                const char* pszSourceFname,              //source file name -                                const char* pszTargetFname,              //target file name -                                const char* pszInstanceFname             //training instance file name -                                ) { -		SAlignmentReader *pAlignReader = new SAlignmentReader(pszAlignFname); -		SParseReader *pParseReader = new SParseReader(pszSynFname, false); -		STxtFileReader *pTxtSReader = new STxtFileReader(pszSourceFname); -		STxtFileReader *pTxtTReader = new STxtFileReader(pszTargetFname); - -		string strInstanceLeftFname = string(pszInstanceFname) + string(".left"); -		string strInstanceRightFname = string(pszInstanceFname) + string(".right"); - -		FILE *fpLeftOut = fopen(strInstanceLeftFname.c_str(), "w"); -		assert(fpLeftOut != NULL); - -		FILE *fpRightOut = fopen(strInstanceRightFname.c_str(), "w"); -		assert(fpRightOut != NULL); - -		//read sentence by sentence -		SAlignment *pAlign; -		SParsedTree *pTree; -		char *pszLine = new char[50001]; -		int iSentNum = 0; -		while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { -			pTree = pParseReader->fnReadNextParseTree(); -			assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); -			vector<string> vecSTerms; -			SplitOnWhitespace(string(pszLine), &vecSTerms); -			assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); -			vector<string> vecTTerms; -			SplitOnWhitespace(string(pszLine), &vecTTerms); - - -			if (pTree != NULL) { - -				vector<STreeItem*> vecFocused; -				fnGetFocusedParentNodes(pTree, vecFocused); - -				for (size_t i = 0; i < vecFocused.size(); i++) { - -					STreeItem *pParent = vecFocused[i]; - -	            	vector<int> vecLeft, vecRight; -	            	for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) { -	                    STreeItem *pCon1 = pParent->m_vecChildren[j]; -	            		int iLeft1, iRight1; -	                    pAlign->fnGetLeftRightMost(pCon1->m_iBegin, pCon1->m_iEnd, true, iLeft1, iRight1); -	                    vecLeft.push_back(iLeft1); -	                    vecRight.push_back(iRight1); -	            	} -	            	vector<int> vecLeftPosition; -	            	fnGetRelativePosition(vecLeft, vecLeftPosition); -	            	vector<int> vecRightPosition; -	            	fnGetRelativePosition(vecRight, vecRightPosition); - -	            	vector<string> vecChunkStatus; -	            	for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) { -	            		string strOutcome = pAlign->fnIsContinuous(pParent->m_vecChildren[j]->m_iBegin, pParent->m_vecChildren[j]->m_iEnd); -	            		vecChunkStatus.push_back(strOutcome); -	            	} - -					for (size_t j = 1; j < pParent->m_vecChildren.size(); j++) { -						//children[j-1] vs. children[j] reordering - -						string strLeftOutcome; -						ostringstream ostr; - -						fnGenerateInstance(pTree, pParent, j, vecChunkStatus, vecLeftPosition, vecSTerms, vecTTerms, strLeftOutcome, ostr); - -						//fprintf(stderr, "%s %s\n", ostr.str().c_str(), strLeftOutcome.c_str()); -						fprintf(fpLeftOut, "%s %s\n", ostr.str().c_str(), strLeftOutcome.c_str()); - -						string strRightOutcome; -						fnGetOutcome(vecRightPosition[j-1], vecRightPosition[j], strRightOutcome); -						fprintf(fpRightOut, "%s LeftOrder=%s %s\n", ostr.str().c_str(), strLeftOutcome.c_str(), strRightOutcome.c_str()); -					} -				} -				delete pTree; -			} - -			delete pAlign; -			iSentNum++; - -			if (iSentNum % 100000 == 0) -				fprintf(stderr, "#%d\n", iSentNum); -		} - - -		fclose(fpLeftOut); -		fclose(fpRightOut); -		delete pAlignReader; -		delete pParseReader; -		delete pTxtSReader; -		delete pTxtTReader; -		delete [] pszLine; -	} - -	void fnGenerateInstanceFile2(const char* pszSynFname,        //source-side flattened parse tree file name -                                const char* pszAlignFname,               //alignment filename -                                const char* pszSourceFname,              //source file name -                                const char* pszTargetFname,              //target file name -                                const char* pszInstanceFname             //training instance file name -                                ) { -		SAlignmentReader *pAlignReader = new SAlignmentReader(pszAlignFname); -		SParseReader *pParseReader = new SParseReader(pszSynFname, false); -		STxtFileReader *pTxtSReader = new STxtFileReader(pszSourceFname); -		STxtFileReader *pTxtTReader = new STxtFileReader(pszTargetFname); - -		FILE *fpOut = fopen(pszInstanceFname, "w"); -		assert(fpOut != NULL); - -		//read sentence by sentence -		SAlignment *pAlign; -		SParsedTree *pTree; -		char *pszLine = new char[50001]; -		int iSentNum = 0; -		while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { -			pTree = pParseReader->fnReadNextParseTree(); -			assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); -			vector<string> vecSTerms; -			SplitOnWhitespace(string(pszLine), &vecSTerms); -			assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); -			vector<string> vecTTerms; -			SplitOnWhitespace(string(pszLine), &vecTTerms); - - -			if (pTree != NULL) { - -				vector<STreeItem*> vecFocused; -				fnGetFocusedParentNodes(pTree, vecFocused); - -				for (size_t i = 0; i < vecFocused.size() && pTree->m_vecTerminals.size() > 10; i++) { - -					STreeItem *pParent = vecFocused[i]; - -					for (size_t j = 1; j < pParent->m_vecChildren.size(); j++) { -						//children[j-1] vs. children[j] reordering - -						string strOutcome; -						ostringstream ostr; - -						fnGenerateInstance(pTree, pParent, pParent->m_vecChildren[j-1], pParent->m_vecChildren[j], pAlign, vecSTerms, vecTTerms, strOutcome, ostr); - -						//fprintf(stderr, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); -						fprintf(fpOut, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); -					} -				} -				delete pTree; -			} - -			delete pAlign; -			iSentNum++; - -			if (iSentNum % 100000 == 0) -				fprintf(stderr, "#%d\n", iSentNum); -		} - - -		fclose(fpOut); -		delete pAlignReader; -		delete pParseReader; -		delete pTxtSReader; -		delete pTxtTReader; -		delete [] pszLine; -	} +struct SConstReorderTrainer { +  SConstReorderTrainer( +      const char* pszSynFname,  // source-side flattened parse tree file name +      const char* pszAlignFname,  // alignment filename +      const char* pszSourceFname,  // source file name +      const char* pszTargetFname,  // target file name +      const char* pszInstanceFname,  // training instance file name +      const char* pszModelPrefix,  // classifier model file name prefix +      int iClassifierType,  // classifier type +      int iCutoff,  // feature count threshold +      const char* pszOption  // other classifier parameters (for svmlight) +      ) { +    fnGenerateInstanceFile(pszSynFname, pszAlignFname, pszSourceFname, +                           pszTargetFname, pszInstanceFname); + +    string strInstanceLeftFname = string(pszInstanceFname) + string(".left"); +    string strInstanceRightFname = string(pszInstanceFname) + string(".right"); + +    string strModelLeftFname = string(pszModelPrefix) + string(".left"); +    string strModelRightFname = string(pszModelPrefix) + string(".right"); + +    fprintf(stdout, "...Training the left ordering model\n"); +    fnTraining(strInstanceLeftFname.c_str(), strModelLeftFname.c_str(), +               iCutoff); +    fprintf(stdout, "...Training the right ordering model\n"); +    fnTraining(strInstanceRightFname.c_str(), strModelRightFname.c_str(), +               iCutoff); +  } +  ~SConstReorderTrainer() {} + + private: +  void fnTraining(const char* pszInstanceFname, const char* pszModelFname, +                  int iCutoff) { +    char* pszNewInstanceFName = new char[strlen(pszInstanceFname) + 50]; +    if (iCutoff > 0) { +      sprintf(pszNewInstanceFName, "%s.tmp", pszInstanceFname); +      fnPreparingTrainingdata(pszInstanceFname, iCutoff, pszNewInstanceFName); +    } else { +      strcpy(pszNewInstanceFName, pszInstanceFname); +    } + +    /*Zhangle_Maxent *pZhangleMaxent = new Zhangle_Maxent(NULL); +pZhangleMaxent->fnTrain(pszInstanceFname, "lbfgs", pszModelFname, 100, 2.0); +delete pZhangleMaxent;*/ + +    Tsuruoka_Maxent* pMaxent = new Tsuruoka_Maxent(NULL); +    pMaxent->fnTrain(pszNewInstanceFName, "l1", pszModelFname, 300); +    delete pMaxent; + +    if (strcmp(pszNewInstanceFName, pszInstanceFname) != 0) { +      sprintf(pszNewInstanceFName, "rm %s.tmp", pszInstanceFname); +      system(pszNewInstanceFName); +    } +    delete[] pszNewInstanceFName; +  } + +  inline bool fnIsVerbPOS(const char* pszTerm) { +    if (strcmp(pszTerm, "VV") == 0 || strcmp(pszTerm, "VA") == 0 || +        strcmp(pszTerm, "VC") == 0 || strcmp(pszTerm, "VE") == 0) +      return true; +    return false; +  } + +  inline void fnGetOutcome(int iL1, int iR1, int iL2, int iR2, +                           const SAlignment* pAlign, string& strOutcome) { +    if (iL1 == -1 && iL2 == -1) +      strOutcome = "BU";  // 1. both are untranslated +    else if (iL1 == -1) +      strOutcome = "1U";  // 2. XP1 is untranslated +    else if (iL2 == -1) +      strOutcome = "2U";  // 3. XP2 is untranslated +    else if (iL1 == iL2 && iR2 == iR2) +      strOutcome = "SS";  // 4. Have same scope +    else if (iL1 <= iL2 && iR1 >= iR2) +      strOutcome = "1C2";  // 5. XP1's translation covers XP2's +    else if (iL1 >= iL2 && iR1 <= iR2) +      strOutcome = "2C1";  // 6. XP2's translation covers XP1's +    else if (iR1 < iL2) { +      int i = iR1 + 1; +      /*while (i < iL2) { +              if (pAlign->fnIsAligned(i, false)) +                      break; +              i++; +      }*/ +      if (i == iL2) +        strOutcome = "M";  // 7. Monotone +      else +        strOutcome = "DM";  // 8. Discontinuous monotone +    } else if (iL1 < iL2 && iL2 <= iR1 && iR1 < iR2) +      strOutcome = "OM";  // 9. Overlap monotone +    else if (iR2 < iL1) { +      int i = iR2 + 1; +      /*while (i < iL1) { +              if (pAlign->fnIsAligned(i, false)) +                      break; +              i++; +      }*/ +      if (i == iL1) +        strOutcome = "S";  // 10. Swap +      else +        strOutcome = "DS";  // 11. Discontinuous swap +    } else if (iL2 < iL1 && iL1 <= iR2 && iR2 < iR1) +      strOutcome = "OS";  // 12. Overlap swap +    else +      assert(false); +  } + +  inline void fnGetOutcome(int i1, int i2, string& strOutcome) { +    assert(i1 != i2); +    if (i1 < i2) { +      if (i2 > i1 + 1) +        strOutcome = string("DM"); +      else +        strOutcome = string("M"); +    } else { +      if (i1 > i2 + 1) +        strOutcome = string("DS"); +      else +        strOutcome = string("S"); +    } +  } + +  inline void fnGetRelativePosition(const vector<int>& vecLeft, +                                    vector<int>& vecPosition) { +    vecPosition.clear(); + +    vector<float> vec; +    for (size_t i = 0; i < vecLeft.size(); i++) { +      if (vecLeft[i] == -1) { +        if (i == 0) +          vec.push_back(-1); +        else +          vec.push_back(vecLeft[i - 1] + 0.1); +      } else +        vec.push_back(vecLeft[i]); +    } + +    for (size_t i = 0; i < vecLeft.size(); i++) { +      int count = 0; + +      for (size_t j = 0; j < vecLeft.size(); j++) { +        if (j == i) continue; +        if (vec[j] < vec[i]) { +          count++; +        } else if (vec[j] == vec[i] && j < i) { +          count++; +        } +      } +      vecPosition.push_back(count); +    } +  } + +  /* +   * features: +   * f1: (left_label, right_label, parent_label) +   * f2: (left_label, right_label, parent_label, other_right_sibling_label) +   * f3: (left_label, right_label, parent_label, other_left_sibling_label) +   * f4: (left_label, right_label, left_head_pos) +   * f5: (left_label, right_label, left_head_word) +   * f6: (left_label, right_label, right_head_pos) +   * f7: (left_label, right_label, right_head_word) +   * f8: (left_label, right_label, left_chunk_status) +   * f9: (left_label, right_label, right_chunk_status) +   * f10: (left_label, parent_label) +   * f11: (right_label, parent_label) +   */ +  void fnGenerateInstance(const SParsedTree* pTree, const STreeItem* pParent, +                          int iPos, const vector<string>& vecChunkStatus, +                          const vector<int>& vecPosition, +                          const vector<string>& vecSTerms, +                          const vector<string>& vecTTerms, string& strOutcome, +                          ostringstream& ostr) { +    STreeItem* pCon1, *pCon2; +    pCon1 = pParent->m_vecChildren[iPos - 1]; +    pCon2 = pParent->m_vecChildren[iPos]; + +    fnGetOutcome(vecPosition[iPos - 1], vecPosition[iPos], strOutcome); + +    string left_label = string(pCon1->m_pszTerm); +    string right_label = string(pCon2->m_pszTerm); +    string parent_label = string(pParent->m_pszTerm); + +    vector<string> vec_other_right_sibling; +    for (int i = iPos + 1; i < pParent->m_vecChildren.size(); i++) +      vec_other_right_sibling.push_back( +          string(pParent->m_vecChildren[i]->m_pszTerm)); +    if (vec_other_right_sibling.size() == 0) +      vec_other_right_sibling.push_back(string("NULL")); +    vector<string> vec_other_left_sibling; +    for (int i = 0; i < iPos - 1; i++) +      vec_other_left_sibling.push_back( +          string(pParent->m_vecChildren[i]->m_pszTerm)); +    if (vec_other_left_sibling.size() == 0) +      vec_other_left_sibling.push_back(string("NULL")); + +    // generate features +    // f1 +    ostr << "f1=" << left_label << "_" << right_label << "_" << parent_label; +    // f2 +    for (int i = 0; i < vec_other_right_sibling.size(); i++) +      ostr << " f2=" << left_label << "_" << right_label << "_" << parent_label +           << "_" << vec_other_right_sibling[i]; +    // f3 +    for (int i = 0; i < vec_other_left_sibling.size(); i++) +      ostr << " f3=" << left_label << "_" << right_label << "_" << parent_label +           << "_" << vec_other_left_sibling[i]; +    // f4 +    ostr << " f4=" << left_label << "_" << right_label << "_" +         << pTree->m_vecTerminals[pCon1->m_iHeadWord]->m_ptParent->m_pszTerm; +    // f5 +    ostr << " f5=" << left_label << "_" << right_label << "_" +         << vecSTerms[pCon1->m_iHeadWord]; +    // f6 +    ostr << " f6=" << left_label << "_" << right_label << "_" +         << pTree->m_vecTerminals[pCon2->m_iHeadWord]->m_ptParent->m_pszTerm; +    // f7 +    ostr << " f7=" << left_label << "_" << right_label << "_" +         << vecSTerms[pCon2->m_iHeadWord]; +    // f8 +    ostr << " f8=" << left_label << "_" << right_label << "_" +         << vecChunkStatus[iPos - 1]; +    // f9 +    ostr << " f9=" << left_label << "_" << right_label << "_" +         << vecChunkStatus[iPos]; +    // f10 +    ostr << " f10=" << left_label << "_" << parent_label; +    // f11 +    ostr << " f11=" << right_label << "_" << parent_label; +  } + +  /* +   * Source side (11 features): +   * f1: the categories of XP1 and XP2 (f1_1, f1_2) +   * f2: the head words of XP1 and XP2 (f2_1, f2_2) +   * f3: the first and last word of XP1 (f3_f, f3_l) +   * f4: the first and last word of XP2 (f4_f, f4_l) +   * f5: is XP1 or XP2 the head node (f5_1, f5_2) +   * f6: the category of the common parent +   * Target side (6 features): +   * f7: the first and the last word of XP1's translation (f7_f, f7_l) +   * f8: the first and the last word of XP2's translation (f8_f, f8_l) +   * f9: the translation of XP1's and XP2's head word (f9_1, f9_2) +   */ +  void fnGenerateInstance(const SParsedTree* pTree, const STreeItem* pParent, +                          const STreeItem* pCon1, const STreeItem* pCon2, +                          const SAlignment* pAlign, +                          const vector<string>& vecSTerms, +                          const vector<string>& vecTTerms, string& strOutcome, +                          ostringstream& ostr) { + +    int iLeft1, iRight1, iLeft2, iRight2; +    pAlign->fnGetLeftRightMost(pCon1->m_iBegin, pCon1->m_iEnd, true, iLeft1, +                               iRight1); +    pAlign->fnGetLeftRightMost(pCon2->m_iBegin, pCon2->m_iEnd, true, iLeft2, +                               iRight2); + +    fnGetOutcome(iLeft1, iRight1, iLeft2, iRight2, pAlign, strOutcome); + +    // generate features +    // f1 +    ostr << "f1_1=" << pCon1->m_pszTerm << " f1_2=" << pCon2->m_pszTerm; +    // f2 +    ostr << " f2_1=" << vecSTerms[pCon1->m_iHeadWord] << " f2_2" +         << vecSTerms[pCon2->m_iHeadWord]; +    // f3 +    ostr << " f3_f=" << vecSTerms[pCon1->m_iBegin] +         << " f3_l=" << vecSTerms[pCon1->m_iEnd]; +    // f4 +    ostr << " f4_f=" << vecSTerms[pCon2->m_iBegin] +         << " f4_l=" << vecSTerms[pCon2->m_iEnd]; +    // f5 +    if (pParent->m_iHeadChild == pCon1->m_iBrotherIndex) +      ostr << " f5_1=1"; +    else +      ostr << " f5_1=0"; +    if (pParent->m_iHeadChild == pCon2->m_iBrotherIndex) +      ostr << " f5_2=1"; +    else +      ostr << " f5_2=0"; +    // f6 +    ostr << " f6=" << pParent->m_pszTerm; + +    /*//f7 +    if (iLeft1 != -1) { +            ostr << " f7_f=" << vecTTerms[iLeft1] << " f7_l=" << +    vecTTerms[iRight1]; +    } +    if (iLeft2 != -1) { +            ostr << " f8_f=" << vecTTerms[iLeft2] << " f8_l=" << +    vecTTerms[iRight2]; +    } + +    const vector<int>* pvecTarget = +    pAlign->fnGetSingleWordAlign(pCon1->m_iHeadWord, true); +    string str = ""; +    for (size_t i = 0; pvecTarget != NULL && i < pvecTarget->size(); i++) { +            str += vecTTerms[(*pvecTarget)[i]] + "_"; +    } +    if (str.length() > 0) { +            ostr << " f9_1=" << str.substr(0, str.size()-1); +    } +    pvecTarget = pAlign->fnGetSingleWordAlign(pCon2->m_iHeadWord, true); +    str = ""; +    for (size_t i = 0; pvecTarget != NULL && i < pvecTarget->size(); i++) { +            str += vecTTerms[(*pvecTarget)[i]] + "_"; +    } +    if (str.length() > 0) { +            ostr << " f9_2=" << str.substr(0, str.size()-1); +    } */ +  } + +  void fnGetFocusedParentNodes(const SParsedTree* pTree, +                               vector<STreeItem*>& vecFocused) { +    for (size_t i = 0; i < pTree->m_vecTerminals.size(); i++) { +      STreeItem* pParent = pTree->m_vecTerminals[i]->m_ptParent; + +      while (pParent != NULL) { +        // if (pParent->m_vecChildren.size() > 1 && pParent->m_iEnd - +        // pParent->m_iBegin > 5) { +        if (pParent->m_vecChildren.size() > 1) { +          // do constituent reordering for all children of pParent +          vecFocused.push_back(pParent); +        } +        if (pParent->m_iBrotherIndex != 0) break; +        pParent = pParent->m_ptParent; +      } +    } +  } + +  void fnGenerateInstanceFile( +      const char* pszSynFname,  // source-side flattened parse tree file name +      const char* pszAlignFname,  // alignment filename +      const char* pszSourceFname,  // source file name +      const char* pszTargetFname,  // target file name +      const char* pszInstanceFname  // training instance file name +      ) { +    SAlignmentReader* pAlignReader = new SAlignmentReader(pszAlignFname); +    SParseReader* pParseReader = new SParseReader(pszSynFname, false); +    STxtFileReader* pTxtSReader = new STxtFileReader(pszSourceFname); +    STxtFileReader* pTxtTReader = new STxtFileReader(pszTargetFname); + +    string strInstanceLeftFname = string(pszInstanceFname) + string(".left"); +    string strInstanceRightFname = string(pszInstanceFname) + string(".right"); + +    FILE* fpLeftOut = fopen(strInstanceLeftFname.c_str(), "w"); +    assert(fpLeftOut != NULL); + +    FILE* fpRightOut = fopen(strInstanceRightFname.c_str(), "w"); +    assert(fpRightOut != NULL); + +    // read sentence by sentence +    SAlignment* pAlign; +    SParsedTree* pTree; +    char* pszLine = new char[50001]; +    int iSentNum = 0; +    while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { +      pTree = pParseReader->fnReadNextParseTree(); +      assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); +      vector<string> vecSTerms; +      SplitOnWhitespace(string(pszLine), &vecSTerms); +      assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); +      vector<string> vecTTerms; +      SplitOnWhitespace(string(pszLine), &vecTTerms); + +      if (pTree != NULL) { + +        vector<STreeItem*> vecFocused; +        fnGetFocusedParentNodes(pTree, vecFocused); + +        for (size_t i = 0; i < vecFocused.size(); i++) { + +          STreeItem* pParent = vecFocused[i]; + +          vector<int> vecLeft, vecRight; +          for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) { +            STreeItem* pCon1 = pParent->m_vecChildren[j]; +            int iLeft1, iRight1; +            pAlign->fnGetLeftRightMost(pCon1->m_iBegin, pCon1->m_iEnd, true, +                                       iLeft1, iRight1); +            vecLeft.push_back(iLeft1); +            vecRight.push_back(iRight1); +          } +          vector<int> vecLeftPosition; +          fnGetRelativePosition(vecLeft, vecLeftPosition); +          vector<int> vecRightPosition; +          fnGetRelativePosition(vecRight, vecRightPosition); + +          vector<string> vecChunkStatus; +          for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) { +            string strOutcome = +                pAlign->fnIsContinuous(pParent->m_vecChildren[j]->m_iBegin, +                                       pParent->m_vecChildren[j]->m_iEnd); +            vecChunkStatus.push_back(strOutcome); +          } + +          for (size_t j = 1; j < pParent->m_vecChildren.size(); j++) { +            // children[j-1] vs. children[j] reordering + +            string strLeftOutcome; +            ostringstream ostr; + +            fnGenerateInstance(pTree, pParent, j, vecChunkStatus, +                               vecLeftPosition, vecSTerms, vecTTerms, +                               strLeftOutcome, ostr); + +            // fprintf(stderr, "%s %s\n", ostr.str().c_str(), +            // strLeftOutcome.c_str()); +            fprintf(fpLeftOut, "%s %s\n", ostr.str().c_str(), +                    strLeftOutcome.c_str()); + +            string strRightOutcome; +            fnGetOutcome(vecRightPosition[j - 1], vecRightPosition[j], +                         strRightOutcome); +            fprintf(fpRightOut, "%s LeftOrder=%s %s\n", ostr.str().c_str(), +                    strLeftOutcome.c_str(), strRightOutcome.c_str()); +          } +        } +        delete pTree; +      } + +      delete pAlign; +      iSentNum++; + +      if (iSentNum % 100000 == 0) fprintf(stderr, "#%d\n", iSentNum); +    } + +    fclose(fpLeftOut); +    fclose(fpRightOut); +    delete pAlignReader; +    delete pParseReader; +    delete pTxtSReader; +    delete pTxtTReader; +    delete[] pszLine; +  } + +  void fnGenerateInstanceFile2( +      const char* pszSynFname,  // source-side flattened parse tree file name +      const char* pszAlignFname,  // alignment filename +      const char* pszSourceFname,  // source file name +      const char* pszTargetFname,  // target file name +      const char* pszInstanceFname  // training instance file name +      ) { +    SAlignmentReader* pAlignReader = new SAlignmentReader(pszAlignFname); +    SParseReader* pParseReader = new SParseReader(pszSynFname, false); +    STxtFileReader* pTxtSReader = new STxtFileReader(pszSourceFname); +    STxtFileReader* pTxtTReader = new STxtFileReader(pszTargetFname); + +    FILE* fpOut = fopen(pszInstanceFname, "w"); +    assert(fpOut != NULL); + +    // read sentence by sentence +    SAlignment* pAlign; +    SParsedTree* pTree; +    char* pszLine = new char[50001]; +    int iSentNum = 0; +    while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { +      pTree = pParseReader->fnReadNextParseTree(); +      assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); +      vector<string> vecSTerms; +      SplitOnWhitespace(string(pszLine), &vecSTerms); +      assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); +      vector<string> vecTTerms; +      SplitOnWhitespace(string(pszLine), &vecTTerms); + +      if (pTree != NULL) { + +        vector<STreeItem*> vecFocused; +        fnGetFocusedParentNodes(pTree, vecFocused); + +        for (size_t i = 0; +             i < vecFocused.size() && pTree->m_vecTerminals.size() > 10; i++) { + +          STreeItem* pParent = vecFocused[i]; + +          for (size_t j = 1; j < pParent->m_vecChildren.size(); j++) { +            // children[j-1] vs. children[j] reordering + +            string strOutcome; +            ostringstream ostr; + +            fnGenerateInstance(pTree, pParent, pParent->m_vecChildren[j - 1], +                               pParent->m_vecChildren[j], pAlign, vecSTerms, +                               vecTTerms, strOutcome, ostr); + +            // fprintf(stderr, "%s %s\n", ostr.str().c_str(), +            // strOutcome.c_str()); +            fprintf(fpOut, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); +          } +        } +        delete pTree; +      } + +      delete pAlign; +      iSentNum++; + +      if (iSentNum % 100000 == 0) fprintf(stderr, "#%d\n", iSentNum); +    } + +    fclose(fpOut); +    delete pAlignReader; +    delete pParseReader; +    delete pTxtSReader; +    delete pTxtTReader; +    delete[] pszLine; +  }  }; -struct SConstContTrainer{ -	SConstContTrainer(const char* pszFlattenedSynFname,        //source-side flattened parse tree file name -	                  const char* pszAlignFname,               //alignment filename -	                  const char* pszSourceFname,              //source file name -	                  const char* pszTargetFname,              //target file name -	                  const char* pszInstanceFname,            //training instance file name -	                  const char* pszModelPrefix,              //classifier model file name prefix -	                  int iClassifierType,                     //classifier type -	                  int iCutoff,                             //feature count threshold -	                  const char* pszOption                    //other classifier parameters (for svmlight) -	                 ) { -		fnGenerateInstanceFile(pszFlattenedSynFname, pszAlignFname, pszSourceFname, pszTargetFname, pszInstanceFname); -		//fnTraining(pszInstanceFname, pszModelPrefix, iClassifierType, iCutoff, pszOption); -		fnTraining(pszInstanceFname, pszModelPrefix, iCutoff); -	} -	~SConstContTrainer() { - -	} - -private: - -	void fnTraining(const char* pszInstanceFname, const char* pszModelFname, int iCutoff) { -		char *pszNewInstanceFName = new char[strlen(pszInstanceFname) + 50]; -		if (iCutoff > 0) { -			sprintf(pszNewInstanceFName, "%s.tmp", pszInstanceFname); -			fnPreparingTrainingdata(pszInstanceFname, iCutoff, pszNewInstanceFName); -		} else { -			strcpy(pszNewInstanceFName, pszInstanceFname); -		} - -		/*Zhangle_Maxent *pZhangleMaxent = new Zhangle_Maxent(NULL); -		   pZhangleMaxent->fnTrain(pszInstanceFname, "lbfgs", pszModelFname, 100, 2.0); -		   delete pZhangleMaxent;*/ - -		Tsuruoka_Maxent *pMaxent = new Tsuruoka_Maxent(NULL); -		pMaxent->fnTrain(pszInstanceFname, "l1", pszModelFname, 300); -		delete pMaxent; - -		if (strcmp(pszNewInstanceFName, pszInstanceFname) != 0) { -			sprintf(pszNewInstanceFName, "rm %s.tmp", pszInstanceFname); -			system(pszNewInstanceFName); -		} -		delete [] pszNewInstanceFName; -	} - - -	void fnGetFocusedParentNodes(const SParsedTree* pTree, vector<STreeItem*>& vecFocused){ -		for (size_t i = 0; i < pTree->m_vecTerminals.size(); i++) { -			STreeItem *pParent = pTree->m_vecTerminals[i]->m_ptParent; - -			while (pParent != NULL) { -				//if (pParent->m_vecChildren.size() > 1 && pParent->m_iEnd - pParent->m_iBegin > 5) { -				if (pParent->m_vecChildren.size() > 1) { -					//do constituent reordering for all children of pParent -					vecFocused.push_back(pParent); -				} -				if (pParent->m_iBrotherIndex != 0) break; -				pParent = pParent->m_ptParent; -			} -		} -	} - -	inline void fnGetOutcome(int iL1, int iR1, const SAlignment *pAlign, string& strOutcome) { -		strOutcome = pAlign->fnIsContinuous(iL1, iR1); -	} - -	inline string fnGetLengthType(int iLen) { -		if (iLen == 1) -			return string("1"); -		if (iLen == 2) -			return string("2"); -		if (iLen == 3) -			return string("3"); -		if (iLen < 6) -			return string("4"); -		if (iLen < 11) -			return string("6"); -		return string("11"); -	} - -	/* -	 * Source side (11 features): -	 * f1: the syntactic category -	 * f2: the syntactic category of its parent -	 * f3: the head word's pos -	 * f4: =1 if it's the head of its parent node -	 *     or -	 *     the head of its parent node -	 * f5: length type -	 */ -	void fnGenerateInstance(const SParsedTree *pTree, const STreeItem *pCon1, const SAlignment *pAlign, const vector<string>& vecSTerms, const vector<string>& vecTTerms, string& strOutcome, ostringstream& ostr) { - -		fnGetOutcome(pCon1->m_iBegin, pCon1->m_iEnd, pAlign, strOutcome); - -		//generate features -		//f1 -		ostr << "f1=" << pCon1->m_pszTerm; -		//f2 -		ostr << " f2=" << pCon1->m_ptParent->m_pszTerm; -		//f3 -		ostr << " f3=" << pTree->m_vecTerminals[pCon1->m_iHeadWord]->m_ptParent->m_pszTerm; -		//f4 -		if (pCon1->m_iBrotherIndex == pCon1->m_ptParent->m_iHeadChild) { -			ostr << " f4=1"; -		} else { -			ostr << " f4=" << pCon1->m_ptParent->m_vecChildren[pCon1->m_ptParent->m_iHeadChild]->m_pszTerm; -		} -		//f5 -		ostr << " f5=" << fnGetLengthType(pCon1->m_iEnd - pCon1->m_iBegin + 1); -	} - -	void fnGenerateInstanceFile(const char* pszFlattenedSynFname,        //source-side flattened parse tree file name -                                const char* pszAlignFname,               //alignment filename -                                const char* pszSourceFname,              //source file name -                                const char* pszTargetFname,              //target file name -                                const char* pszInstanceFname             //training instance file name -                                ) { -		SAlignmentReader *pAlignReader = new SAlignmentReader(pszAlignFname); -		SParseReader *pParseReader = new SParseReader(pszFlattenedSynFname, true); -		STxtFileReader *pTxtSReader = new STxtFileReader(pszSourceFname); -		STxtFileReader *pTxtTReader = new STxtFileReader(pszTargetFname); - -		FILE *fpOut = fopen(pszInstanceFname, "w"); -		assert(fpOut != NULL); - -		//read sentence by sentence -		SAlignment *pAlign; -		SParsedTree *pTree; -		char *pszLine = new char[50001]; -		int iSentNum = 0; -		while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { -			pTree = pParseReader->fnReadNextParseTree(); -			assert(pTree != NULL); -			assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); -			vector<string> vecSTerms; -			SplitOnWhitespace(string(pszLine), &vecSTerms); -			assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); -			vector<string> vecTTerms; -			SplitOnWhitespace(string(pszLine), &vecTTerms); - -			vector<STreeItem*> vecFocused; -			fnGetFocusedParentNodes(pTree, vecFocused); - -			for (size_t i = 0; i < vecFocused.size() && pTree->m_vecTerminals.size() > 10; i++) { - -				STreeItem *pParent = vecFocused[i]; - -				for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) { -					//children[j-1] vs. children[j] reordering - -					string strOutcome; -					ostringstream ostr; - -					fnGenerateInstance(pTree, pParent->m_vecChildren[j], pAlign, vecSTerms, vecTTerms, strOutcome, ostr); - -					//fprintf(stderr, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); -					fprintf(fpOut, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); -				} -			} - -			delete pAlign; -			delete pTree; -			iSentNum++; - -			if (iSentNum % 100000 == 0) -				fprintf(stderr, "#%d\n", iSentNum); -		} - - -		fclose(fpOut); -		delete pAlignReader; -		delete pParseReader; -		delete pTxtSReader; -		delete pTxtTReader; -		delete [] pszLine; -	} +struct SConstContTrainer { +  SConstContTrainer( +      const char* pszFlattenedSynFname,  // source-side flattened parse tree +                                         // file name +      const char* pszAlignFname,  // alignment filename +      const char* pszSourceFname,  // source file name +      const char* pszTargetFname,  // target file name +      const char* pszInstanceFname,  // training instance file name +      const char* pszModelPrefix,  // classifier model file name prefix +      int iClassifierType,  // classifier type +      int iCutoff,  // feature count threshold +      const char* pszOption  // other classifier parameters (for svmlight) +      ) { +    fnGenerateInstanceFile(pszFlattenedSynFname, pszAlignFname, pszSourceFname, +                           pszTargetFname, pszInstanceFname); +    // fnTraining(pszInstanceFname, pszModelPrefix, iClassifierType, iCutoff, +    // pszOption); +    fnTraining(pszInstanceFname, pszModelPrefix, iCutoff); +  } +  ~SConstContTrainer() {} + + private: +  void fnTraining(const char* pszInstanceFname, const char* pszModelFname, +                  int iCutoff) { +    char* pszNewInstanceFName = new char[strlen(pszInstanceFname) + 50]; +    if (iCutoff > 0) { +      sprintf(pszNewInstanceFName, "%s.tmp", pszInstanceFname); +      fnPreparingTrainingdata(pszInstanceFname, iCutoff, pszNewInstanceFName); +    } else { +      strcpy(pszNewInstanceFName, pszInstanceFname); +    } + +    /*Zhangle_Maxent *pZhangleMaxent = new Zhangle_Maxent(NULL); +       pZhangleMaxent->fnTrain(pszInstanceFname, "lbfgs", pszModelFname, 100, +       2.0); +       delete pZhangleMaxent;*/ + +    Tsuruoka_Maxent* pMaxent = new Tsuruoka_Maxent(NULL); +    pMaxent->fnTrain(pszInstanceFname, "l1", pszModelFname, 300); +    delete pMaxent; + +    if (strcmp(pszNewInstanceFName, pszInstanceFname) != 0) { +      sprintf(pszNewInstanceFName, "rm %s.tmp", pszInstanceFname); +      system(pszNewInstanceFName); +    } +    delete[] pszNewInstanceFName; +  } + +  void fnGetFocusedParentNodes(const SParsedTree* pTree, +                               vector<STreeItem*>& vecFocused) { +    for (size_t i = 0; i < pTree->m_vecTerminals.size(); i++) { +      STreeItem* pParent = pTree->m_vecTerminals[i]->m_ptParent; + +      while (pParent != NULL) { +        // if (pParent->m_vecChildren.size() > 1 && pParent->m_iEnd - +        // pParent->m_iBegin > 5) { +        if (pParent->m_vecChildren.size() > 1) { +          // do constituent reordering for all children of pParent +          vecFocused.push_back(pParent); +        } +        if (pParent->m_iBrotherIndex != 0) break; +        pParent = pParent->m_ptParent; +      } +    } +  } + +  inline void fnGetOutcome(int iL1, int iR1, const SAlignment* pAlign, +                           string& strOutcome) { +    strOutcome = pAlign->fnIsContinuous(iL1, iR1); +  } + +  inline string fnGetLengthType(int iLen) { +    if (iLen == 1) return string("1"); +    if (iLen == 2) return string("2"); +    if (iLen == 3) return string("3"); +    if (iLen < 6) return string("4"); +    if (iLen < 11) return string("6"); +    return string("11"); +  } + +  /* +   * Source side (11 features): +   * f1: the syntactic category +   * f2: the syntactic category of its parent +   * f3: the head word's pos +   * f4: =1 if it's the head of its parent node +   *     or +   *     the head of its parent node +   * f5: length type +   */ +  void fnGenerateInstance(const SParsedTree* pTree, const STreeItem* pCon1, +                          const SAlignment* pAlign, +                          const vector<string>& vecSTerms, +                          const vector<string>& vecTTerms, string& strOutcome, +                          ostringstream& ostr) { + +    fnGetOutcome(pCon1->m_iBegin, pCon1->m_iEnd, pAlign, strOutcome); + +    // generate features +    // f1 +    ostr << "f1=" << pCon1->m_pszTerm; +    // f2 +    ostr << " f2=" << pCon1->m_ptParent->m_pszTerm; +    // f3 +    ostr << " f3=" << pTree->m_vecTerminals[pCon1->m_iHeadWord] +                          ->m_ptParent->m_pszTerm; +    // f4 +    if (pCon1->m_iBrotherIndex == pCon1->m_ptParent->m_iHeadChild) { +      ostr << " f4=1"; +    } else { +      ostr << " f4=" +           << pCon1->m_ptParent->m_vecChildren[pCon1->m_ptParent->m_iHeadChild] +                  ->m_pszTerm; +    } +    // f5 +    ostr << " f5=" << fnGetLengthType(pCon1->m_iEnd - pCon1->m_iBegin + 1); +  } + +  void fnGenerateInstanceFile( +      const char* pszFlattenedSynFname,  // source-side flattened parse tree +                                         // file name +      const char* pszAlignFname,  // alignment filename +      const char* pszSourceFname,  // source file name +      const char* pszTargetFname,  // target file name +      const char* pszInstanceFname  // training instance file name +      ) { +    SAlignmentReader* pAlignReader = new SAlignmentReader(pszAlignFname); +    SParseReader* pParseReader = new SParseReader(pszFlattenedSynFname, true); +    STxtFileReader* pTxtSReader = new STxtFileReader(pszSourceFname); +    STxtFileReader* pTxtTReader = new STxtFileReader(pszTargetFname); + +    FILE* fpOut = fopen(pszInstanceFname, "w"); +    assert(fpOut != NULL); + +    // read sentence by sentence +    SAlignment* pAlign; +    SParsedTree* pTree; +    char* pszLine = new char[50001]; +    int iSentNum = 0; +    while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { +      pTree = pParseReader->fnReadNextParseTree(); +      assert(pTree != NULL); +      assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); +      vector<string> vecSTerms; +      SplitOnWhitespace(string(pszLine), &vecSTerms); +      assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); +      vector<string> vecTTerms; +      SplitOnWhitespace(string(pszLine), &vecTTerms); + +      vector<STreeItem*> vecFocused; +      fnGetFocusedParentNodes(pTree, vecFocused); + +      for (size_t i = 0; +           i < vecFocused.size() && pTree->m_vecTerminals.size() > 10; i++) { + +        STreeItem* pParent = vecFocused[i]; + +        for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) { +          // children[j-1] vs. children[j] reordering + +          string strOutcome; +          ostringstream ostr; + +          fnGenerateInstance(pTree, pParent->m_vecChildren[j], pAlign, +                             vecSTerms, vecTTerms, strOutcome, ostr); + +          // fprintf(stderr, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); +          fprintf(fpOut, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); +        } +      } + +      delete pAlign; +      delete pTree; +      iSentNum++; + +      if (iSentNum % 100000 == 0) fprintf(stderr, "#%d\n", iSentNum); +    } + +    fclose(fpOut); +    delete pAlignReader; +    delete pParseReader; +    delete pTxtSReader; +    delete pTxtTReader; +    delete[] pszLine; +  }  }; -inline void print_options(std::ostream &out,po::options_description const& opts) { -  typedef std::vector< boost::shared_ptr<po::option_description> > Ds; -  Ds const& ds=opts.options(); +inline void print_options(std::ostream& out, +                          po::options_description const& opts) { +  typedef std::vector<boost::shared_ptr<po::option_description> > Ds; +  Ds const& ds = opts.options();    out << '"'; -  for (unsigned i=0;i<ds.size();++i) { -    if (i) out<<' '; -    out<<"--"<<ds[i]->long_name(); +  for (unsigned i = 0; i < ds.size(); ++i) { +    if (i) out << ' '; +    out << "--" << ds[i]->long_name();    }    out << '\n';  } -inline string str(char const* name,po::variables_map const& conf) { +inline string str(char const* name, po::variables_map const& conf) {    return conf[name].as<string>();  } -//--parse_file /scratch0/mt_exp/gq-ctb/data/train.srl.cn --align_file /scratch0/mt_exp/gq-ctb/data/aligned.grow-diag-final-and --source_file /scratch0/mt_exp/gq-ctb/data/train.cn --target_file /scratch0/mt_exp/gq-ctb/data/train.en --instance_file /scratch0/mt_exp/gq-ctb/data/srl-instance --model_prefix /scratch0/mt_exp/gq-ctb/data/srl-instance --feature_cutoff 10 --classifier_type 1 +//--parse_file /scratch0/mt_exp/gq-ctb/data/train.srl.cn --align_file +///scratch0/mt_exp/gq-ctb/data/aligned.grow-diag-final-and --source_file +///scratch0/mt_exp/gq-ctb/data/train.cn --target_file +///scratch0/mt_exp/gq-ctb/data/train.en --instance_file +///scratch0/mt_exp/gq-ctb/data/srl-instance --model_prefix +///scratch0/mt_exp/gq-ctb/data/srl-instance --feature_cutoff 10 +//--classifier_type 1  int main(int argc, char** argv) { -	po::options_description opts("Configuration options"); -	opts.add_options() -                ("parse_file",po::value<string>(),"parse file path (input)") -                ("align_file",po::value<string>(),"Alignment file path (input)") -                ("source_file",po::value<string>(),"Source text file path (input)") -                ("target_file",po::value<string>(),"Target text file path (input)") -                ("instance_file",po::value<string>(),"Instance file path (output)") -                ("model_prefix",po::value<string>(),"Model file path prefix (output): three files will be generated") -                ("classifier_type",po::value<int>()->default_value(1),"Classifier type: 1 for openNLP maxent; 2 for Zhangle maxent; and 3 for SVMLight") -                ("feature_cutoff",po::value<int>()->default_value(100),"Feature cutoff threshold") -                ("svm_option",po::value<string>(),"Parameters for SVMLight classifier") -                ("help", "produce help message"); - -	po::variables_map vm; -	if (argc) { -		po::store(po::parse_command_line(argc, argv, opts), vm); -		po::notify(vm); -	} - -	if (vm.count("help")) { -		print_options(cout, opts); -		return 1; -	} - -	if (!vm.count("parse_file") -			|| !vm.count("align_file") -			|| !vm.count("source_file") -			|| !vm.count("target_file") -			|| !vm.count("instance_file") -			|| !vm.count("model_prefix") -			) { -		print_options(cout, opts); -		if (!vm.count("parse_file")) cout << "--parse_file NOT FOUND\n"; -		if (!vm.count("align_file")) cout << "--align_file NOT FOUND\n"; -		if (!vm.count("source_file")) cout << "--source_file NOT FOUND\n"; -		if (!vm.count("target_file")) cout << "--target_file NOT FOUND\n"; -		if (!vm.count("instance_file")) cout << "--instance_file NOT FOUND\n"; -		if (!vm.count("model_prefix")) cout << "--model_prefix NOT FOUND\n"; -		exit(0); -	} - -	const char *pOption; -	if (vm.count("svm_option")) -		pOption = str("svm_option", vm).c_str(); -	else -		pOption = NULL; - -	SConstReorderTrainer *pTrainer = new SConstReorderTrainer(str("parse_file", vm).c_str(), -			                                                  str("align_file", vm).c_str(), -			                                                  str("source_file", vm).c_str(), -			                                                  str("target_file", vm).c_str(), -			                                                  str("instance_file", vm).c_str(), -			                                                  str("model_prefix", vm).c_str(), -			                                                  vm["classifier_type"].as<int>(), -			                                                  vm["feature_cutoff"].as<int>(), -			                                                  pOption); -	delete pTrainer; - -	return 1; +  po::options_description opts("Configuration options"); +  opts.add_options()("parse_file", po::value<string>(), +                     "parse file path (input)")( +      "align_file", po::value<string>(), "Alignment file path (input)")( +      "source_file", po::value<string>(), "Source text file path (input)")( +      "target_file", po::value<string>(), "Target text file path (input)")( +      "instance_file", po::value<string>(), "Instance file path (output)")( +      "model_prefix", po::value<string>(), +      "Model file path prefix (output): three files will be generated")( +      "classifier_type", po::value<int>()->default_value(1), +      "Classifier type: 1 for openNLP maxent; 2 for Zhangle maxent; and 3 for " +      "SVMLight")("feature_cutoff", po::value<int>()->default_value(100), +                  "Feature cutoff threshold")( +      "svm_option", po::value<string>(), "Parameters for SVMLight classifier")( +      "help", "produce help message"); + +  po::variables_map vm; +  if (argc) { +    po::store(po::parse_command_line(argc, argv, opts), vm); +    po::notify(vm); +  } + +  if (vm.count("help")) { +    print_options(cout, opts); +    return 1; +  } + +  if (!vm.count("parse_file") || !vm.count("align_file") || +      !vm.count("source_file") || !vm.count("target_file") || +      !vm.count("instance_file") || !vm.count("model_prefix")) { +    print_options(cout, opts); +    if (!vm.count("parse_file")) cout << "--parse_file NOT FOUND\n"; +    if (!vm.count("align_file")) cout << "--align_file NOT FOUND\n"; +    if (!vm.count("source_file")) cout << "--source_file NOT FOUND\n"; +    if (!vm.count("target_file")) cout << "--target_file NOT FOUND\n"; +    if (!vm.count("instance_file")) cout << "--instance_file NOT FOUND\n"; +    if (!vm.count("model_prefix")) cout << "--model_prefix NOT FOUND\n"; +    exit(0); +  } + +  const char* pOption; +  if (vm.count("svm_option")) +    pOption = str("svm_option", vm).c_str(); +  else +    pOption = NULL; + +  SConstReorderTrainer* pTrainer = new SConstReorderTrainer( +      str("parse_file", vm).c_str(), str("align_file", vm).c_str(), +      str("source_file", vm).c_str(), str("target_file", vm).c_str(), +      str("instance_file", vm).c_str(), str("model_prefix", vm).c_str(), +      vm["classifier_type"].as<int>(), vm["feature_cutoff"].as<int>(), pOption); +  delete pTrainer; +  return 1;  } diff --git a/utils/synutils/maxent-3.0/lbfgs.cpp b/utils/synutils/maxent-3.0/lbfgs.cpp index 9eb04bef..bd26f048 100644 --- a/utils/synutils/maxent-3.0/lbfgs.cpp +++ b/utils/synutils/maxent-3.0/lbfgs.cpp @@ -8,20 +8,17 @@  using namespace std; -const static int    M = LBFGS_M; +const static int M = LBFGS_M;  const static double LINE_SEARCH_ALPHA = 0.1; -const static double LINE_SEARCH_BETA  = 0.5; +const static double LINE_SEARCH_BETA = 0.5;  // stopping criteria -int    LBFGS_MAX_ITER      = 300; +int LBFGS_MAX_ITER = 300;  const static double MIN_GRAD_NORM = 0.0001; - -double  -ME_Model::backtracking_line_search( -			 const Vec & x0, const Vec & grad0, const double f0,  -			 const Vec & dx, Vec & x, Vec & grad1) -{ +double ME_Model::backtracking_line_search(const Vec& x0, const Vec& grad0, +                                          const double f0, const Vec& dx, +                                          Vec& x, Vec& grad1) {    double t = 1.0 / LINE_SEARCH_BETA;    double f; @@ -39,20 +36,23 @@ ME_Model::backtracking_line_search(  // Jorge Nocedal, "Updating Quasi-Newton Matrices With Limited Storage",  // Mathematics of Computation, Vol. 35, No. 151, pp. 773-782, 1980.  // -Vec  -approximate_Hg(const int iter, const Vec & grad, -	       const Vec s[],  const Vec y[], const double z[]) -{ +Vec approximate_Hg(const int iter, const Vec& grad, const Vec s[], +                   const Vec y[], const double z[]) {    int offset, bound; -  if (iter <= M) { offset = 0;        bound = iter; } -  else           { offset = iter - M; bound = M;    } +  if (iter <= M) { +    offset = 0; +    bound = iter; +  } else { +    offset = iter - M; +    bound = M; +  }    Vec q = grad;    double alpha[M], beta[M];    for (int i = bound - 1; i >= 0; i--) {      const int j = (i + offset) % M; -    alpha[i]    = z[j]   * dot_product(s[j], q); -    q          += -alpha[i] * y[j]; +    alpha[i] = z[j] * dot_product(s[j], q); +    q += -alpha[i] * y[j];    }    if (iter > 0) {      const int j = (iter - 1) % M; @@ -63,16 +63,14 @@ approximate_Hg(const int iter, const Vec & grad,    }    for (int i = 0; i <= bound - 1; i++) {      const int j = (i + offset) % M; -    beta[i]     = z[j] * dot_product(y[j], q); -    q          += s[j] * (alpha[i] - beta[i]); +    beta[i] = z[j] * dot_product(y[j], q); +    q += s[j] * (alpha[i] - beta[i]);    }    return q;  } -vector<double>  -ME_Model::perform_LBFGS(const vector<double> & x0) -{ +vector<double> ME_Model::perform_LBFGS(const vector<double>& x0) {    const size_t dim = x0.size();    Vec x = x0; @@ -84,10 +82,11 @@ ME_Model::perform_LBFGS(const vector<double> & x0)    for (int iter = 0; iter < LBFGS_MAX_ITER; iter++) { -    fprintf(stderr, "%3d  obj(err) = %f (%6.4f)", iter+1, -f, _train_error); +    fprintf(stderr, "%3d  obj(err) = %f (%6.4f)", iter + 1, -f, _train_error);      if (_nheldout > 0) {        const double heldout_logl = heldout_likelihood(); -      fprintf(stderr, "  heldout_logl(err) = %f (%6.4f)", heldout_logl, _heldout_error); +      fprintf(stderr, "  heldout_logl(err) = %f (%6.4f)", heldout_logl, +              _heldout_error);      }      fprintf(stderr, "\n"); @@ -107,4 +106,3 @@ ME_Model::perform_LBFGS(const vector<double> & x0)    return x.STLVec();  } - diff --git a/utils/synutils/maxent-3.0/lbfgs.h b/utils/synutils/maxent-3.0/lbfgs.h index 113919a3..ed5cd944 100644 --- a/utils/synutils/maxent-3.0/lbfgs.h +++ b/utils/synutils/maxent-3.0/lbfgs.h @@ -1,21 +1,19 @@  #ifndef _LBFGS_H_  #define _LBFGS_H_ -//template<class FuncGrad> -//std::vector<double>  -//perform_LBFGS(FuncGrad func_grad, const std::vector<double> & x0); +// template<class FuncGrad> +// std::vector<double> +// perform_LBFGS(FuncGrad func_grad, const std::vector<double> & x0); -std::vector<double>  -perform_LBFGS(double (*func_grad)(const std::vector<double> &, std::vector<double> &),  -	      const std::vector<double> & x0); +std::vector<double> perform_LBFGS( +    double (*func_grad)(const std::vector<double> &, std::vector<double> &), +    const std::vector<double> &x0); +std::vector<double> perform_OWLQN( +    double (*func_grad)(const std::vector<double> &, std::vector<double> &), +    const std::vector<double> &x0, const double C); -std::vector<double>  -perform_OWLQN(double (*func_grad)(const std::vector<double> &, std::vector<double> &),  -	      const std::vector<double> & x0, -	      const double C); - -//const int    LBFGS_M = 7; -const int    LBFGS_M = 10; +// const int    LBFGS_M = 7; +const int LBFGS_M = 10;  #endif diff --git a/utils/synutils/maxent-3.0/mathvec.h b/utils/synutils/maxent-3.0/mathvec.h index 4ec82797..f8c60e5d 100644 --- a/utils/synutils/maxent-3.0/mathvec.h +++ b/utils/synutils/maxent-3.0/mathvec.h @@ -5,41 +5,40 @@  #include <iostream>  #include <cassert> -class Vec -{ -private: +class Vec { + private:    std::vector<double> _v; -public: + + public:    Vec(const size_t n = 0, const double val = 0) { _v.resize(n, val); } -  Vec(const std::vector<double> & v) : _v(v)    {} -  const std::vector<double> & STLVec() const { return _v; } -  std::vector<double>       & STLVec()       { return _v; } +  Vec(const std::vector<double>& v) : _v(v) {} +  const std::vector<double>& STLVec() const { return _v; } +  std::vector<double>& STLVec() { return _v; }    size_t Size() const { return _v.size(); } -  double       & operator[](int i)       { return _v[i]; } -  const double & operator[](int i) const { return _v[i]; } -  Vec & operator+=(const Vec & b) { +  double& operator[](int i) { return _v[i]; } +  const double& operator[](int i) const { return _v[i]; } +  Vec& operator+=(const Vec& b) {      assert(b.Size() == _v.size());      for (size_t i = 0; i < _v.size(); i++) {        _v[i] += b[i];      }      return *this;    } -  Vec & operator*=(const double c) { +  Vec& operator*=(const double c) {      for (size_t i = 0; i < _v.size(); i++) {        _v[i] *= c;      }      return *this;    } -  void Project(const Vec & y) { +  void Project(const Vec& y) {      for (size_t i = 0; i < _v.size(); i++) {        //      if (sign(_v[i]) != sign(y[i])) _v[i] = 0; -      if (_v[i] * y[i] <=0) _v[i] = 0; +      if (_v[i] * y[i] <= 0) _v[i] = 0;      }    }  }; -inline double dot_product(const Vec & a, const Vec & b) -{ +inline double dot_product(const Vec& a, const Vec& b) {    double sum = 0;    for (size_t i = 0; i < a.Size(); i++) {      sum += a[i] * b[i]; @@ -47,8 +46,7 @@ inline double dot_product(const Vec & a, const Vec & b)    return sum;  } -inline std::ostream & operator<<(std::ostream & s, const Vec & a) -{ +inline std::ostream& operator<<(std::ostream& s, const Vec& a) {    s << "(";    for (size_t i = 0; i < a.Size(); i++) {      if (i != 0) s << ", "; @@ -58,8 +56,7 @@ inline std::ostream & operator<<(std::ostream & s, const Vec & a)    return s;  } -inline const Vec operator+(const Vec & a, const Vec & b) -{ +inline const Vec operator+(const Vec& a, const Vec& b) {    Vec v(a.Size());    assert(a.Size() == b.Size());    for (size_t i = 0; i < a.Size(); i++) { @@ -68,8 +65,7 @@ inline const Vec operator+(const Vec & a, const Vec & b)    return v;  } -inline const Vec operator-(const Vec & a, const Vec & b) -{ +inline const Vec operator-(const Vec& a, const Vec& b) {    Vec v(a.Size());    assert(a.Size() == b.Size());    for (size_t i = 0; i < a.Size(); i++) { @@ -78,8 +74,7 @@ inline const Vec operator-(const Vec & a, const Vec & b)    return v;  } -inline const Vec operator*(const Vec & a, const double c) -{ +inline const Vec operator*(const Vec& a, const double c) {    Vec v(a.Size());    for (size_t i = 0; i < a.Size(); i++) {      v[i] = a[i] * c; @@ -87,11 +82,6 @@ inline const Vec operator*(const Vec & a, const double c)    return v;  } -inline const Vec operator*(const double c, const Vec & a) -{ -  return a * c; -} - - +inline const Vec operator*(const double c, const Vec& a) { return a * c; }  #endif diff --git a/utils/synutils/maxent-3.0/maxent.cpp b/utils/synutils/maxent-3.0/maxent.cpp index feb0efdc..8d00ac1d 100644 --- a/utils/synutils/maxent-3.0/maxent.cpp +++ b/utils/synutils/maxent-3.0/maxent.cpp @@ -9,14 +9,13 @@  using namespace std; -double -ME_Model::FunctionGradient(const vector<double> & x, vector<double> & grad) -{ +double ME_Model::FunctionGradient(const vector<double>& x, +                                  vector<double>& grad) {    assert((int)_fb.Size() == x.size());    for (size_t i = 0; i < x.size(); i++) {      _vl[i] = x[i];    } -   +    double score = update_model_expectation();    if (_l2reg == 0) { @@ -33,9 +32,7 @@ ME_Model::FunctionGradient(const vector<double> & x, vector<double> & grad)    return -score;  } -int -ME_Model::perform_GIS(int C) -{ +int ME_Model::perform_GIS(int C) {    cerr << "C = " << C << endl;    C = 1;    cerr << "performing AGIS" << endl; @@ -43,11 +40,13 @@ ME_Model::perform_GIS(int C)    double pre_logl = -999999;    for (int iter = 0; iter < 200; iter++) { -    double logl =  update_model_expectation(); -    fprintf(stderr, "iter = %2d  C = %d  f = %10.7f  train_err = %7.5f", iter, C, logl, _train_error); +    double logl = update_model_expectation(); +    fprintf(stderr, "iter = %2d  C = %d  f = %10.7f  train_err = %7.5f", iter, +            C, logl, _train_error);      if (_heldout.size() > 0) {        double hlogl = heldout_likelihood(); -      fprintf(stderr, "  heldout_logl(err) = %f (%6.4f)", hlogl, _heldout_error); +      fprintf(stderr, "  heldout_logl(err) = %f (%6.4f)", hlogl, +              _heldout_error);      }      cerr << endl; @@ -71,13 +70,13 @@ ME_Model::perform_GIS(int C)    return 0;  } -int -ME_Model::perform_QUASI_NEWTON() -{ +int ME_Model::perform_QUASI_NEWTON() {    const int dim = _fb.Size();    vector<double> x0(dim); -  for (int i = 0; i < dim; i++) { x0[i] = _vl[i]; } +  for (int i = 0; i < dim; i++) { +    x0[i] = _vl[i]; +  }    vector<double> x;    if (_l1reg > 0) { @@ -88,34 +87,39 @@ ME_Model::perform_QUASI_NEWTON()      x = perform_LBFGS(x0);    } -  for (int i = 0; i < dim; i++) { _vl[i] = x[i]; } +  for (int i = 0; i < dim; i++) { +    _vl[i] = x[i]; +  }    return 0;  } -int -ME_Model::conditional_probability(const Sample & s, -                                  std::vector<double> & membp) const -{ -  //int num_classes = membp.size(); +int ME_Model::conditional_probability(const Sample& s, +                                      std::vector<double>& membp) const { +  // int num_classes = membp.size();    double sum = 0;    int max_label = -1;    //  double maxp = 0;    vector<double> powv(_num_classes, 0.0); -  for (vector<int>::const_iterator j = s.positive_features.begin(); j != s.positive_features.end(); j++){ -    for (vector<int>::const_iterator k = _feature2mef[*j].begin(); k != _feature2mef[*j].end(); k++) { +  for (vector<int>::const_iterator j = s.positive_features.begin(); +       j != s.positive_features.end(); j++) { +    for (vector<int>::const_iterator k = _feature2mef[*j].begin(); +         k != _feature2mef[*j].end(); k++) {        powv[_fb.Feature(*k).label()] += _vl[*k];      }    } -  for (vector<pair<int, double> >::const_iterator j = s.rvfeatures.begin(); j != s.rvfeatures.end(); j++) { -    for (vector<int>::const_iterator k = _feature2mef[j->first].begin(); k != _feature2mef[j->first].end(); k++) { +  for (vector<pair<int, double> >::const_iterator j = s.rvfeatures.begin(); +       j != s.rvfeatures.end(); j++) { +    for (vector<int>::const_iterator k = _feature2mef[j->first].begin(); +         k != _feature2mef[j->first].end(); k++) {        powv[_fb.Feature(*k).label()] += _vl[*k] * j->second;      }    } -  std::vector<double>::const_iterator pmax = max_element(powv.begin(), powv.end()); -  double offset = max(0.0, *pmax - 700); // to avoid overflow +  std::vector<double>::const_iterator pmax = +      max_element(powv.begin(), powv.end()); +  double offset = max(0.0, *pmax - 700);  // to avoid overflow    for (int label = 0; label < _num_classes; label++) {      double pow = powv[label] - offset;      double prod = exp(pow); @@ -134,33 +138,38 @@ ME_Model::conditional_probability(const Sample & s,    return max_label;  } -int -ME_Model::make_feature_bag(const int cutoff) -{ +int ME_Model::make_feature_bag(const int cutoff) {    int max_num_features = 0; -  // count the occurrences of features +// count the occurrences of features  #ifdef USE_HASH_MAP    typedef __gnu_cxx::hash_map<unsigned int, int> map_type; -#else     +#else    typedef std::map<unsigned int, int> map_type;  #endif    map_type count;    if (cutoff > 0) { -    for (std::vector<Sample>::const_iterator i = _vs.begin(); i != _vs.end(); i++) { -      for (std::vector<int>::const_iterator j = i->positive_features.begin(); j != i->positive_features.end(); j++) { +    for (std::vector<Sample>::const_iterator i = _vs.begin(); i != _vs.end(); +         i++) { +      for (std::vector<int>::const_iterator j = i->positive_features.begin(); +           j != i->positive_features.end(); j++) {          count[ME_Feature(i->label, *j).body()]++;        } -      for (std::vector<pair<int, double> >::const_iterator j = i->rvfeatures.begin(); j != i->rvfeatures.end(); j++) { +      for (std::vector<pair<int, double> >::const_iterator j = +               i->rvfeatures.begin(); +           j != i->rvfeatures.end(); j++) {          count[ME_Feature(i->label, j->first).body()]++;        }      }    } -  int n = 0;  -  for (std::vector<Sample>::const_iterator i = _vs.begin(); i != _vs.end(); i++, n++) { -    max_num_features = max(max_num_features, (int)(i->positive_features.size())); -    for (std::vector<int>::const_iterator j = i->positive_features.begin(); j != i->positive_features.end(); j++) { +  int n = 0; +  for (std::vector<Sample>::const_iterator i = _vs.begin(); i != _vs.end(); +       i++, n++) { +    max_num_features = +        max(max_num_features, (int)(i->positive_features.size())); +    for (std::vector<int>::const_iterator j = i->positive_features.begin(); +         j != i->positive_features.end(); j++) {        const ME_Feature feature(i->label, *j);        //      if (cutoff > 0 && count[feature.body()] < cutoff) continue;        if (cutoff > 0 && count[feature.body()] <= cutoff) continue; @@ -168,7 +177,9 @@ ME_Model::make_feature_bag(const int cutoff)        //      cout << i->label << "\t" << *j << "\t" << id << endl;        //      feature2sample[id].push_back(n);      } -    for (std::vector<pair<int, double> >::const_iterator j = i->rvfeatures.begin(); j != i->rvfeatures.end(); j++) { +    for (std::vector<pair<int, double> >::const_iterator j = +             i->rvfeatures.begin(); +         j != i->rvfeatures.end(); j++) {        const ME_Feature feature(i->label, j->first);        //      if (cutoff > 0 && count[feature.body()] < cutoff) continue;        if (cutoff > 0 && count[feature.body()] <= cutoff) continue; @@ -176,71 +187,72 @@ ME_Model::make_feature_bag(const int cutoff)      }    }    count.clear(); -   +    //  cerr << "num_classes = " << _num_classes << endl;    //  cerr << "max_num_features = " << max_num_features << endl;    init_feature2mef(); -   +    return max_num_features;  } -double -ME_Model::heldout_likelihood() -{ +double ME_Model::heldout_likelihood() {    double logl = 0;    int ncorrect = 0; -  for (std::vector<Sample>::const_iterator i = _heldout.begin(); i != _heldout.end(); i++) { +  for (std::vector<Sample>::const_iterator i = _heldout.begin(); +       i != _heldout.end(); i++) {      vector<double> membp(_num_classes);      int l = classify(*i, membp);      logl += log(membp[i->label]);      if (l == i->label) ncorrect++;    }    _heldout_error = 1 - (double)ncorrect / _heldout.size(); -   +    return logl /= _heldout.size();  } -double -ME_Model::update_model_expectation() -{ +double ME_Model::update_model_expectation() {    double logl = 0;    int ncorrect = 0;    _vme.resize(_fb.Size());    for (int i = 0; i < _fb.Size(); i++) _vme[i] = 0; -   +    int n = 0; -  for (vector<Sample>::const_iterator i = _vs.begin(); i != _vs.end(); i++, n++) { +  for (vector<Sample>::const_iterator i = _vs.begin(); i != _vs.end(); +       i++, n++) {      vector<double> membp(_num_classes);      int max_label = conditional_probability(*i, membp); -     +      logl += log(membp[i->label]);      //    cout << membp[*i] << " " << logl << " ";      if (max_label == i->label) ncorrect++;      // model_expectation -    for (vector<int>::const_iterator j = i->positive_features.begin(); j != i->positive_features.end(); j++){ -      for (vector<int>::const_iterator k = _feature2mef[*j].begin(); k != _feature2mef[*j].end(); k++) { -	_vme[*k] += membp[_fb.Feature(*k).label()]; +    for (vector<int>::const_iterator j = i->positive_features.begin(); +         j != i->positive_features.end(); j++) { +      for (vector<int>::const_iterator k = _feature2mef[*j].begin(); +           k != _feature2mef[*j].end(); k++) { +        _vme[*k] += membp[_fb.Feature(*k).label()];        }      } -    for (vector<pair<int, double> >::const_iterator j = i->rvfeatures.begin(); j != i->rvfeatures.end(); j++) { -      for (vector<int>::const_iterator k = _feature2mef[j->first].begin(); k != _feature2mef[j->first].end(); k++) { -	_vme[*k] += membp[_fb.Feature(*k).label()] * j->second; +    for (vector<pair<int, double> >::const_iterator j = i->rvfeatures.begin(); +         j != i->rvfeatures.end(); j++) { +      for (vector<int>::const_iterator k = _feature2mef[j->first].begin(); +           k != _feature2mef[j->first].end(); k++) { +        _vme[*k] += membp[_fb.Feature(*k).label()] * j->second;        }      } -        }    for (int i = 0; i < _fb.Size(); i++) {      _vme[i] /= _vs.size();    } -   +    _train_error = 1 - (double)ncorrect / _vs.size();    logl /= _vs.size(); -   +    if (_l2reg > 0) {      const double c = _l2reg;      for (int i = 0; i < _fb.Size(); i++) { @@ -248,17 +260,17 @@ ME_Model::update_model_expectation()      }    } -  //logl /= _vs.size(); -   -  //  fprintf(stderr, "iter =%3d  logl = %10.7f  train_acc = %7.5f\n", iter, logl, (double)ncorrect/train.size()); -  //  fprintf(stderr, "logl = %10.7f  train_acc = %7.5f\n", logl, (double)ncorrect/_train.size()); +  // logl /= _vs.size(); + +  //  fprintf(stderr, "iter =%3d  logl = %10.7f  train_acc = %7.5f\n", iter, +  // logl, (double)ncorrect/train.size()); +  //  fprintf(stderr, "logl = %10.7f  train_acc = %7.5f\n", logl, +  // (double)ncorrect/_train.size());    return logl;  } -int -ME_Model::train(const vector<ME_Sample> & vms) -{ +int ME_Model::train(const vector<ME_Sample>& vms) {    _vs.clear();    for (vector<ME_Sample>::const_iterator i = vms.begin(); i != vms.end(); i++) {      add_training_sample(*i); @@ -267,39 +279,41 @@ ME_Model::train(const vector<ME_Sample> & vms)    return train();  } -void -ME_Model::add_training_sample(const ME_Sample & mes) -{ +void ME_Model::add_training_sample(const ME_Sample& mes) {    Sample s;    s.label = _label_bag.Put(mes.label);    if (s.label > ME_Feature::MAX_LABEL_TYPES) {      cerr << "error: too many types of labels." << endl;      exit(1);    } -  for (vector<string>::const_iterator j = mes.features.begin(); j != mes.features.end(); j++) { +  for (vector<string>::const_iterator j = mes.features.begin(); +       j != mes.features.end(); j++) {      s.positive_features.push_back(_featurename_bag.Put(*j));    } -  for (vector<pair<string, double> >::const_iterator j = mes.rvfeatures.begin(); j != mes.rvfeatures.end(); j++) { -    s.rvfeatures.push_back(pair<int, double>(_featurename_bag.Put(j->first), j->second)); +  for (vector<pair<string, double> >::const_iterator j = mes.rvfeatures.begin(); +       j != mes.rvfeatures.end(); j++) { +    s.rvfeatures.push_back( +        pair<int, double>(_featurename_bag.Put(j->first), j->second));    }    if (_ref_modelp != NULL) { -    ME_Sample tmp = mes;; +    ME_Sample tmp = mes; +    ;      s.ref_pd = _ref_modelp->classify(tmp);    }    //  cout << s.label << "\t"; -  //  for (vector<int>::const_iterator j = s.positive_features.begin(); j != s.positive_features.end(); j++){ +  //  for (vector<int>::const_iterator j = s.positive_features.begin(); j != +  // s.positive_features.end(); j++){    //    cout << *j << " ";    //  }    //  cout << endl; -   +    _vs.push_back(s);  } -int -ME_Model::train() -{ +int ME_Model::train() {    if (_l1reg > 0 && _l2reg > 0) { -    cerr << "error: L1 and L2 regularizers cannot be used simultaneously." << endl; +    cerr << "error: L1 and L2 regularizers cannot be used simultaneously." +         << endl;      return 0;    }    if (_vs.size() == 0) { @@ -307,20 +321,22 @@ ME_Model::train()      return 0;    }    if (_nheldout >= (int)_vs.size()) { -    cerr << "error: too much heldout data. no training data is available." << endl; +    cerr << "error: too much heldout data. no training data is available." +         << endl;      return 0;    }    //  if (_nheldout > 0) random_shuffle(_vs.begin(), _vs.end());    int max_label = 0; -  for (std::vector<Sample>::const_iterator i = _vs.begin(); i != _vs.end(); i++) { +  for (std::vector<Sample>::const_iterator i = _vs.begin(); i != _vs.end(); +       i++) {      max_label = max(max_label, i->label);    }    _num_classes = max_label + 1;    if (_num_classes != _label_bag.Size()) {      cerr << "warning: _num_class != _label_bag.Size()" << endl;    } -   +    if (_ref_modelp != NULL) {      cerr << "setting reference distribution...";      for (int i = 0; i < _ref_modelp->num_classes(); i++) { @@ -332,7 +348,7 @@ ME_Model::train()      }      cerr << "done" << endl;    } -   +    for (int i = 0; i < _nheldout; i++) {      _heldout.push_back(_vs.back());      _vs.pop_back(); @@ -362,25 +378,28 @@ ME_Model::train()      _vee[i] = 0;    }    for (int n = 0; n < (int)_vs.size(); n++) { -    const Sample * i = &_vs[n]; -    for (vector<int>::const_iterator j = i->positive_features.begin(); j != i->positive_features.end(); j++){ -      for (vector<int>::const_iterator k = _feature2mef[*j].begin(); k != _feature2mef[*j].end(); k++) { -	if (_fb.Feature(*k).label() == i->label) _vee[*k] += 1.0; +    const Sample* i = &_vs[n]; +    for (vector<int>::const_iterator j = i->positive_features.begin(); +         j != i->positive_features.end(); j++) { +      for (vector<int>::const_iterator k = _feature2mef[*j].begin(); +           k != _feature2mef[*j].end(); k++) { +        if (_fb.Feature(*k).label() == i->label) _vee[*k] += 1.0;        }      } -    for (vector<pair<int, double> >::const_iterator j = i->rvfeatures.begin(); j != i->rvfeatures.end(); j++) { -      for (vector<int>::const_iterator k = _feature2mef[j->first].begin(); k != _feature2mef[j->first].end(); k++) { -	if (_fb.Feature(*k).label() == i->label) _vee[*k] += j->second; +    for (vector<pair<int, double> >::const_iterator j = i->rvfeatures.begin(); +         j != i->rvfeatures.end(); j++) { +      for (vector<int>::const_iterator k = _feature2mef[j->first].begin(); +           k != _feature2mef[j->first].end(); k++) { +        if (_fb.Feature(*k).label() == i->label) _vee[*k] += j->second;        }      } -    }    for (int i = 0; i < _fb.Size(); i++) {      _vee[i] /= _vs.size();    }    cerr << "done" << endl; -   +    _vl.resize(_fb.Size());    for (int i = 0; i < _fb.Size(); i++) _vl[i] = 0.0; @@ -399,13 +418,12 @@ ME_Model::train()    return 0;  } -void -ME_Model::get_features(list< pair< pair<string, string>, double> > & fl) -{ +void ME_Model::get_features(list<pair<pair<string, string>, double> >& fl) {    fl.clear();    //  for (int i = 0; i < _fb.Size(); i++) {    //    ME_Feature f = _fb.Feature(i); -  //    fl.push_back( make_pair(make_pair(_label_bag.Str(f.label()), _featurename_bag.Str(f.feature())), _vl[i])); +  //    fl.push_back( make_pair(make_pair(_label_bag.Str(f.label()), +  // _featurename_bag.Str(f.feature())), _vl[i]));    //  }    for (MiniStringBag::map_type::const_iterator i = _featurename_bag.begin();         i != _featurename_bag.end(); i++) { @@ -414,14 +432,12 @@ ME_Model::get_features(list< pair< pair<string, string>, double> > & fl)        string history = i->first;        int id = _fb.Id(ME_Feature(j, i->second));        if (id < 0) continue; -      fl.push_back( make_pair(make_pair(label, history), _vl[id]) ); +      fl.push_back(make_pair(make_pair(label, history), _vl[id]));      }    }  } -void -ME_Model::clear() -{ +void ME_Model::clear() {    _vl.clear();    _label_bag.Clear();    _featurename_bag.Clear(); @@ -433,10 +449,8 @@ ME_Model::clear()    _heldout.clear();  } -bool -ME_Model::load_from_file(const string & filename) -{ -  FILE * fp = fopen(filename.c_str(), "r"); +bool ME_Model::load_from_file(const string& filename) { +  FILE* fp = fopen(filename.c_str(), "r");    if (!fp) {      cerr << "error: cannot open " << filename << "!" << endl;      return false; @@ -447,22 +461,22 @@ ME_Model::load_from_file(const string & filename)    _featurename_bag.Clear();    _fb.Clear();    char buf[1024]; -  while(fgets(buf, 1024, fp)) { +  while (fgets(buf, 1024, fp)) {      string line(buf);      string::size_type t1 = line.find_first_of('\t');      string::size_type t2 = line.find_last_of('\t');      string classname = line.substr(0, t1); -    string featurename = line.substr(t1 + 1, t2 - (t1 + 1) ); +    string featurename = line.substr(t1 + 1, t2 - (t1 + 1));      float lambda; -    string w = line.substr(t2+1); +    string w = line.substr(t2 + 1);      sscanf(w.c_str(), "%f", &lambda); -       +      int label = _label_bag.Put(classname);      int feature = _featurename_bag.Put(featurename);      _fb.Put(ME_Feature(label, feature));      _vl.push_back(lambda);    } -     +    _num_classes = _label_bag.Size();    init_feature2mef(); @@ -472,9 +486,7 @@ ME_Model::load_from_file(const string & filename)    return true;  } -void -ME_Model::init_feature2mef() -{ +void ME_Model::init_feature2mef() {    _feature2mef.clear();    for (int i = 0; i < _featurename_bag.Size(); i++) {      vector<int> vi; @@ -486,9 +498,7 @@ ME_Model::init_feature2mef()    }  } -bool -ME_Model::load_from_array(const ME_Model_Data data[]) -{ +bool ME_Model::load_from_array(const ME_Model_Data data[]) {    _vl.clear();    for (int i = 0;; i++) {      if (string(data[i].label) == "///") break; @@ -500,14 +510,12 @@ ME_Model::load_from_array(const ME_Model_Data data[])    _num_classes = _label_bag.Size();    init_feature2mef(); -   +    return true;  } -bool -ME_Model::save_to_file(const string & filename, const double th) const -{ -  FILE * fp = fopen(filename.c_str(), "w"); +bool ME_Model::save_to_file(const string& filename, const double th) const { +  FILE* fp = fopen(filename.c_str(), "w");    if (!fp) {      cerr << "error: cannot open " << filename << "!" << endl;      return false; @@ -516,7 +524,8 @@ ME_Model::save_to_file(const string & filename, const double th) const    //  for (int i = 0; i < _fb.Size(); i++) {    //    if (_vl[i] == 0) continue; // ignore zero-weight features    //    ME_Feature f = _fb.Feature(i); -  //    fprintf(fp, "%s\t%s\t%f\n", _label_bag.Str(f.label()).c_str(), _featurename_bag.Str(f.feature()).c_str(), _vl[i]); +  //    fprintf(fp, "%s\t%s\t%f\n", _label_bag.Str(f.label()).c_str(), +  // _featurename_bag.Str(f.feature()).c_str(), _vl[i]);    //  }    for (MiniStringBag::map_type::const_iterator i = _featurename_bag.begin();         i != _featurename_bag.end(); i++) { @@ -525,8 +534,8 @@ ME_Model::save_to_file(const string & filename, const double th) const        string history = i->first;        int id = _fb.Id(ME_Feature(j, i->second));        if (id < 0) continue; -      if (_vl[id] == 0) continue; // ignore zero-weight features -      if (fabs(_vl[id]) < th) continue; // cut off low-weight features +      if (_vl[id] == 0) continue;        // ignore zero-weight features +      if (fabs(_vl[id]) < th) continue;  // cut off low-weight features        fprintf(fp, "%s\t%s\t%f\n", label.c_str(), history.c_str(), _vl[id]);      }    } @@ -536,9 +545,7 @@ ME_Model::save_to_file(const string & filename, const double th) const    return true;  } -void -ME_Model::set_ref_dist(Sample & s) const -{ +void ME_Model::set_ref_dist(Sample& s) const {    vector<double> v0 = s.ref_pd;    vector<double> v(_num_classes);    for (unsigned int i = 0; i < v.size(); i++) { @@ -548,14 +555,12 @@ ME_Model::set_ref_dist(Sample & s) const      if (id_ref != -1) {        v[i] = v0[id_ref];      } -    if (v[i] == 0) v[i] = 0.001; // to avoid -inf logl +    if (v[i] == 0) v[i] = 0.001;  // to avoid -inf logl    }    s.ref_pd = v;  } -   -int -ME_Model::classify(const Sample & nbs, vector<double> & membp) const -{ + +int ME_Model::classify(const Sample& nbs, vector<double>& membp) const {    //  vector<double> membp(_num_classes);    assert(_num_classes == (int)membp.size());    conditional_probability(nbs, membp); @@ -563,22 +568,24 @@ ME_Model::classify(const Sample & nbs, vector<double> & membp) const    double max = 0.0;    for (int i = 0; i < (int)membp.size(); i++) {      //    cout << membp[i] << " "; -    if (membp[i] > max) { max_label = i; max = membp[i]; } +    if (membp[i] > max) { +      max_label = i; +      max = membp[i]; +    }    }    //  cout << endl;    return max_label;  } -vector<double> -ME_Model::classify(ME_Sample & mes) const -{ +vector<double> ME_Model::classify(ME_Sample& mes) const {    Sample s; -  for (vector<string>::const_iterator j = mes.features.begin(); j != mes.features.end(); j++) { +  for (vector<string>::const_iterator j = mes.features.begin(); +       j != mes.features.end(); j++) {      int id = _featurename_bag.Id(*j); -    if (id >= 0) -      s.positive_features.push_back(id); +    if (id >= 0) s.positive_features.push_back(id);    } -  for (vector<pair<string, double> >::const_iterator j = mes.rvfeatures.begin(); j != mes.rvfeatures.end(); j++) { +  for (vector<pair<string, double> >::const_iterator j = mes.rvfeatures.begin(); +       j != mes.rvfeatures.end(); j++) {      int id = _featurename_bag.Id(j->first);      if (id >= 0) {        s.rvfeatures.push_back(pair<int, double>(id, j->second)); @@ -595,7 +602,6 @@ ME_Model::classify(ME_Sample & mes) const    return vp;  } -  /*   * $Log: maxent.cpp,v $   * Revision 1.1.1.1  2007/05/15 08:30:35  kyoshida @@ -695,4 +701,3 @@ ME_Model::classify(ME_Sample & mes) const   * remove some comments   *   */ - diff --git a/utils/synutils/maxent-3.0/maxent.h b/utils/synutils/maxent-3.0/maxent.h index a4391ead..88a04e25 100644 --- a/utils/synutils/maxent-3.0/maxent.h +++ b/utils/synutils/maxent-3.0/maxent.h @@ -15,7 +15,8 @@  #include <cassert>  #include "mathvec.h" -#define USE_HASH_MAP  // if you encounter errors with hash, try commenting out this line. (the program will be a bit slower, though) +#define USE_HASH_MAP  // if you encounter errors with hash, try commenting out +                      // this line. (the program will be a bit slower, though)  #ifdef USE_HASH_MAP  #include <ext/hash_map>  #endif @@ -23,68 +24,68 @@  //  // data format for each sample for training/testing  // -struct ME_Sample -{ -public: +struct ME_Sample { + public:    ME_Sample() : label("") {}; -  ME_Sample(const std::string & l) : label(l) {}; -  void set_label(const std::string & l) { label = l; } +  ME_Sample(const std::string& l) : label(l) {}; +  void set_label(const std::string& l) { label = l; }    // to add a binary feature -  void add_feature(const std::string & f) { -    features.push_back(f);    -  } +  void add_feature(const std::string& f) { features.push_back(f); }    // to add a real-valued feature -  void add_feature(const std::string & s, const double d) { -    rvfeatures.push_back(std::pair<std::string, double>(s, d));  +  void add_feature(const std::string& s, const double d) { +    rvfeatures.push_back(std::pair<std::string, double>(s, d));    } -public: + public:    std::string label;    std::vector<std::string> features;    std::vector<std::pair<std::string, double> > rvfeatures;    // obsolete -  void add_feature(const std::pair<std::string, double> & f) {   -    rvfeatures.push_back(f); // real-valued features +  void add_feature(const std::pair<std::string, double>& f) { +    rvfeatures.push_back(f);  // real-valued features    }  }; -  //  // for those who want to use load_from_array()  // -typedef struct ME_Model_Data -{ -  char * label; -  char * feature; +typedef struct ME_Model_Data { +  char* label; +  char* feature;    double weight;  } ME_Model_Data; - -class ME_Model -{ -public: - -  void add_training_sample(const ME_Sample & s); +class ME_Model { + public: +  void add_training_sample(const ME_Sample& s);    int train(); -  std::vector<double> classify(ME_Sample & s) const; -  bool load_from_file(const std::string & filename); -  bool save_to_file(const std::string & filename, const double th = 0) const; +  std::vector<double> classify(ME_Sample& s) const; +  bool load_from_file(const std::string& filename); +  bool save_to_file(const std::string& filename, const double th = 0) const;    int num_classes() const { return _num_classes; }    std::string get_class_label(int i) const { return _label_bag.Str(i); } -  int get_class_id(const std::string & s) const { return _label_bag.Id(s); } -  void get_features(std::list< std::pair< std::pair<std::string, std::string>, double> > & fl); -  void set_heldout(const int h, const int n = 0) { _nheldout = h; _early_stopping_n = n; }; +  int get_class_id(const std::string& s) const { return _label_bag.Id(s); } +  void get_features( +      std::list<std::pair<std::pair<std::string, std::string>, double> >& fl); +  void set_heldout(const int h, const int n = 0) { +    _nheldout = h; +    _early_stopping_n = n; +  };    void use_l1_regularizer(const double v) { _l1reg = v; }    void use_l2_regularizer(const double v) { _l2reg = v; }    void use_SGD(int iter = 30, double eta0 = 1, double alpha = 0.85) {      _optimization_method = SGD; -    SGD_ITER = iter; SGD_ETA0 = eta0; SGD_ALPHA = alpha; +    SGD_ITER = iter; +    SGD_ETA0 = eta0; +    SGD_ALPHA = alpha;    }    bool load_from_array(const ME_Model_Data data[]); -  void set_reference_model(const ME_Model & ref_model) { _ref_modelp = &ref_model; }; +  void set_reference_model(const ME_Model& ref_model) { +    _ref_modelp = &ref_model; +  };    void clear();    ME_Model() { @@ -95,13 +96,16 @@ public:      _optimization_method = LBFGS;    } -public: + public:    // obsolete. just for downward compatibility -  int train(const std::vector<ME_Sample> & train); - -private:   - -  enum OPTIMIZATION_METHOD { LBFGS, OWLQN, SGD } _optimization_method; +  int train(const std::vector<ME_Sample>& train); + + private: +  enum OPTIMIZATION_METHOD { +    LBFGS, +    OWLQN, +    SGD +  } _optimization_method;    // OWLQN and SGD are available only for L1-regularization    int SGD_ITER; @@ -109,13 +113,13 @@ private:    double SGD_ALPHA;    double _l1reg, _l2reg; -   +    struct Sample {      int label;      std::vector<int> positive_features;      std::vector<std::pair<int, double> > rvfeatures; -    std::vector<double> ref_pd; // reference probability distribution -    bool operator<(const Sample & x) const { +    std::vector<double> ref_pd;  // reference probability distribution +    bool operator<(const Sample& x) const {        for (unsigned int i = 0; i < positive_features.size(); i++) {          if (i >= x.positive_features.size()) return false;          int v0 = positive_features[i]; @@ -127,10 +131,11 @@ private:      }    }; -  struct ME_Feature -  { -    enum { MAX_LABEL_TYPES = 255 }; -       +  struct ME_Feature { +    enum { +      MAX_LABEL_TYPES = 255 +    }; +      //    ME_Feature(const int l, const int f) : _body((l << 24) + f) {      //      assert(l >= 0 && l < 256);      //      assert(f >= 0 && f <= 0xffffff); @@ -144,20 +149,20 @@ private:      int label() const { return _body & 0xff; }      int feature() const { return _body >> 8; }      unsigned int body() const { return _body; } -  private: + +   private:      unsigned int _body;    }; -  struct ME_FeatureBag -  { +  struct ME_FeatureBag {  #ifdef USE_HASH_MAP      typedef __gnu_cxx::hash_map<unsigned int, int> map_type; -#else     +#else      typedef std::map<unsigned int, int> map_type;  #endif      map_type mef2id;      std::vector<ME_Feature> id2mef; -    int Put(const ME_Feature & i) { +    int Put(const ME_Feature& i) {        map_type::const_iterator j = mef2id.find(i.body());        if (j == mef2id.end()) {          int id = id2mef.size(); @@ -167,7 +172,7 @@ private:        }        return j->second;      } -    int Id(const ME_Feature & i) const { +    int Id(const ME_Feature& i) const {        map_type::const_iterator j = mef2id.find(i.body());        if (j == mef2id.end()) {          return -1; @@ -178,17 +183,14 @@ private:        assert(id >= 0 && id < (int)id2mef.size());        return id2mef[id];      } -    int Size() const { -      return id2mef.size(); -    } +    int Size() const { return id2mef.size(); }      void Clear() {        mef2id.clear();        id2mef.clear();      }    }; -  struct hashfun_str -  { +  struct hashfun_str {      size_t operator()(const std::string& s) const {        assert(sizeof(int) == 4 && sizeof(char) == 1);        const int* p = reinterpret_cast<const int*>(s.c_str()); @@ -196,7 +198,7 @@ private:        int n = s.size() / 4;        for (int i = 0; i < n; i++, p++) {          //      v ^= *p; -        v ^= *p << (4 * (i % 2)); // note) 0 <= char < 128 +        v ^= *p << (4 * (i % 2));  // note) 0 <= char < 128        }        int m = s.size() % 4;        for (int i = 0; i < m; i++) { @@ -206,17 +208,16 @@ private:      }    }; -  struct MiniStringBag -  { +  struct MiniStringBag {  #ifdef USE_HASH_MAP      typedef __gnu_cxx::hash_map<std::string, int, hashfun_str> map_type; -#else     +#else      typedef std::map<std::string, int> map_type;  #endif      int _size;      map_type str2id;      MiniStringBag() : _size(0) {} -    int Put(const std::string & i) { +    int Put(const std::string& i) {        map_type::const_iterator j = str2id.find(i);        if (j == str2id.end()) {          int id = _size; @@ -226,21 +227,23 @@ private:        }        return j->second;      } -    int Id(const std::string & i) const { +    int Id(const std::string& i) const {        map_type::const_iterator j = str2id.find(i); -      if (j == str2id.end())  return -1; +      if (j == str2id.end()) return -1;        return j->second;      }      int Size() const { return _size; } -    void Clear() { str2id.clear(); _size = 0; } +    void Clear() { +      str2id.clear(); +      _size = 0; +    }      map_type::const_iterator begin() const { return str2id.begin(); } -    map_type::const_iterator end()   const { return str2id.end(); } +    map_type::const_iterator end() const { return str2id.end(); }    }; -  struct StringBag : public MiniStringBag -  { +  struct StringBag : public MiniStringBag {      std::vector<std::string> id2str; -    int Put(const std::string & i) { +    int Put(const std::string& i) {        map_type::const_iterator j = str2id.find(i);        if (j == str2id.end()) {          int id = id2str.size(); @@ -261,7 +264,7 @@ private:      }    }; -  std::vector<Sample> _vs; // vector of training_samples +  std::vector<Sample> _vs;  // vector of training_samples    StringBag _label_bag;    MiniStringBag _featurename_bag;    std::vector<double> _vl;  // vector of lambda @@ -269,42 +272,46 @@ private:    int _num_classes;    std::vector<double> _vee;  // empirical expectation    std::vector<double> _vme;  // empirical expectation -  std::vector< std::vector< int > > _feature2mef; -  std::vector< Sample > _heldout; -  double _train_error;   // current error rate on the training data -  double _heldout_error; // current error rate on the heldout data +  std::vector<std::vector<int> > _feature2mef; +  std::vector<Sample> _heldout; +  double _train_error;    // current error rate on the training data +  double _heldout_error;  // current error rate on the heldout data    int _nheldout;    int _early_stopping_n;    std::vector<double> _vhlogl; -  const ME_Model * _ref_modelp; +  const ME_Model* _ref_modelp;    double heldout_likelihood(); -  int conditional_probability(const Sample & nbs, std::vector<double> & membp) const; +  int conditional_probability(const Sample& nbs, +                              std::vector<double>& membp) const;    int make_feature_bag(const int cutoff); -  int classify(const Sample & nbs, std::vector<double> & membp) const; +  int classify(const Sample& nbs, std::vector<double>& membp) const;    double update_model_expectation();    int perform_QUASI_NEWTON();    int perform_SGD();    int perform_GIS(int C); -  std::vector<double> perform_LBFGS(const std::vector<double> & x0); -  std::vector<double> perform_OWLQN(const std::vector<double> & x0, const double C); -  double backtracking_line_search(const Vec & x0, const Vec & grad0, const double f0, const Vec & dx, Vec & x, Vec & grad1); -  double regularized_func_grad(const double C, const Vec & x, Vec & grad); -  double constrained_line_search(double C, const Vec & x0, const Vec & grad0, const double f0, const Vec & dx, Vec & x, Vec & grad1); - - -  void set_ref_dist(Sample & s) const; +  std::vector<double> perform_LBFGS(const std::vector<double>& x0); +  std::vector<double> perform_OWLQN(const std::vector<double>& x0, +                                    const double C); +  double backtracking_line_search(const Vec& x0, const Vec& grad0, +                                  const double f0, const Vec& dx, Vec& x, +                                  Vec& grad1); +  double regularized_func_grad(const double C, const Vec& x, Vec& grad); +  double constrained_line_search(double C, const Vec& x0, const Vec& grad0, +                                 const double f0, const Vec& dx, Vec& x, +                                 Vec& grad1); + +  void set_ref_dist(Sample& s) const;    void init_feature2mef(); -  double FunctionGradient(const std::vector<double> & x, std::vector<double> & grad); -  static double FunctionGradientWrapper(const std::vector<double> & x, std::vector<double> & grad); -   +  double FunctionGradient(const std::vector<double>& x, +                          std::vector<double>& grad); +  static double FunctionGradientWrapper(const std::vector<double>& x, +                                        std::vector<double>& grad);  }; -  #endif -  /*   * $Log: maxent.h,v $   * Revision 1.1.1.1  2007/05/15 08:30:35  kyoshida diff --git a/utils/synutils/maxent-3.0/owlqn.cpp b/utils/synutils/maxent-3.0/owlqn.cpp index 7b2cea7d..c3a0f0da 100644 --- a/utils/synutils/maxent-3.0/owlqn.cpp +++ b/utils/synutils/maxent-3.0/owlqn.cpp @@ -8,29 +8,24 @@  using namespace std; -const static int    M = LBFGS_M; +const static int M = LBFGS_M;  const static double LINE_SEARCH_ALPHA = 0.1; -const static double LINE_SEARCH_BETA  = 0.5; +const static double LINE_SEARCH_BETA = 0.5;  // stopping criteria -int    OWLQN_MAX_ITER      = 300; +int OWLQN_MAX_ITER = 300;  const static double MIN_GRAD_NORM = 0.0001; +Vec approximate_Hg(const int iter, const Vec& grad, const Vec s[], +                   const Vec y[], const double z[]); -Vec approximate_Hg(const int iter, const Vec & grad, -		   const Vec s[],  const Vec y[], const double z[]); - - -inline int sign(double x)  -{ +inline int sign(double x) {    if (x > 0) return 1;    if (x < 0) return -1;    return 0;  }; -static Vec -pseudo_gradient(const Vec & x, const Vec & grad0, const double C) -{ +static Vec pseudo_gradient(const Vec& x, const Vec& grad0, const double C) {    Vec grad = grad0;    for (size_t i = 0; i < x.Size(); i++) {      if (x[i] != 0) { @@ -53,9 +48,8 @@ pseudo_gradient(const Vec & x, const Vec & grad0, const double C)    return grad;  } -double  -ME_Model::regularized_func_grad(const double C, const Vec & x, Vec & grad) -{ +double ME_Model::regularized_func_grad(const double C, const Vec& x, +                                       Vec& grad) {    double f = FunctionGradient(x.STLVec(), grad.STLVec());    for (size_t i = 0; i < x.Size(); i++) {      f += C * fabs(x[i]); @@ -64,11 +58,9 @@ ME_Model::regularized_func_grad(const double C, const Vec & x, Vec & grad)    return f;  } -double  -ME_Model::constrained_line_search(double C, -			const Vec & x0, const Vec & grad0, const double f0,  -			const Vec & dx, Vec & x, Vec & grad1) -{ +double ME_Model::constrained_line_search(double C, const Vec& x0, +                                         const Vec& grad0, const double f0, +                                         const Vec& dx, Vec& x, Vec& grad1) {    // compute the orthant to explore    Vec orthant = x0;    for (size_t i = 0; i < orthant.Size(); i++) { @@ -93,9 +85,8 @@ ME_Model::constrained_line_search(double C,    return f;  } -vector<double>  -ME_Model::perform_OWLQN(const vector<double> & x0, const double C) -{ +vector<double> ME_Model::perform_OWLQN(const vector<double>& x0, +                                       const double C) {    const size_t dim = x0.size();    Vec x = x0; @@ -108,18 +99,18 @@ ME_Model::perform_OWLQN(const vector<double> & x0, const double C)    for (int iter = 0; iter < OWLQN_MAX_ITER; iter++) {      Vec pg = pseudo_gradient(x, grad, C); -    fprintf(stderr, "%3d  obj(err) = %f (%6.4f)", iter+1, -f, _train_error); +    fprintf(stderr, "%3d  obj(err) = %f (%6.4f)", iter + 1, -f, _train_error);      if (_nheldout > 0) {        const double heldout_logl = heldout_likelihood(); -      fprintf(stderr, "  heldout_logl(err) = %f (%6.4f)", heldout_logl, _heldout_error); +      fprintf(stderr, "  heldout_logl(err) = %f (%6.4f)", heldout_logl, +              _heldout_error);      }      fprintf(stderr, "\n");      if (sqrt(dot_product(pg, pg)) < MIN_GRAD_NORM) break;      dx = -1 * approximate_Hg(iter, pg, s, y, z); -    if (dot_product(dx, pg) >= 0) -      dx.Project(-1 * pg); +    if (dot_product(dx, pg) >= 0) dx.Project(-1 * pg);      Vec x1(dim), grad1(dim);      f = constrained_line_search(C, x, pg, f, dx, x1, grad1); @@ -134,4 +125,3 @@ ME_Model::perform_OWLQN(const vector<double> & x0, const double C)    return x.STLVec();  } - diff --git a/utils/synutils/maxent-3.0/sgd.cpp b/utils/synutils/maxent-3.0/sgd.cpp index 6d28c23f..8613edca 100644 --- a/utils/synutils/maxent-3.0/sgd.cpp +++ b/utils/synutils/maxent-3.0/sgd.cpp @@ -4,23 +4,20 @@  using namespace std; -//const double SGD_ETA0 = 1; -//const double SGD_ITER = 30; -//const double SGD_ALPHA = 0.85; - +// const double SGD_ETA0 = 1; +// const double SGD_ITER = 30; +// const double SGD_ALPHA = 0.85;  //#define FOLOS_NAIVE  //#define FOLOS_LAZY  #define SGD_CP -inline void -apply_l1_penalty(const int i, const double u, -		 vector<double> & _vl, vector<double> & q) -{ -  double & w = _vl[i]; +inline void apply_l1_penalty(const int i, const double u, vector<double>& _vl, +                             vector<double>& q) { +  double& w = _vl[i];    const double z = w; -  double & qi = q[i]; -  if (w > 0) {  +  double& qi = q[i]; +  if (w > 0) {      w = max(0.0, w - (u + qi));    } else if (w < 0) {      w = min(0.0, w + (u - qi)); @@ -28,31 +25,29 @@ apply_l1_penalty(const int i, const double u,    qi += w - z;  } -static double -l1norm(const vector<double>& v) -{ +static double l1norm(const vector<double>& v) {    double sum = 0;    for (size_t i = 0; i < v.size(); i++) sum += abs(v[i]);    return sum;  } -inline void -update_folos_lazy(const int iter_sample, -		  const int k, vector<double> & _vl, const vector<double> & sum_eta, -		  vector<int> & last_updated) -{ +inline void update_folos_lazy(const int iter_sample, const int k, +                              vector<double>& _vl, +                              const vector<double>& sum_eta, +                              vector<int>& last_updated) {    const double penalty = sum_eta[iter_sample] - sum_eta[last_updated[k]]; -  double & x = _vl[k]; -  if (x > 0) x = max(0.0, x - penalty); -  else       x = min(0.0, x + penalty); +  double& x = _vl[k]; +  if (x > 0) +    x = max(0.0, x - penalty); +  else +    x = min(0.0, x + penalty);    last_updated[k] = iter_sample;  } -int -ME_Model::perform_SGD() -{ +int ME_Model::perform_SGD() {    if (_l2reg > 0) { -    cerr << "error: L2 regularization is currently not supported in SGD mode." << endl; +    cerr << "error: L2 regularization is currently not supported in SGD mode." +         << endl;      exit(1);    } @@ -85,95 +80,113 @@ ME_Model::perform_SGD()      double logl = 0;      int ncorrect = 0, ntotal = 0;      for (size_t i = 0; i < _vs.size(); i++, ntotal++, iter_sample++) { -      const Sample & s = _vs[ri[i]]; +      const Sample& s = _vs[ri[i]];  #ifdef FOLOS_LAZY -      for (vector<int>::const_iterator j = s.positive_features.begin(); j != s.positive_features.end(); j++){ -	for (vector<int>::const_iterator k = _feature2mef[*j].begin(); k != _feature2mef[*j].end(); k++) { -	  update_folos_lazy(iter_sample, *k, _vl, sum_eta, last_updated); -	} +      for (vector<int>::const_iterator j = s.positive_features.begin(); +           j != s.positive_features.end(); j++) { +        for (vector<int>::const_iterator k = _feature2mef[*j].begin(); +             k != _feature2mef[*j].end(); k++) { +          update_folos_lazy(iter_sample, *k, _vl, sum_eta, last_updated); +        }        }  #endif        vector<double> membp(_num_classes);        const int max_label = conditional_probability(s, membp); -      const double eta = eta0 * pow(SGD_ALPHA, (double)iter_sample / _vs.size()); // exponential decay -      //      const double eta = eta0 / (1.0 + (double)iter_sample / _vs.size()); +      const double eta = +          eta0 * pow(SGD_ALPHA, +                     (double)iter_sample / _vs.size());  // exponential decay +      //      const double eta = eta0 / (1.0 + (double)iter_sample / +      // _vs.size()); -      //      if (iter_sample % _vs.size() == 0) cerr << "eta = " << eta << endl; +      //      if (iter_sample % _vs.size() == 0) cerr << "eta = " << eta << +      // endl;        u += eta * l1param;        sum_eta.push_back(sum_eta.back() + eta * l1param); -     +        logl += log(membp[s.label]);        if (max_label == s.label) ncorrect++;        // binary features -      for (vector<int>::const_iterator j = s.positive_features.begin(); j != s.positive_features.end(); j++){ -	for (vector<int>::const_iterator k = _feature2mef[*j].begin(); k != _feature2mef[*j].end(); k++) { -	  const double me = membp[_fb.Feature(*k).label()]; -	  const double ee = (_fb.Feature(*k).label() == s.label ? 1.0 : 0); -	  const double grad = (me - ee); -	  _vl[*k] -= eta * grad; +      for (vector<int>::const_iterator j = s.positive_features.begin(); +           j != s.positive_features.end(); j++) { +        for (vector<int>::const_iterator k = _feature2mef[*j].begin(); +             k != _feature2mef[*j].end(); k++) { +          const double me = membp[_fb.Feature(*k).label()]; +          const double ee = (_fb.Feature(*k).label() == s.label ? 1.0 : 0); +          const double grad = (me - ee); +          _vl[*k] -= eta * grad;  #ifdef SGD_CP -	  apply_l1_penalty(*k, u, _vl, q); +          apply_l1_penalty(*k, u, _vl, q);  #endif -	} +        }        }        // real-valued features -      for (vector<pair<int, double> >::const_iterator j = s.rvfeatures.begin(); j != s.rvfeatures.end(); j++) { -	for (vector<int>::const_iterator k = _feature2mef[j->first].begin(); k != _feature2mef[j->first].end(); k++) { -	  const double me = membp[_fb.Feature(*k).label()]; -	  const double ee = (_fb.Feature(*k).label() == s.label ? 1.0 : 0); -	  const double grad = (me - ee) * j->second; -	  _vl[*k] -= eta * grad; +      for (vector<pair<int, double> >::const_iterator j = s.rvfeatures.begin(); +           j != s.rvfeatures.end(); j++) { +        for (vector<int>::const_iterator k = _feature2mef[j->first].begin(); +             k != _feature2mef[j->first].end(); k++) { +          const double me = membp[_fb.Feature(*k).label()]; +          const double ee = (_fb.Feature(*k).label() == s.label ? 1.0 : 0); +          const double grad = (me - ee) * j->second; +          _vl[*k] -= eta * grad;  #ifdef SGD_CP -	  apply_l1_penalty(*k, u, _vl, q); +          apply_l1_penalty(*k, u, _vl, q);  #endif -	} +        }        }  #ifdef FOLOS_NAIVE        for (size_t j = 0; j < d; j++) { -	double & x = _vl[j]; -	if (x > 0) x = max(0.0, x - eta * l1param); -	else       x = min(0.0, x + eta * l1param); +        double& x = _vl[j]; +        if (x > 0) +          x = max(0.0, x - eta * l1param); +        else +          x = min(0.0, x + eta * l1param);        }  #endif -      }      logl /= _vs.size(); -    //    fprintf(stderr, "%4d logl = %8.3f acc = %6.4f ", iter, logl, (double)ncorrect / ntotal); +//    fprintf(stderr, "%4d logl = %8.3f acc = %6.4f ", iter, logl, +// (double)ncorrect / ntotal);  #ifdef FOLOS_LAZY      if (l1param > 0) {        for (size_t j = 0; j < d; j++) -	update_folos_lazy(iter_sample, j, _vl, sum_eta, last_updated); +        update_folos_lazy(iter_sample, j, _vl, sum_eta, last_updated);      }  #endif      double f = logl;      if (l1param > 0) { -      const double l1 = l1norm(_vl); // this is not accurate when lazy update is used -      //      cerr << "f0 = " <<  update_model_expectation() - l1param * l1 << " "; +      const double l1 = +          l1norm(_vl);  // this is not accurate when lazy update is used +      //      cerr << "f0 = " <<  update_model_expectation() - l1param * l1 << " +      // ";        f -= l1param * l1;        int nonzero = 0; -      for (int j = 0; j < d; j++) if (_vl[j] != 0) nonzero++; -      //      cerr << " f = " << f << " l1 = " << l1 << " nonzero_features = " << nonzero << endl; +      for (int j = 0; j < d; j++) +        if (_vl[j] != 0) nonzero++; +      //      cerr << " f = " << f << " l1 = " << l1 << " nonzero_features = " +      // << nonzero << endl;      } -    //    fprintf(stderr, "%4d  obj = %7.3f acc = %6.4f", iter+1, f, (double)ncorrect/ntotal); +    //    fprintf(stderr, "%4d  obj = %7.3f acc = %6.4f", iter+1, f, +    // (double)ncorrect/ntotal);      //    fprintf(stderr, "%4d  obj = %f", iter+1, f); -    fprintf(stderr, "%3d  obj(err) = %f (%6.4f)", iter+1, f, 1 - (double)ncorrect/ntotal); +    fprintf(stderr, "%3d  obj(err) = %f (%6.4f)", iter + 1, f, +            1 - (double)ncorrect / ntotal);      if (_nheldout > 0) {        double heldout_logl = heldout_likelihood(); -      //      fprintf(stderr, "  heldout_logl = %f  acc = %6.4f\n", heldout_logl, 1 - _heldout_error); -      fprintf(stderr, "  heldout_logl(err) = %f (%6.4f)", heldout_logl, _heldout_error); +      //      fprintf(stderr, "  heldout_logl = %f  acc = %6.4f\n", +      // heldout_logl, 1 - _heldout_error); +      fprintf(stderr, "  heldout_logl(err) = %f (%6.4f)", heldout_logl, +              _heldout_error);      }      fprintf(stderr, "\n"); - -        }    return 0; diff --git a/utils/synutils/srl_sentence.h b/utils/synutils/srl_sentence.h index 6f00267a..c08cc065 100644 --- a/utils/synutils/srl_sentence.h +++ b/utils/synutils/srl_sentence.h @@ -14,202 +14,198 @@  using namespace std; -struct SArgument{ -	SArgument(const char* pszRole, int iBegin, int iEnd, float fProb) { -		m_pszRole = new char[strlen(pszRole) + 1]; -		strcpy(m_pszRole, pszRole); -		m_iBegin = iBegin; -		m_iEnd = iEnd; -		m_fProb = fProb; -		m_pTreeItem = NULL; -	} -	~SArgument() { -		delete [] m_pszRole; -	} - -	void fnSetTreeItem(STreeItem *pTreeItem) { -		m_pTreeItem = pTreeItem; -		if (m_pTreeItem != NULL && m_pTreeItem->m_iBegin != -1) { -			assert(m_pTreeItem->m_iBegin == m_iBegin); -			assert(m_pTreeItem->m_iEnd == m_iEnd); -		} -	} - -	char *m_pszRole; //argument rule, e.g., ARG0, ARGM-TMP -	int m_iBegin; -	int m_iEnd;  //the span of the argument, [m_iBegin, m_iEnd] -	float m_fProb; //the probability of this role, -	STreeItem *m_pTreeItem; +struct SArgument { +  SArgument(const char* pszRole, int iBegin, int iEnd, float fProb) { +    m_pszRole = new char[strlen(pszRole) + 1]; +    strcpy(m_pszRole, pszRole); +    m_iBegin = iBegin; +    m_iEnd = iEnd; +    m_fProb = fProb; +    m_pTreeItem = NULL; +  } +  ~SArgument() { delete[] m_pszRole; } + +  void fnSetTreeItem(STreeItem* pTreeItem) { +    m_pTreeItem = pTreeItem; +    if (m_pTreeItem != NULL && m_pTreeItem->m_iBegin != -1) { +      assert(m_pTreeItem->m_iBegin == m_iBegin); +      assert(m_pTreeItem->m_iEnd == m_iEnd); +    } +  } + +  char* m_pszRole;  // argument rule, e.g., ARG0, ARGM-TMP +  int m_iBegin; +  int m_iEnd;  // the span of the argument, [m_iBegin, m_iEnd] +  float m_fProb;  // the probability of this role, +  STreeItem* m_pTreeItem;  }; -struct SPredicate{ -	SPredicate(const char* pszLemma, int iPosition) { -		if (pszLemma != NULL) { -			m_pszLemma = new char[strlen(pszLemma) + 1]; -			strcpy(m_pszLemma, pszLemma); -		} else -			m_pszLemma = NULL; -		m_iPosition = iPosition; -	} -	~SPredicate() { -		if (m_pszLemma != NULL) -			delete [] m_pszLemma; -		for (size_t i = 0; i < m_vecArgt.size(); i++) -			delete m_vecArgt[i]; -	} -	int fnAppend(const char* pszRole, int iBegin, int iEnd) { -		SArgument *pArgt = new SArgument(pszRole, iBegin, iEnd, 1.0); -		return fnAppend(pArgt); -	} -	int fnAppend(SArgument *pArgt) { -		m_vecArgt.push_back(pArgt); -		int iPosition = m_vecArgt.size() - 1; -		return iPosition; -	} - -	char *m_pszLemma; //lemma of the predicate, for Chinese, it's always as same as the predicate itself -	int m_iPosition; //the position in sentence -	vector<SArgument*> m_vecArgt; //arguments associated to the predicate +struct SPredicate { +  SPredicate(const char* pszLemma, int iPosition) { +    if (pszLemma != NULL) { +      m_pszLemma = new char[strlen(pszLemma) + 1]; +      strcpy(m_pszLemma, pszLemma); +    } else +      m_pszLemma = NULL; +    m_iPosition = iPosition; +  } +  ~SPredicate() { +    if (m_pszLemma != NULL) delete[] m_pszLemma; +    for (size_t i = 0; i < m_vecArgt.size(); i++) delete m_vecArgt[i]; +  } +  int fnAppend(const char* pszRole, int iBegin, int iEnd) { +    SArgument* pArgt = new SArgument(pszRole, iBegin, iEnd, 1.0); +    return fnAppend(pArgt); +  } +  int fnAppend(SArgument* pArgt) { +    m_vecArgt.push_back(pArgt); +    int iPosition = m_vecArgt.size() - 1; +    return iPosition; +  } + +  char* m_pszLemma;  // lemma of the predicate, for Chinese, it's always as same +                     // as the predicate itself +  int m_iPosition;  // the position in sentence +  vector<SArgument*> m_vecArgt;  // arguments associated to the predicate  }; -struct SSrlSentence{ -	SSrlSentence() { -		m_pTree = NULL; -	} -	~SSrlSentence() { -		if (m_pTree != NULL) -			delete m_pTree; - -		for (size_t i = 0; i < m_vecPred.size(); i++) -			delete m_vecPred[i]; -	} -	int fnAppend(const char* pszLemma, int iPosition) { -		SPredicate *pPred = new SPredicate(pszLemma, iPosition); -		return fnAppend(pPred); -	} -	int fnAppend(SPredicate* pPred) { -		m_vecPred.push_back(pPred); -		int iPosition = m_vecPred.size() - 1; -		return iPosition; -	} -	int GetPredicateNum() { -		return m_vecPred.size(); -	} - -	SParsedTree *m_pTree; -	vector<SPredicate*> m_vecPred; +struct SSrlSentence { +  SSrlSentence() { m_pTree = NULL; } +  ~SSrlSentence() { +    if (m_pTree != NULL) delete m_pTree; + +    for (size_t i = 0; i < m_vecPred.size(); i++) delete m_vecPred[i]; +  } +  int fnAppend(const char* pszLemma, int iPosition) { +    SPredicate* pPred = new SPredicate(pszLemma, iPosition); +    return fnAppend(pPred); +  } +  int fnAppend(SPredicate* pPred) { +    m_vecPred.push_back(pPred); +    int iPosition = m_vecPred.size() - 1; +    return iPosition; +  } +  int GetPredicateNum() { return m_vecPred.size(); } + +  SParsedTree* m_pTree; +  vector<SPredicate*> m_vecPred;  };  struct SSrlSentenceReader { -	SSrlSentenceReader(const char* pszSrlFname) { -		m_fpIn = fopen(pszSrlFname, "r"); -		assert(m_fpIn != NULL); -	} -	~SSrlSentenceReader() { -		if (m_fpIn != NULL) -			fclose(m_fpIn); -	} - -	inline void fnReplaceAll(std::string& str, const std::string& from, const std::string& to) { -	    size_t start_pos = 0; -	    while((start_pos = str.find(from, start_pos)) != std::string::npos) { -	             str.replace(start_pos, from.length(), to); -	             start_pos += to.length(); // In case 'to' contains 'from', like replacing 'x' with 'yx' -	    } -	} - -	//TODO: here only considers flat predicate-argument structure -	//      i.e., no overlap among them -	SSrlSentence* fnReadNextSrlSentence() { -		vector<vector<string> > vecContent; -		if (fnReadNextContent(vecContent) == false) -			return NULL; - -		SSrlSentence *pSrlSentence = new SSrlSentence(); -		int iSize = vecContent.size(); -		//put together syntactic text -		std::ostringstream ostr; -		for (int i = 0; i < iSize; i++) { -			string strSynSeg = vecContent[i][5]; //the 5th column is the syntactic segment -			size_t iPosition = strSynSeg.find_first_of('*'); -			assert(iPosition != string::npos); -			ostringstream ostrTmp; -			ostrTmp << "(" << vecContent[i][2] << " " << vecContent[i][0] << ")"; //the 2th column is POS-tag, and the 0th column is word -			strSynSeg.replace(iPosition, 1, ostrTmp.str()); -			fnReplaceAll(strSynSeg, "(", " ("); -			ostr << strSynSeg; -		} -		string strSyn = ostr.str(); -		pSrlSentence->m_pTree = SParsedTree::fnConvertFromString(strSyn.c_str()); -		pSrlSentence->m_pTree->fnSetHeadWord(); -		pSrlSentence->m_pTree->fnSetSpanInfo(); - -		//read predicate-argument structure -		int iNumPred = vecContent[0].size() - 8; -		for (int i = 0; i < iNumPred; i++) { -			vector<string> vecRole; -			vector<int> vecBegin; -			vector<int> vecEnd; -			int iPred = -1; -			for (int j = 0; j < iSize; j++) { -				const char* p = vecContent[j][i + 8].c_str(); -				const char* q; -				if (p[0] == '(') { -					//starting position of an argument(or predicate) -					vecBegin.push_back(j); -					q = strchr(p, '*'); -					assert(q != NULL); -					vecRole.push_back(vecContent[j][i + 8].substr(1, q - p - 1)); -					if (vecRole.back().compare("V") == 0) { -						assert(iPred == -1); -						iPred = vecRole.size() - 1; -					} -				} -				if (p[strlen(p) - 1] == ')') { -					//end position of an argument(or predicate) -					vecEnd.push_back(j); -					assert(vecBegin.size() == vecEnd.size()); -				} -			} -			assert(iPred != -1); -			SPredicate *pPred =  new SPredicate(pSrlSentence->m_pTree->m_vecTerminals[vecBegin[iPred]]->m_pszTerm, vecBegin[iPred]); -			pSrlSentence->fnAppend(pPred); -			for (size_t j = 0; j < vecBegin.size(); j++) { -				if (j == iPred) -					continue; -				pPred->fnAppend(vecRole[j].c_str(), vecBegin[j], vecEnd[j]); -				pPred->m_vecArgt.back()->fnSetTreeItem(pSrlSentence->m_pTree->fnFindNodeForSpan(vecBegin[j], vecEnd[j], false)); -			} -		} -		return pSrlSentence; -	} -private: -	bool fnReadNextContent(vector<vector<string> >& vecContent) { -		vecContent.clear(); -		if (feof(m_fpIn) == true) -			return false; -		char *pszLine; -		pszLine = new char[100001]; -		pszLine[0] = '\0'; -		int iLen; -		while (!feof(m_fpIn)) { -			fgets(pszLine, 10001, m_fpIn); -			iLen = strlen(pszLine); -			while (iLen > 0 && pszLine[iLen - 1] > 0 && pszLine[iLen -1] < 33) { -				pszLine[ iLen - 1 ] = '\0'; -				iLen--; -			} -			if (iLen == 0) -				break; //end of this sentence - -			vector<string> terms = SplitOnWhitespace(string(pszLine)); -			assert(terms.size() > 7); -			vecContent.push_back(terms); -		} -		delete [] pszLine; -		return true; -	} -private: -	FILE *m_fpIn; +  SSrlSentenceReader(const char* pszSrlFname) { +    m_fpIn = fopen(pszSrlFname, "r"); +    assert(m_fpIn != NULL); +  } +  ~SSrlSentenceReader() { +    if (m_fpIn != NULL) fclose(m_fpIn); +  } + +  inline void fnReplaceAll(std::string& str, const std::string& from, +                           const std::string& to) { +    size_t start_pos = 0; +    while ((start_pos = str.find(from, start_pos)) != std::string::npos) { +      str.replace(start_pos, from.length(), to); +      start_pos += to.length();  // In case 'to' contains 'from', like replacing +                                 // 'x' with 'yx' +    } +  } + +  // TODO: here only considers flat predicate-argument structure +  //      i.e., no overlap among them +  SSrlSentence* fnReadNextSrlSentence() { +    vector<vector<string> > vecContent; +    if (fnReadNextContent(vecContent) == false) return NULL; + +    SSrlSentence* pSrlSentence = new SSrlSentence(); +    int iSize = vecContent.size(); +    // put together syntactic text +    std::ostringstream ostr; +    for (int i = 0; i < iSize; i++) { +      string strSynSeg = +          vecContent[i][5];  // the 5th column is the syntactic segment +      size_t iPosition = strSynSeg.find_first_of('*'); +      assert(iPosition != string::npos); +      ostringstream ostrTmp; +      ostrTmp << "(" << vecContent[i][2] << " " << vecContent[i][0] +              << ")";  // the 2th column is POS-tag, and the 0th column is word +      strSynSeg.replace(iPosition, 1, ostrTmp.str()); +      fnReplaceAll(strSynSeg, "(", " ("); +      ostr << strSynSeg; +    } +    string strSyn = ostr.str(); +    pSrlSentence->m_pTree = SParsedTree::fnConvertFromString(strSyn.c_str()); +    pSrlSentence->m_pTree->fnSetHeadWord(); +    pSrlSentence->m_pTree->fnSetSpanInfo(); + +    // read predicate-argument structure +    int iNumPred = vecContent[0].size() - 8; +    for (int i = 0; i < iNumPred; i++) { +      vector<string> vecRole; +      vector<int> vecBegin; +      vector<int> vecEnd; +      int iPred = -1; +      for (int j = 0; j < iSize; j++) { +        const char* p = vecContent[j][i + 8].c_str(); +        const char* q; +        if (p[0] == '(') { +          // starting position of an argument(or predicate) +          vecBegin.push_back(j); +          q = strchr(p, '*'); +          assert(q != NULL); +          vecRole.push_back(vecContent[j][i + 8].substr(1, q - p - 1)); +          if (vecRole.back().compare("V") == 0) { +            assert(iPred == -1); +            iPred = vecRole.size() - 1; +          } +        } +        if (p[strlen(p) - 1] == ')') { +          // end position of an argument(or predicate) +          vecEnd.push_back(j); +          assert(vecBegin.size() == vecEnd.size()); +        } +      } +      assert(iPred != -1); +      SPredicate* pPred = new SPredicate( +          pSrlSentence->m_pTree->m_vecTerminals[vecBegin[iPred]]->m_pszTerm, +          vecBegin[iPred]); +      pSrlSentence->fnAppend(pPred); +      for (size_t j = 0; j < vecBegin.size(); j++) { +        if (j == iPred) continue; +        pPred->fnAppend(vecRole[j].c_str(), vecBegin[j], vecEnd[j]); +        pPred->m_vecArgt.back()->fnSetTreeItem( +            pSrlSentence->m_pTree->fnFindNodeForSpan(vecBegin[j], vecEnd[j], +                                                     false)); +      } +    } +    return pSrlSentence; +  } + + private: +  bool fnReadNextContent(vector<vector<string> >& vecContent) { +    vecContent.clear(); +    if (feof(m_fpIn) == true) return false; +    char* pszLine; +    pszLine = new char[100001]; +    pszLine[0] = '\0'; +    int iLen; +    while (!feof(m_fpIn)) { +      fgets(pszLine, 10001, m_fpIn); +      iLen = strlen(pszLine); +      while (iLen > 0 && pszLine[iLen - 1] > 0 && pszLine[iLen - 1] < 33) { +        pszLine[iLen - 1] = '\0'; +        iLen--; +      } +      if (iLen == 0) break;  // end of this sentence + +      vector<string> terms = SplitOnWhitespace(string(pszLine)); +      assert(terms.size() > 7); +      vecContent.push_back(terms); +    } +    delete[] pszLine; +    return true; +  } + + private: +  FILE* m_fpIn;  };  #endif /* SRL_SENTENCE_H_ */ diff --git a/utils/synutils/tree.h b/utils/synutils/tree.h index 658a470d..32521a3a 100644 --- a/utils/synutils/tree.h +++ b/utils/synutils/tree.h @@ -14,718 +14,685 @@  using namespace std; -struct STreeItem{ -	STreeItem(const char* pszTerm) { -		m_pszTerm = new char[ strlen( pszTerm ) + 1 ]; -		strcpy( m_pszTerm, pszTerm ); - -		m_ptParent = NULL; -		m_iBegin = -1; -		m_iEnd = -1; -		m_iHeadChild = -1; -		m_iHeadWord = -1; -		m_iBrotherIndex = -1; -	} -	~STreeItem( ) { -		delete [] m_pszTerm; -		for (size_t i = 0; i < m_vecChildren.size(); i++) -			delete m_vecChildren[i]; -	} -	int fnAppend(STreeItem *ptChild) { -		m_vecChildren.push_back(ptChild); -		ptChild->m_iBrotherIndex = m_vecChildren.size() - 1; -		ptChild->m_ptParent = this; -		return m_vecChildren.size() - 1; -	} -	int fnGetChildrenNum() { -		return m_vecChildren.size(); -	} - -	bool fnIsPreTerminal( void ) { -		int I; -		if ( this == NULL || m_vecChildren.size() == 0 ) -			return false; - -		for ( I = 0; I < m_vecChildren.size(); I++ ) -			if (m_vecChildren[I]->m_vecChildren.size() > 0 ) -				return false; - -		return true; -	} -public: -	char *m_pszTerm; - -	vector<STreeItem*> m_vecChildren;//children items -	STreeItem *m_ptParent;//the parent item - -	int m_iBegin; -	int m_iEnd; //the node span words[m_iBegin, m_iEnd] -	int m_iHeadChild; //the index of its head child -	int m_iHeadWord; //the index of its head word -	int m_iBrotherIndex;//the index in his brothers +struct STreeItem { +  STreeItem(const char *pszTerm) { +    m_pszTerm = new char[strlen(pszTerm) + 1]; +    strcpy(m_pszTerm, pszTerm); + +    m_ptParent = NULL; +    m_iBegin = -1; +    m_iEnd = -1; +    m_iHeadChild = -1; +    m_iHeadWord = -1; +    m_iBrotherIndex = -1; +  } +  ~STreeItem() { +    delete[] m_pszTerm; +    for (size_t i = 0; i < m_vecChildren.size(); i++) delete m_vecChildren[i]; +  } +  int fnAppend(STreeItem *ptChild) { +    m_vecChildren.push_back(ptChild); +    ptChild->m_iBrotherIndex = m_vecChildren.size() - 1; +    ptChild->m_ptParent = this; +    return m_vecChildren.size() - 1; +  } +  int fnGetChildrenNum() { return m_vecChildren.size(); } + +  bool fnIsPreTerminal(void) { +    int I; +    if (this == NULL || m_vecChildren.size() == 0) return false; + +    for (I = 0; I < m_vecChildren.size(); I++) +      if (m_vecChildren[I]->m_vecChildren.size() > 0) return false; + +    return true; +  } + + public: +  char *m_pszTerm; + +  vector<STreeItem *> m_vecChildren;  // children items +  STreeItem *m_ptParent;  // the parent item + +  int m_iBegin; +  int m_iEnd;  // the node span words[m_iBegin, m_iEnd] +  int m_iHeadChild;  // the index of its head child +  int m_iHeadWord;  // the index of its head word +  int m_iBrotherIndex;  // the index in his brothers  }; - -struct SGetHeadWord{ -	typedef vector<string> CVectorStr; -	SGetHeadWord() { - -	} -	~SGetHeadWord() { - -	} -	int fnGetHeadWord( char *pszCFGLeft, CVectorStr vectRight ) { -		//0 indicating from right to left while 1 indicating from left to right -		char szaHeadLists[ 201 ] = "0"; - -		/*  //head rules for Egnlish -		if( strcmp( pszCFGLeft, "ADJP" ) == 0 ) -			strcpy( szaHeadLists, "0NNS 0QP 0NN 0$ 0ADVP 0JJ 0VBN 0VBG 0ADJP 0JJR 0NP 0JJS 0DT 0FW 0RBR 0RBS 0SBAR 0RB 0" ); -		else if( strcmp( pszCFGLeft, "ADVP" ) == 0 ) -			strcpy( szaHeadLists, "1RB 1RBR 1RBS 1FW 1ADVP 1TO 1CD 1JJR 1JJ 1IN 1NP 1JJS 1NN 1" ); -		else if( strcmp( pszCFGLeft, "CONJP" ) == 0 ) -			strcpy( szaHeadLists, "1CC 1RB 1IN 1" ); -		else if( strcmp( pszCFGLeft, "FRAG" ) == 0 ) -			strcpy( szaHeadLists, "1" ); -		else if( strcmp( pszCFGLeft, "INTJ" ) == 0 ) -			strcpy( szaHeadLists, "0" ); -		else if( strcmp( pszCFGLeft, "LST" ) == 0 ) -			strcpy( szaHeadLists, "1LS 1: 1CLN 1" ); -		else if( strcmp( pszCFGLeft, "NAC" ) == 0 ) -			strcpy( szaHeadLists, "0NN 0NNS 0NNP 0NNPS 0NP 0NAC 0EX 0$ 0CD 0QP 0PRP 0VBG 0JJ 0JJS 0JJR 0ADJP 0FW 0" ); -		else if( strcmp( pszCFGLeft, "PP" ) == 0 ) -			strcpy( szaHeadLists, "1IN 1TO 1VBG 1VBN 1RP 1FW 1" ); -		else if( strcmp( pszCFGLeft, "PRN" ) == 0 ) -			strcpy( szaHeadLists, "1" ); -		else if( strcmp( pszCFGLeft, "PRT" ) == 0 ) -			strcpy( szaHeadLists, "1RP 1" ); -		else if( strcmp( pszCFGLeft, "QP" ) == 0 ) -			strcpy( szaHeadLists, "0$ 0IN 0NNS 0NN 0JJ 0RB 0DT 0CD 0NCD 0QP 0JJR 0JJS 0" ); -		else if( strcmp( pszCFGLeft, "RRC" ) == 0 ) -			strcpy( szaHeadLists, "1VP 1NP 1ADVP 1ADJP 1PP 1" ); -		else if( strcmp( pszCFGLeft, "S" ) == 0 ) -			strcpy( szaHeadLists, "0TO 0IN 0VP 0S 0SBAR 0ADJP 0UCP 0NP 0" ); -		else if( strcmp( pszCFGLeft, "SBAR" ) == 0 ) -			strcpy( szaHeadLists, "0WHNP 0WHPP 0WHADVP 0WHADJP 0IN 0DT 0S 0SQ 0SINV 0SBAR 0FRAG 0" ); -		else if( strcmp( pszCFGLeft, "SBARQ" ) == 0 ) -			strcpy( szaHeadLists, "0SQ 0S 0SINV 0SBARQ 0FRAG 0" ); -		else if( strcmp( pszCFGLeft, "SINV" ) == 0 ) -			strcpy( szaHeadLists, "0VBZ 0VBD 0VBP 0VB 0MD 0VP 0S 0SINV 0ADJP 0NP 0" ); -		else if( strcmp( pszCFGLeft, "SQ" ) == 0 ) -			strcpy( szaHeadLists, "0VBZ 0VBD 0VBP 0VB 0MD 0VP 0SQ 0" ); -		else if( strcmp( pszCFGLeft, "UCP" ) == 0 ) -			strcpy( szaHeadLists, "1" ); -		else if( strcmp( pszCFGLeft, "VP" ) == 0 ) -			strcpy( szaHeadLists, "0TO 0VBD 0VBN 0MD 0VBZ 0VB 0VBG 0VBP 0VP 0ADJP 0NN 0NNS 0NP 0" ); -		else if( strcmp( pszCFGLeft, "WHADJP" ) == 0 ) -			strcpy( szaHeadLists, "0CC 0WRB 0JJ 0ADJP 0" ); -		else if( strcmp( pszCFGLeft, "WHADVP" ) == 0 ) -			strcpy( szaHeadLists, "1CC 1WRB 1" ); -		else if( strcmp( pszCFGLeft, "WHNP" ) == 0 ) -			strcpy( szaHeadLists, "0WDT 0WP 0WP$ 0WHADJP 0WHPP 0WHNP 0" ); -		else if( strcmp( pszCFGLeft, "WHPP" ) == 0 ) -			strcpy( szaHeadLists, "1IN 1TO FW 1" ); -		else if( strcmp( pszCFGLeft, "NP" ) == 0 ) -			strcpy( szaHeadLists, "0NN NNP NNS NNPS NX POS JJR 0NP 0$ ADJP PRN 0CD 0JJ JJS RB QP 0" ); -		*/ - -		if( strcmp( pszCFGLeft, "ADJP" ) == 0 ) -			strcpy( szaHeadLists, "0ADJP JJ 0AD NN CS 0" ); -		else if( strcmp( pszCFGLeft, "ADVP" ) == 0 ) -			strcpy( szaHeadLists, "0ADVP AD 0" ); -		else if( strcmp( pszCFGLeft, "CLP" ) == 0 ) -			strcpy( szaHeadLists, "0CLP M 0" ); -		else if( strcmp( pszCFGLeft, "CP" ) == 0 ) -			strcpy( szaHeadLists, "0DEC SP 1ADVP CS 0CP IP 0" ); -		else if( strcmp( pszCFGLeft, "DNP" ) == 0 ) -			strcpy( szaHeadLists, "0DNP DEG 0DEC 0" ); -		else if( strcmp( pszCFGLeft, "DVP" ) == 0 ) -			strcpy( szaHeadLists, "0DVP DEV 0" ); -		else if( strcmp( pszCFGLeft, "DP" ) == 0 ) -			strcpy( szaHeadLists, "1DP DT 1" ); -		else if( strcmp( pszCFGLeft, "FRAG" ) == 0 ) -			strcpy( szaHeadLists, "0VV NR NN 0" ); -		else if( strcmp( pszCFGLeft, "INTJ" ) == 0 ) -			strcpy( szaHeadLists, "0INTJ IJ 0" ); -		else if( strcmp( pszCFGLeft, "LST" ) == 0 ) -			strcpy( szaHeadLists, "1LST CD OD 1" ); -		else if( strcmp( pszCFGLeft, "IP" ) == 0 ) -			strcpy( szaHeadLists, "0IP VP 0VV 0" ); -			//strcpy( szaHeadLists, "0VP 0VV 1IP 0" ); -		else if( strcmp( pszCFGLeft, "LCP" ) == 0 ) -			strcpy( szaHeadLists, "0LCP LC 0" ); -		else if( strcmp( pszCFGLeft, "NP" ) == 0 ) -			strcpy( szaHeadLists, "0NP NN NT NR QP 0" ); -		else if( strcmp( pszCFGLeft, "PP" ) == 0 ) -			strcpy( szaHeadLists, "1PP P 1" ); -		else if( strcmp( pszCFGLeft, "PRN" ) == 0 ) -			strcpy( szaHeadLists, "0 NP IP VP NT NR NN 0" ); -		else if( strcmp( pszCFGLeft, "QP" ) == 0 ) -			strcpy( szaHeadLists, "0QP CLP CD OD 0" ); -		else if( strcmp( pszCFGLeft, "VP" ) == 0 ) -			strcpy( szaHeadLists, "1VP VA VC VE VV BA LB VCD VSB VRD VNV VCP 1" ); -		else if( strcmp( pszCFGLeft, "VCD" ) == 0 ) -			strcpy( szaHeadLists, "0VCD VV VA VC VE 0" ); -		if( strcmp( pszCFGLeft, "VRD" ) == 0 ) -			strcpy( szaHeadLists, "0VRD VV VA VC VE 0" ); -		else if( strcmp( pszCFGLeft, "VSB" ) == 0 ) -			strcpy( szaHeadLists, "0VSB VV VA VC VE 0" ); -		else if( strcmp( pszCFGLeft, "VCP" ) == 0 ) -			strcpy( szaHeadLists, "0VCP VV VA VC VE 0" ); -		else if( strcmp( pszCFGLeft, "VNV" ) == 0 ) -			strcpy( szaHeadLists, "0VNV VV VA VC VE 0" ); -		else if( strcmp( pszCFGLeft, "VPT" ) == 0 ) -			strcpy( szaHeadLists, "0VNV VV VA VC VE 0" ); -		else if( strcmp( pszCFGLeft, "UCP" ) == 0 ) -			strcpy( szaHeadLists, "0" ); -		else if( strcmp( pszCFGLeft, "WHNP" ) == 0 ) -			strcpy( szaHeadLists, "0WHNP NP NN NT NR QP 0" ); -		else if( strcmp( pszCFGLeft, "WHPP" ) == 0 ) -			strcpy( szaHeadLists, "1WHPP PP P 1" ); - -		/*  //head rules for GENIA corpus -		if( strcmp( pszCFGLeft, "ADJP" ) == 0 ) -			strcpy( szaHeadLists, "0NNS 0QP 0NN 0$ 0ADVP 0JJ 0VBN 0VBG 0ADJP 0JJR 0NP 0JJS 0DT 0FW 0RBR 0RBS 0SBAR 0RB 0" ); -		else if( strcmp( pszCFGLeft, "ADVP" ) == 0 ) -			strcpy( szaHeadLists, "1RB 1RBR 1RBS 1FW 1ADVP 1TO 1CD 1JJR 1JJ 1IN 1NP 1JJS 1NN 1" ); -		else if( strcmp( pszCFGLeft, "CONJP" ) == 0 ) -			strcpy( szaHeadLists, "1CC 1RB 1IN 1" ); -		else if( strcmp( pszCFGLeft, "FRAG" ) == 0 ) -			strcpy( szaHeadLists, "1" ); -		else if( strcmp( pszCFGLeft, "INTJ" ) == 0 ) -			strcpy( szaHeadLists, "0" ); -		else if( strcmp( pszCFGLeft, "LST" ) == 0 ) -			strcpy( szaHeadLists, "1LS 1: 1CLN 1" ); -		else if( strcmp( pszCFGLeft, "NAC" ) == 0 ) -			strcpy( szaHeadLists, "0NN 0NNS 0NNP 0NNPS 0NP 0NAC 0EX 0$ 0CD 0QP 0PRP 0VBG 0JJ 0JJS 0JJR 0ADJP 0FW 0" ); -		else if( strcmp( pszCFGLeft, "PP" ) == 0 ) -			strcpy( szaHeadLists, "1IN 1TO 1VBG 1VBN 1RP 1FW 1" ); -		else if( strcmp( pszCFGLeft, "PRN" ) == 0 ) -			strcpy( szaHeadLists, "1" ); -		else if( strcmp( pszCFGLeft, "PRT" ) == 0 ) -			strcpy( szaHeadLists, "1RP 1" ); -		else if( strcmp( pszCFGLeft, "QP" ) == 0 ) -			strcpy( szaHeadLists, "0$ 0IN 0NNS 0NN 0JJ 0RB 0DT 0CD 0NCD 0QP 0JJR 0JJS 0" ); -		else if( strcmp( pszCFGLeft, "RRC" ) == 0 ) -			strcpy( szaHeadLists, "1VP 1NP 1ADVP 1ADJP 1PP 1" ); -		else if( strcmp( pszCFGLeft, "S" ) == 0 ) -			strcpy( szaHeadLists, "0TO 0IN 0VP 0S 0SBAR 0ADJP 0UCP 0NP 0" ); -		else if( strcmp( pszCFGLeft, "SBAR" ) == 0 ) -			strcpy( szaHeadLists, "0WHNP 0WHPP 0WHADVP 0WHADJP 0IN 0DT 0S 0SQ 0SINV 0SBAR 0FRAG 0" ); -		else if( strcmp( pszCFGLeft, "SBARQ" ) == 0 ) -			strcpy( szaHeadLists, "0SQ 0S 0SINV 0SBARQ 0FRAG 0" ); -		else if( strcmp( pszCFGLeft, "SINV" ) == 0 ) -			strcpy( szaHeadLists, "0VBZ 0VBD 0VBP 0VB 0MD 0VP 0S 0SINV 0ADJP 0NP 0" ); -		else if( strcmp( pszCFGLeft, "SQ" ) == 0 ) -			strcpy( szaHeadLists, "0VBZ 0VBD 0VBP 0VB 0MD 0VP 0SQ 0" ); -		else if( strcmp( pszCFGLeft, "UCP" ) == 0 ) -			strcpy( szaHeadLists, "1" ); -		else if( strcmp( pszCFGLeft, "VP" ) == 0 ) -			strcpy( szaHeadLists, "0TO 0VBD 0VBN 0MD 0VBZ 0VB 0VBG 0VBP 0VP 0ADJP 0NN 0NNS 0NP 0" ); -		else if( strcmp( pszCFGLeft, "WHADJP" ) == 0 ) -			strcpy( szaHeadLists, "0CC 0WRB 0JJ 0ADJP 0" ); -		else if( strcmp( pszCFGLeft, "WHADVP" ) == 0 ) -			strcpy( szaHeadLists, "1CC 1WRB 1" ); -		else if( strcmp( pszCFGLeft, "WHNP" ) == 0 ) -			strcpy( szaHeadLists, "0WDT 0WP 0WP$ 0WHADJP 0WHPP 0WHNP 0" ); -		else if( strcmp( pszCFGLeft, "WHPP" ) == 0 ) -			strcpy( szaHeadLists, "1IN 1TO FW 1" ); -		else if( strcmp( pszCFGLeft, "NP" ) == 0 ) -			strcpy( szaHeadLists, "0NN NNP NNS NNPS NX POS JJR 0NP 0$ ADJP PRN 0CD 0JJ JJS RB QP 0" ); -		*/ - -		return fnMyOwnHeadWordRule( szaHeadLists, vectRight ); -	} - -private: -	int fnMyOwnHeadWordRule( char *pszaHeadLists, CVectorStr vectRight ) { -		char szHeadList[ 201 ], *p; -		char szTerm[ 101 ]; -		int J; - -		p = pszaHeadLists; - -		int iCountRight; - -		iCountRight = vectRight.size( ); - -		szHeadList[ 0 ] = '\0'; -		while( 1 ){ -			szTerm[ 0 ] = '\0'; -			sscanf( p, "%s", szTerm ); -			if( strlen( szHeadList ) == 0 ){ -				if( strcmp( szTerm, "0" ) == 0 ){ -					return iCountRight - 1; -				} -				if( strcmp( szTerm, "1" ) == 0 ){ -					return 0; -				} - -				sprintf( szHeadList, "%c %s ", szTerm[ 0 ], szTerm + 1 ); -				p = strstr( p, szTerm ); -				p += strlen( szTerm ); -			} -			else{ -				if(   ( szTerm[ 0 ] == '0' ) -					||( szTerm[ 0 ] == '1' ) ){ -					if( szHeadList[ 0 ] == '0' ){ -						for( J = iCountRight - 1; J >= 0; J -- ){ -							sprintf( szTerm, " %s ", vectRight.at( J ).c_str( ) ); -							if( strstr( szHeadList, szTerm ) != NULL ) -								return J; -						} -					} -					else{ -						for( J = 0; J < iCountRight; J ++ ){ -							sprintf( szTerm, " %s ", vectRight.at( J ).c_str( ) ); -							if(	strstr( szHeadList, szTerm ) != NULL ) -								return J; -						} -					} - -					szHeadList[ 0 ] = '\0'; -				} -				else{ -					strcat( szHeadList, szTerm ); -					strcat( szHeadList, " " ); - -					p = strstr( p, szTerm ); -					p += strlen( szTerm ); -				} -			} -		} - -		return 0; -	} - +struct SGetHeadWord { +  typedef vector<string> CVectorStr; +  SGetHeadWord() {} +  ~SGetHeadWord() {} +  int fnGetHeadWord(char *pszCFGLeft, CVectorStr vectRight) { +    // 0 indicating from right to left while 1 indicating from left to right +    char szaHeadLists[201] = "0"; + +    /*  //head rules for Egnlish +    if( strcmp( pszCFGLeft, "ADJP" ) == 0 ) +            strcpy( szaHeadLists, "0NNS 0QP 0NN 0$ 0ADVP 0JJ 0VBN 0VBG 0ADJP +    0JJR 0NP 0JJS 0DT 0FW 0RBR 0RBS 0SBAR 0RB 0" ); +    else if( strcmp( pszCFGLeft, "ADVP" ) == 0 ) +            strcpy( szaHeadLists, "1RB 1RBR 1RBS 1FW 1ADVP 1TO 1CD 1JJR 1JJ 1IN +    1NP 1JJS 1NN 1" ); +    else if( strcmp( pszCFGLeft, "CONJP" ) == 0 ) +            strcpy( szaHeadLists, "1CC 1RB 1IN 1" ); +    else if( strcmp( pszCFGLeft, "FRAG" ) == 0 ) +            strcpy( szaHeadLists, "1" ); +    else if( strcmp( pszCFGLeft, "INTJ" ) == 0 ) +            strcpy( szaHeadLists, "0" ); +    else if( strcmp( pszCFGLeft, "LST" ) == 0 ) +            strcpy( szaHeadLists, "1LS 1: 1CLN 1" ); +    else if( strcmp( pszCFGLeft, "NAC" ) == 0 ) +            strcpy( szaHeadLists, "0NN 0NNS 0NNP 0NNPS 0NP 0NAC 0EX 0$ 0CD 0QP +    0PRP 0VBG 0JJ 0JJS 0JJR 0ADJP 0FW 0" ); +    else if( strcmp( pszCFGLeft, "PP" ) == 0 ) +            strcpy( szaHeadLists, "1IN 1TO 1VBG 1VBN 1RP 1FW 1" ); +    else if( strcmp( pszCFGLeft, "PRN" ) == 0 ) +            strcpy( szaHeadLists, "1" ); +    else if( strcmp( pszCFGLeft, "PRT" ) == 0 ) +            strcpy( szaHeadLists, "1RP 1" ); +    else if( strcmp( pszCFGLeft, "QP" ) == 0 ) +            strcpy( szaHeadLists, "0$ 0IN 0NNS 0NN 0JJ 0RB 0DT 0CD 0NCD 0QP 0JJR +    0JJS 0" ); +    else if( strcmp( pszCFGLeft, "RRC" ) == 0 ) +            strcpy( szaHeadLists, "1VP 1NP 1ADVP 1ADJP 1PP 1" ); +    else if( strcmp( pszCFGLeft, "S" ) == 0 ) +            strcpy( szaHeadLists, "0TO 0IN 0VP 0S 0SBAR 0ADJP 0UCP 0NP 0" ); +    else if( strcmp( pszCFGLeft, "SBAR" ) == 0 ) +            strcpy( szaHeadLists, "0WHNP 0WHPP 0WHADVP 0WHADJP 0IN 0DT 0S 0SQ +    0SINV 0SBAR 0FRAG 0" ); +    else if( strcmp( pszCFGLeft, "SBARQ" ) == 0 ) +            strcpy( szaHeadLists, "0SQ 0S 0SINV 0SBARQ 0FRAG 0" ); +    else if( strcmp( pszCFGLeft, "SINV" ) == 0 ) +            strcpy( szaHeadLists, "0VBZ 0VBD 0VBP 0VB 0MD 0VP 0S 0SINV 0ADJP 0NP +    0" ); +    else if( strcmp( pszCFGLeft, "SQ" ) == 0 ) +            strcpy( szaHeadLists, "0VBZ 0VBD 0VBP 0VB 0MD 0VP 0SQ 0" ); +    else if( strcmp( pszCFGLeft, "UCP" ) == 0 ) +            strcpy( szaHeadLists, "1" ); +    else if( strcmp( pszCFGLeft, "VP" ) == 0 ) +            strcpy( szaHeadLists, "0TO 0VBD 0VBN 0MD 0VBZ 0VB 0VBG 0VBP 0VP +    0ADJP 0NN 0NNS 0NP 0" ); +    else if( strcmp( pszCFGLeft, "WHADJP" ) == 0 ) +            strcpy( szaHeadLists, "0CC 0WRB 0JJ 0ADJP 0" ); +    else if( strcmp( pszCFGLeft, "WHADVP" ) == 0 ) +            strcpy( szaHeadLists, "1CC 1WRB 1" ); +    else if( strcmp( pszCFGLeft, "WHNP" ) == 0 ) +            strcpy( szaHeadLists, "0WDT 0WP 0WP$ 0WHADJP 0WHPP 0WHNP 0" ); +    else if( strcmp( pszCFGLeft, "WHPP" ) == 0 ) +            strcpy( szaHeadLists, "1IN 1TO FW 1" ); +    else if( strcmp( pszCFGLeft, "NP" ) == 0 ) +            strcpy( szaHeadLists, "0NN NNP NNS NNPS NX POS JJR 0NP 0$ ADJP PRN +    0CD 0JJ JJS RB QP 0" ); +    */ + +    if (strcmp(pszCFGLeft, "ADJP") == 0) +      strcpy(szaHeadLists, "0ADJP JJ 0AD NN CS 0"); +    else if (strcmp(pszCFGLeft, "ADVP") == 0) +      strcpy(szaHeadLists, "0ADVP AD 0"); +    else if (strcmp(pszCFGLeft, "CLP") == 0) +      strcpy(szaHeadLists, "0CLP M 0"); +    else if (strcmp(pszCFGLeft, "CP") == 0) +      strcpy(szaHeadLists, "0DEC SP 1ADVP CS 0CP IP 0"); +    else if (strcmp(pszCFGLeft, "DNP") == 0) +      strcpy(szaHeadLists, "0DNP DEG 0DEC 0"); +    else if (strcmp(pszCFGLeft, "DVP") == 0) +      strcpy(szaHeadLists, "0DVP DEV 0"); +    else if (strcmp(pszCFGLeft, "DP") == 0) +      strcpy(szaHeadLists, "1DP DT 1"); +    else if (strcmp(pszCFGLeft, "FRAG") == 0) +      strcpy(szaHeadLists, "0VV NR NN 0"); +    else if (strcmp(pszCFGLeft, "INTJ") == 0) +      strcpy(szaHeadLists, "0INTJ IJ 0"); +    else if (strcmp(pszCFGLeft, "LST") == 0) +      strcpy(szaHeadLists, "1LST CD OD 1"); +    else if (strcmp(pszCFGLeft, "IP") == 0) +      strcpy(szaHeadLists, "0IP VP 0VV 0"); +    // strcpy( szaHeadLists, "0VP 0VV 1IP 0" ); +    else if (strcmp(pszCFGLeft, "LCP") == 0) +      strcpy(szaHeadLists, "0LCP LC 0"); +    else if (strcmp(pszCFGLeft, "NP") == 0) +      strcpy(szaHeadLists, "0NP NN NT NR QP 0"); +    else if (strcmp(pszCFGLeft, "PP") == 0) +      strcpy(szaHeadLists, "1PP P 1"); +    else if (strcmp(pszCFGLeft, "PRN") == 0) +      strcpy(szaHeadLists, "0 NP IP VP NT NR NN 0"); +    else if (strcmp(pszCFGLeft, "QP") == 0) +      strcpy(szaHeadLists, "0QP CLP CD OD 0"); +    else if (strcmp(pszCFGLeft, "VP") == 0) +      strcpy(szaHeadLists, "1VP VA VC VE VV BA LB VCD VSB VRD VNV VCP 1"); +    else if (strcmp(pszCFGLeft, "VCD") == 0) +      strcpy(szaHeadLists, "0VCD VV VA VC VE 0"); +    if (strcmp(pszCFGLeft, "VRD") == 0) +      strcpy(szaHeadLists, "0VRD VV VA VC VE 0"); +    else if (strcmp(pszCFGLeft, "VSB") == 0) +      strcpy(szaHeadLists, "0VSB VV VA VC VE 0"); +    else if (strcmp(pszCFGLeft, "VCP") == 0) +      strcpy(szaHeadLists, "0VCP VV VA VC VE 0"); +    else if (strcmp(pszCFGLeft, "VNV") == 0) +      strcpy(szaHeadLists, "0VNV VV VA VC VE 0"); +    else if (strcmp(pszCFGLeft, "VPT") == 0) +      strcpy(szaHeadLists, "0VNV VV VA VC VE 0"); +    else if (strcmp(pszCFGLeft, "UCP") == 0) +      strcpy(szaHeadLists, "0"); +    else if (strcmp(pszCFGLeft, "WHNP") == 0) +      strcpy(szaHeadLists, "0WHNP NP NN NT NR QP 0"); +    else if (strcmp(pszCFGLeft, "WHPP") == 0) +      strcpy(szaHeadLists, "1WHPP PP P 1"); + +    /*  //head rules for GENIA corpus +    if( strcmp( pszCFGLeft, "ADJP" ) == 0 ) +            strcpy( szaHeadLists, "0NNS 0QP 0NN 0$ 0ADVP 0JJ 0VBN 0VBG 0ADJP +    0JJR 0NP 0JJS 0DT 0FW 0RBR 0RBS 0SBAR 0RB 0" ); +    else if( strcmp( pszCFGLeft, "ADVP" ) == 0 ) +            strcpy( szaHeadLists, "1RB 1RBR 1RBS 1FW 1ADVP 1TO 1CD 1JJR 1JJ 1IN +    1NP 1JJS 1NN 1" ); +    else if( strcmp( pszCFGLeft, "CONJP" ) == 0 ) +            strcpy( szaHeadLists, "1CC 1RB 1IN 1" ); +    else if( strcmp( pszCFGLeft, "FRAG" ) == 0 ) +            strcpy( szaHeadLists, "1" ); +    else if( strcmp( pszCFGLeft, "INTJ" ) == 0 ) +            strcpy( szaHeadLists, "0" ); +    else if( strcmp( pszCFGLeft, "LST" ) == 0 ) +            strcpy( szaHeadLists, "1LS 1: 1CLN 1" ); +    else if( strcmp( pszCFGLeft, "NAC" ) == 0 ) +            strcpy( szaHeadLists, "0NN 0NNS 0NNP 0NNPS 0NP 0NAC 0EX 0$ 0CD 0QP +    0PRP 0VBG 0JJ 0JJS 0JJR 0ADJP 0FW 0" ); +    else if( strcmp( pszCFGLeft, "PP" ) == 0 ) +            strcpy( szaHeadLists, "1IN 1TO 1VBG 1VBN 1RP 1FW 1" ); +    else if( strcmp( pszCFGLeft, "PRN" ) == 0 ) +            strcpy( szaHeadLists, "1" ); +    else if( strcmp( pszCFGLeft, "PRT" ) == 0 ) +            strcpy( szaHeadLists, "1RP 1" ); +    else if( strcmp( pszCFGLeft, "QP" ) == 0 ) +            strcpy( szaHeadLists, "0$ 0IN 0NNS 0NN 0JJ 0RB 0DT 0CD 0NCD 0QP 0JJR +    0JJS 0" ); +    else if( strcmp( pszCFGLeft, "RRC" ) == 0 ) +            strcpy( szaHeadLists, "1VP 1NP 1ADVP 1ADJP 1PP 1" ); +    else if( strcmp( pszCFGLeft, "S" ) == 0 ) +            strcpy( szaHeadLists, "0TO 0IN 0VP 0S 0SBAR 0ADJP 0UCP 0NP 0" ); +    else if( strcmp( pszCFGLeft, "SBAR" ) == 0 ) +            strcpy( szaHeadLists, "0WHNP 0WHPP 0WHADVP 0WHADJP 0IN 0DT 0S 0SQ +    0SINV 0SBAR 0FRAG 0" ); +    else if( strcmp( pszCFGLeft, "SBARQ" ) == 0 ) +            strcpy( szaHeadLists, "0SQ 0S 0SINV 0SBARQ 0FRAG 0" ); +    else if( strcmp( pszCFGLeft, "SINV" ) == 0 ) +            strcpy( szaHeadLists, "0VBZ 0VBD 0VBP 0VB 0MD 0VP 0S 0SINV 0ADJP 0NP +    0" ); +    else if( strcmp( pszCFGLeft, "SQ" ) == 0 ) +            strcpy( szaHeadLists, "0VBZ 0VBD 0VBP 0VB 0MD 0VP 0SQ 0" ); +    else if( strcmp( pszCFGLeft, "UCP" ) == 0 ) +            strcpy( szaHeadLists, "1" ); +    else if( strcmp( pszCFGLeft, "VP" ) == 0 ) +            strcpy( szaHeadLists, "0TO 0VBD 0VBN 0MD 0VBZ 0VB 0VBG 0VBP 0VP +    0ADJP 0NN 0NNS 0NP 0" ); +    else if( strcmp( pszCFGLeft, "WHADJP" ) == 0 ) +            strcpy( szaHeadLists, "0CC 0WRB 0JJ 0ADJP 0" ); +    else if( strcmp( pszCFGLeft, "WHADVP" ) == 0 ) +            strcpy( szaHeadLists, "1CC 1WRB 1" ); +    else if( strcmp( pszCFGLeft, "WHNP" ) == 0 ) +            strcpy( szaHeadLists, "0WDT 0WP 0WP$ 0WHADJP 0WHPP 0WHNP 0" ); +    else if( strcmp( pszCFGLeft, "WHPP" ) == 0 ) +            strcpy( szaHeadLists, "1IN 1TO FW 1" ); +    else if( strcmp( pszCFGLeft, "NP" ) == 0 ) +            strcpy( szaHeadLists, "0NN NNP NNS NNPS NX POS JJR 0NP 0$ ADJP PRN +    0CD 0JJ JJS RB QP 0" ); +    */ + +    return fnMyOwnHeadWordRule(szaHeadLists, vectRight); +  } + + private: +  int fnMyOwnHeadWordRule(char *pszaHeadLists, CVectorStr vectRight) { +    char szHeadList[201], *p; +    char szTerm[101]; +    int J; + +    p = pszaHeadLists; + +    int iCountRight; + +    iCountRight = vectRight.size(); + +    szHeadList[0] = '\0'; +    while (1) { +      szTerm[0] = '\0'; +      sscanf(p, "%s", szTerm); +      if (strlen(szHeadList) == 0) { +        if (strcmp(szTerm, "0") == 0) { +          return iCountRight - 1; +        } +        if (strcmp(szTerm, "1") == 0) { +          return 0; +        } + +        sprintf(szHeadList, "%c %s ", szTerm[0], szTerm + 1); +        p = strstr(p, szTerm); +        p += strlen(szTerm); +      } else { +        if ((szTerm[0] == '0') || (szTerm[0] == '1')) { +          if (szHeadList[0] == '0') { +            for (J = iCountRight - 1; J >= 0; J--) { +              sprintf(szTerm, " %s ", vectRight.at(J).c_str()); +              if (strstr(szHeadList, szTerm) != NULL) return J; +            } +          } else { +            for (J = 0; J < iCountRight; J++) { +              sprintf(szTerm, " %s ", vectRight.at(J).c_str()); +              if (strstr(szHeadList, szTerm) != NULL) return J; +            } +          } + +          szHeadList[0] = '\0'; +        } else { +          strcat(szHeadList, szTerm); +          strcat(szHeadList, " "); + +          p = strstr(p, szTerm); +          p += strlen(szTerm); +        } +      } +    } + +    return 0; +  }  }; -struct SParsedTree{ -	SParsedTree( ) { -		m_ptRoot = NULL; -	} -	~SParsedTree( ) { -		if (m_ptRoot != NULL) -			delete m_ptRoot; -	} -	static SParsedTree* fnConvertFromString(const char* pszStr) { -		if (strcmp(pszStr, "(())") == 0) -			return NULL; -		SParsedTree* pTree = new SParsedTree(); - -		vector<string> vecSyn; -		fnReadSyntactic(pszStr, vecSyn); - -		int iLeft = 1, iRight = 1; //# left/right parenthesis - -		STreeItem *pcurrent; - -		pTree->m_ptRoot = new STreeItem(vecSyn[1].c_str()); - -		pcurrent = pTree->m_ptRoot; - -		for (size_t i = 2; i < vecSyn.size() - 1; i++) { -			if ( strcmp(vecSyn[i].c_str(), "(") == 0 ) -					iLeft++; -			else if (strcmp(vecSyn[i].c_str(), ")") == 0 ) { -				iRight++; -				if (pcurrent == NULL) { -					//error -					fprintf(stderr, "ERROR in ConvertFromString\n"); -					fprintf(stderr, "%s\n", pszStr); -					return NULL; -				} -				pcurrent = pcurrent->m_ptParent; -			} else { -				STreeItem *ptNewItem = new STreeItem(vecSyn[i].c_str()); -				pcurrent->fnAppend( ptNewItem ); -				pcurrent = ptNewItem; - -				if (strcmp(vecSyn[i - 1].c_str(), "(" ) != 0 -						&& strcmp(vecSyn[i - 1].c_str(), ")" ) != 0 ) { -					pTree->m_vecTerminals.push_back(ptNewItem); -					pcurrent = pcurrent->m_ptParent; -				} -			} -		} - -		if ( iLeft != iRight ) { -			//error -			fprintf(stderr, "the left and right parentheses are not matched!"); -			fprintf(stderr, "ERROR in ConvertFromString\n"); -			fprintf(stderr, "%s\n", pszStr); -			return NULL; -		} - -		return pTree; -	} - -	int fnGetNumWord() { -		return m_vecTerminals.size(); -	} - -	void fnSetSpanInfo() { -		int iNextNum = 0; -		fnSuffixTraverseSetSpanInfo(m_ptRoot, iNextNum); -	} - -	void fnSetHeadWord() { -		for (size_t i = 0; i < m_vecTerminals.size(); i++) -			m_vecTerminals[i]->m_iHeadWord = i; -		SGetHeadWord *pGetHeadWord = new SGetHeadWord(); -		fnSuffixTraverseSetHeadWord(m_ptRoot, pGetHeadWord); -		delete pGetHeadWord; -	} - -	STreeItem *fnFindNodeForSpan(int iLeft, int iRight, bool bLowest) { -		STreeItem *pTreeItem = m_vecTerminals[iLeft]; - -		while (pTreeItem->m_iEnd < iRight) { -			pTreeItem = pTreeItem->m_ptParent; -			if (pTreeItem == NULL) break; -		} -		if (pTreeItem == NULL) -			return NULL; -		if (pTreeItem->m_iEnd > iRight) -			return NULL; - -		assert(pTreeItem->m_iEnd == iRight); -		if (bLowest) -			return pTreeItem; - -		while (pTreeItem->m_ptParent != NULL && pTreeItem->m_ptParent->fnGetChildrenNum() == 1) -			pTreeItem = pTreeItem->m_ptParent; - -		return pTreeItem; -	} - -private: -	void fnSuffixTraverseSetSpanInfo(STreeItem *ptItem, int& iNextNum) { -		int I; -		int iNumChildren = ptItem->fnGetChildrenNum(); -		for ( I = 0; I < iNumChildren; I++ ) -			fnSuffixTraverseSetSpanInfo(ptItem->m_vecChildren[ I ], iNextNum); - -		if ( I == 0 ) -		{ -			ptItem->m_iBegin = iNextNum; -			ptItem->m_iEnd = iNextNum++; -		} -		else -		{ -			ptItem->m_iBegin = ptItem->m_vecChildren[0]->m_iBegin; -			ptItem->m_iEnd = ptItem->m_vecChildren[I - 1]->m_iEnd; -		} -	} - - -	void fnSuffixTraverseSetHeadWord(STreeItem *ptItem, SGetHeadWord *pGetHeadWord) { -		int I, iHeadchild; - -		if ( ptItem->m_vecChildren.size() == 0 ) -			return; - -		for ( I = 0; I < ptItem->m_vecChildren.size(); I++ ) -			fnSuffixTraverseSetHeadWord(ptItem->m_vecChildren[I], pGetHeadWord); - -		vector<string> vecRight; - - -		if ( ptItem->m_vecChildren.size() == 1 ) -			iHeadchild = 0; -		else -		{ -			for ( I = 0; I < ptItem->m_vecChildren.size(); I++ ) -				vecRight.push_back( string( ptItem->m_vecChildren[ I ]->m_pszTerm ) ); - -			iHeadchild = pGetHeadWord->fnGetHeadWord( ptItem->m_pszTerm, vecRight ); -		} - -		ptItem->m_iHeadChild = iHeadchild; -		ptItem->m_iHeadWord = ptItem->m_vecChildren[iHeadchild]->m_iHeadWord; -	} - - -	static void fnReadSyntactic(const char *pszSyn, vector<string>& vec) { -		char *p; -		int I; - -		int iLeftNum, iRightNum; -		char *pszTmp, *pszTerm; -		pszTmp = new char[strlen(pszSyn)]; -		pszTerm = new char[strlen(pszSyn)]; -		pszTmp[0] = pszTerm[0] = '\0'; - -		vec.clear(); - -		char *pszLine; -		pszLine = new char[strlen(pszSyn) + 1]; -		strcpy( pszLine, pszSyn ); - -		char *pszLine2; - -		while( 1 ) { -			while(( strlen( pszLine ) > 0 ) -					&&( pszLine[ strlen( pszLine ) - 1 ] > 0 ) -					&&( pszLine[ strlen( pszLine ) - 1 ] <= ' ' ) ) -				pszLine[ strlen( pszLine ) - 1 ] = '\0'; - -			if( strlen( pszLine ) == 0 ) -				break; - -			//printf( "%s\n", pszLine ); -			pszLine2 = pszLine; -			while( pszLine2[ 0 ] <= ' ' ) -				pszLine2 ++; -			if( pszLine2[ 0 ] == '<' ) -				continue; - -			sscanf( pszLine2 + 1, "%s", pszTmp ); - -			if ( pszLine2[ 0 ] == '(' ) { -				iLeftNum = 0; -				iRightNum = 0; -			} - -			p = pszLine2; -			while ( 1 ) { -				pszTerm[ 0 ] = '\0'; -				sscanf( p, "%s", pszTerm ); - -				if( strlen( pszTerm ) == 0 ) -					break; -				p = strstr( p, pszTerm ); -				p += strlen( pszTerm ); - -				if( ( pszTerm[ 0 ] == '(' ) -						||( pszTerm[ strlen( pszTerm ) - 1 ] == ')' ) ) { -					if( pszTerm[ 0 ] == '(' ) { -						vec.push_back(string("(")); -						iLeftNum++; - -						I = 1; -						while ( pszTerm[ I ] == '(' && pszTerm[ I ]!= '\0' ) { -							vec.push_back(string("(")); -							iLeftNum++; - -							I++; -						} - -						if( strlen( pszTerm ) > 1 ) -							vec.push_back(string(pszTerm + I)); -					} else { -						char *pTmp; -						pTmp = pszTerm + strlen( pszTerm ) - 1; -						while( ( pTmp[ 0 ] == ')' ) && ( pTmp >= pszTerm ) ) -							pTmp --; -						pTmp[ 1 ] = '\0'; - -						if( strlen( pszTerm ) > 0 ) -							vec.push_back(string(pszTerm)); -						pTmp += 2; - -						for( I = 0; I <= (int)strlen( pTmp ); I ++ ) { -							vec.push_back(string(")")); -							iRightNum++; -						} -					} -				} else { -					char *q; -					q = strchr( pszTerm, ')' ); -					if ( q != NULL ) { -						q[ 0 ] = '\0'; -						if ( pszTerm[ 0 ] != '\0' ) -							vec.push_back(string(pszTerm)); -						vec.push_back(string(")")); -						iRightNum++; - -						q++; -						while ( q[ 0 ] == ')' ) { -							vec.push_back(string(")")); -							q++; -							iRightNum ++; -						} - -						while( q[ 0 ] == '(' ) { -							vec.push_back(string("(")); -							q++; -							iLeftNum++; -						} - -						if ( q[ 0 ] != '\0' ) -							vec.push_back(string(q)); -					} -					else -						vec.push_back(string(pszTerm)); -				} -			} - -			if (iLeftNum != iRightNum) { -				fprintf(stderr, "%s\n", pszSyn); -				assert(iLeftNum == iRightNum); -			} -			/*if ( iLeftNum != iRightNum ) { -				printf( "ERROR: left( and right ) is not matched, %d ( and %d )\n", iLeftNum, iRightNum ); -				return; -			}*/ - - -			if( vec.size() >= 2 -					&& strcmp( vec[1].c_str(), "(" ) == 0 ) { -				//( (IP..) ) -				std::vector<string>::iterator it; -				it = vec.begin(); -				it++; -				vec.insert(it, string("ROOT")); -			} - -			break; -		} - -		delete [] pszLine; -		delete [] pszTmp; -		delete [] pszTerm; -	} - -public: -	STreeItem *m_ptRoot; -	vector<STreeItem*>  m_vecTerminals; //the leaf nodes +struct SParsedTree { +  SParsedTree() { m_ptRoot = NULL; } +  ~SParsedTree() { +    if (m_ptRoot != NULL) delete m_ptRoot; +  } +  static SParsedTree *fnConvertFromString(const char *pszStr) { +    if (strcmp(pszStr, "(())") == 0) return NULL; +    SParsedTree *pTree = new SParsedTree(); + +    vector<string> vecSyn; +    fnReadSyntactic(pszStr, vecSyn); + +    int iLeft = 1, iRight = 1;  //# left/right parenthesis + +    STreeItem *pcurrent; + +    pTree->m_ptRoot = new STreeItem(vecSyn[1].c_str()); + +    pcurrent = pTree->m_ptRoot; + +    for (size_t i = 2; i < vecSyn.size() - 1; i++) { +      if (strcmp(vecSyn[i].c_str(), "(") == 0) +        iLeft++; +      else if (strcmp(vecSyn[i].c_str(), ")") == 0) { +        iRight++; +        if (pcurrent == NULL) { +          // error +          fprintf(stderr, "ERROR in ConvertFromString\n"); +          fprintf(stderr, "%s\n", pszStr); +          return NULL; +        } +        pcurrent = pcurrent->m_ptParent; +      } else { +        STreeItem *ptNewItem = new STreeItem(vecSyn[i].c_str()); +        pcurrent->fnAppend(ptNewItem); +        pcurrent = ptNewItem; + +        if (strcmp(vecSyn[i - 1].c_str(), "(") != 0 && +            strcmp(vecSyn[i - 1].c_str(), ")") != 0) { +          pTree->m_vecTerminals.push_back(ptNewItem); +          pcurrent = pcurrent->m_ptParent; +        } +      } +    } + +    if (iLeft != iRight) { +      // error +      fprintf(stderr, "the left and right parentheses are not matched!"); +      fprintf(stderr, "ERROR in ConvertFromString\n"); +      fprintf(stderr, "%s\n", pszStr); +      return NULL; +    } + +    return pTree; +  } + +  int fnGetNumWord() { return m_vecTerminals.size(); } + +  void fnSetSpanInfo() { +    int iNextNum = 0; +    fnSuffixTraverseSetSpanInfo(m_ptRoot, iNextNum); +  } + +  void fnSetHeadWord() { +    for (size_t i = 0; i < m_vecTerminals.size(); i++) +      m_vecTerminals[i]->m_iHeadWord = i; +    SGetHeadWord *pGetHeadWord = new SGetHeadWord(); +    fnSuffixTraverseSetHeadWord(m_ptRoot, pGetHeadWord); +    delete pGetHeadWord; +  } + +  STreeItem *fnFindNodeForSpan(int iLeft, int iRight, bool bLowest) { +    STreeItem *pTreeItem = m_vecTerminals[iLeft]; + +    while (pTreeItem->m_iEnd < iRight) { +      pTreeItem = pTreeItem->m_ptParent; +      if (pTreeItem == NULL) break; +    } +    if (pTreeItem == NULL) return NULL; +    if (pTreeItem->m_iEnd > iRight) return NULL; + +    assert(pTreeItem->m_iEnd == iRight); +    if (bLowest) return pTreeItem; + +    while (pTreeItem->m_ptParent != NULL && +           pTreeItem->m_ptParent->fnGetChildrenNum() == 1) +      pTreeItem = pTreeItem->m_ptParent; + +    return pTreeItem; +  } + + private: +  void fnSuffixTraverseSetSpanInfo(STreeItem *ptItem, int &iNextNum) { +    int I; +    int iNumChildren = ptItem->fnGetChildrenNum(); +    for (I = 0; I < iNumChildren; I++) +      fnSuffixTraverseSetSpanInfo(ptItem->m_vecChildren[I], iNextNum); + +    if (I == 0) { +      ptItem->m_iBegin = iNextNum; +      ptItem->m_iEnd = iNextNum++; +    } else { +      ptItem->m_iBegin = ptItem->m_vecChildren[0]->m_iBegin; +      ptItem->m_iEnd = ptItem->m_vecChildren[I - 1]->m_iEnd; +    } +  } + +  void fnSuffixTraverseSetHeadWord(STreeItem *ptItem, +                                   SGetHeadWord *pGetHeadWord) { +    int I, iHeadchild; + +    if (ptItem->m_vecChildren.size() == 0) return; + +    for (I = 0; I < ptItem->m_vecChildren.size(); I++) +      fnSuffixTraverseSetHeadWord(ptItem->m_vecChildren[I], pGetHeadWord); + +    vector<string> vecRight; + +    if (ptItem->m_vecChildren.size() == 1) +      iHeadchild = 0; +    else { +      for (I = 0; I < ptItem->m_vecChildren.size(); I++) +        vecRight.push_back(string(ptItem->m_vecChildren[I]->m_pszTerm)); + +      iHeadchild = pGetHeadWord->fnGetHeadWord(ptItem->m_pszTerm, vecRight); +    } + +    ptItem->m_iHeadChild = iHeadchild; +    ptItem->m_iHeadWord = ptItem->m_vecChildren[iHeadchild]->m_iHeadWord; +  } + +  static void fnReadSyntactic(const char *pszSyn, vector<string> &vec) { +    char *p; +    int I; + +    int iLeftNum, iRightNum; +    char *pszTmp, *pszTerm; +    pszTmp = new char[strlen(pszSyn)]; +    pszTerm = new char[strlen(pszSyn)]; +    pszTmp[0] = pszTerm[0] = '\0'; + +    vec.clear(); + +    char *pszLine; +    pszLine = new char[strlen(pszSyn) + 1]; +    strcpy(pszLine, pszSyn); + +    char *pszLine2; + +    while (1) { +      while ((strlen(pszLine) > 0) && (pszLine[strlen(pszLine) - 1] > 0) && +             (pszLine[strlen(pszLine) - 1] <= ' ')) +        pszLine[strlen(pszLine) - 1] = '\0'; + +      if (strlen(pszLine) == 0) break; + +      // printf( "%s\n", pszLine ); +      pszLine2 = pszLine; +      while (pszLine2[0] <= ' ') pszLine2++; +      if (pszLine2[0] == '<') continue; + +      sscanf(pszLine2 + 1, "%s", pszTmp); + +      if (pszLine2[0] == '(') { +        iLeftNum = 0; +        iRightNum = 0; +      } + +      p = pszLine2; +      while (1) { +        pszTerm[0] = '\0'; +        sscanf(p, "%s", pszTerm); + +        if (strlen(pszTerm) == 0) break; +        p = strstr(p, pszTerm); +        p += strlen(pszTerm); + +        if ((pszTerm[0] == '(') || (pszTerm[strlen(pszTerm) - 1] == ')')) { +          if (pszTerm[0] == '(') { +            vec.push_back(string("(")); +            iLeftNum++; + +            I = 1; +            while (pszTerm[I] == '(' && pszTerm[I] != '\0') { +              vec.push_back(string("(")); +              iLeftNum++; + +              I++; +            } + +            if (strlen(pszTerm) > 1) vec.push_back(string(pszTerm + I)); +          } else { +            char *pTmp; +            pTmp = pszTerm + strlen(pszTerm) - 1; +            while ((pTmp[0] == ')') && (pTmp >= pszTerm)) pTmp--; +            pTmp[1] = '\0'; + +            if (strlen(pszTerm) > 0) vec.push_back(string(pszTerm)); +            pTmp += 2; + +            for (I = 0; I <= (int)strlen(pTmp); I++) { +              vec.push_back(string(")")); +              iRightNum++; +            } +          } +        } else { +          char *q; +          q = strchr(pszTerm, ')'); +          if (q != NULL) { +            q[0] = '\0'; +            if (pszTerm[0] != '\0') vec.push_back(string(pszTerm)); +            vec.push_back(string(")")); +            iRightNum++; + +            q++; +            while (q[0] == ')') { +              vec.push_back(string(")")); +              q++; +              iRightNum++; +            } + +            while (q[0] == '(') { +              vec.push_back(string("(")); +              q++; +              iLeftNum++; +            } + +            if (q[0] != '\0') vec.push_back(string(q)); +          } else +            vec.push_back(string(pszTerm)); +        } +      } + +      if (iLeftNum != iRightNum) { +        fprintf(stderr, "%s\n", pszSyn); +        assert(iLeftNum == iRightNum); +      } +      /*if ( iLeftNum != iRightNum ) { +              printf( "ERROR: left( and right ) is not matched, %d ( and %d +      )\n", iLeftNum, iRightNum ); +              return; +      }*/ + +      if (vec.size() >= 2 && strcmp(vec[1].c_str(), "(") == 0) { +        //( (IP..) ) +        std::vector<string>::iterator it; +        it = vec.begin(); +        it++; +        vec.insert(it, string("ROOT")); +      } + +      break; +    } + +    delete[] pszLine; +    delete[] pszTmp; +    delete[] pszTerm; +  } + + public: +  STreeItem *m_ptRoot; +  vector<STreeItem *> m_vecTerminals;  // the leaf nodes  };  struct SParseReader { -	SParseReader(const char* pszParse_Fname, bool bFlattened = false) : -	m_bFlattened(bFlattened){ -		m_fpIn = fopen( pszParse_Fname, "r" ); -		assert(m_fpIn != NULL); -	} -	~SParseReader() { -		if (m_fpIn != NULL) -			fclose(m_fpIn); -	} - -	SParsedTree* fnReadNextParseTree( ) { -		SParsedTree *pTree = NULL; -		char *pszLine = new char[100001]; -		int iLen; - -		while (fnReadNextSentence(pszLine, &iLen) == true ) { -			if (iLen == 0) -				continue; - -			pTree = SParsedTree::fnConvertFromString(pszLine); -			if (pTree == NULL)  -				break; -			if (m_bFlattened) -				fnPostProcessingFlattenedParse(pTree); -			else { -				pTree->fnSetSpanInfo(); -				pTree->fnSetHeadWord(); -			} -			break; -		} - -		delete [] pszLine; -		return pTree; -	} - -	SParsedTree* fnReadNextParseTreeWithProb(double *pProb) { -		SParsedTree *pTree = NULL; -		char *pszLine = new char[100001]; -		int iLen; - -		while (fnReadNextSentence(pszLine, &iLen) == true ) { -			if (iLen == 0) -				continue; - -			char *p = strchr(pszLine, ' '); -			assert(p != NULL); -			p[0] = '\0'; -			p++; -			if (pProb) -				(*pProb) = atof(pszLine); - -			pTree = SParsedTree::fnConvertFromString(p); -			if (m_bFlattened) -				fnPostProcessingFlattenedParse(pTree); -			else { -				pTree->fnSetSpanInfo(); -				pTree->fnSetHeadWord(); -			} -			break; -		} - -		delete [] pszLine; -		return pTree; -	} -private: -	/* -	 * since to the parse tree is a flattened tree, use the head mark to identify head info. -	 * the head node will be marked as "*XP*" -	 */ -	void fnSetParseTreeHeadInfo(SParsedTree *pTree) { -		for (size_t i = 0; i < pTree->m_vecTerminals.size(); i++) -			pTree->m_vecTerminals[i]->m_iHeadWord = i; -		fnSuffixTraverseSetHeadWord(pTree->m_ptRoot); -	} - -	void fnSuffixTraverseSetHeadWord(STreeItem *pTreeItem) { -		if (pTreeItem->m_vecChildren.size() == 0) -			return; - -		for (size_t i = 0; i < pTreeItem->m_vecChildren.size(); i++ ) -			fnSuffixTraverseSetHeadWord(pTreeItem->m_vecChildren[i]); - -		vector<string> vecRight; - -		int iHeadchild; - -		if (pTreeItem->fnIsPreTerminal()) { -			iHeadchild = 0; -		} else { -			size_t i; -			for (i = 0; i < pTreeItem->m_vecChildren.size(); i++) { -				char *p = pTreeItem->m_vecChildren[i]->m_pszTerm; -				if (p[0] == '*' && p[strlen(p) - 1] == '*') { -					iHeadchild = i; -					p[strlen(p) - 1] = '\0'; -					string str = p + 1; -					strcpy(p, str.c_str()); //erase the "*..*" -					break; -				} -			} -			assert(i < pTreeItem->m_vecChildren.size()); -		} - -		pTreeItem->m_iHeadChild = iHeadchild; -		pTreeItem->m_iHeadWord = pTreeItem->m_vecChildren[iHeadchild]->m_iHeadWord; -	} -	void fnPostProcessingFlattenedParse(SParsedTree *pTree) { -		pTree->fnSetSpanInfo(); -		fnSetParseTreeHeadInfo(pTree); -	} -	bool fnReadNextSentence(char *pszLine, int* piLength) { -		if (feof(m_fpIn) == true) -			return false; - -		int iLen; - -		pszLine[ 0 ] = '\0'; - -		fgets(pszLine, 10001, m_fpIn); -		iLen = strlen(pszLine); -		while (iLen > 0 && pszLine[iLen - 1] > 0 && pszLine[iLen -1] < 33) { -			pszLine[ iLen - 1 ] = '\0'; -			iLen--; -		} - -		if ( piLength != NULL ) -			(*piLength) = iLen; - -		return true; -	} -private: -	FILE *m_fpIn; -	const bool m_bFlattened; +  SParseReader(const char *pszParse_Fname, bool bFlattened = false) +      : m_bFlattened(bFlattened) { +    m_fpIn = fopen(pszParse_Fname, "r"); +    assert(m_fpIn != NULL); +  } +  ~SParseReader() { +    if (m_fpIn != NULL) fclose(m_fpIn); +  } + +  SParsedTree *fnReadNextParseTree() { +    SParsedTree *pTree = NULL; +    char *pszLine = new char[100001]; +    int iLen; + +    while (fnReadNextSentence(pszLine, &iLen) == true) { +      if (iLen == 0) continue; + +      pTree = SParsedTree::fnConvertFromString(pszLine); +      if (pTree == NULL) break; +      if (m_bFlattened) +        fnPostProcessingFlattenedParse(pTree); +      else { +        pTree->fnSetSpanInfo(); +        pTree->fnSetHeadWord(); +      } +      break; +    } + +    delete[] pszLine; +    return pTree; +  } + +  SParsedTree *fnReadNextParseTreeWithProb(double *pProb) { +    SParsedTree *pTree = NULL; +    char *pszLine = new char[100001]; +    int iLen; + +    while (fnReadNextSentence(pszLine, &iLen) == true) { +      if (iLen == 0) continue; + +      char *p = strchr(pszLine, ' '); +      assert(p != NULL); +      p[0] = '\0'; +      p++; +      if (pProb) (*pProb) = atof(pszLine); + +      pTree = SParsedTree::fnConvertFromString(p); +      if (m_bFlattened) +        fnPostProcessingFlattenedParse(pTree); +      else { +        pTree->fnSetSpanInfo(); +        pTree->fnSetHeadWord(); +      } +      break; +    } + +    delete[] pszLine; +    return pTree; +  } + + private: +  /* +   * since to the parse tree is a flattened tree, use the head mark to identify +   * head info. +   * the head node will be marked as "*XP*" +   */ +  void fnSetParseTreeHeadInfo(SParsedTree *pTree) { +    for (size_t i = 0; i < pTree->m_vecTerminals.size(); i++) +      pTree->m_vecTerminals[i]->m_iHeadWord = i; +    fnSuffixTraverseSetHeadWord(pTree->m_ptRoot); +  } + +  void fnSuffixTraverseSetHeadWord(STreeItem *pTreeItem) { +    if (pTreeItem->m_vecChildren.size() == 0) return; + +    for (size_t i = 0; i < pTreeItem->m_vecChildren.size(); i++) +      fnSuffixTraverseSetHeadWord(pTreeItem->m_vecChildren[i]); + +    vector<string> vecRight; + +    int iHeadchild; + +    if (pTreeItem->fnIsPreTerminal()) { +      iHeadchild = 0; +    } else { +      size_t i; +      for (i = 0; i < pTreeItem->m_vecChildren.size(); i++) { +        char *p = pTreeItem->m_vecChildren[i]->m_pszTerm; +        if (p[0] == '*' && p[strlen(p) - 1] == '*') { +          iHeadchild = i; +          p[strlen(p) - 1] = '\0'; +          string str = p + 1; +          strcpy(p, str.c_str());  // erase the "*..*" +          break; +        } +      } +      assert(i < pTreeItem->m_vecChildren.size()); +    } + +    pTreeItem->m_iHeadChild = iHeadchild; +    pTreeItem->m_iHeadWord = pTreeItem->m_vecChildren[iHeadchild]->m_iHeadWord; +  } +  void fnPostProcessingFlattenedParse(SParsedTree *pTree) { +    pTree->fnSetSpanInfo(); +    fnSetParseTreeHeadInfo(pTree); +  } +  bool fnReadNextSentence(char *pszLine, int *piLength) { +    if (feof(m_fpIn) == true) return false; + +    int iLen; + +    pszLine[0] = '\0'; + +    fgets(pszLine, 10001, m_fpIn); +    iLen = strlen(pszLine); +    while (iLen > 0 && pszLine[iLen - 1] > 0 && pszLine[iLen - 1] < 33) { +      pszLine[iLen - 1] = '\0'; +      iLen--; +    } + +    if (piLength != NULL) (*piLength) = iLen; + +    return true; +  } + + private: +  FILE *m_fpIn; +  const bool m_bFlattened;  };  #endif /* TREE_H_ */ diff --git a/utils/synutils/tsuruoka_maxent.h b/utils/synutils/tsuruoka_maxent.h index 790876ab..b5a87404 100644 --- a/utils/synutils/tsuruoka_maxent.h +++ b/utils/synutils/tsuruoka_maxent.h @@ -18,143 +18,138 @@  using namespace std; -  typedef std::tr1::unordered_map<std::string, int> Map;  typedef std::tr1::unordered_map<std::string, int>::iterator Iterator; - - -struct Tsuruoka_Maxent{ -	Tsuruoka_Maxent(const char* pszModelFName) { -		if (pszModelFName != NULL) { -			m_pModel = new ME_Model(); -			m_pModel->load_from_file(pszModelFName); -		} else -			m_pModel = NULL; -	} - -	~Tsuruoka_Maxent() { -		if (m_pModel != NULL) -			delete m_pModel; -	} - -	void fnTrain(const char* pszInstanceFName, const char* pszAlgorithm, const char* pszModelFName, int iNumIteration) { -		assert(strcmp(pszAlgorithm, "l1") == 0 -				|| strcmp(pszAlgorithm, "l2") == 0 -				|| strcmp(pszAlgorithm, "sgd") == 0 -				|| strcmp(pszAlgorithm, "SGD") == 0); -		FILE *fpIn = fopen(pszInstanceFName, "r"); - -		ME_Model *pModel = new ME_Model(); - -		char *pszLine = new char[100001]; -		int iNumInstances = 0; -		int iLen; -		while (!feof(fpIn)) { -			pszLine[0] = '\0'; -			fgets(pszLine, 20000, fpIn); -			if (strlen(pszLine) == 0) { -				continue; -			} - -			iLen = strlen(pszLine); -			while (iLen > 0 && pszLine[iLen - 1] > 0 && pszLine[iLen -1] < 33) { -				pszLine[ iLen - 1 ] = '\0'; -				iLen--; -			} - - -			iNumInstances++; - -			ME_Sample *pmes = new ME_Sample(); - -			char *p = strrchr(pszLine, ' '); -			assert(p != NULL); -			p[0] = '\0'; -			p++; -			vector<string> vecContext; -			SplitOnWhitespace(string(pszLine), &vecContext); - -			pmes->label = string(p); -			for (size_t i = 0; i < vecContext.size(); i++) -				pmes->add_feature(vecContext[i]); -			pModel->add_training_sample((*pmes)); -			if (iNumInstances % 100000 == 0) -				fprintf(stdout, "......Reading #Instances: %1d\n", iNumInstances); -			delete pmes; -		} -		fprintf(stdout, "......Reading #Instances: %1d\n", iNumInstances); -		fclose(fpIn); - -		if (strcmp(pszAlgorithm, "l1") == 0) -			pModel->use_l1_regularizer(1.0); -		else if (strcmp(pszAlgorithm, "l2") == 0) -			pModel->use_l2_regularizer(1.0); -		else -			pModel->use_SGD(); - -		pModel->train(); -		pModel->save_to_file(pszModelFName); - -		delete pModel; -		fprintf(stdout, "......Finished Training\n"); -		fprintf(stdout, "......Model saved as %s\n", pszModelFName); -		delete [] pszLine; -	} - -	double fnEval(const char* pszContext, const char* pszOutcome) const { -		vector<string> vecContext; -		ME_Sample *pmes = new ME_Sample(); -		SplitOnWhitespace(string(pszContext), &vecContext); - -		for (size_t i = 0; i < vecContext.size(); i++) -			pmes->add_feature(vecContext[i]); -		vector<double> vecProb = m_pModel->classify(*pmes); -		delete pmes; -		int iLableID = m_pModel->get_class_id(pszOutcome); -		return vecProb[iLableID]; -	} -	void fnEval(const char* pszContext, vector<pair<string, double> >& vecOutput) const { -		vector<string> vecContext; -		ME_Sample *pmes = new ME_Sample(); -		SplitOnWhitespace(string(pszContext), &vecContext); - -		vecOutput.clear(); - -		for (size_t i = 0; i < vecContext.size(); i++) -			pmes->add_feature(vecContext[i]); -		vector<double> vecProb = m_pModel->classify(*pmes); - -		for (size_t i = 0; i < vecProb.size(); i++) { -			string label = m_pModel->get_class_label(i); -			vecOutput.push_back(make_pair(label, vecProb[i])); -		} -		delete pmes; -	} -	void fnEval(const char* pszContext, vector<double>& vecOutput) const{ -		vector<string> vecContext; -		ME_Sample *pmes = new ME_Sample(); -		SplitOnWhitespace(string(pszContext), &vecContext); - -		vecOutput.clear(); - -		for (size_t i = 0; i < vecContext.size(); i++) -			pmes->add_feature(vecContext[i]); -		vector<double> vecProb = m_pModel->classify(*pmes); - -		for (size_t i = 0; i < vecProb.size(); i++) { -			string label = m_pModel->get_class_label(i); -			vecOutput.push_back(vecProb[i]); -		} -		delete pmes; -	} -	int fnGetClassId(const string& strLabel) const { -		return m_pModel->get_class_id(strLabel); -	} -private: -	ME_Model *m_pModel; +struct Tsuruoka_Maxent { +  Tsuruoka_Maxent(const char* pszModelFName) { +    if (pszModelFName != NULL) { +      m_pModel = new ME_Model(); +      m_pModel->load_from_file(pszModelFName); +    } else +      m_pModel = NULL; +  } + +  ~Tsuruoka_Maxent() { +    if (m_pModel != NULL) delete m_pModel; +  } + +  void fnTrain(const char* pszInstanceFName, const char* pszAlgorithm, +               const char* pszModelFName, int iNumIteration) { +    assert(strcmp(pszAlgorithm, "l1") == 0 || strcmp(pszAlgorithm, "l2") == 0 || +           strcmp(pszAlgorithm, "sgd") == 0 || +           strcmp(pszAlgorithm, "SGD") == 0); +    FILE* fpIn = fopen(pszInstanceFName, "r"); + +    ME_Model* pModel = new ME_Model(); + +    char* pszLine = new char[100001]; +    int iNumInstances = 0; +    int iLen; +    while (!feof(fpIn)) { +      pszLine[0] = '\0'; +      fgets(pszLine, 20000, fpIn); +      if (strlen(pszLine) == 0) { +        continue; +      } + +      iLen = strlen(pszLine); +      while (iLen > 0 && pszLine[iLen - 1] > 0 && pszLine[iLen - 1] < 33) { +        pszLine[iLen - 1] = '\0'; +        iLen--; +      } + +      iNumInstances++; + +      ME_Sample* pmes = new ME_Sample(); + +      char* p = strrchr(pszLine, ' '); +      assert(p != NULL); +      p[0] = '\0'; +      p++; +      vector<string> vecContext; +      SplitOnWhitespace(string(pszLine), &vecContext); + +      pmes->label = string(p); +      for (size_t i = 0; i < vecContext.size(); i++) +        pmes->add_feature(vecContext[i]); +      pModel->add_training_sample((*pmes)); +      if (iNumInstances % 100000 == 0) +        fprintf(stdout, "......Reading #Instances: %1d\n", iNumInstances); +      delete pmes; +    } +    fprintf(stdout, "......Reading #Instances: %1d\n", iNumInstances); +    fclose(fpIn); + +    if (strcmp(pszAlgorithm, "l1") == 0) +      pModel->use_l1_regularizer(1.0); +    else if (strcmp(pszAlgorithm, "l2") == 0) +      pModel->use_l2_regularizer(1.0); +    else +      pModel->use_SGD(); + +    pModel->train(); +    pModel->save_to_file(pszModelFName); + +    delete pModel; +    fprintf(stdout, "......Finished Training\n"); +    fprintf(stdout, "......Model saved as %s\n", pszModelFName); +    delete[] pszLine; +  } + +  double fnEval(const char* pszContext, const char* pszOutcome) const { +    vector<string> vecContext; +    ME_Sample* pmes = new ME_Sample(); +    SplitOnWhitespace(string(pszContext), &vecContext); + +    for (size_t i = 0; i < vecContext.size(); i++) +      pmes->add_feature(vecContext[i]); +    vector<double> vecProb = m_pModel->classify(*pmes); +    delete pmes; +    int iLableID = m_pModel->get_class_id(pszOutcome); +    return vecProb[iLableID]; +  } +  void fnEval(const char* pszContext, +              vector<pair<string, double> >& vecOutput) const { +    vector<string> vecContext; +    ME_Sample* pmes = new ME_Sample(); +    SplitOnWhitespace(string(pszContext), &vecContext); + +    vecOutput.clear(); + +    for (size_t i = 0; i < vecContext.size(); i++) +      pmes->add_feature(vecContext[i]); +    vector<double> vecProb = m_pModel->classify(*pmes); + +    for (size_t i = 0; i < vecProb.size(); i++) { +      string label = m_pModel->get_class_label(i); +      vecOutput.push_back(make_pair(label, vecProb[i])); +    } +    delete pmes; +  } +  void fnEval(const char* pszContext, vector<double>& vecOutput) const { +    vector<string> vecContext; +    ME_Sample* pmes = new ME_Sample(); +    SplitOnWhitespace(string(pszContext), &vecContext); + +    vecOutput.clear(); + +    for (size_t i = 0; i < vecContext.size(); i++) +      pmes->add_feature(vecContext[i]); +    vector<double> vecProb = m_pModel->classify(*pmes); + +    for (size_t i = 0; i < vecProb.size(); i++) { +      string label = m_pModel->get_class_label(i); +      vecOutput.push_back(vecProb[i]); +    } +    delete pmes; +  } +  int fnGetClassId(const string& strLabel) const { +    return m_pModel->get_class_id(strLabel); +  } + + private: +  ME_Model* m_pModel;  }; - -  #endif /* TSURUOKA_MAXENT_H_ */ diff --git a/utils/synutils/utility.h b/utils/synutils/utility.h index d3f52ee0..791b2286 100644 --- a/utils/synutils/utility.h +++ b/utils/synutils/utility.h @@ -8,7 +8,6 @@  #ifndef UTILITY_H_  #define UTILITY_H_ -  #include <zlib.h>  #include <stdio.h>  #include <assert.h> @@ -20,110 +19,107 @@ using namespace std;  typedef std::unordered_map<std::string, int> MapString2Int;  typedef std::unordered_map<std::string, float> MapString2Float; -typedef std::unordered_map<std::string, float>::iterator MapString2FloatIterator; +typedef std::unordered_map<std::string, float>::iterator +    MapString2FloatIterator;  using namespace std;  struct SFReader { -	SFReader( ){ } -	virtual ~SFReader( ){ } +  SFReader() {} +  virtual ~SFReader() {} -	virtual bool fnReadNextLine( char *pszLine, int* piLength ) = 0; -	virtual bool fnReadNextLine(string& strLine) = 0; +  virtual bool fnReadNextLine(char* pszLine, int* piLength) = 0; +  virtual bool fnReadNextLine(string& strLine) = 0;  }; -struct STxtFileReader: public SFReader { -	STxtFileReader(const char* pszFname) { -		m_fpIn = fopen(pszFname, "r"); -		assert(m_fpIn != NULL); -	} -	~STxtFileReader() { -		if (m_fpIn != NULL) -			fclose(m_fpIn); -	} - -	bool fnReadNextLine(char *pszLine, int* piLength) { -			if (feof(m_fpIn) == true) -				return false; - -			int iLen; - -			pszLine[ 0 ] = '\0'; - -			fgets(pszLine, 10001, m_fpIn); -			iLen = strlen(pszLine); -			if (iLen == 0) return false; -			while (iLen > 0 && pszLine[iLen - 1] > 0 && pszLine[iLen -1] < 33) { -				pszLine[ iLen - 1 ] = '\0'; -				iLen--; -			} - -			if ( piLength != NULL ) -				(*piLength) = iLen; - -			return true; -		} - -	bool fnReadNextLine(string& strLine) { -		char *pszLine = new char[10001]; -		bool bOut = fnReadNextLine(pszLine, NULL); -		if (bOut) strLine = string(pszLine); -		else strLine = string(""); -		delete [] pszLine; - -		return bOut; -	} -private: -	FILE *m_fpIn; +struct STxtFileReader : public SFReader { +  STxtFileReader(const char* pszFname) { +    m_fpIn = fopen(pszFname, "r"); +    assert(m_fpIn != NULL); +  } +  ~STxtFileReader() { +    if (m_fpIn != NULL) fclose(m_fpIn); +  } + +  bool fnReadNextLine(char* pszLine, int* piLength) { +    if (feof(m_fpIn) == true) return false; + +    int iLen; + +    pszLine[0] = '\0'; + +    fgets(pszLine, 10001, m_fpIn); +    iLen = strlen(pszLine); +    if (iLen == 0) return false; +    while (iLen > 0 && pszLine[iLen - 1] > 0 && pszLine[iLen - 1] < 33) { +      pszLine[iLen - 1] = '\0'; +      iLen--; +    } + +    if (piLength != NULL) (*piLength) = iLen; + +    return true; +  } + +  bool fnReadNextLine(string& strLine) { +    char* pszLine = new char[10001]; +    bool bOut = fnReadNextLine(pszLine, NULL); +    if (bOut) +      strLine = string(pszLine); +    else +      strLine = string(""); +    delete[] pszLine; + +    return bOut; +  } + + private: +  FILE* m_fpIn;  }; -struct SGZFileReader: public SFReader { -	SGZFileReader(const char* pszFname) { -		m_fpIn = gzopen( pszFname, "r" ); -		assert(m_fpIn != NULL); -	} -	~SGZFileReader() { -		if (m_fpIn != NULL) -			gzclose(m_fpIn); -	} - -	bool fnReadNextLine(char *pszLine, int* piLength) { -		if ( m_fpIn == NULL ) -			exit( 0 ); -		if ( gzeof( m_fpIn ) == true ) -			return false; - -		int iLen; - -		pszLine[ 0 ] = '\0'; - -		gzgets( m_fpIn, pszLine, 10001 ); -		iLen = strlen( pszLine ); -		while ( iLen > 0 && pszLine[ iLen - 1 ] > 0 && pszLine[ iLen -1 ] < 33 ) -		{ -			pszLine[ iLen - 1 ] = '\0'; -			iLen--; -		} - -		if ( piLength != NULL ) -			(*piLength) = iLen; - -		return true; -	} - -	bool fnReadNextLine(string& strLine) { -		char *pszLine = new char[10001]; -		bool bOut = fnReadNextLine(pszLine, NULL); -		if (bOut) strLine = string(pszLine); -		else strLine = string(""); -		delete [] pszLine; - -		return bOut; -	} -private: -	gzFile m_fpIn; +struct SGZFileReader : public SFReader { +  SGZFileReader(const char* pszFname) { +    m_fpIn = gzopen(pszFname, "r"); +    assert(m_fpIn != NULL); +  } +  ~SGZFileReader() { +    if (m_fpIn != NULL) gzclose(m_fpIn); +  } + +  bool fnReadNextLine(char* pszLine, int* piLength) { +    if (m_fpIn == NULL) exit(0); +    if (gzeof(m_fpIn) == true) return false; + +    int iLen; + +    pszLine[0] = '\0'; + +    gzgets(m_fpIn, pszLine, 10001); +    iLen = strlen(pszLine); +    while (iLen > 0 && pszLine[iLen - 1] > 0 && pszLine[iLen - 1] < 33) { +      pszLine[iLen - 1] = '\0'; +      iLen--; +    } + +    if (piLength != NULL) (*piLength) = iLen; + +    return true; +  } + +  bool fnReadNextLine(string& strLine) { +    char* pszLine = new char[10001]; +    bool bOut = fnReadNextLine(pszLine, NULL); +    if (bOut) +      strLine = string(pszLine); +    else +      strLine = string(""); +    delete[] pszLine; + +    return bOut; +  } + + private: +  gzFile m_fpIn;  }; -  #endif /* UTILITY_H_ */ - | 
