Import synutils

author: Wu, Ke <wuke@cs.umd.edu> 2014-10-07 17:12:32 -0400
committer: Wu, Ke <wuke@cs.umd.edu> 2014-10-07 17:12:32 -0400
commit: f762dbbf10a8204d0d0b82e9acb29feacd3b3bb4 (patch)
tree: 5b775aaed0a060ec13626eb90dba583e59feddc6 /utils/synutils/tree.h
parent: d88186af251ecae60974b20395ce75807bfdda35 (diff)
1 files changed, 731 insertions, 0 deletions
diff --git a/utils/synutils/tree.h b/utils/synutils/tree.h
new file mode 100644
index 00000000..658a470d
--- /dev/null
+++ b/utils/synutils/tree.h
@@ -0,0 +1,731 @@
+/*
+ * tree.h
+ *
+ *  Created on: May 23, 2013
+ *      Author: lijunhui
+ */
+
+#ifndef TREE_H_
+#define TREE_H_
+
+#include <string>
+#include <assert.h>
+#include <stdio.h>
+
+using namespace std;
+
+struct STreeItem{
+	STreeItem(const char* pszTerm) {
+		m_pszTerm = new char[ strlen( pszTerm ) + 1 ];
+		strcpy( m_pszTerm, pszTerm );
+
+		m_ptParent = NULL;
+		m_iBegin = -1;
+		m_iEnd = -1;
+		m_iHeadChild = -1;
+		m_iHeadWord = -1;
+		m_iBrotherIndex = -1;
+	}
+	~STreeItem( ) {
+		delete [] m_pszTerm;
+		for (size_t i = 0; i < m_vecChildren.size(); i++)
+			delete m_vecChildren[i];
+	}
+	int fnAppend(STreeItem *ptChild) {
+		m_vecChildren.push_back(ptChild);
+		ptChild->m_iBrotherIndex = m_vecChildren.size() - 1;
+		ptChild->m_ptParent = this;
+		return m_vecChildren.size() - 1;
+	}
+	int fnGetChildrenNum() {
+		return m_vecChildren.size();
+	}
+
+	bool fnIsPreTerminal( void ) {
+		int I;
+		if ( this == NULL || m_vecChildren.size() == 0 )
+			return false;
+
+		for ( I = 0; I < m_vecChildren.size(); I++ )
+			if (m_vecChildren[I]->m_vecChildren.size() > 0 )
+				return false;
+
+		return true;
+	}
+public:
+	char *m_pszTerm;
+
+	vector<STreeItem*> m_vecChildren;//children items
+	STreeItem *m_ptParent;//the parent item
+
+	int m_iBegin;
+	int m_iEnd; //the node span words[m_iBegin, m_iEnd]
+	int m_iHeadChild; //the index of its head child
+	int m_iHeadWord; //the index of its head word
+	int m_iBrotherIndex;//the index in his brothers
+};
+
+
+struct SGetHeadWord{
+	typedef vector<string> CVectorStr;
+	SGetHeadWord() {
+
+	}
+	~SGetHeadWord() {
+
+	}
+	int fnGetHeadWord( char *pszCFGLeft, CVectorStr vectRight ) {
+		//0 indicating from right to left while 1 indicating from left to right
+		char szaHeadLists[ 201 ] = "0";
+
+		/*  //head rules for Egnlish
+		if( strcmp( pszCFGLeft, "ADJP" ) == 0 )
+			strcpy( szaHeadLists, "0NNS 0QP 0NN 0$ 0ADVP 0JJ 0VBN 0VBG 0ADJP 0JJR 0NP 0JJS 0DT 0FW 0RBR 0RBS 0SBAR 0RB 0" );
+		else if( strcmp( pszCFGLeft, "ADVP" ) == 0 )
+			strcpy( szaHeadLists, "1RB 1RBR 1RBS 1FW 1ADVP 1TO 1CD 1JJR 1JJ 1IN 1NP 1JJS 1NN 1" );
+		else if( strcmp( pszCFGLeft, "CONJP" ) == 0 )
+			strcpy( szaHeadLists, "1CC 1RB 1IN 1" );
+		else if( strcmp( pszCFGLeft, "FRAG" ) == 0 )
+			strcpy( szaHeadLists, "1" );
+		else if( strcmp( pszCFGLeft, "INTJ" ) == 0 )
+			strcpy( szaHeadLists, "0" );
+		else if( strcmp( pszCFGLeft, "LST" ) == 0 )
+			strcpy( szaHeadLists, "1LS 1: 1CLN 1" );
+		else if( strcmp( pszCFGLeft, "NAC" ) == 0 )
+			strcpy( szaHeadLists, "0NN 0NNS 0NNP 0NNPS 0NP 0NAC 0EX 0$ 0CD 0QP 0PRP 0VBG 0JJ 0JJS 0JJR 0ADJP 0FW 0" );
+		else if( strcmp( pszCFGLeft, "PP" ) == 0 )
+			strcpy( szaHeadLists, "1IN 1TO 1VBG 1VBN 1RP 1FW 1" );
+		else if( strcmp( pszCFGLeft, "PRN" ) == 0 )
+			strcpy( szaHeadLists, "1" );
+		else if( strcmp( pszCFGLeft, "PRT" ) == 0 )
+			strcpy( szaHeadLists, "1RP 1" );
+		else if( strcmp( pszCFGLeft, "QP" ) == 0 )
+			strcpy( szaHeadLists, "0$ 0IN 0NNS 0NN 0JJ 0RB 0DT 0CD 0NCD 0QP 0JJR 0JJS 0" );
+		else if( strcmp( pszCFGLeft, "RRC" ) == 0 )
+			strcpy( szaHeadLists, "1VP 1NP 1ADVP 1ADJP 1PP 1" );
+		else if( strcmp( pszCFGLeft, "S" ) == 0 )
+			strcpy( szaHeadLists, "0TO 0IN 0VP 0S 0SBAR 0ADJP 0UCP 0NP 0" );
+		else if( strcmp( pszCFGLeft, "SBAR" ) == 0 )
+			strcpy( szaHeadLists, "0WHNP 0WHPP 0WHADVP 0WHADJP 0IN 0DT 0S 0SQ 0SINV 0SBAR 0FRAG 0" );
+		else if( strcmp( pszCFGLeft, "SBARQ" ) == 0 )
+			strcpy( szaHeadLists, "0SQ 0S 0SINV 0SBARQ 0FRAG 0" );
+		else if( strcmp( pszCFGLeft, "SINV" ) == 0 )
+			strcpy( szaHeadLists, "0VBZ 0VBD 0VBP 0VB 0MD 0VP 0S 0SINV 0ADJP 0NP 0" );
+		else if( strcmp( pszCFGLeft, "SQ" ) == 0 )
+			strcpy( szaHeadLists, "0VBZ 0VBD 0VBP 0VB 0MD 0VP 0SQ 0" );
+		else if( strcmp( pszCFGLeft, "UCP" ) == 0 )
+			strcpy( szaHeadLists, "1" );
+		else if( strcmp( pszCFGLeft, "VP" ) == 0 )
+			strcpy( szaHeadLists, "0TO 0VBD 0VBN 0MD 0VBZ 0VB 0VBG 0VBP 0VP 0ADJP 0NN 0NNS 0NP 0" );
+		else if( strcmp( pszCFGLeft, "WHADJP" ) == 0 )
+			strcpy( szaHeadLists, "0CC 0WRB 0JJ 0ADJP 0" );
+		else if( strcmp( pszCFGLeft, "WHADVP" ) == 0 )
+			strcpy( szaHeadLists, "1CC 1WRB 1" );
+		else if( strcmp( pszCFGLeft, "WHNP" ) == 0 )
+			strcpy( szaHeadLists, "0WDT 0WP 0WP$ 0WHADJP 0WHPP 0WHNP 0" );
+		else if( strcmp( pszCFGLeft, "WHPP" ) == 0 )
+			strcpy( szaHeadLists, "1IN 1TO FW 1" );
+		else if( strcmp( pszCFGLeft, "NP" ) == 0 )
+			strcpy( szaHeadLists, "0NN NNP NNS NNPS NX POS JJR 0NP 0$ ADJP PRN 0CD 0JJ JJS RB QP 0" );
+		*/
+
+		if( strcmp( pszCFGLeft, "ADJP" ) == 0 )
+			strcpy( szaHeadLists, "0ADJP JJ 0AD NN CS 0" );
+		else if( strcmp( pszCFGLeft, "ADVP" ) == 0 )
+			strcpy( szaHeadLists, "0ADVP AD 0" );
+		else if( strcmp( pszCFGLeft, "CLP" ) == 0 )
+			strcpy( szaHeadLists, "0CLP M 0" );
+		else if( strcmp( pszCFGLeft, "CP" ) == 0 )
+			strcpy( szaHeadLists, "0DEC SP 1ADVP CS 0CP IP 0" );
+		else if( strcmp( pszCFGLeft, "DNP" ) == 0 )
+			strcpy( szaHeadLists, "0DNP DEG 0DEC 0" );
+		else if( strcmp( pszCFGLeft, "DVP" ) == 0 )
+			strcpy( szaHeadLists, "0DVP DEV 0" );
+		else if( strcmp( pszCFGLeft, "DP" ) == 0 )
+			strcpy( szaHeadLists, "1DP DT 1" );
+		else if( strcmp( pszCFGLeft, "FRAG" ) == 0 )
+			strcpy( szaHeadLists, "0VV NR NN 0" );
+		else if( strcmp( pszCFGLeft, "INTJ" ) == 0 )
+			strcpy( szaHeadLists, "0INTJ IJ 0" );
+		else if( strcmp( pszCFGLeft, "LST" ) == 0 )
+			strcpy( szaHeadLists, "1LST CD OD 1" );
+		else if( strcmp( pszCFGLeft, "IP" ) == 0 )
+			strcpy( szaHeadLists, "0IP VP 0VV 0" );
+			//strcpy( szaHeadLists, "0VP 0VV 1IP 0" );
+		else if( strcmp( pszCFGLeft, "LCP" ) == 0 )
+			strcpy( szaHeadLists, "0LCP LC 0" );
+		else if( strcmp( pszCFGLeft, "NP" ) == 0 )
+			strcpy( szaHeadLists, "0NP NN NT NR QP 0" );
+		else if( strcmp( pszCFGLeft, "PP" ) == 0 )
+			strcpy( szaHeadLists, "1PP P 1" );
+		else if( strcmp( pszCFGLeft, "PRN" ) == 0 )
+			strcpy( szaHeadLists, "0 NP IP VP NT NR NN 0" );
+		else if( strcmp( pszCFGLeft, "QP" ) == 0 )
+			strcpy( szaHeadLists, "0QP CLP CD OD 0" );
+		else if( strcmp( pszCFGLeft, "VP" ) == 0 )
+			strcpy( szaHeadLists, "1VP VA VC VE VV BA LB VCD VSB VRD VNV VCP 1" );
+		else if( strcmp( pszCFGLeft, "VCD" ) == 0 )
+			strcpy( szaHeadLists, "0VCD VV VA VC VE 0" );
+		if( strcmp( pszCFGLeft, "VRD" ) == 0 )
+			strcpy( szaHeadLists, "0VRD VV VA VC VE 0" );
+		else if( strcmp( pszCFGLeft, "VSB" ) == 0 )
+			strcpy( szaHeadLists, "0VSB VV VA VC VE 0" );
+		else if( strcmp( pszCFGLeft, "VCP" ) == 0 )
+			strcpy( szaHeadLists, "0VCP VV VA VC VE 0" );
+		else if( strcmp( pszCFGLeft, "VNV" ) == 0 )
+			strcpy( szaHeadLists, "0VNV VV VA VC VE 0" );
+		else if( strcmp( pszCFGLeft, "VPT" ) == 0 )
+			strcpy( szaHeadLists, "0VNV VV VA VC VE 0" );
+		else if( strcmp( pszCFGLeft, "UCP" ) == 0 )
+			strcpy( szaHeadLists, "0" );
+		else if( strcmp( pszCFGLeft, "WHNP" ) == 0 )
+			strcpy( szaHeadLists, "0WHNP NP NN NT NR QP 0" );
+		else if( strcmp( pszCFGLeft, "WHPP" ) == 0 )
+			strcpy( szaHeadLists, "1WHPP PP P 1" );
+
+		/*  //head rules for GENIA corpus
+		if( strcmp( pszCFGLeft, "ADJP" ) == 0 )
+			strcpy( szaHeadLists, "0NNS 0QP 0NN 0$ 0ADVP 0JJ 0VBN 0VBG 0ADJP 0JJR 0NP 0JJS 0DT 0FW 0RBR 0RBS 0SBAR 0RB 0" );
+		else if( strcmp( pszCFGLeft, "ADVP" ) == 0 )
+			strcpy( szaHeadLists, "1RB 1RBR 1RBS 1FW 1ADVP 1TO 1CD 1JJR 1JJ 1IN 1NP 1JJS 1NN 1" );
+		else if( strcmp( pszCFGLeft, "CONJP" ) == 0 )
+			strcpy( szaHeadLists, "1CC 1RB 1IN 1" );
+		else if( strcmp( pszCFGLeft, "FRAG" ) == 0 )
+			strcpy( szaHeadLists, "1" );
+		else if( strcmp( pszCFGLeft, "INTJ" ) == 0 )
+			strcpy( szaHeadLists, "0" );
+		else if( strcmp( pszCFGLeft, "LST" ) == 0 )
+			strcpy( szaHeadLists, "1LS 1: 1CLN 1" );
+		else if( strcmp( pszCFGLeft, "NAC" ) == 0 )
+			strcpy( szaHeadLists, "0NN 0NNS 0NNP 0NNPS 0NP 0NAC 0EX 0$ 0CD 0QP 0PRP 0VBG 0JJ 0JJS 0JJR 0ADJP 0FW 0" );
+		else if( strcmp( pszCFGLeft, "PP" ) == 0 )
+			strcpy( szaHeadLists, "1IN 1TO 1VBG 1VBN 1RP 1FW 1" );
+		else if( strcmp( pszCFGLeft, "PRN" ) == 0 )
+			strcpy( szaHeadLists, "1" );
+		else if( strcmp( pszCFGLeft, "PRT" ) == 0 )
+			strcpy( szaHeadLists, "1RP 1" );
+		else if( strcmp( pszCFGLeft, "QP" ) == 0 )
+			strcpy( szaHeadLists, "0$ 0IN 0NNS 0NN 0JJ 0RB 0DT 0CD 0NCD 0QP 0JJR 0JJS 0" );
+		else if( strcmp( pszCFGLeft, "RRC" ) == 0 )
+			strcpy( szaHeadLists, "1VP 1NP 1ADVP 1ADJP 1PP 1" );
+		else if( strcmp( pszCFGLeft, "S" ) == 0 )
+			strcpy( szaHeadLists, "0TO 0IN 0VP 0S 0SBAR 0ADJP 0UCP 0NP 0" );
+		else if( strcmp( pszCFGLeft, "SBAR" ) == 0 )
+			strcpy( szaHeadLists, "0WHNP 0WHPP 0WHADVP 0WHADJP 0IN 0DT 0S 0SQ 0SINV 0SBAR 0FRAG 0" );
+		else if( strcmp( pszCFGLeft, "SBARQ" ) == 0 )
+			strcpy( szaHeadLists, "0SQ 0S 0SINV 0SBARQ 0FRAG 0" );
+		else if( strcmp( pszCFGLeft, "SINV" ) == 0 )
+			strcpy( szaHeadLists, "0VBZ 0VBD 0VBP 0VB 0MD 0VP 0S 0SINV 0ADJP 0NP 0" );
+		else if( strcmp( pszCFGLeft, "SQ" ) == 0 )
+			strcpy( szaHeadLists, "0VBZ 0VBD 0VBP 0VB 0MD 0VP 0SQ 0" );
+		else if( strcmp( pszCFGLeft, "UCP" ) == 0 )
+			strcpy( szaHeadLists, "1" );
+		else if( strcmp( pszCFGLeft, "VP" ) == 0 )
+			strcpy( szaHeadLists, "0TO 0VBD 0VBN 0MD 0VBZ 0VB 0VBG 0VBP 0VP 0ADJP 0NN 0NNS 0NP 0" );
+		else if( strcmp( pszCFGLeft, "WHADJP" ) == 0 )
+			strcpy( szaHeadLists, "0CC 0WRB 0JJ 0ADJP 0" );
+		else if( strcmp( pszCFGLeft, "WHADVP" ) == 0 )
+			strcpy( szaHeadLists, "1CC 1WRB 1" );
+		else if( strcmp( pszCFGLeft, "WHNP" ) == 0 )
+			strcpy( szaHeadLists, "0WDT 0WP 0WP$ 0WHADJP 0WHPP 0WHNP 0" );
+		else if( strcmp( pszCFGLeft, "WHPP" ) == 0 )
+			strcpy( szaHeadLists, "1IN 1TO FW 1" );
+		else if( strcmp( pszCFGLeft, "NP" ) == 0 )
+			strcpy( szaHeadLists, "0NN NNP NNS NNPS NX POS JJR 0NP 0$ ADJP PRN 0CD 0JJ JJS RB QP 0" );
+		*/
+
+		return fnMyOwnHeadWordRule( szaHeadLists, vectRight );
+	}
+
+private:
+	int fnMyOwnHeadWordRule( char *pszaHeadLists, CVectorStr vectRight ) {
+		char szHeadList[ 201 ], *p;
+		char szTerm[ 101 ];
+		int J;
+
+		p = pszaHeadLists;
+
+		int iCountRight;
+
+		iCountRight = vectRight.size( );
+
+		szHeadList[ 0 ] = '\0';
+		while( 1 ){
+			szTerm[ 0 ] = '\0';
+			sscanf( p, "%s", szTerm );
+			if( strlen( szHeadList ) == 0 ){
+				if( strcmp( szTerm, "0" ) == 0 ){
+					return iCountRight - 1;
+				}
+				if( strcmp( szTerm, "1" ) == 0 ){
+					return 0;
+				}
+
+				sprintf( szHeadList, "%c %s ", szTerm[ 0 ], szTerm + 1 );
+				p = strstr( p, szTerm );
+				p += strlen( szTerm );
+			}
+			else{
+				if(   ( szTerm[ 0 ] == '0' )
+					||( szTerm[ 0 ] == '1' ) ){
+					if( szHeadList[ 0 ] == '0' ){
+						for( J = iCountRight - 1; J >= 0; J -- ){
+							sprintf( szTerm, " %s ", vectRight.at( J ).c_str( ) );
+							if( strstr( szHeadList, szTerm ) != NULL )
+								return J;
+						}
+					}
+					else{
+						for( J = 0; J < iCountRight; J ++ ){
+							sprintf( szTerm, " %s ", vectRight.at( J ).c_str( ) );
+							if(	strstr( szHeadList, szTerm ) != NULL )
+								return J;
+						}
+					}
+
+					szHeadList[ 0 ] = '\0';
+				}
+				else{
+					strcat( szHeadList, szTerm );
+					strcat( szHeadList, " " );
+
+					p = strstr( p, szTerm );
+					p += strlen( szTerm );
+				}
+			}
+		}
+
+		return 0;
+	}
+
+};
+
+struct SParsedTree{
+	SParsedTree( ) {
+		m_ptRoot = NULL;
+	}
+	~SParsedTree( ) {
+		if (m_ptRoot != NULL)
+			delete m_ptRoot;
+	}
+	static SParsedTree* fnConvertFromString(const char* pszStr) {
+		if (strcmp(pszStr, "(())") == 0)
+			return NULL;
+		SParsedTree* pTree = new SParsedTree();
+
+		vector<string> vecSyn;
+		fnReadSyntactic(pszStr, vecSyn);
+
+		int iLeft = 1, iRight = 1; //# left/right parenthesis
+
+		STreeItem *pcurrent;
+
+		pTree->m_ptRoot = new STreeItem(vecSyn[1].c_str());
+
+		pcurrent = pTree->m_ptRoot;
+
+		for (size_t i = 2; i < vecSyn.size() - 1; i++) {
+			if ( strcmp(vecSyn[i].c_str(), "(") == 0 )
+					iLeft++;
+			else if (strcmp(vecSyn[i].c_str(), ")") == 0 ) {
+				iRight++;
+				if (pcurrent == NULL) {
+					//error
+					fprintf(stderr, "ERROR in ConvertFromString\n");
+					fprintf(stderr, "%s\n", pszStr);
+					return NULL;
+				}
+				pcurrent = pcurrent->m_ptParent;
+			} else {
+				STreeItem *ptNewItem = new STreeItem(vecSyn[i].c_str());
+				pcurrent->fnAppend( ptNewItem );
+				pcurrent = ptNewItem;
+
+				if (strcmp(vecSyn[i - 1].c_str(), "(" ) != 0
+						&& strcmp(vecSyn[i - 1].c_str(), ")" ) != 0 ) {
+					pTree->m_vecTerminals.push_back(ptNewItem);
+					pcurrent = pcurrent->m_ptParent;
+				}
+			}
+		}
+
+		if ( iLeft != iRight ) {
+			//error
+			fprintf(stderr, "the left and right parentheses are not matched!");
+			fprintf(stderr, "ERROR in ConvertFromString\n");
+			fprintf(stderr, "%s\n", pszStr);
+			return NULL;
+		}
+
+		return pTree;
+	}
+
+	int fnGetNumWord() {
+		return m_vecTerminals.size();
+	}
+
+	void fnSetSpanInfo() {
+		int iNextNum = 0;
+		fnSuffixTraverseSetSpanInfo(m_ptRoot, iNextNum);
+	}
+
+	void fnSetHeadWord() {
+		for (size_t i = 0; i < m_vecTerminals.size(); i++)
+			m_vecTerminals[i]->m_iHeadWord = i;
+		SGetHeadWord *pGetHeadWord = new SGetHeadWord();
+		fnSuffixTraverseSetHeadWord(m_ptRoot, pGetHeadWord);
+		delete pGetHeadWord;
+	}
+
+	STreeItem *fnFindNodeForSpan(int iLeft, int iRight, bool bLowest) {
+		STreeItem *pTreeItem = m_vecTerminals[iLeft];
+
+		while (pTreeItem->m_iEnd < iRight) {
+			pTreeItem = pTreeItem->m_ptParent;
+			if (pTreeItem == NULL) break;
+		}
+		if (pTreeItem == NULL)
+			return NULL;
+		if (pTreeItem->m_iEnd > iRight)
+			return NULL;
+
+		assert(pTreeItem->m_iEnd == iRight);
+		if (bLowest)
+			return pTreeItem;
+
+		while (pTreeItem->m_ptParent != NULL && pTreeItem->m_ptParent->fnGetChildrenNum() == 1)
+			pTreeItem = pTreeItem->m_ptParent;
+
+		return pTreeItem;
+	}
+
+private:
+	void fnSuffixTraverseSetSpanInfo(STreeItem *ptItem, int& iNextNum) {
+		int I;
+		int iNumChildren = ptItem->fnGetChildrenNum();
+		for ( I = 0; I < iNumChildren; I++ )
+			fnSuffixTraverseSetSpanInfo(ptItem->m_vecChildren[ I ], iNextNum);
+
+		if ( I == 0 )
+		{
+			ptItem->m_iBegin = iNextNum;
+			ptItem->m_iEnd = iNextNum++;
+		}
+		else
+		{
+			ptItem->m_iBegin = ptItem->m_vecChildren[0]->m_iBegin;
+			ptItem->m_iEnd = ptItem->m_vecChildren[I - 1]->m_iEnd;
+		}
+	}
+
+
+	void fnSuffixTraverseSetHeadWord(STreeItem *ptItem, SGetHeadWord *pGetHeadWord) {
+		int I, iHeadchild;
+
+		if ( ptItem->m_vecChildren.size() == 0 )
+			return;
+
+		for ( I = 0; I < ptItem->m_vecChildren.size(); I++ )
+			fnSuffixTraverseSetHeadWord(ptItem->m_vecChildren[I], pGetHeadWord);
+
+		vector<string> vecRight;
+
+
+		if ( ptItem->m_vecChildren.size() == 1 )
+			iHeadchild = 0;
+		else
+		{
+			for ( I = 0; I < ptItem->m_vecChildren.size(); I++ )
+				vecRight.push_back( string( ptItem->m_vecChildren[ I ]->m_pszTerm ) );
+
+			iHeadchild = pGetHeadWord->fnGetHeadWord( ptItem->m_pszTerm, vecRight );
+		}
+
+		ptItem->m_iHeadChild = iHeadchild;
+		ptItem->m_iHeadWord = ptItem->m_vecChildren[iHeadchild]->m_iHeadWord;
+	}
+
+
+	static void fnReadSyntactic(const char *pszSyn, vector<string>& vec) {
+		char *p;
+		int I;
+
+		int iLeftNum, iRightNum;
+		char *pszTmp, *pszTerm;
+		pszTmp = new char[strlen(pszSyn)];
+		pszTerm = new char[strlen(pszSyn)];
+		pszTmp[0] = pszTerm[0] = '\0';
+
+		vec.clear();
+
+		char *pszLine;
+		pszLine = new char[strlen(pszSyn) + 1];
+		strcpy( pszLine, pszSyn );
+
+		char *pszLine2;
+
+		while( 1 ) {
+			while(( strlen( pszLine ) > 0 )
+					&&( pszLine[ strlen( pszLine ) - 1 ] > 0 )
+					&&( pszLine[ strlen( pszLine ) - 1 ] <= ' ' ) )
+				pszLine[ strlen( pszLine ) - 1 ] = '\0';
+
+			if( strlen( pszLine ) == 0 )
+				break;
+
+			//printf( "%s\n", pszLine );
+			pszLine2 = pszLine;
+			while( pszLine2[ 0 ] <= ' ' )
+				pszLine2 ++;
+			if( pszLine2[ 0 ] == '<' )
+				continue;
+
+			sscanf( pszLine2 + 1, "%s", pszTmp );
+
+			if ( pszLine2[ 0 ] == '(' ) {
+				iLeftNum = 0;
+				iRightNum = 0;
+			}
+
+			p = pszLine2;
+			while ( 1 ) {
+				pszTerm[ 0 ] = '\0';
+				sscanf( p, "%s", pszTerm );
+
+				if( strlen( pszTerm ) == 0 )
+					break;
+				p = strstr( p, pszTerm );
+				p += strlen( pszTerm );
+
+				if( ( pszTerm[ 0 ] == '(' )
+						||( pszTerm[ strlen( pszTerm ) - 1 ] == ')' ) ) {
+					if( pszTerm[ 0 ] == '(' ) {
+						vec.push_back(string("("));
+						iLeftNum++;
+
+						I = 1;
+						while ( pszTerm[ I ] == '(' && pszTerm[ I ]!= '\0' ) {
+							vec.push_back(string("("));
+							iLeftNum++;
+
+							I++;
+						}
+
+						if( strlen( pszTerm ) > 1 )
+							vec.push_back(string(pszTerm + I));
+					} else {
+						char *pTmp;
+						pTmp = pszTerm + strlen( pszTerm ) - 1;
+						while( ( pTmp[ 0 ] == ')' ) && ( pTmp >= pszTerm ) )
+							pTmp --;
+						pTmp[ 1 ] = '\0';
+
+						if( strlen( pszTerm ) > 0 )
+							vec.push_back(string(pszTerm));
+						pTmp += 2;
+
+						for( I = 0; I <= (int)strlen( pTmp ); I ++ ) {
+							vec.push_back(string(")"));
+							iRightNum++;
+						}
+					}
+				} else {
+					char *q;
+					q = strchr( pszTerm, ')' );
+					if ( q != NULL ) {
+						q[ 0 ] = '\0';
+						if ( pszTerm[ 0 ] != '\0' )
+							vec.push_back(string(pszTerm));
+						vec.push_back(string(")"));
+						iRightNum++;
+
+						q++;
+						while ( q[ 0 ] == ')' ) {
+							vec.push_back(string(")"));
+							q++;
+							iRightNum ++;
+						}
+
+						while( q[ 0 ] == '(' ) {
+							vec.push_back(string("("));
+							q++;
+							iLeftNum++;
+						}
+
+						if ( q[ 0 ] != '\0' )
+							vec.push_back(string(q));
+					}
+					else
+						vec.push_back(string(pszTerm));
+				}
+			}
+
+			if (iLeftNum != iRightNum) {
+				fprintf(stderr, "%s\n", pszSyn);
+				assert(iLeftNum == iRightNum);
+			}
+			/*if ( iLeftNum != iRightNum ) {
+				printf( "ERROR: left( and right ) is not matched, %d ( and %d )\n", iLeftNum, iRightNum );
+				return;
+			}*/
+
+
+			if( vec.size() >= 2
+					&& strcmp( vec[1].c_str(), "(" ) == 0 ) {
+				//( (IP..) )
+				std::vector<string>::iterator it;
+				it = vec.begin();
+				it++;
+				vec.insert(it, string("ROOT"));
+			}
+
+			break;
+		}
+
+		delete [] pszLine;
+		delete [] pszTmp;
+		delete [] pszTerm;
+	}
+
+public:
+	STreeItem *m_ptRoot;
+	vector<STreeItem*>  m_vecTerminals; //the leaf nodes
+};
+
+struct SParseReader {
+	SParseReader(const char* pszParse_Fname, bool bFlattened = false) :
+	m_bFlattened(bFlattened){
+		m_fpIn = fopen( pszParse_Fname, "r" );
+		assert(m_fpIn != NULL);
+	}
+	~SParseReader() {
+		if (m_fpIn != NULL)
+			fclose(m_fpIn);
+	}
+
+	SParsedTree* fnReadNextParseTree( ) {
+		SParsedTree *pTree = NULL;
+		char *pszLine = new char[100001];
+		int iLen;
+
+		while (fnReadNextSentence(pszLine, &iLen) == true ) {
+			if (iLen == 0)
+				continue;
+
+			pTree = SParsedTree::fnConvertFromString(pszLine);
+			if (pTree == NULL) 
+				break;
+			if (m_bFlattened)
+				fnPostProcessingFlattenedParse(pTree);
+			else {
+				pTree->fnSetSpanInfo();
+				pTree->fnSetHeadWord();
+			}
+			break;
+		}
+
+		delete [] pszLine;
+		return pTree;
+	}
+
+	SParsedTree* fnReadNextParseTreeWithProb(double *pProb) {
+		SParsedTree *pTree = NULL;
+		char *pszLine = new char[100001];
+		int iLen;
+
+		while (fnReadNextSentence(pszLine, &iLen) == true ) {
+			if (iLen == 0)
+				continue;
+
+			char *p = strchr(pszLine, ' ');
+			assert(p != NULL);
+			p[0] = '\0';
+			p++;
+			if (pProb)
+				(*pProb) = atof(pszLine);
+
+			pTree = SParsedTree::fnConvertFromString(p);
+			if (m_bFlattened)
+				fnPostProcessingFlattenedParse(pTree);
+			else {
+				pTree->fnSetSpanInfo();
+				pTree->fnSetHeadWord();
+			}
+			break;
+		}
+
+		delete [] pszLine;
+		return pTree;
+	}
+private:
+	/*
+	 * since to the parse tree is a flattened tree, use the head mark to identify head info.
+	 * the head node will be marked as "*XP*"
+	 */
+	void fnSetParseTreeHeadInfo(SParsedTree *pTree) {
+		for (size_t i = 0; i < pTree->m_vecTerminals.size(); i++)
+			pTree->m_vecTerminals[i]->m_iHeadWord = i;
+		fnSuffixTraverseSetHeadWord(pTree->m_ptRoot);
+	}
+
+	void fnSuffixTraverseSetHeadWord(STreeItem *pTreeItem) {
+		if (pTreeItem->m_vecChildren.size() == 0)
+			return;
+
+		for (size_t i = 0; i < pTreeItem->m_vecChildren.size(); i++ )
+			fnSuffixTraverseSetHeadWord(pTreeItem->m_vecChildren[i]);
+
+		vector<string> vecRight;
+
+		int iHeadchild;
+
+		if (pTreeItem->fnIsPreTerminal()) {
+			iHeadchild = 0;
+		} else {
+			size_t i;
+			for (i = 0; i < pTreeItem->m_vecChildren.size(); i++) {
+				char *p = pTreeItem->m_vecChildren[i]->m_pszTerm;
+				if (p[0] == '*' && p[strlen(p) - 1] == '*') {
+					iHeadchild = i;
+					p[strlen(p) - 1] = '\0';
+					string str = p + 1;
+					strcpy(p, str.c_str()); //erase the "*..*"
+					break;
+				}
+			}
+			assert(i < pTreeItem->m_vecChildren.size());
+		}
+
+		pTreeItem->m_iHeadChild = iHeadchild;
+		pTreeItem->m_iHeadWord = pTreeItem->m_vecChildren[iHeadchild]->m_iHeadWord;
+	}
+	void fnPostProcessingFlattenedParse(SParsedTree *pTree) {
+		pTree->fnSetSpanInfo();
+		fnSetParseTreeHeadInfo(pTree);
+	}
+	bool fnReadNextSentence(char *pszLine, int* piLength) {
+		if (feof(m_fpIn) == true)
+			return false;
+
+		int iLen;
+
+		pszLine[ 0 ] = '\0';
+
+		fgets(pszLine, 10001, m_fpIn);
+		iLen = strlen(pszLine);
+		while (iLen > 0 && pszLine[iLen - 1] > 0 && pszLine[iLen -1] < 33) {
+			pszLine[ iLen - 1 ] = '\0';
+			iLen--;
+		}
+
+		if ( piLength != NULL )
+			(*piLength) = iLen;
+
+		return true;
+	}
+private:
+	FILE *m_fpIn;
+	const bool m_bFlattened;
+};
+
+#endif /* TREE_H_ */
author	Wu, Ke <wuke@cs.umd.edu>	2014-10-07 17:12:32 -0400
committer	Wu, Ke <wuke@cs.umd.edu>	2014-10-07 17:12:32 -0400
commit	f762dbbf10a8204d0d0b82e9acb29feacd3b3bb4 (patch)
tree	5b775aaed0a060ec13626eb90dba583e59feddc6 /utils/synutils/tree.h
parent	d88186af251ecae60974b20395ce75807bfdda35 (diff)