summaryrefslogtreecommitdiff
path: root/utils/tree.h
diff options
context:
space:
mode:
authorWu, Ke <wuke@cs.umd.edu>2014-12-06 12:17:27 -0500
committerWu, Ke <wuke@cs.umd.edu>2014-12-06 12:17:27 -0500
commit16827862bcc4f04ada087abc255c6604d88076c1 (patch)
tree394f8585409ca2d132fc1400c906c39df43613e6 /utils/tree.h
parenta21959213f9b1cc15befae52dbb5091e848de7a1 (diff)
Move non-MaxEnt code out of utils
1. alignment.h, argument_reorder_model.h, src_sentence.h, tree.h, tsuruoka_maxent.h -> decoder/ff_const_reorder_common.h. 2. Trainers source files (argument_reorder_model.cc and constituent_reorder_model.cc) are moved to training/const_reorder.
Diffstat (limited to 'utils/tree.h')
-rw-r--r--utils/tree.h699
1 files changed, 0 insertions, 699 deletions
diff --git a/utils/tree.h b/utils/tree.h
deleted file mode 100644
index 6c3406d6..00000000
--- a/utils/tree.h
+++ /dev/null
@@ -1,699 +0,0 @@
-/*
- * tree.h
- *
- * Created on: May 23, 2013
- * Author: lijunhui
- */
-
-#ifndef TREE_H_
-#define TREE_H_
-
-#include <assert.h>
-#include <stdio.h>
-
-#include <string>
-#include <vector>
-
-struct STreeItem {
- STreeItem(const char *pszTerm) {
- m_pszTerm = new char[strlen(pszTerm) + 1];
- strcpy(m_pszTerm, pszTerm);
-
- m_ptParent = NULL;
- m_iBegin = -1;
- m_iEnd = -1;
- m_iHeadChild = -1;
- m_iHeadWord = -1;
- m_iBrotherIndex = -1;
- }
- ~STreeItem() {
- delete[] m_pszTerm;
- for (size_t i = 0; i < m_vecChildren.size(); i++) delete m_vecChildren[i];
- }
- int fnAppend(STreeItem *ptChild) {
- m_vecChildren.push_back(ptChild);
- ptChild->m_iBrotherIndex = m_vecChildren.size() - 1;
- ptChild->m_ptParent = this;
- return m_vecChildren.size() - 1;
- }
- int fnGetChildrenNum() { return m_vecChildren.size(); }
-
- bool fnIsPreTerminal(void) {
- int I;
- if (this == NULL || m_vecChildren.size() == 0) return false;
-
- for (I = 0; I < m_vecChildren.size(); I++)
- if (m_vecChildren[I]->m_vecChildren.size() > 0) return false;
-
- return true;
- }
-
- public:
- char *m_pszTerm;
-
- std::vector<STreeItem *> m_vecChildren; // children items
- STreeItem *m_ptParent; // the parent item
-
- int m_iBegin;
- int m_iEnd; // the node span words[m_iBegin, m_iEnd]
- int m_iHeadChild; // the index of its head child
- int m_iHeadWord; // the index of its head word
- int m_iBrotherIndex; // the index in his brothers
-};
-
-struct SGetHeadWord {
- typedef std::vector<std::string> CVectorStr;
- SGetHeadWord() {}
- ~SGetHeadWord() {}
- int fnGetHeadWord(char *pszCFGLeft, CVectorStr vectRight) {
- // 0 indicating from right to left while 1 indicating from left to right
- char szaHeadLists[201] = "0";
-
- /* //head rules for Egnlish
- if( strcmp( pszCFGLeft, "ADJP" ) == 0 )
- strcpy( szaHeadLists, "0NNS 0QP 0NN 0$ 0ADVP 0JJ 0VBN 0VBG 0ADJP
- 0JJR 0NP 0JJS 0DT 0FW 0RBR 0RBS 0SBAR 0RB 0" );
- else if( strcmp( pszCFGLeft, "ADVP" ) == 0 )
- strcpy( szaHeadLists, "1RB 1RBR 1RBS 1FW 1ADVP 1TO 1CD 1JJR 1JJ 1IN
- 1NP 1JJS 1NN 1" );
- else if( strcmp( pszCFGLeft, "CONJP" ) == 0 )
- strcpy( szaHeadLists, "1CC 1RB 1IN 1" );
- else if( strcmp( pszCFGLeft, "FRAG" ) == 0 )
- strcpy( szaHeadLists, "1" );
- else if( strcmp( pszCFGLeft, "INTJ" ) == 0 )
- strcpy( szaHeadLists, "0" );
- else if( strcmp( pszCFGLeft, "LST" ) == 0 )
- strcpy( szaHeadLists, "1LS 1: 1CLN 1" );
- else if( strcmp( pszCFGLeft, "NAC" ) == 0 )
- strcpy( szaHeadLists, "0NN 0NNS 0NNP 0NNPS 0NP 0NAC 0EX 0$ 0CD 0QP
- 0PRP 0VBG 0JJ 0JJS 0JJR 0ADJP 0FW 0" );
- else if( strcmp( pszCFGLeft, "PP" ) == 0 )
- strcpy( szaHeadLists, "1IN 1TO 1VBG 1VBN 1RP 1FW 1" );
- else if( strcmp( pszCFGLeft, "PRN" ) == 0 )
- strcpy( szaHeadLists, "1" );
- else if( strcmp( pszCFGLeft, "PRT" ) == 0 )
- strcpy( szaHeadLists, "1RP 1" );
- else if( strcmp( pszCFGLeft, "QP" ) == 0 )
- strcpy( szaHeadLists, "0$ 0IN 0NNS 0NN 0JJ 0RB 0DT 0CD 0NCD 0QP 0JJR
- 0JJS 0" );
- else if( strcmp( pszCFGLeft, "RRC" ) == 0 )
- strcpy( szaHeadLists, "1VP 1NP 1ADVP 1ADJP 1PP 1" );
- else if( strcmp( pszCFGLeft, "S" ) == 0 )
- strcpy( szaHeadLists, "0TO 0IN 0VP 0S 0SBAR 0ADJP 0UCP 0NP 0" );
- else if( strcmp( pszCFGLeft, "SBAR" ) == 0 )
- strcpy( szaHeadLists, "0WHNP 0WHPP 0WHADVP 0WHADJP 0IN 0DT 0S 0SQ
- 0SINV 0SBAR 0FRAG 0" );
- else if( strcmp( pszCFGLeft, "SBARQ" ) == 0 )
- strcpy( szaHeadLists, "0SQ 0S 0SINV 0SBARQ 0FRAG 0" );
- else if( strcmp( pszCFGLeft, "SINV" ) == 0 )
- strcpy( szaHeadLists, "0VBZ 0VBD 0VBP 0VB 0MD 0VP 0S 0SINV 0ADJP 0NP
- 0" );
- else if( strcmp( pszCFGLeft, "SQ" ) == 0 )
- strcpy( szaHeadLists, "0VBZ 0VBD 0VBP 0VB 0MD 0VP 0SQ 0" );
- else if( strcmp( pszCFGLeft, "UCP" ) == 0 )
- strcpy( szaHeadLists, "1" );
- else if( strcmp( pszCFGLeft, "VP" ) == 0 )
- strcpy( szaHeadLists, "0TO 0VBD 0VBN 0MD 0VBZ 0VB 0VBG 0VBP 0VP
- 0ADJP 0NN 0NNS 0NP 0" );
- else if( strcmp( pszCFGLeft, "WHADJP" ) == 0 )
- strcpy( szaHeadLists, "0CC 0WRB 0JJ 0ADJP 0" );
- else if( strcmp( pszCFGLeft, "WHADVP" ) == 0 )
- strcpy( szaHeadLists, "1CC 1WRB 1" );
- else if( strcmp( pszCFGLeft, "WHNP" ) == 0 )
- strcpy( szaHeadLists, "0WDT 0WP 0WP$ 0WHADJP 0WHPP 0WHNP 0" );
- else if( strcmp( pszCFGLeft, "WHPP" ) == 0 )
- strcpy( szaHeadLists, "1IN 1TO FW 1" );
- else if( strcmp( pszCFGLeft, "NP" ) == 0 )
- strcpy( szaHeadLists, "0NN NNP NNS NNPS NX POS JJR 0NP 0$ ADJP PRN
- 0CD 0JJ JJS RB QP 0" );
- */
-
- if (strcmp(pszCFGLeft, "ADJP") == 0)
- strcpy(szaHeadLists, "0ADJP JJ 0AD NN CS 0");
- else if (strcmp(pszCFGLeft, "ADVP") == 0)
- strcpy(szaHeadLists, "0ADVP AD 0");
- else if (strcmp(pszCFGLeft, "CLP") == 0)
- strcpy(szaHeadLists, "0CLP M 0");
- else if (strcmp(pszCFGLeft, "CP") == 0)
- strcpy(szaHeadLists, "0DEC SP 1ADVP CS 0CP IP 0");
- else if (strcmp(pszCFGLeft, "DNP") == 0)
- strcpy(szaHeadLists, "0DNP DEG 0DEC 0");
- else if (strcmp(pszCFGLeft, "DVP") == 0)
- strcpy(szaHeadLists, "0DVP DEV 0");
- else if (strcmp(pszCFGLeft, "DP") == 0)
- strcpy(szaHeadLists, "1DP DT 1");
- else if (strcmp(pszCFGLeft, "FRAG") == 0)
- strcpy(szaHeadLists, "0VV NR NN 0");
- else if (strcmp(pszCFGLeft, "INTJ") == 0)
- strcpy(szaHeadLists, "0INTJ IJ 0");
- else if (strcmp(pszCFGLeft, "LST") == 0)
- strcpy(szaHeadLists, "1LST CD OD 1");
- else if (strcmp(pszCFGLeft, "IP") == 0)
- strcpy(szaHeadLists, "0IP VP 0VV 0");
- // strcpy( szaHeadLists, "0VP 0VV 1IP 0" );
- else if (strcmp(pszCFGLeft, "LCP") == 0)
- strcpy(szaHeadLists, "0LCP LC 0");
- else if (strcmp(pszCFGLeft, "NP") == 0)
- strcpy(szaHeadLists, "0NP NN NT NR QP 0");
- else if (strcmp(pszCFGLeft, "PP") == 0)
- strcpy(szaHeadLists, "1PP P 1");
- else if (strcmp(pszCFGLeft, "PRN") == 0)
- strcpy(szaHeadLists, "0 NP IP VP NT NR NN 0");
- else if (strcmp(pszCFGLeft, "QP") == 0)
- strcpy(szaHeadLists, "0QP CLP CD OD 0");
- else if (strcmp(pszCFGLeft, "VP") == 0)
- strcpy(szaHeadLists, "1VP VA VC VE VV BA LB VCD VSB VRD VNV VCP 1");
- else if (strcmp(pszCFGLeft, "VCD") == 0)
- strcpy(szaHeadLists, "0VCD VV VA VC VE 0");
- if (strcmp(pszCFGLeft, "VRD") == 0)
- strcpy(szaHeadLists, "0VRD VV VA VC VE 0");
- else if (strcmp(pszCFGLeft, "VSB") == 0)
- strcpy(szaHeadLists, "0VSB VV VA VC VE 0");
- else if (strcmp(pszCFGLeft, "VCP") == 0)
- strcpy(szaHeadLists, "0VCP VV VA VC VE 0");
- else if (strcmp(pszCFGLeft, "VNV") == 0)
- strcpy(szaHeadLists, "0VNV VV VA VC VE 0");
- else if (strcmp(pszCFGLeft, "VPT") == 0)
- strcpy(szaHeadLists, "0VNV VV VA VC VE 0");
- else if (strcmp(pszCFGLeft, "UCP") == 0)
- strcpy(szaHeadLists, "0");
- else if (strcmp(pszCFGLeft, "WHNP") == 0)
- strcpy(szaHeadLists, "0WHNP NP NN NT NR QP 0");
- else if (strcmp(pszCFGLeft, "WHPP") == 0)
- strcpy(szaHeadLists, "1WHPP PP P 1");
-
- /* //head rules for GENIA corpus
- if( strcmp( pszCFGLeft, "ADJP" ) == 0 )
- strcpy( szaHeadLists, "0NNS 0QP 0NN 0$ 0ADVP 0JJ 0VBN 0VBG 0ADJP
- 0JJR 0NP 0JJS 0DT 0FW 0RBR 0RBS 0SBAR 0RB 0" );
- else if( strcmp( pszCFGLeft, "ADVP" ) == 0 )
- strcpy( szaHeadLists, "1RB 1RBR 1RBS 1FW 1ADVP 1TO 1CD 1JJR 1JJ 1IN
- 1NP 1JJS 1NN 1" );
- else if( strcmp( pszCFGLeft, "CONJP" ) == 0 )
- strcpy( szaHeadLists, "1CC 1RB 1IN 1" );
- else if( strcmp( pszCFGLeft, "FRAG" ) == 0 )
- strcpy( szaHeadLists, "1" );
- else if( strcmp( pszCFGLeft, "INTJ" ) == 0 )
- strcpy( szaHeadLists, "0" );
- else if( strcmp( pszCFGLeft, "LST" ) == 0 )
- strcpy( szaHeadLists, "1LS 1: 1CLN 1" );
- else if( strcmp( pszCFGLeft, "NAC" ) == 0 )
- strcpy( szaHeadLists, "0NN 0NNS 0NNP 0NNPS 0NP 0NAC 0EX 0$ 0CD 0QP
- 0PRP 0VBG 0JJ 0JJS 0JJR 0ADJP 0FW 0" );
- else if( strcmp( pszCFGLeft, "PP" ) == 0 )
- strcpy( szaHeadLists, "1IN 1TO 1VBG 1VBN 1RP 1FW 1" );
- else if( strcmp( pszCFGLeft, "PRN" ) == 0 )
- strcpy( szaHeadLists, "1" );
- else if( strcmp( pszCFGLeft, "PRT" ) == 0 )
- strcpy( szaHeadLists, "1RP 1" );
- else if( strcmp( pszCFGLeft, "QP" ) == 0 )
- strcpy( szaHeadLists, "0$ 0IN 0NNS 0NN 0JJ 0RB 0DT 0CD 0NCD 0QP 0JJR
- 0JJS 0" );
- else if( strcmp( pszCFGLeft, "RRC" ) == 0 )
- strcpy( szaHeadLists, "1VP 1NP 1ADVP 1ADJP 1PP 1" );
- else if( strcmp( pszCFGLeft, "S" ) == 0 )
- strcpy( szaHeadLists, "0TO 0IN 0VP 0S 0SBAR 0ADJP 0UCP 0NP 0" );
- else if( strcmp( pszCFGLeft, "SBAR" ) == 0 )
- strcpy( szaHeadLists, "0WHNP 0WHPP 0WHADVP 0WHADJP 0IN 0DT 0S 0SQ
- 0SINV 0SBAR 0FRAG 0" );
- else if( strcmp( pszCFGLeft, "SBARQ" ) == 0 )
- strcpy( szaHeadLists, "0SQ 0S 0SINV 0SBARQ 0FRAG 0" );
- else if( strcmp( pszCFGLeft, "SINV" ) == 0 )
- strcpy( szaHeadLists, "0VBZ 0VBD 0VBP 0VB 0MD 0VP 0S 0SINV 0ADJP 0NP
- 0" );
- else if( strcmp( pszCFGLeft, "SQ" ) == 0 )
- strcpy( szaHeadLists, "0VBZ 0VBD 0VBP 0VB 0MD 0VP 0SQ 0" );
- else if( strcmp( pszCFGLeft, "UCP" ) == 0 )
- strcpy( szaHeadLists, "1" );
- else if( strcmp( pszCFGLeft, "VP" ) == 0 )
- strcpy( szaHeadLists, "0TO 0VBD 0VBN 0MD 0VBZ 0VB 0VBG 0VBP 0VP
- 0ADJP 0NN 0NNS 0NP 0" );
- else if( strcmp( pszCFGLeft, "WHADJP" ) == 0 )
- strcpy( szaHeadLists, "0CC 0WRB 0JJ 0ADJP 0" );
- else if( strcmp( pszCFGLeft, "WHADVP" ) == 0 )
- strcpy( szaHeadLists, "1CC 1WRB 1" );
- else if( strcmp( pszCFGLeft, "WHNP" ) == 0 )
- strcpy( szaHeadLists, "0WDT 0WP 0WP$ 0WHADJP 0WHPP 0WHNP 0" );
- else if( strcmp( pszCFGLeft, "WHPP" ) == 0 )
- strcpy( szaHeadLists, "1IN 1TO FW 1" );
- else if( strcmp( pszCFGLeft, "NP" ) == 0 )
- strcpy( szaHeadLists, "0NN NNP NNS NNPS NX POS JJR 0NP 0$ ADJP PRN
- 0CD 0JJ JJS RB QP 0" );
- */
-
- return fnMyOwnHeadWordRule(szaHeadLists, vectRight);
- }
-
- private:
- int fnMyOwnHeadWordRule(char *pszaHeadLists, CVectorStr vectRight) {
- char szHeadList[201], *p;
- char szTerm[101];
- int J;
-
- p = pszaHeadLists;
-
- int iCountRight;
-
- iCountRight = vectRight.size();
-
- szHeadList[0] = '\0';
- while (1) {
- szTerm[0] = '\0';
- sscanf(p, "%s", szTerm);
- if (strlen(szHeadList) == 0) {
- if (strcmp(szTerm, "0") == 0) {
- return iCountRight - 1;
- }
- if (strcmp(szTerm, "1") == 0) {
- return 0;
- }
-
- sprintf(szHeadList, "%c %s ", szTerm[0], szTerm + 1);
- p = strstr(p, szTerm);
- p += strlen(szTerm);
- } else {
- if ((szTerm[0] == '0') || (szTerm[0] == '1')) {
- if (szHeadList[0] == '0') {
- for (J = iCountRight - 1; J >= 0; J--) {
- sprintf(szTerm, " %s ", vectRight.at(J).c_str());
- if (strstr(szHeadList, szTerm) != NULL) return J;
- }
- } else {
- for (J = 0; J < iCountRight; J++) {
- sprintf(szTerm, " %s ", vectRight.at(J).c_str());
- if (strstr(szHeadList, szTerm) != NULL) return J;
- }
- }
-
- szHeadList[0] = '\0';
- } else {
- strcat(szHeadList, szTerm);
- strcat(szHeadList, " ");
-
- p = strstr(p, szTerm);
- p += strlen(szTerm);
- }
- }
- }
-
- return 0;
- }
-};
-
-struct SParsedTree {
- SParsedTree() { m_ptRoot = NULL; }
- ~SParsedTree() {
- if (m_ptRoot != NULL) delete m_ptRoot;
- }
- static SParsedTree *fnConvertFromString(const char *pszStr) {
- if (strcmp(pszStr, "(())") == 0) return NULL;
- SParsedTree *pTree = new SParsedTree();
-
- std::vector<std::string> vecSyn;
- fnReadSyntactic(pszStr, vecSyn);
-
- int iLeft = 1, iRight = 1; //# left/right parenthesis
-
- STreeItem *pcurrent;
-
- pTree->m_ptRoot = new STreeItem(vecSyn[1].c_str());
-
- pcurrent = pTree->m_ptRoot;
-
- for (size_t i = 2; i < vecSyn.size() - 1; i++) {
- if (strcmp(vecSyn[i].c_str(), "(") == 0)
- iLeft++;
- else if (strcmp(vecSyn[i].c_str(), ")") == 0) {
- iRight++;
- if (pcurrent == NULL) {
- // error
- fprintf(stderr, "ERROR in ConvertFromString\n");
- fprintf(stderr, "%s\n", pszStr);
- return NULL;
- }
- pcurrent = pcurrent->m_ptParent;
- } else {
- STreeItem *ptNewItem = new STreeItem(vecSyn[i].c_str());
- pcurrent->fnAppend(ptNewItem);
- pcurrent = ptNewItem;
-
- if (strcmp(vecSyn[i - 1].c_str(), "(") != 0 &&
- strcmp(vecSyn[i - 1].c_str(), ")") != 0) {
- pTree->m_vecTerminals.push_back(ptNewItem);
- pcurrent = pcurrent->m_ptParent;
- }
- }
- }
-
- if (iLeft != iRight) {
- // error
- fprintf(stderr, "the left and right parentheses are not matched!");
- fprintf(stderr, "ERROR in ConvertFromString\n");
- fprintf(stderr, "%s\n", pszStr);
- return NULL;
- }
-
- return pTree;
- }
-
- int fnGetNumWord() { return m_vecTerminals.size(); }
-
- void fnSetSpanInfo() {
- int iNextNum = 0;
- fnSuffixTraverseSetSpanInfo(m_ptRoot, iNextNum);
- }
-
- void fnSetHeadWord() {
- for (size_t i = 0; i < m_vecTerminals.size(); i++)
- m_vecTerminals[i]->m_iHeadWord = i;
- SGetHeadWord *pGetHeadWord = new SGetHeadWord();
- fnSuffixTraverseSetHeadWord(m_ptRoot, pGetHeadWord);
- delete pGetHeadWord;
- }
-
- STreeItem *fnFindNodeForSpan(int iLeft, int iRight, bool bLowest) {
- STreeItem *pTreeItem = m_vecTerminals[iLeft];
-
- while (pTreeItem->m_iEnd < iRight) {
- pTreeItem = pTreeItem->m_ptParent;
- if (pTreeItem == NULL) break;
- }
- if (pTreeItem == NULL) return NULL;
- if (pTreeItem->m_iEnd > iRight) return NULL;
-
- assert(pTreeItem->m_iEnd == iRight);
- if (bLowest) return pTreeItem;
-
- while (pTreeItem->m_ptParent != NULL &&
- pTreeItem->m_ptParent->fnGetChildrenNum() == 1)
- pTreeItem = pTreeItem->m_ptParent;
-
- return pTreeItem;
- }
-
- private:
- void fnSuffixTraverseSetSpanInfo(STreeItem *ptItem, int &iNextNum) {
- int I;
- int iNumChildren = ptItem->fnGetChildrenNum();
- for (I = 0; I < iNumChildren; I++)
- fnSuffixTraverseSetSpanInfo(ptItem->m_vecChildren[I], iNextNum);
-
- if (I == 0) {
- ptItem->m_iBegin = iNextNum;
- ptItem->m_iEnd = iNextNum++;
- } else {
- ptItem->m_iBegin = ptItem->m_vecChildren[0]->m_iBegin;
- ptItem->m_iEnd = ptItem->m_vecChildren[I - 1]->m_iEnd;
- }
- }
-
- void fnSuffixTraverseSetHeadWord(STreeItem *ptItem,
- SGetHeadWord *pGetHeadWord) {
- int I, iHeadchild;
-
- if (ptItem->m_vecChildren.size() == 0) return;
-
- for (I = 0; I < ptItem->m_vecChildren.size(); I++)
- fnSuffixTraverseSetHeadWord(ptItem->m_vecChildren[I], pGetHeadWord);
-
- std::vector<std::string> vecRight;
-
- if (ptItem->m_vecChildren.size() == 1)
- iHeadchild = 0;
- else {
- for (I = 0; I < ptItem->m_vecChildren.size(); I++)
- vecRight.push_back(std::string(ptItem->m_vecChildren[I]->m_pszTerm));
-
- iHeadchild = pGetHeadWord->fnGetHeadWord(ptItem->m_pszTerm, vecRight);
- }
-
- ptItem->m_iHeadChild = iHeadchild;
- ptItem->m_iHeadWord = ptItem->m_vecChildren[iHeadchild]->m_iHeadWord;
- }
-
- static void fnReadSyntactic(const char *pszSyn,
- std::vector<std::string> &vec) {
- char *p;
- int I;
-
- int iLeftNum, iRightNum;
- char *pszTmp, *pszTerm;
- pszTmp = new char[strlen(pszSyn)];
- pszTerm = new char[strlen(pszSyn)];
- pszTmp[0] = pszTerm[0] = '\0';
-
- vec.clear();
-
- char *pszLine;
- pszLine = new char[strlen(pszSyn) + 1];
- strcpy(pszLine, pszSyn);
-
- char *pszLine2;
-
- while (1) {
- while ((strlen(pszLine) > 0) && (pszLine[strlen(pszLine) - 1] > 0) &&
- (pszLine[strlen(pszLine) - 1] <= ' '))
- pszLine[strlen(pszLine) - 1] = '\0';
-
- if (strlen(pszLine) == 0) break;
-
- // printf( "%s\n", pszLine );
- pszLine2 = pszLine;
- while (pszLine2[0] <= ' ') pszLine2++;
- if (pszLine2[0] == '<') continue;
-
- sscanf(pszLine2 + 1, "%s", pszTmp);
-
- if (pszLine2[0] == '(') {
- iLeftNum = 0;
- iRightNum = 0;
- }
-
- p = pszLine2;
- while (1) {
- pszTerm[0] = '\0';
- sscanf(p, "%s", pszTerm);
-
- if (strlen(pszTerm) == 0) break;
- p = strstr(p, pszTerm);
- p += strlen(pszTerm);
-
- if ((pszTerm[0] == '(') || (pszTerm[strlen(pszTerm) - 1] == ')')) {
- if (pszTerm[0] == '(') {
- vec.push_back(std::string("("));
- iLeftNum++;
-
- I = 1;
- while (pszTerm[I] == '(' && pszTerm[I] != '\0') {
- vec.push_back(std::string("("));
- iLeftNum++;
-
- I++;
- }
-
- if (strlen(pszTerm) > 1) vec.push_back(std::string(pszTerm + I));
- } else {
- char *pTmp;
- pTmp = pszTerm + strlen(pszTerm) - 1;
- while ((pTmp[0] == ')') && (pTmp >= pszTerm)) pTmp--;
- pTmp[1] = '\0';
-
- if (strlen(pszTerm) > 0) vec.push_back(std::string(pszTerm));
- pTmp += 2;
-
- for (I = 0; I <= (int)strlen(pTmp); I++) {
- vec.push_back(std::string(")"));
- iRightNum++;
- }
- }
- } else {
- char *q;
- q = strchr(pszTerm, ')');
- if (q != NULL) {
- q[0] = '\0';
- if (pszTerm[0] != '\0') vec.push_back(std::string(pszTerm));
- vec.push_back(std::string(")"));
- iRightNum++;
-
- q++;
- while (q[0] == ')') {
- vec.push_back(std::string(")"));
- q++;
- iRightNum++;
- }
-
- while (q[0] == '(') {
- vec.push_back(std::string("("));
- q++;
- iLeftNum++;
- }
-
- if (q[0] != '\0') vec.push_back(std::string(q));
- } else
- vec.push_back(std::string(pszTerm));
- }
- }
-
- if (iLeftNum != iRightNum) {
- fprintf(stderr, "%s\n", pszSyn);
- assert(iLeftNum == iRightNum);
- }
- /*if ( iLeftNum != iRightNum ) {
- printf( "ERROR: left( and right ) is not matched, %d ( and %d
- )\n", iLeftNum, iRightNum );
- return;
- }*/
-
- if (vec.size() >= 2 && strcmp(vec[1].c_str(), "(") == 0) {
- //( (IP..) )
- std::vector<std::string>::iterator it;
- it = vec.begin();
- it++;
- vec.insert(it, std::string("ROOT"));
- }
-
- break;
- }
-
- delete[] pszLine;
- delete[] pszTmp;
- delete[] pszTerm;
- }
-
- public:
- STreeItem *m_ptRoot;
- std::vector<STreeItem *> m_vecTerminals; // the leaf nodes
-};
-
-struct SParseReader {
- SParseReader(const char *pszParse_Fname, bool bFlattened = false)
- : m_bFlattened(bFlattened) {
- m_fpIn = fopen(pszParse_Fname, "r");
- assert(m_fpIn != NULL);
- }
- ~SParseReader() {
- if (m_fpIn != NULL) fclose(m_fpIn);
- }
-
- SParsedTree *fnReadNextParseTree() {
- SParsedTree *pTree = NULL;
- char *pszLine = new char[100001];
- int iLen;
-
- while (fnReadNextSentence(pszLine, &iLen) == true) {
- if (iLen == 0) continue;
-
- pTree = SParsedTree::fnConvertFromString(pszLine);
- if (pTree == NULL) break;
- if (m_bFlattened)
- fnPostProcessingFlattenedParse(pTree);
- else {
- pTree->fnSetSpanInfo();
- pTree->fnSetHeadWord();
- }
- break;
- }
-
- delete[] pszLine;
- return pTree;
- }
-
- SParsedTree *fnReadNextParseTreeWithProb(double *pProb) {
- SParsedTree *pTree = NULL;
- char *pszLine = new char[100001];
- int iLen;
-
- while (fnReadNextSentence(pszLine, &iLen) == true) {
- if (iLen == 0) continue;
-
- char *p = strchr(pszLine, ' ');
- assert(p != NULL);
- p[0] = '\0';
- p++;
- if (pProb) (*pProb) = atof(pszLine);
-
- pTree = SParsedTree::fnConvertFromString(p);
- if (m_bFlattened)
- fnPostProcessingFlattenedParse(pTree);
- else {
- pTree->fnSetSpanInfo();
- pTree->fnSetHeadWord();
- }
- break;
- }
-
- delete[] pszLine;
- return pTree;
- }
-
- private:
- /*
- * since to the parse tree is a flattened tree, use the head mark to identify
- * head info.
- * the head node will be marked as "*XP*"
- */
- void fnSetParseTreeHeadInfo(SParsedTree *pTree) {
- for (size_t i = 0; i < pTree->m_vecTerminals.size(); i++)
- pTree->m_vecTerminals[i]->m_iHeadWord = i;
- fnSuffixTraverseSetHeadWord(pTree->m_ptRoot);
- }
-
- void fnSuffixTraverseSetHeadWord(STreeItem *pTreeItem) {
- if (pTreeItem->m_vecChildren.size() == 0) return;
-
- for (size_t i = 0; i < pTreeItem->m_vecChildren.size(); i++)
- fnSuffixTraverseSetHeadWord(pTreeItem->m_vecChildren[i]);
-
- std::vector<std::string> vecRight;
-
- int iHeadchild;
-
- if (pTreeItem->fnIsPreTerminal()) {
- iHeadchild = 0;
- } else {
- size_t i;
- for (i = 0; i < pTreeItem->m_vecChildren.size(); i++) {
- char *p = pTreeItem->m_vecChildren[i]->m_pszTerm;
- if (p[0] == '*' && p[strlen(p) - 1] == '*') {
- iHeadchild = i;
- p[strlen(p) - 1] = '\0';
- std::string str = p + 1;
- strcpy(p, str.c_str()); // erase the "*..*"
- break;
- }
- }
- assert(i < pTreeItem->m_vecChildren.size());
- }
-
- pTreeItem->m_iHeadChild = iHeadchild;
- pTreeItem->m_iHeadWord = pTreeItem->m_vecChildren[iHeadchild]->m_iHeadWord;
- }
- void fnPostProcessingFlattenedParse(SParsedTree *pTree) {
- pTree->fnSetSpanInfo();
- fnSetParseTreeHeadInfo(pTree);
- }
- bool fnReadNextSentence(char *pszLine, int *piLength) {
- if (feof(m_fpIn) == true) return false;
-
- int iLen;
-
- pszLine[0] = '\0';
-
- fgets(pszLine, 10001, m_fpIn);
- iLen = strlen(pszLine);
- while (iLen > 0 && pszLine[iLen - 1] > 0 && pszLine[iLen - 1] < 33) {
- pszLine[iLen - 1] = '\0';
- iLen--;
- }
-
- if (piLength != NULL) (*piLength) = iLen;
-
- return true;
- }
-
- private:
- FILE *m_fpIn;
- const bool m_bFlattened;
-};
-
-#endif /* TREE_H_ */