Move synutils under utils

author: Wu, Ke <wuke@cs.umd.edu> 2014-10-07 18:44:05 -0400
committer: Wu, Ke <wuke@cs.umd.edu> 2014-10-07 18:44:05 -0400
commit: 8c26c195213805face566a6407597ba2a871a122 (patch)
tree: 378301ff345bf465f407f1447ad5fe126b3cd47c /utils/synutils/tree.h
parent: 6c7bf8cf49db88ca47e5b08aa449032995736854 (diff)
1 files changed, 0 insertions, 700 deletions
diff --git a/utils/synutils/tree.h b/utils/synutils/tree.h
deleted file mode 100644
index 8070f828..00000000
--- a/utils/synutils/tree.h
+++ /dev/null
@@ -1,700 +0,0 @@
-/*
- * tree.h
- *
- *  Created on: May 23, 2013
- *      Author: lijunhui
- */
-
-#ifndef TREE_H_
-#define TREE_H_
-
-#include <assert.h>
-#include <stdio.h>
-
-#include <string>
-#include <vector>
-
-using namespace std;
-
-struct STreeItem {
-  STreeItem(const char *pszTerm) {
-    m_pszTerm = new char[strlen(pszTerm) + 1];
-    strcpy(m_pszTerm, pszTerm);
-
-    m_ptParent = NULL;
-    m_iBegin = -1;
-    m_iEnd = -1;
-    m_iHeadChild = -1;
-    m_iHeadWord = -1;
-    m_iBrotherIndex = -1;
-  }
-  ~STreeItem() {
-    delete[] m_pszTerm;
-    for (size_t i = 0; i < m_vecChildren.size(); i++) delete m_vecChildren[i];
-  }
-  int fnAppend(STreeItem *ptChild) {
-    m_vecChildren.push_back(ptChild);
-    ptChild->m_iBrotherIndex = m_vecChildren.size() - 1;
-    ptChild->m_ptParent = this;
-    return m_vecChildren.size() - 1;
-  }
-  int fnGetChildrenNum() { return m_vecChildren.size(); }
-
-  bool fnIsPreTerminal(void) {
-    int I;
-    if (this == NULL || m_vecChildren.size() == 0) return false;
-
-    for (I = 0; I < m_vecChildren.size(); I++)
-      if (m_vecChildren[I]->m_vecChildren.size() > 0) return false;
-
-    return true;
-  }
-
- public:
-  char *m_pszTerm;
-
-  vector<STreeItem *> m_vecChildren;  // children items
-  STreeItem *m_ptParent;  // the parent item
-
-  int m_iBegin;
-  int m_iEnd;  // the node span words[m_iBegin, m_iEnd]
-  int m_iHeadChild;  // the index of its head child
-  int m_iHeadWord;  // the index of its head word
-  int m_iBrotherIndex;  // the index in his brothers
-};
-
-struct SGetHeadWord {
-  typedef vector<string> CVectorStr;
-  SGetHeadWord() {}
-  ~SGetHeadWord() {}
-  int fnGetHeadWord(char *pszCFGLeft, CVectorStr vectRight) {
-    // 0 indicating from right to left while 1 indicating from left to right
-    char szaHeadLists[201] = "0";
-
-    /*  //head rules for Egnlish
-    if( strcmp( pszCFGLeft, "ADJP" ) == 0 )
-            strcpy( szaHeadLists, "0NNS 0QP 0NN 0$ 0ADVP 0JJ 0VBN 0VBG 0ADJP
-    0JJR 0NP 0JJS 0DT 0FW 0RBR 0RBS 0SBAR 0RB 0" );
-    else if( strcmp( pszCFGLeft, "ADVP" ) == 0 )
-            strcpy( szaHeadLists, "1RB 1RBR 1RBS 1FW 1ADVP 1TO 1CD 1JJR 1JJ 1IN
-    1NP 1JJS 1NN 1" );
-    else if( strcmp( pszCFGLeft, "CONJP" ) == 0 )
-            strcpy( szaHeadLists, "1CC 1RB 1IN 1" );
-    else if( strcmp( pszCFGLeft, "FRAG" ) == 0 )
-            strcpy( szaHeadLists, "1" );
-    else if( strcmp( pszCFGLeft, "INTJ" ) == 0 )
-            strcpy( szaHeadLists, "0" );
-    else if( strcmp( pszCFGLeft, "LST" ) == 0 )
-            strcpy( szaHeadLists, "1LS 1: 1CLN 1" );
-    else if( strcmp( pszCFGLeft, "NAC" ) == 0 )
-            strcpy( szaHeadLists, "0NN 0NNS 0NNP 0NNPS 0NP 0NAC 0EX 0$ 0CD 0QP
-    0PRP 0VBG 0JJ 0JJS 0JJR 0ADJP 0FW 0" );
-    else if( strcmp( pszCFGLeft, "PP" ) == 0 )
-            strcpy( szaHeadLists, "1IN 1TO 1VBG 1VBN 1RP 1FW 1" );
-    else if( strcmp( pszCFGLeft, "PRN" ) == 0 )
-            strcpy( szaHeadLists, "1" );
-    else if( strcmp( pszCFGLeft, "PRT" ) == 0 )
-            strcpy( szaHeadLists, "1RP 1" );
-    else if( strcmp( pszCFGLeft, "QP" ) == 0 )
-            strcpy( szaHeadLists, "0$ 0IN 0NNS 0NN 0JJ 0RB 0DT 0CD 0NCD 0QP 0JJR
-    0JJS 0" );
-    else if( strcmp( pszCFGLeft, "RRC" ) == 0 )
-            strcpy( szaHeadLists, "1VP 1NP 1ADVP 1ADJP 1PP 1" );
-    else if( strcmp( pszCFGLeft, "S" ) == 0 )
-            strcpy( szaHeadLists, "0TO 0IN 0VP 0S 0SBAR 0ADJP 0UCP 0NP 0" );
-    else if( strcmp( pszCFGLeft, "SBAR" ) == 0 )
-            strcpy( szaHeadLists, "0WHNP 0WHPP 0WHADVP 0WHADJP 0IN 0DT 0S 0SQ
-    0SINV 0SBAR 0FRAG 0" );
-    else if( strcmp( pszCFGLeft, "SBARQ" ) == 0 )
-            strcpy( szaHeadLists, "0SQ 0S 0SINV 0SBARQ 0FRAG 0" );
-    else if( strcmp( pszCFGLeft, "SINV" ) == 0 )
-            strcpy( szaHeadLists, "0VBZ 0VBD 0VBP 0VB 0MD 0VP 0S 0SINV 0ADJP 0NP
-    0" );
-    else if( strcmp( pszCFGLeft, "SQ" ) == 0 )
-            strcpy( szaHeadLists, "0VBZ 0VBD 0VBP 0VB 0MD 0VP 0SQ 0" );
-    else if( strcmp( pszCFGLeft, "UCP" ) == 0 )
-            strcpy( szaHeadLists, "1" );
-    else if( strcmp( pszCFGLeft, "VP" ) == 0 )
-            strcpy( szaHeadLists, "0TO 0VBD 0VBN 0MD 0VBZ 0VB 0VBG 0VBP 0VP
-    0ADJP 0NN 0NNS 0NP 0" );
-    else if( strcmp( pszCFGLeft, "WHADJP" ) == 0 )
-            strcpy( szaHeadLists, "0CC 0WRB 0JJ 0ADJP 0" );
-    else if( strcmp( pszCFGLeft, "WHADVP" ) == 0 )
-            strcpy( szaHeadLists, "1CC 1WRB 1" );
-    else if( strcmp( pszCFGLeft, "WHNP" ) == 0 )
-            strcpy( szaHeadLists, "0WDT 0WP 0WP$ 0WHADJP 0WHPP 0WHNP 0" );
-    else if( strcmp( pszCFGLeft, "WHPP" ) == 0 )
-            strcpy( szaHeadLists, "1IN 1TO FW 1" );
-    else if( strcmp( pszCFGLeft, "NP" ) == 0 )
-            strcpy( szaHeadLists, "0NN NNP NNS NNPS NX POS JJR 0NP 0$ ADJP PRN
-    0CD 0JJ JJS RB QP 0" );
-    */
-
-    if (strcmp(pszCFGLeft, "ADJP") == 0)
-      strcpy(szaHeadLists, "0ADJP JJ 0AD NN CS 0");
-    else if (strcmp(pszCFGLeft, "ADVP") == 0)
-      strcpy(szaHeadLists, "0ADVP AD 0");
-    else if (strcmp(pszCFGLeft, "CLP") == 0)
-      strcpy(szaHeadLists, "0CLP M 0");
-    else if (strcmp(pszCFGLeft, "CP") == 0)
-      strcpy(szaHeadLists, "0DEC SP 1ADVP CS 0CP IP 0");
-    else if (strcmp(pszCFGLeft, "DNP") == 0)
-      strcpy(szaHeadLists, "0DNP DEG 0DEC 0");
-    else if (strcmp(pszCFGLeft, "DVP") == 0)
-      strcpy(szaHeadLists, "0DVP DEV 0");
-    else if (strcmp(pszCFGLeft, "DP") == 0)
-      strcpy(szaHeadLists, "1DP DT 1");
-    else if (strcmp(pszCFGLeft, "FRAG") == 0)
-      strcpy(szaHeadLists, "0VV NR NN 0");
-    else if (strcmp(pszCFGLeft, "INTJ") == 0)
-      strcpy(szaHeadLists, "0INTJ IJ 0");
-    else if (strcmp(pszCFGLeft, "LST") == 0)
-      strcpy(szaHeadLists, "1LST CD OD 1");
-    else if (strcmp(pszCFGLeft, "IP") == 0)
-      strcpy(szaHeadLists, "0IP VP 0VV 0");
-    // strcpy( szaHeadLists, "0VP 0VV 1IP 0" );
-    else if (strcmp(pszCFGLeft, "LCP") == 0)
-      strcpy(szaHeadLists, "0LCP LC 0");
-    else if (strcmp(pszCFGLeft, "NP") == 0)
-      strcpy(szaHeadLists, "0NP NN NT NR QP 0");
-    else if (strcmp(pszCFGLeft, "PP") == 0)
-      strcpy(szaHeadLists, "1PP P 1");
-    else if (strcmp(pszCFGLeft, "PRN") == 0)
-      strcpy(szaHeadLists, "0 NP IP VP NT NR NN 0");
-    else if (strcmp(pszCFGLeft, "QP") == 0)
-      strcpy(szaHeadLists, "0QP CLP CD OD 0");
-    else if (strcmp(pszCFGLeft, "VP") == 0)
-      strcpy(szaHeadLists, "1VP VA VC VE VV BA LB VCD VSB VRD VNV VCP 1");
-    else if (strcmp(pszCFGLeft, "VCD") == 0)
-      strcpy(szaHeadLists, "0VCD VV VA VC VE 0");
-    if (strcmp(pszCFGLeft, "VRD") == 0)
-      strcpy(szaHeadLists, "0VRD VV VA VC VE 0");
-    else if (strcmp(pszCFGLeft, "VSB") == 0)
-      strcpy(szaHeadLists, "0VSB VV VA VC VE 0");
-    else if (strcmp(pszCFGLeft, "VCP") == 0)
-      strcpy(szaHeadLists, "0VCP VV VA VC VE 0");
-    else if (strcmp(pszCFGLeft, "VNV") == 0)
-      strcpy(szaHeadLists, "0VNV VV VA VC VE 0");
-    else if (strcmp(pszCFGLeft, "VPT") == 0)
-      strcpy(szaHeadLists, "0VNV VV VA VC VE 0");
-    else if (strcmp(pszCFGLeft, "UCP") == 0)
-      strcpy(szaHeadLists, "0");
-    else if (strcmp(pszCFGLeft, "WHNP") == 0)
-      strcpy(szaHeadLists, "0WHNP NP NN NT NR QP 0");
-    else if (strcmp(pszCFGLeft, "WHPP") == 0)
-      strcpy(szaHeadLists, "1WHPP PP P 1");
-
-    /*  //head rules for GENIA corpus
-    if( strcmp( pszCFGLeft, "ADJP" ) == 0 )
-            strcpy( szaHeadLists, "0NNS 0QP 0NN 0$ 0ADVP 0JJ 0VBN 0VBG 0ADJP
-    0JJR 0NP 0JJS 0DT 0FW 0RBR 0RBS 0SBAR 0RB 0" );
-    else if( strcmp( pszCFGLeft, "ADVP" ) == 0 )
-            strcpy( szaHeadLists, "1RB 1RBR 1RBS 1FW 1ADVP 1TO 1CD 1JJR 1JJ 1IN
-    1NP 1JJS 1NN 1" );
-    else if( strcmp( pszCFGLeft, "CONJP" ) == 0 )
-            strcpy( szaHeadLists, "1CC 1RB 1IN 1" );
-    else if( strcmp( pszCFGLeft, "FRAG" ) == 0 )
-            strcpy( szaHeadLists, "1" );
-    else if( strcmp( pszCFGLeft, "INTJ" ) == 0 )
-            strcpy( szaHeadLists, "0" );
-    else if( strcmp( pszCFGLeft, "LST" ) == 0 )
-            strcpy( szaHeadLists, "1LS 1: 1CLN 1" );
-    else if( strcmp( pszCFGLeft, "NAC" ) == 0 )
-            strcpy( szaHeadLists, "0NN 0NNS 0NNP 0NNPS 0NP 0NAC 0EX 0$ 0CD 0QP
-    0PRP 0VBG 0JJ 0JJS 0JJR 0ADJP 0FW 0" );
-    else if( strcmp( pszCFGLeft, "PP" ) == 0 )
-            strcpy( szaHeadLists, "1IN 1TO 1VBG 1VBN 1RP 1FW 1" );
-    else if( strcmp( pszCFGLeft, "PRN" ) == 0 )
-            strcpy( szaHeadLists, "1" );
-    else if( strcmp( pszCFGLeft, "PRT" ) == 0 )
-            strcpy( szaHeadLists, "1RP 1" );
-    else if( strcmp( pszCFGLeft, "QP" ) == 0 )
-            strcpy( szaHeadLists, "0$ 0IN 0NNS 0NN 0JJ 0RB 0DT 0CD 0NCD 0QP 0JJR
-    0JJS 0" );
-    else if( strcmp( pszCFGLeft, "RRC" ) == 0 )
-            strcpy( szaHeadLists, "1VP 1NP 1ADVP 1ADJP 1PP 1" );
-    else if( strcmp( pszCFGLeft, "S" ) == 0 )
-            strcpy( szaHeadLists, "0TO 0IN 0VP 0S 0SBAR 0ADJP 0UCP 0NP 0" );
-    else if( strcmp( pszCFGLeft, "SBAR" ) == 0 )
-            strcpy( szaHeadLists, "0WHNP 0WHPP 0WHADVP 0WHADJP 0IN 0DT 0S 0SQ
-    0SINV 0SBAR 0FRAG 0" );
-    else if( strcmp( pszCFGLeft, "SBARQ" ) == 0 )
-            strcpy( szaHeadLists, "0SQ 0S 0SINV 0SBARQ 0FRAG 0" );
-    else if( strcmp( pszCFGLeft, "SINV" ) == 0 )
-            strcpy( szaHeadLists, "0VBZ 0VBD 0VBP 0VB 0MD 0VP 0S 0SINV 0ADJP 0NP
-    0" );
-    else if( strcmp( pszCFGLeft, "SQ" ) == 0 )
-            strcpy( szaHeadLists, "0VBZ 0VBD 0VBP 0VB 0MD 0VP 0SQ 0" );
-    else if( strcmp( pszCFGLeft, "UCP" ) == 0 )
-            strcpy( szaHeadLists, "1" );
-    else if( strcmp( pszCFGLeft, "VP" ) == 0 )
-            strcpy( szaHeadLists, "0TO 0VBD 0VBN 0MD 0VBZ 0VB 0VBG 0VBP 0VP
-    0ADJP 0NN 0NNS 0NP 0" );
-    else if( strcmp( pszCFGLeft, "WHADJP" ) == 0 )
-            strcpy( szaHeadLists, "0CC 0WRB 0JJ 0ADJP 0" );
-    else if( strcmp( pszCFGLeft, "WHADVP" ) == 0 )
-            strcpy( szaHeadLists, "1CC 1WRB 1" );
-    else if( strcmp( pszCFGLeft, "WHNP" ) == 0 )
-            strcpy( szaHeadLists, "0WDT 0WP 0WP$ 0WHADJP 0WHPP 0WHNP 0" );
-    else if( strcmp( pszCFGLeft, "WHPP" ) == 0 )
-            strcpy( szaHeadLists, "1IN 1TO FW 1" );
-    else if( strcmp( pszCFGLeft, "NP" ) == 0 )
-            strcpy( szaHeadLists, "0NN NNP NNS NNPS NX POS JJR 0NP 0$ ADJP PRN
-    0CD 0JJ JJS RB QP 0" );
-    */
-
-    return fnMyOwnHeadWordRule(szaHeadLists, vectRight);
-  }
-
- private:
-  int fnMyOwnHeadWordRule(char *pszaHeadLists, CVectorStr vectRight) {
-    char szHeadList[201], *p;
-    char szTerm[101];
-    int J;
-
-    p = pszaHeadLists;
-
-    int iCountRight;
-
-    iCountRight = vectRight.size();
-
-    szHeadList[0] = '\0';
-    while (1) {
-      szTerm[0] = '\0';
-      sscanf(p, "%s", szTerm);
-      if (strlen(szHeadList) == 0) {
-        if (strcmp(szTerm, "0") == 0) {
-          return iCountRight - 1;
-        }
-        if (strcmp(szTerm, "1") == 0) {
-          return 0;
-        }
-
-        sprintf(szHeadList, "%c %s ", szTerm[0], szTerm + 1);
-        p = strstr(p, szTerm);
-        p += strlen(szTerm);
-      } else {
-        if ((szTerm[0] == '0') || (szTerm[0] == '1')) {
-          if (szHeadList[0] == '0') {
-            for (J = iCountRight - 1; J >= 0; J--) {
-              sprintf(szTerm, " %s ", vectRight.at(J).c_str());
-              if (strstr(szHeadList, szTerm) != NULL) return J;
-            }
-          } else {
-            for (J = 0; J < iCountRight; J++) {
-              sprintf(szTerm, " %s ", vectRight.at(J).c_str());
-              if (strstr(szHeadList, szTerm) != NULL) return J;
-            }
-          }
-
-          szHeadList[0] = '\0';
-        } else {
-          strcat(szHeadList, szTerm);
-          strcat(szHeadList, " ");
-
-          p = strstr(p, szTerm);
-          p += strlen(szTerm);
-        }
-      }
-    }
-
-    return 0;
-  }
-};
-
-struct SParsedTree {
-  SParsedTree() { m_ptRoot = NULL; }
-  ~SParsedTree() {
-    if (m_ptRoot != NULL) delete m_ptRoot;
-  }
-  static SParsedTree *fnConvertFromString(const char *pszStr) {
-    if (strcmp(pszStr, "(())") == 0) return NULL;
-    SParsedTree *pTree = new SParsedTree();
-
-    vector<string> vecSyn;
-    fnReadSyntactic(pszStr, vecSyn);
-
-    int iLeft = 1, iRight = 1;  //# left/right parenthesis
-
-    STreeItem *pcurrent;
-
-    pTree->m_ptRoot = new STreeItem(vecSyn[1].c_str());
-
-    pcurrent = pTree->m_ptRoot;
-
-    for (size_t i = 2; i < vecSyn.size() - 1; i++) {
-      if (strcmp(vecSyn[i].c_str(), "(") == 0)
-        iLeft++;
-      else if (strcmp(vecSyn[i].c_str(), ")") == 0) {
-        iRight++;
-        if (pcurrent == NULL) {
-          // error
-          fprintf(stderr, "ERROR in ConvertFromString\n");
-          fprintf(stderr, "%s\n", pszStr);
-          return NULL;
-        }
-        pcurrent = pcurrent->m_ptParent;
-      } else {
-        STreeItem *ptNewItem = new STreeItem(vecSyn[i].c_str());
-        pcurrent->fnAppend(ptNewItem);
-        pcurrent = ptNewItem;
-
-        if (strcmp(vecSyn[i - 1].c_str(), "(") != 0 &&
-            strcmp(vecSyn[i - 1].c_str(), ")") != 0) {
-          pTree->m_vecTerminals.push_back(ptNewItem);
-          pcurrent = pcurrent->m_ptParent;
-        }
-      }
-    }
-
-    if (iLeft != iRight) {
-      // error
-      fprintf(stderr, "the left and right parentheses are not matched!");
-      fprintf(stderr, "ERROR in ConvertFromString\n");
-      fprintf(stderr, "%s\n", pszStr);
-      return NULL;
-    }
-
-    return pTree;
-  }
-
-  int fnGetNumWord() { return m_vecTerminals.size(); }
-
-  void fnSetSpanInfo() {
-    int iNextNum = 0;
-    fnSuffixTraverseSetSpanInfo(m_ptRoot, iNextNum);
-  }
-
-  void fnSetHeadWord() {
-    for (size_t i = 0; i < m_vecTerminals.size(); i++)
-      m_vecTerminals[i]->m_iHeadWord = i;
-    SGetHeadWord *pGetHeadWord = new SGetHeadWord();
-    fnSuffixTraverseSetHeadWord(m_ptRoot, pGetHeadWord);
-    delete pGetHeadWord;
-  }
-
-  STreeItem *fnFindNodeForSpan(int iLeft, int iRight, bool bLowest) {
-    STreeItem *pTreeItem = m_vecTerminals[iLeft];
-
-    while (pTreeItem->m_iEnd < iRight) {
-      pTreeItem = pTreeItem->m_ptParent;
-      if (pTreeItem == NULL) break;
-    }
-    if (pTreeItem == NULL) return NULL;
-    if (pTreeItem->m_iEnd > iRight) return NULL;
-
-    assert(pTreeItem->m_iEnd == iRight);
-    if (bLowest) return pTreeItem;
-
-    while (pTreeItem->m_ptParent != NULL &&
-           pTreeItem->m_ptParent->fnGetChildrenNum() == 1)
-      pTreeItem = pTreeItem->m_ptParent;
-
-    return pTreeItem;
-  }
-
- private:
-  void fnSuffixTraverseSetSpanInfo(STreeItem *ptItem, int &iNextNum) {
-    int I;
-    int iNumChildren = ptItem->fnGetChildrenNum();
-    for (I = 0; I < iNumChildren; I++)
-      fnSuffixTraverseSetSpanInfo(ptItem->m_vecChildren[I], iNextNum);
-
-    if (I == 0) {
-      ptItem->m_iBegin = iNextNum;
-      ptItem->m_iEnd = iNextNum++;
-    } else {
-      ptItem->m_iBegin = ptItem->m_vecChildren[0]->m_iBegin;
-      ptItem->m_iEnd = ptItem->m_vecChildren[I - 1]->m_iEnd;
-    }
-  }
-
-  void fnSuffixTraverseSetHeadWord(STreeItem *ptItem,
-                                   SGetHeadWord *pGetHeadWord) {
-    int I, iHeadchild;
-
-    if (ptItem->m_vecChildren.size() == 0) return;
-
-    for (I = 0; I < ptItem->m_vecChildren.size(); I++)
-      fnSuffixTraverseSetHeadWord(ptItem->m_vecChildren[I], pGetHeadWord);
-
-    vector<string> vecRight;
-
-    if (ptItem->m_vecChildren.size() == 1)
-      iHeadchild = 0;
-    else {
-      for (I = 0; I < ptItem->m_vecChildren.size(); I++)
-        vecRight.push_back(string(ptItem->m_vecChildren[I]->m_pszTerm));
-
-      iHeadchild = pGetHeadWord->fnGetHeadWord(ptItem->m_pszTerm, vecRight);
-    }
-
-    ptItem->m_iHeadChild = iHeadchild;
-    ptItem->m_iHeadWord = ptItem->m_vecChildren[iHeadchild]->m_iHeadWord;
-  }
-
-  static void fnReadSyntactic(const char *pszSyn, vector<string> &vec) {
-    char *p;
-    int I;
-
-    int iLeftNum, iRightNum;
-    char *pszTmp, *pszTerm;
-    pszTmp = new char[strlen(pszSyn)];
-    pszTerm = new char[strlen(pszSyn)];
-    pszTmp[0] = pszTerm[0] = '\0';
-
-    vec.clear();
-
-    char *pszLine;
-    pszLine = new char[strlen(pszSyn) + 1];
-    strcpy(pszLine, pszSyn);
-
-    char *pszLine2;
-
-    while (1) {
-      while ((strlen(pszLine) > 0) && (pszLine[strlen(pszLine) - 1] > 0) &&
-             (pszLine[strlen(pszLine) - 1] <= ' '))
-        pszLine[strlen(pszLine) - 1] = '\0';
-
-      if (strlen(pszLine) == 0) break;
-
-      // printf( "%s\n", pszLine );
-      pszLine2 = pszLine;
-      while (pszLine2[0] <= ' ') pszLine2++;
-      if (pszLine2[0] == '<') continue;
-
-      sscanf(pszLine2 + 1, "%s", pszTmp);
-
-      if (pszLine2[0] == '(') {
-        iLeftNum = 0;
-        iRightNum = 0;
-      }
-
-      p = pszLine2;
-      while (1) {
-        pszTerm[0] = '\0';
-        sscanf(p, "%s", pszTerm);
-
-        if (strlen(pszTerm) == 0) break;
-        p = strstr(p, pszTerm);
-        p += strlen(pszTerm);
-
-        if ((pszTerm[0] == '(') || (pszTerm[strlen(pszTerm) - 1] == ')')) {
-          if (pszTerm[0] == '(') {
-            vec.push_back(string("("));
-            iLeftNum++;
-
-            I = 1;
-            while (pszTerm[I] == '(' && pszTerm[I] != '\0') {
-              vec.push_back(string("("));
-              iLeftNum++;
-
-              I++;
-            }
-
-            if (strlen(pszTerm) > 1) vec.push_back(string(pszTerm + I));
-          } else {
-            char *pTmp;
-            pTmp = pszTerm + strlen(pszTerm) - 1;
-            while ((pTmp[0] == ')') && (pTmp >= pszTerm)) pTmp--;
-            pTmp[1] = '\0';
-
-            if (strlen(pszTerm) > 0) vec.push_back(string(pszTerm));
-            pTmp += 2;
-
-            for (I = 0; I <= (int)strlen(pTmp); I++) {
-              vec.push_back(string(")"));
-              iRightNum++;
-            }
-          }
-        } else {
-          char *q;
-          q = strchr(pszTerm, ')');
-          if (q != NULL) {
-            q[0] = '\0';
-            if (pszTerm[0] != '\0') vec.push_back(string(pszTerm));
-            vec.push_back(string(")"));
-            iRightNum++;
-
-            q++;
-            while (q[0] == ')') {
-              vec.push_back(string(")"));
-              q++;
-              iRightNum++;
-            }
-
-            while (q[0] == '(') {
-              vec.push_back(string("("));
-              q++;
-              iLeftNum++;
-            }
-
-            if (q[0] != '\0') vec.push_back(string(q));
-          } else
-            vec.push_back(string(pszTerm));
-        }
-      }
-
-      if (iLeftNum != iRightNum) {
-        fprintf(stderr, "%s\n", pszSyn);
-        assert(iLeftNum == iRightNum);
-      }
-      /*if ( iLeftNum != iRightNum ) {
-              printf( "ERROR: left( and right ) is not matched, %d ( and %d
-      )\n", iLeftNum, iRightNum );
-              return;
-      }*/
-
-      if (vec.size() >= 2 && strcmp(vec[1].c_str(), "(") == 0) {
-        //( (IP..) )
-        std::vector<string>::iterator it;
-        it = vec.begin();
-        it++;
-        vec.insert(it, string("ROOT"));
-      }
-
-      break;
-    }
-
-    delete[] pszLine;
-    delete[] pszTmp;
-    delete[] pszTerm;
-  }
-
- public:
-  STreeItem *m_ptRoot;
-  vector<STreeItem *> m_vecTerminals;  // the leaf nodes
-};
-
-struct SParseReader {
-  SParseReader(const char *pszParse_Fname, bool bFlattened = false)
-      : m_bFlattened(bFlattened) {
-    m_fpIn = fopen(pszParse_Fname, "r");
-    assert(m_fpIn != NULL);
-  }
-  ~SParseReader() {
-    if (m_fpIn != NULL) fclose(m_fpIn);
-  }
-
-  SParsedTree *fnReadNextParseTree() {
-    SParsedTree *pTree = NULL;
-    char *pszLine = new char[100001];
-    int iLen;
-
-    while (fnReadNextSentence(pszLine, &iLen) == true) {
-      if (iLen == 0) continue;
-
-      pTree = SParsedTree::fnConvertFromString(pszLine);
-      if (pTree == NULL) break;
-      if (m_bFlattened)
-        fnPostProcessingFlattenedParse(pTree);
-      else {
-        pTree->fnSetSpanInfo();
-        pTree->fnSetHeadWord();
-      }
-      break;
-    }
-
-    delete[] pszLine;
-    return pTree;
-  }
-
-  SParsedTree *fnReadNextParseTreeWithProb(double *pProb) {
-    SParsedTree *pTree = NULL;
-    char *pszLine = new char[100001];
-    int iLen;
-
-    while (fnReadNextSentence(pszLine, &iLen) == true) {
-      if (iLen == 0) continue;
-
-      char *p = strchr(pszLine, ' ');
-      assert(p != NULL);
-      p[0] = '\0';
-      p++;
-      if (pProb) (*pProb) = atof(pszLine);
-
-      pTree = SParsedTree::fnConvertFromString(p);
-      if (m_bFlattened)
-        fnPostProcessingFlattenedParse(pTree);
-      else {
-        pTree->fnSetSpanInfo();
-        pTree->fnSetHeadWord();
-      }
-      break;
-    }
-
-    delete[] pszLine;
-    return pTree;
-  }
-
- private:
-  /*
-   * since to the parse tree is a flattened tree, use the head mark to identify
-   * head info.
-   * the head node will be marked as "*XP*"
-   */
-  void fnSetParseTreeHeadInfo(SParsedTree *pTree) {
-    for (size_t i = 0; i < pTree->m_vecTerminals.size(); i++)
-      pTree->m_vecTerminals[i]->m_iHeadWord = i;
-    fnSuffixTraverseSetHeadWord(pTree->m_ptRoot);
-  }
-
-  void fnSuffixTraverseSetHeadWord(STreeItem *pTreeItem) {
-    if (pTreeItem->m_vecChildren.size() == 0) return;
-
-    for (size_t i = 0; i < pTreeItem->m_vecChildren.size(); i++)
-      fnSuffixTraverseSetHeadWord(pTreeItem->m_vecChildren[i]);
-
-    vector<string> vecRight;
-
-    int iHeadchild;
-
-    if (pTreeItem->fnIsPreTerminal()) {
-      iHeadchild = 0;
-    } else {
-      size_t i;
-      for (i = 0; i < pTreeItem->m_vecChildren.size(); i++) {
-        char *p = pTreeItem->m_vecChildren[i]->m_pszTerm;
-        if (p[0] == '*' && p[strlen(p) - 1] == '*') {
-          iHeadchild = i;
-          p[strlen(p) - 1] = '\0';
-          string str = p + 1;
-          strcpy(p, str.c_str());  // erase the "*..*"
-          break;
-        }
-      }
-      assert(i < pTreeItem->m_vecChildren.size());
-    }
-
-    pTreeItem->m_iHeadChild = iHeadchild;
-    pTreeItem->m_iHeadWord = pTreeItem->m_vecChildren[iHeadchild]->m_iHeadWord;
-  }
-  void fnPostProcessingFlattenedParse(SParsedTree *pTree) {
-    pTree->fnSetSpanInfo();
-    fnSetParseTreeHeadInfo(pTree);
-  }
-  bool fnReadNextSentence(char *pszLine, int *piLength) {
-    if (feof(m_fpIn) == true) return false;
-
-    int iLen;
-
-    pszLine[0] = '\0';
-
-    fgets(pszLine, 10001, m_fpIn);
-    iLen = strlen(pszLine);
-    while (iLen > 0 && pszLine[iLen - 1] > 0 && pszLine[iLen - 1] < 33) {
-      pszLine[iLen - 1] = '\0';
-      iLen--;
-    }
-
-    if (piLength != NULL) (*piLength) = iLen;
-
-    return true;
-  }
-
- private:
-  FILE *m_fpIn;
-  const bool m_bFlattened;
-};
-
-#endif /* TREE_H_ */
author	Wu, Ke <wuke@cs.umd.edu>	2014-10-07 18:44:05 -0400
committer	Wu, Ke <wuke@cs.umd.edu>	2014-10-07 18:44:05 -0400
commit	8c26c195213805face566a6407597ba2a871a122 (patch)
tree	378301ff345bf465f407f1447ad5fe126b3cd47c /utils/synutils/tree.h
parent	6c7bf8cf49db88ca47e5b08aa449032995736854 (diff)