diff options
Diffstat (limited to 'utils')
| -rw-r--r-- | utils/argument_reorder_model.cc | 126 | ||||
| -rw-r--r-- | utils/constituent_reorder_model.cc | 138 | ||||
| -rw-r--r-- | utils/synutils.h | 123 | ||||
| -rw-r--r-- | utils/tsuruoka_maxent.h | 1 | 
4 files changed, 112 insertions, 276 deletions
diff --git a/utils/argument_reorder_model.cc b/utils/argument_reorder_model.cc index 5caf318f..c4e90cba 100644 --- a/utils/argument_reorder_model.cc +++ b/utils/argument_reorder_model.cc @@ -12,60 +12,49 @@  #include <string>  #include <vector> +#include "filelib.h" +  #include "argument_reorder_model.h" -#include "synutils.h"  #include "tsuruoka_maxent.h"  using namespace std;  inline void fnPreparingTrainingdata(const char* pszFName, int iCutoff,                                      const char* pszNewFName) { -  SFReader* pFReader = new STxtFileReader(pszFName); -  char* pszLine = new char[100001]; -  int iLen;    Map hashPredicate; -  while (pFReader->fnReadNextLine(pszLine, &iLen)) { -    if (iLen == 0) continue; - -    vector<string> vecTerms; -    SplitOnWhitespace(string(pszLine), &vecTerms); - -    for (size_t i = 0; i < vecTerms.size() - 1; i++) { -      Iterator iter = hashPredicate.find(vecTerms[i]); -      if (iter == hashPredicate.end()) { -        hashPredicate[vecTerms[i]] = 1; - -      } else { -        iter->second++; +  { +    ReadFile in(pszFName); +    string line; +    while (getline(*in.stream(), line)) { +      if (!line.size()) continue; +      vector<string> terms; +      SplitOnWhitespace(line, &terms); +      for (const auto& i : terms) { +        ++hashPredicate[i];        }      }    } -  delete pFReader; - -  pFReader = new STxtFileReader(pszFName); -  FILE* fpOut = fopen(pszNewFName, "w"); -  while (pFReader->fnReadNextLine(pszLine, &iLen)) { -    if (iLen == 0) continue; - -    vector<string> vecTerms; -    SplitOnWhitespace(string(pszLine), &vecTerms); -    ostringstream ostr; -    for (size_t i = 0; i < vecTerms.size() - 1; i++) { -      Iterator iter = hashPredicate.find(vecTerms[i]); -      assert(iter != hashPredicate.end()); -      if (iter->second >= iCutoff) { -        ostr << vecTerms[i] << " "; + +  { +    ReadFile in(pszFName); +    WriteFile out(pszNewFName); +    string line; +    while (getline(*in.stream(), line)) { +      if (!line.size()) continue; +      vector<string> terms; +      SplitOnWhitespace(line, &terms); +      bool written = false; +      for (const auto& i : terms) { +        if (hashPredicate[i] >= iCutoff) { +          (*out.stream()) << i << " "; +          written = true; +        } +      } +      if (written) { +        (*out.stream()) << "\n";        } -    } -    if (ostr.str().length() > 0) { -      ostr << vecTerms[vecTerms.size() - 1]; -      fprintf(fpOut, "%s\n", ostr.str().c_str());      }    } -  fclose(fpOut); -  delete pFReader; - -  delete[] pszLine;  }  struct SArgumentReorderTrainer { @@ -127,8 +116,8 @@ struct SArgumentReorderTrainer {        ) {      SAlignmentReader* pAlignReader = new SAlignmentReader(pszAlignFname);      SSrlSentenceReader* pSRLReader = new SSrlSentenceReader(pszSRLFname); -    STxtFileReader* pTxtSReader = new STxtFileReader(pszSourceFname); -    STxtFileReader* pTxtTReader = new STxtFileReader(pszTargetFname); +    ReadFile source_file(pszSourceFname); +    ReadFile target_file(pszTargetFname);      Map* pMapPredicate;      if (pszTopPredicateFname != NULL) @@ -136,13 +125,10 @@ struct SArgumentReorderTrainer {      else        pMapPredicate = NULL; -    char* pszLine = new char[50001]; +    string line; -    FILE* fpLeftOut, *fpRightOut; -    sprintf(pszLine, "%s.left", pszInstanceFname); -    fpLeftOut = fopen(pszLine, "w"); -    sprintf(pszLine, "%s.right", pszInstanceFname); -    fpRightOut = fopen(pszLine, "w"); +    WriteFile left_file(pszInstanceFname + string(".left")); +    WriteFile right_file(pszInstanceFname + string(".right"));      // read sentence by sentence      SAlignment* pAlign; @@ -153,12 +139,12 @@ struct SArgumentReorderTrainer {        pSRL = pSRLReader->fnReadNextSrlSentence();        assert(pSRL != NULL);        pTree = pSRL->m_pTree; -      assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); +      assert(getline(*source_file.stream(), line));        vector<string> vecSTerms; -      SplitOnWhitespace(string(pszLine), &vecSTerms); -      assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); +      SplitOnWhitespace(line, &vecSTerms); +      assert(getline(*target_file.stream(), line));        vector<string> vecTTerms; -      SplitOnWhitespace(string(pszLine), &vecTTerms); +      SplitOnWhitespace(line, &vecTTerms);        // vecTPOSTerms.size() == 0, given the case when an english sentence fails        // parsing @@ -204,10 +190,10 @@ struct SArgumentReorderTrainer {              // strOutcome.c_str());              // fprintf(fpOut, "sentid=%d %s %s\n", iSentNum, ostr.str().c_str(),              // strOutcome.c_str()); -            fprintf(fpLeftOut, "%s %s\n", ostr.str().c_str(), -                    strLeftOutcome.c_str()); -            fprintf(fpRightOut, "%s %s\n", ostr.str().c_str(), -                    strRightOutcome.c_str()); +            (*left_file.stream()) << ostr.str() << " " << strLeftOutcome +                                  << "\n"; +            (*right_file.stream()) << ostr.str() << " " << strRightOutcome +                                   << "\n";            }          }        } @@ -218,36 +204,28 @@ struct SArgumentReorderTrainer {        if (iSentNum % 100000 == 0) fprintf(stderr, "#%d\n", iSentNum);      } -    delete[] pszLine; - -    fclose(fpLeftOut); -    fclose(fpRightOut);      delete pAlignReader;      delete pSRLReader; -    delete pTxtSReader; -    delete pTxtTReader;    }    Map* fnLoadTopPredicates(const char* pszTopPredicateFname) {      if (pszTopPredicateFname == NULL) return NULL;      Map* pMapPredicate = new Map(); -    STxtFileReader* pReader = new STxtFileReader(pszTopPredicateFname); -    char* pszLine = new char[50001]; +    // STxtFileReader* pReader = new STxtFileReader(pszTopPredicateFname); +    ReadFile in(pszTopPredicateFname); +    // char* pszLine = new char[50001]; +    string line;      int iNumCount = 0; -    while (pReader->fnReadNextLine(pszLine, NULL)) { -      if (pszLine[0] == '#') continue; -      char* p = strchr(pszLine, ' '); -      assert(p != NULL); -      p[0] = '\0'; -      p++; -      int iCount = atoi(p); +    while (getline(*in.stream(), line)) { +      if (line.size() && line[0] == '#') continue; +      auto p = line.find(' '); +      assert(p != string::npos); +      int iCount = atoi(line.substr(p + 1).c_str());        if (iCount < 100) break; -      (*pMapPredicate)[string(pszLine)] = iNumCount++; +      (*pMapPredicate)[line] = iNumCount++;      } -    delete pReader; -    delete[] pszLine;      return pMapPredicate;    }  }; diff --git a/utils/constituent_reorder_model.cc b/utils/constituent_reorder_model.cc index df75a1a0..bdb7c5d1 100644 --- a/utils/constituent_reorder_model.cc +++ b/utils/constituent_reorder_model.cc @@ -5,15 +5,17 @@   *      Author: junhuili   */ +#include <string> +#include <unordered_map> +  #include <boost/program_options.hpp> +#include "filelib.h" +  #include "alignment.h"  #include "tree.h" -#include "synutils.h"  #include "tsuruoka_maxent.h" -#include <unordered_map> -  using namespace std;  typedef std::unordered_map<std::string, int> Map; @@ -23,52 +25,40 @@ namespace po = boost::program_options;  inline void fnPreparingTrainingdata(const char* pszFName, int iCutoff,                                      const char* pszNewFName) { -  SFReader* pFReader = new STxtFileReader(pszFName); -  char* pszLine = new char[100001]; -  int iLen;    Map hashPredicate; -  while (pFReader->fnReadNextLine(pszLine, &iLen)) { -    if (iLen == 0) continue; - -    vector<string> vecTerms; -    SplitOnWhitespace(string(pszLine), &vecTerms); - -    for (size_t i = 0; i < vecTerms.size() - 1; i++) { -      Iterator iter = hashPredicate.find(vecTerms[i]); -      if (iter == hashPredicate.end()) { -        hashPredicate[vecTerms[i]] = 1; - -      } else { -        iter->second++; +  { +    ReadFile f(pszFName); +    string line; +    while (getline(*f.stream(), line)) { +      if (!line.size()) continue; +      vector<string> terms; +      SplitOnWhitespace(line, &terms); +      for (const auto& i : terms) { +        ++hashPredicate[i];        }      }    } -  delete pFReader; - -  pFReader = new STxtFileReader(pszFName); -  FILE* fpOut = fopen(pszNewFName, "w"); -  while (pFReader->fnReadNextLine(pszLine, &iLen)) { -    if (iLen == 0) continue; - -    vector<string> vecTerms; -    SplitOnWhitespace(string(pszLine), &vecTerms); -    ostringstream ostr; -    for (size_t i = 0; i < vecTerms.size() - 1; i++) { -      Iterator iter = hashPredicate.find(vecTerms[i]); -      assert(iter != hashPredicate.end()); -      if (iter->second >= iCutoff) { -        ostr << vecTerms[i] << " "; + +  { +    ReadFile in(pszFName); +    WriteFile out(pszNewFName); +    string line; +    while (getline(*in.stream(), line)) { +      if (!line.size()) continue; +      vector<string> terms; +      SplitOnWhitespace(line, &terms); +      bool written = false; +      for (const auto& i : terms) { +        if (hashPredicate[i] >= iCutoff) { +          (*out.stream()) << i << " "; +          written = true; +        } +      } +      if (written) { +        (*out.stream()) << "\n";        } -    } -    if (ostr.str().length() > 0) { -      ostr << vecTerms[vecTerms.size() - 1]; -      fprintf(fpOut, "%s\n", ostr.str().c_str());      }    } -  fclose(fpOut); -  delete pFReader; - -  delete[] pszLine;  }  struct SConstReorderTrainer { @@ -408,31 +398,29 @@ delete pZhangleMaxent;*/        ) {      SAlignmentReader* pAlignReader = new SAlignmentReader(pszAlignFname);      SParseReader* pParseReader = new SParseReader(pszSynFname, false); -    STxtFileReader* pTxtSReader = new STxtFileReader(pszSourceFname); -    STxtFileReader* pTxtTReader = new STxtFileReader(pszTargetFname); +    ReadFile source_file(pszSourceFname); +    ReadFile target_file(pszTargetFname);      string strInstanceLeftFname = string(pszInstanceFname) + string(".left");      string strInstanceRightFname = string(pszInstanceFname) + string(".right"); - -    FILE* fpLeftOut = fopen(strInstanceLeftFname.c_str(), "w"); -    assert(fpLeftOut != NULL); - -    FILE* fpRightOut = fopen(strInstanceRightFname.c_str(), "w"); -    assert(fpRightOut != NULL); +    WriteFile left_file(strInstanceLeftFname); +    WriteFile right_file(strInstanceRightFname);      // read sentence by sentence      SAlignment* pAlign;      SParsedTree* pTree; -    char* pszLine = new char[50001]; +    string line;      int iSentNum = 0;      while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) {        pTree = pParseReader->fnReadNextParseTree(); -      assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); + +      assert(getline(*source_file.stream(), line));        vector<string> vecSTerms; -      SplitOnWhitespace(string(pszLine), &vecSTerms); -      assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); +      SplitOnWhitespace(line, &vecSTerms); + +      assert(getline(*target_file.stream(), line));        vector<string> vecTTerms; -      SplitOnWhitespace(string(pszLine), &vecTTerms); +      SplitOnWhitespace(line, &vecTTerms);        if (pTree != NULL) { @@ -475,16 +463,18 @@ delete pZhangleMaxent;*/                                 vecLeftPosition, vecSTerms, vecTTerms,                                 strLeftOutcome, ostr); +            string ostr_str = ostr.str(); +              // fprintf(stderr, "%s %s\n", ostr.str().c_str(),              // strLeftOutcome.c_str()); -            fprintf(fpLeftOut, "%s %s\n", ostr.str().c_str(), -                    strLeftOutcome.c_str()); +            (*left_file.stream()) << ostr_str << " " << strLeftOutcome << "\n";              string strRightOutcome;              fnGetOutcome(vecRightPosition[j - 1], vecRightPosition[j],                           strRightOutcome); -            fprintf(fpRightOut, "%s LeftOrder=%s %s\n", ostr.str().c_str(), -                    strLeftOutcome.c_str(), strRightOutcome.c_str()); +            (*right_file.stream()) << ostr_str +                                   << " LeftOrder=" << strLeftOutcome << " " +                                   << strRightOutcome << "\n";            }          }          delete pTree; @@ -496,13 +486,8 @@ delete pZhangleMaxent;*/        if (iSentNum % 100000 == 0) fprintf(stderr, "#%d\n", iSentNum);      } -    fclose(fpLeftOut); -    fclose(fpRightOut);      delete pAlignReader;      delete pParseReader; -    delete pTxtSReader; -    delete pTxtTReader; -    delete[] pszLine;    }    void fnGenerateInstanceFile2( @@ -514,25 +499,26 @@ delete pZhangleMaxent;*/        ) {      SAlignmentReader* pAlignReader = new SAlignmentReader(pszAlignFname);      SParseReader* pParseReader = new SParseReader(pszSynFname, false); -    STxtFileReader* pTxtSReader = new STxtFileReader(pszSourceFname); -    STxtFileReader* pTxtTReader = new STxtFileReader(pszTargetFname); -    FILE* fpOut = fopen(pszInstanceFname, "w"); -    assert(fpOut != NULL); +    ReadFile source_file(pszSourceFname); +    ReadFile target_file(pszTargetFname); + +    WriteFile output_file(pszInstanceFname);      // read sentence by sentence      SAlignment* pAlign;      SParsedTree* pTree; -    char* pszLine = new char[50001]; +    string line;      int iSentNum = 0;      while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) {        pTree = pParseReader->fnReadNextParseTree(); -      assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); +      assert(getline(*source_file.stream(), line));        vector<string> vecSTerms; -      SplitOnWhitespace(string(pszLine), &vecSTerms); -      assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); +      SplitOnWhitespace(line, &vecSTerms); + +      assert(getline(*target_file.stream(), line));        vector<string> vecTTerms; -      SplitOnWhitespace(string(pszLine), &vecTTerms); +      SplitOnWhitespace(line, &vecTTerms);        if (pTree != NULL) { @@ -556,7 +542,7 @@ delete pZhangleMaxent;*/              // fprintf(stderr, "%s %s\n", ostr.str().c_str(),              // strOutcome.c_str()); -            fprintf(fpOut, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); +            (*output_file.stream()) << ostr.str() << " " << strOutcome << "\n";            }          }          delete pTree; @@ -568,12 +554,8 @@ delete pZhangleMaxent;*/        if (iSentNum % 100000 == 0) fprintf(stderr, "#%d\n", iSentNum);      } -    fclose(fpOut);      delete pAlignReader;      delete pParseReader; -    delete pTxtSReader; -    delete pTxtTReader; -    delete[] pszLine;    }  }; diff --git a/utils/synutils.h b/utils/synutils.h deleted file mode 100644 index f611553e..00000000 --- a/utils/synutils.h +++ /dev/null @@ -1,123 +0,0 @@ -/* - * utility.h - * - *  Created on: Jun 24, 2013 - *      Author: lijunhui - */ - -#ifndef UTILITY_H_ -#define UTILITY_H_ - -#include <zlib.h> -#include <stdio.h> -#include <assert.h> -#include <stdlib.h> -#include <string.h> - -#include <string> -#include <unordered_map> - -typedef std::unordered_map<std::string, int> MapString2Int; -typedef std::unordered_map<std::string, float> MapString2Float; -typedef std::unordered_map<std::string, float>::iterator -    MapString2FloatIterator; - -struct SFReader { -  SFReader() {} -  virtual ~SFReader() {} - -  virtual bool fnReadNextLine(char* pszLine, int* piLength) = 0; -  virtual bool fnReadNextLine(std::string& strLine) = 0; -}; - -struct STxtFileReader : public SFReader { -  STxtFileReader(const char* pszFname) { -    m_fpIn = fopen(pszFname, "r"); -    assert(m_fpIn != NULL); -  } -  ~STxtFileReader() { -    if (m_fpIn != NULL) fclose(m_fpIn); -  } - -  bool fnReadNextLine(char* pszLine, int* piLength) { -    if (feof(m_fpIn) == true) return false; - -    int iLen; - -    pszLine[0] = '\0'; - -    fgets(pszLine, 10001, m_fpIn); -    iLen = strlen(pszLine); -    if (iLen == 0) return false; -    while (iLen > 0 && pszLine[iLen - 1] > 0 && pszLine[iLen - 1] < 33) { -      pszLine[iLen - 1] = '\0'; -      iLen--; -    } - -    if (piLength != NULL) (*piLength) = iLen; - -    return true; -  } - -  bool fnReadNextLine(std::string& strLine) { -    char* pszLine = new char[10001]; -    bool bOut = fnReadNextLine(pszLine, NULL); -    if (bOut) -      strLine = std::string(pszLine); -    else -      strLine = std::string(""); -    delete[] pszLine; - -    return bOut; -  } - - private: -  FILE* m_fpIn; -}; - -struct SGZFileReader : public SFReader { -  SGZFileReader(const char* pszFname) { -    m_fpIn = gzopen(pszFname, "r"); -    assert(m_fpIn != NULL); -  } -  ~SGZFileReader() { -    if (m_fpIn != NULL) gzclose(m_fpIn); -  } - -  bool fnReadNextLine(char* pszLine, int* piLength) { -    if (m_fpIn == NULL) exit(0); -    if (gzeof(m_fpIn) == true) return false; - -    int iLen; - -    pszLine[0] = '\0'; - -    gzgets(m_fpIn, pszLine, 10001); -    iLen = strlen(pszLine); -    while (iLen > 0 && pszLine[iLen - 1] > 0 && pszLine[iLen - 1] < 33) { -      pszLine[iLen - 1] = '\0'; -      iLen--; -    } - -    if (piLength != NULL) (*piLength) = iLen; - -    return true; -  } - -  bool fnReadNextLine(std::string& strLine) { -    char* pszLine = new char[10001]; -    bool bOut = fnReadNextLine(pszLine, NULL); -    if (bOut) -      strLine = std::string(pszLine); -    else -      strLine = std::string(""); -    delete[] pszLine; - -    return bOut; -  } - - private: -  gzFile m_fpIn; -}; - -#endif /* UTILITY_H_ */ diff --git a/utils/tsuruoka_maxent.h b/utils/tsuruoka_maxent.h index 550a4b7f..82da44ff 100644 --- a/utils/tsuruoka_maxent.h +++ b/utils/tsuruoka_maxent.h @@ -13,7 +13,6 @@  #include <utility>  #include <vector> -#include "synutils.h"  #include "stringlib.h"  #include "maxent.h"  | 
