diff options
author | Wu, Ke <wuke@cs.umd.edu> | 2014-12-06 10:37:56 -0500 |
---|---|---|
committer | Wu, Ke <wuke@cs.umd.edu> | 2014-12-06 10:37:56 -0500 |
commit | 41bf2308139d08c992a3342154d1c8b96b44f681 (patch) | |
tree | a4e3bd820b4923151299588d74ed256d4e65472c /utils | |
parent | 34b7c1e7c3aa5f9ee780be65effc40726d849303 (diff) | |
parent | a21959213f9b1cc15befae52dbb5091e848de7a1 (diff) |
Merge branch 'const_reorder' into softsyn
Diffstat (limited to 'utils')
-rw-r--r-- | utils/argument_reorder_model.cc | 126 | ||||
-rw-r--r-- | utils/constituent_reorder_model.cc | 138 | ||||
-rw-r--r-- | utils/synutils.h | 123 | ||||
-rw-r--r-- | utils/tsuruoka_maxent.h | 1 |
4 files changed, 112 insertions, 276 deletions
diff --git a/utils/argument_reorder_model.cc b/utils/argument_reorder_model.cc index 5caf318f..c4e90cba 100644 --- a/utils/argument_reorder_model.cc +++ b/utils/argument_reorder_model.cc @@ -12,60 +12,49 @@ #include <string> #include <vector> +#include "filelib.h" + #include "argument_reorder_model.h" -#include "synutils.h" #include "tsuruoka_maxent.h" using namespace std; inline void fnPreparingTrainingdata(const char* pszFName, int iCutoff, const char* pszNewFName) { - SFReader* pFReader = new STxtFileReader(pszFName); - char* pszLine = new char[100001]; - int iLen; Map hashPredicate; - while (pFReader->fnReadNextLine(pszLine, &iLen)) { - if (iLen == 0) continue; - - vector<string> vecTerms; - SplitOnWhitespace(string(pszLine), &vecTerms); - - for (size_t i = 0; i < vecTerms.size() - 1; i++) { - Iterator iter = hashPredicate.find(vecTerms[i]); - if (iter == hashPredicate.end()) { - hashPredicate[vecTerms[i]] = 1; - - } else { - iter->second++; + { + ReadFile in(pszFName); + string line; + while (getline(*in.stream(), line)) { + if (!line.size()) continue; + vector<string> terms; + SplitOnWhitespace(line, &terms); + for (const auto& i : terms) { + ++hashPredicate[i]; } } } - delete pFReader; - - pFReader = new STxtFileReader(pszFName); - FILE* fpOut = fopen(pszNewFName, "w"); - while (pFReader->fnReadNextLine(pszLine, &iLen)) { - if (iLen == 0) continue; - - vector<string> vecTerms; - SplitOnWhitespace(string(pszLine), &vecTerms); - ostringstream ostr; - for (size_t i = 0; i < vecTerms.size() - 1; i++) { - Iterator iter = hashPredicate.find(vecTerms[i]); - assert(iter != hashPredicate.end()); - if (iter->second >= iCutoff) { - ostr << vecTerms[i] << " "; + + { + ReadFile in(pszFName); + WriteFile out(pszNewFName); + string line; + while (getline(*in.stream(), line)) { + if (!line.size()) continue; + vector<string> terms; + SplitOnWhitespace(line, &terms); + bool written = false; + for (const auto& i : terms) { + if (hashPredicate[i] >= iCutoff) { + (*out.stream()) << i << " "; + written = true; + } + } + if (written) { + (*out.stream()) << "\n"; } - } - if (ostr.str().length() > 0) { - ostr << vecTerms[vecTerms.size() - 1]; - fprintf(fpOut, "%s\n", ostr.str().c_str()); } } - fclose(fpOut); - delete pFReader; - - delete[] pszLine; } struct SArgumentReorderTrainer { @@ -127,8 +116,8 @@ struct SArgumentReorderTrainer { ) { SAlignmentReader* pAlignReader = new SAlignmentReader(pszAlignFname); SSrlSentenceReader* pSRLReader = new SSrlSentenceReader(pszSRLFname); - STxtFileReader* pTxtSReader = new STxtFileReader(pszSourceFname); - STxtFileReader* pTxtTReader = new STxtFileReader(pszTargetFname); + ReadFile source_file(pszSourceFname); + ReadFile target_file(pszTargetFname); Map* pMapPredicate; if (pszTopPredicateFname != NULL) @@ -136,13 +125,10 @@ struct SArgumentReorderTrainer { else pMapPredicate = NULL; - char* pszLine = new char[50001]; + string line; - FILE* fpLeftOut, *fpRightOut; - sprintf(pszLine, "%s.left", pszInstanceFname); - fpLeftOut = fopen(pszLine, "w"); - sprintf(pszLine, "%s.right", pszInstanceFname); - fpRightOut = fopen(pszLine, "w"); + WriteFile left_file(pszInstanceFname + string(".left")); + WriteFile right_file(pszInstanceFname + string(".right")); // read sentence by sentence SAlignment* pAlign; @@ -153,12 +139,12 @@ struct SArgumentReorderTrainer { pSRL = pSRLReader->fnReadNextSrlSentence(); assert(pSRL != NULL); pTree = pSRL->m_pTree; - assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); + assert(getline(*source_file.stream(), line)); vector<string> vecSTerms; - SplitOnWhitespace(string(pszLine), &vecSTerms); - assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); + SplitOnWhitespace(line, &vecSTerms); + assert(getline(*target_file.stream(), line)); vector<string> vecTTerms; - SplitOnWhitespace(string(pszLine), &vecTTerms); + SplitOnWhitespace(line, &vecTTerms); // vecTPOSTerms.size() == 0, given the case when an english sentence fails // parsing @@ -204,10 +190,10 @@ struct SArgumentReorderTrainer { // strOutcome.c_str()); // fprintf(fpOut, "sentid=%d %s %s\n", iSentNum, ostr.str().c_str(), // strOutcome.c_str()); - fprintf(fpLeftOut, "%s %s\n", ostr.str().c_str(), - strLeftOutcome.c_str()); - fprintf(fpRightOut, "%s %s\n", ostr.str().c_str(), - strRightOutcome.c_str()); + (*left_file.stream()) << ostr.str() << " " << strLeftOutcome + << "\n"; + (*right_file.stream()) << ostr.str() << " " << strRightOutcome + << "\n"; } } } @@ -218,36 +204,28 @@ struct SArgumentReorderTrainer { if (iSentNum % 100000 == 0) fprintf(stderr, "#%d\n", iSentNum); } - delete[] pszLine; - - fclose(fpLeftOut); - fclose(fpRightOut); delete pAlignReader; delete pSRLReader; - delete pTxtSReader; - delete pTxtTReader; } Map* fnLoadTopPredicates(const char* pszTopPredicateFname) { if (pszTopPredicateFname == NULL) return NULL; Map* pMapPredicate = new Map(); - STxtFileReader* pReader = new STxtFileReader(pszTopPredicateFname); - char* pszLine = new char[50001]; + // STxtFileReader* pReader = new STxtFileReader(pszTopPredicateFname); + ReadFile in(pszTopPredicateFname); + // char* pszLine = new char[50001]; + string line; int iNumCount = 0; - while (pReader->fnReadNextLine(pszLine, NULL)) { - if (pszLine[0] == '#') continue; - char* p = strchr(pszLine, ' '); - assert(p != NULL); - p[0] = '\0'; - p++; - int iCount = atoi(p); + while (getline(*in.stream(), line)) { + if (line.size() && line[0] == '#') continue; + auto p = line.find(' '); + assert(p != string::npos); + int iCount = atoi(line.substr(p + 1).c_str()); if (iCount < 100) break; - (*pMapPredicate)[string(pszLine)] = iNumCount++; + (*pMapPredicate)[line] = iNumCount++; } - delete pReader; - delete[] pszLine; return pMapPredicate; } }; diff --git a/utils/constituent_reorder_model.cc b/utils/constituent_reorder_model.cc index df75a1a0..bdb7c5d1 100644 --- a/utils/constituent_reorder_model.cc +++ b/utils/constituent_reorder_model.cc @@ -5,15 +5,17 @@ * Author: junhuili */ +#include <string> +#include <unordered_map> + #include <boost/program_options.hpp> +#include "filelib.h" + #include "alignment.h" #include "tree.h" -#include "synutils.h" #include "tsuruoka_maxent.h" -#include <unordered_map> - using namespace std; typedef std::unordered_map<std::string, int> Map; @@ -23,52 +25,40 @@ namespace po = boost::program_options; inline void fnPreparingTrainingdata(const char* pszFName, int iCutoff, const char* pszNewFName) { - SFReader* pFReader = new STxtFileReader(pszFName); - char* pszLine = new char[100001]; - int iLen; Map hashPredicate; - while (pFReader->fnReadNextLine(pszLine, &iLen)) { - if (iLen == 0) continue; - - vector<string> vecTerms; - SplitOnWhitespace(string(pszLine), &vecTerms); - - for (size_t i = 0; i < vecTerms.size() - 1; i++) { - Iterator iter = hashPredicate.find(vecTerms[i]); - if (iter == hashPredicate.end()) { - hashPredicate[vecTerms[i]] = 1; - - } else { - iter->second++; + { + ReadFile f(pszFName); + string line; + while (getline(*f.stream(), line)) { + if (!line.size()) continue; + vector<string> terms; + SplitOnWhitespace(line, &terms); + for (const auto& i : terms) { + ++hashPredicate[i]; } } } - delete pFReader; - - pFReader = new STxtFileReader(pszFName); - FILE* fpOut = fopen(pszNewFName, "w"); - while (pFReader->fnReadNextLine(pszLine, &iLen)) { - if (iLen == 0) continue; - - vector<string> vecTerms; - SplitOnWhitespace(string(pszLine), &vecTerms); - ostringstream ostr; - for (size_t i = 0; i < vecTerms.size() - 1; i++) { - Iterator iter = hashPredicate.find(vecTerms[i]); - assert(iter != hashPredicate.end()); - if (iter->second >= iCutoff) { - ostr << vecTerms[i] << " "; + + { + ReadFile in(pszFName); + WriteFile out(pszNewFName); + string line; + while (getline(*in.stream(), line)) { + if (!line.size()) continue; + vector<string> terms; + SplitOnWhitespace(line, &terms); + bool written = false; + for (const auto& i : terms) { + if (hashPredicate[i] >= iCutoff) { + (*out.stream()) << i << " "; + written = true; + } + } + if (written) { + (*out.stream()) << "\n"; } - } - if (ostr.str().length() > 0) { - ostr << vecTerms[vecTerms.size() - 1]; - fprintf(fpOut, "%s\n", ostr.str().c_str()); } } - fclose(fpOut); - delete pFReader; - - delete[] pszLine; } struct SConstReorderTrainer { @@ -408,31 +398,29 @@ delete pZhangleMaxent;*/ ) { SAlignmentReader* pAlignReader = new SAlignmentReader(pszAlignFname); SParseReader* pParseReader = new SParseReader(pszSynFname, false); - STxtFileReader* pTxtSReader = new STxtFileReader(pszSourceFname); - STxtFileReader* pTxtTReader = new STxtFileReader(pszTargetFname); + ReadFile source_file(pszSourceFname); + ReadFile target_file(pszTargetFname); string strInstanceLeftFname = string(pszInstanceFname) + string(".left"); string strInstanceRightFname = string(pszInstanceFname) + string(".right"); - - FILE* fpLeftOut = fopen(strInstanceLeftFname.c_str(), "w"); - assert(fpLeftOut != NULL); - - FILE* fpRightOut = fopen(strInstanceRightFname.c_str(), "w"); - assert(fpRightOut != NULL); + WriteFile left_file(strInstanceLeftFname); + WriteFile right_file(strInstanceRightFname); // read sentence by sentence SAlignment* pAlign; SParsedTree* pTree; - char* pszLine = new char[50001]; + string line; int iSentNum = 0; while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { pTree = pParseReader->fnReadNextParseTree(); - assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); + + assert(getline(*source_file.stream(), line)); vector<string> vecSTerms; - SplitOnWhitespace(string(pszLine), &vecSTerms); - assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); + SplitOnWhitespace(line, &vecSTerms); + + assert(getline(*target_file.stream(), line)); vector<string> vecTTerms; - SplitOnWhitespace(string(pszLine), &vecTTerms); + SplitOnWhitespace(line, &vecTTerms); if (pTree != NULL) { @@ -475,16 +463,18 @@ delete pZhangleMaxent;*/ vecLeftPosition, vecSTerms, vecTTerms, strLeftOutcome, ostr); + string ostr_str = ostr.str(); + // fprintf(stderr, "%s %s\n", ostr.str().c_str(), // strLeftOutcome.c_str()); - fprintf(fpLeftOut, "%s %s\n", ostr.str().c_str(), - strLeftOutcome.c_str()); + (*left_file.stream()) << ostr_str << " " << strLeftOutcome << "\n"; string strRightOutcome; fnGetOutcome(vecRightPosition[j - 1], vecRightPosition[j], strRightOutcome); - fprintf(fpRightOut, "%s LeftOrder=%s %s\n", ostr.str().c_str(), - strLeftOutcome.c_str(), strRightOutcome.c_str()); + (*right_file.stream()) << ostr_str + << " LeftOrder=" << strLeftOutcome << " " + << strRightOutcome << "\n"; } } delete pTree; @@ -496,13 +486,8 @@ delete pZhangleMaxent;*/ if (iSentNum % 100000 == 0) fprintf(stderr, "#%d\n", iSentNum); } - fclose(fpLeftOut); - fclose(fpRightOut); delete pAlignReader; delete pParseReader; - delete pTxtSReader; - delete pTxtTReader; - delete[] pszLine; } void fnGenerateInstanceFile2( @@ -514,25 +499,26 @@ delete pZhangleMaxent;*/ ) { SAlignmentReader* pAlignReader = new SAlignmentReader(pszAlignFname); SParseReader* pParseReader = new SParseReader(pszSynFname, false); - STxtFileReader* pTxtSReader = new STxtFileReader(pszSourceFname); - STxtFileReader* pTxtTReader = new STxtFileReader(pszTargetFname); - FILE* fpOut = fopen(pszInstanceFname, "w"); - assert(fpOut != NULL); + ReadFile source_file(pszSourceFname); + ReadFile target_file(pszTargetFname); + + WriteFile output_file(pszInstanceFname); // read sentence by sentence SAlignment* pAlign; SParsedTree* pTree; - char* pszLine = new char[50001]; + string line; int iSentNum = 0; while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { pTree = pParseReader->fnReadNextParseTree(); - assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); + assert(getline(*source_file.stream(), line)); vector<string> vecSTerms; - SplitOnWhitespace(string(pszLine), &vecSTerms); - assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); + SplitOnWhitespace(line, &vecSTerms); + + assert(getline(*target_file.stream(), line)); vector<string> vecTTerms; - SplitOnWhitespace(string(pszLine), &vecTTerms); + SplitOnWhitespace(line, &vecTTerms); if (pTree != NULL) { @@ -556,7 +542,7 @@ delete pZhangleMaxent;*/ // fprintf(stderr, "%s %s\n", ostr.str().c_str(), // strOutcome.c_str()); - fprintf(fpOut, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); + (*output_file.stream()) << ostr.str() << " " << strOutcome << "\n"; } } delete pTree; @@ -568,12 +554,8 @@ delete pZhangleMaxent;*/ if (iSentNum % 100000 == 0) fprintf(stderr, "#%d\n", iSentNum); } - fclose(fpOut); delete pAlignReader; delete pParseReader; - delete pTxtSReader; - delete pTxtTReader; - delete[] pszLine; } }; diff --git a/utils/synutils.h b/utils/synutils.h deleted file mode 100644 index f611553e..00000000 --- a/utils/synutils.h +++ /dev/null @@ -1,123 +0,0 @@ -/* - * utility.h - * - * Created on: Jun 24, 2013 - * Author: lijunhui - */ - -#ifndef UTILITY_H_ -#define UTILITY_H_ - -#include <zlib.h> -#include <stdio.h> -#include <assert.h> -#include <stdlib.h> -#include <string.h> - -#include <string> -#include <unordered_map> - -typedef std::unordered_map<std::string, int> MapString2Int; -typedef std::unordered_map<std::string, float> MapString2Float; -typedef std::unordered_map<std::string, float>::iterator - MapString2FloatIterator; - -struct SFReader { - SFReader() {} - virtual ~SFReader() {} - - virtual bool fnReadNextLine(char* pszLine, int* piLength) = 0; - virtual bool fnReadNextLine(std::string& strLine) = 0; -}; - -struct STxtFileReader : public SFReader { - STxtFileReader(const char* pszFname) { - m_fpIn = fopen(pszFname, "r"); - assert(m_fpIn != NULL); - } - ~STxtFileReader() { - if (m_fpIn != NULL) fclose(m_fpIn); - } - - bool fnReadNextLine(char* pszLine, int* piLength) { - if (feof(m_fpIn) == true) return false; - - int iLen; - - pszLine[0] = '\0'; - - fgets(pszLine, 10001, m_fpIn); - iLen = strlen(pszLine); - if (iLen == 0) return false; - while (iLen > 0 && pszLine[iLen - 1] > 0 && pszLine[iLen - 1] < 33) { - pszLine[iLen - 1] = '\0'; - iLen--; - } - - if (piLength != NULL) (*piLength) = iLen; - - return true; - } - - bool fnReadNextLine(std::string& strLine) { - char* pszLine = new char[10001]; - bool bOut = fnReadNextLine(pszLine, NULL); - if (bOut) - strLine = std::string(pszLine); - else - strLine = std::string(""); - delete[] pszLine; - - return bOut; - } - - private: - FILE* m_fpIn; -}; - -struct SGZFileReader : public SFReader { - SGZFileReader(const char* pszFname) { - m_fpIn = gzopen(pszFname, "r"); - assert(m_fpIn != NULL); - } - ~SGZFileReader() { - if (m_fpIn != NULL) gzclose(m_fpIn); - } - - bool fnReadNextLine(char* pszLine, int* piLength) { - if (m_fpIn == NULL) exit(0); - if (gzeof(m_fpIn) == true) return false; - - int iLen; - - pszLine[0] = '\0'; - - gzgets(m_fpIn, pszLine, 10001); - iLen = strlen(pszLine); - while (iLen > 0 && pszLine[iLen - 1] > 0 && pszLine[iLen - 1] < 33) { - pszLine[iLen - 1] = '\0'; - iLen--; - } - - if (piLength != NULL) (*piLength) = iLen; - - return true; - } - - bool fnReadNextLine(std::string& strLine) { - char* pszLine = new char[10001]; - bool bOut = fnReadNextLine(pszLine, NULL); - if (bOut) - strLine = std::string(pszLine); - else - strLine = std::string(""); - delete[] pszLine; - - return bOut; - } - - private: - gzFile m_fpIn; -}; - -#endif /* UTILITY_H_ */ diff --git a/utils/tsuruoka_maxent.h b/utils/tsuruoka_maxent.h index 550a4b7f..82da44ff 100644 --- a/utils/tsuruoka_maxent.h +++ b/utils/tsuruoka_maxent.h @@ -13,7 +13,6 @@ #include <utility> #include <vector> -#include "synutils.h" #include "stringlib.h" #include "maxent.h" |