summaryrefslogtreecommitdiff
path: root/utils
diff options
context:
space:
mode:
authorWu, Ke <wuke@cs.umd.edu>2014-12-06 10:37:56 -0500
committerWu, Ke <wuke@cs.umd.edu>2014-12-06 10:37:56 -0500
commitf690860fab19bf7ce48861368f0c7e868e50d3a1 (patch)
treea23a7791002eee10a4027260af9e6487d7f83700 /utils
parent4444b842ef12f31e3aede13950b553f9a5bcb2d6 (diff)
parente39742355186ebf59a369b2e5b369648047ca216 (diff)
Merge branch 'const_reorder' into softsyn
Diffstat (limited to 'utils')
-rw-r--r--utils/argument_reorder_model.cc126
-rw-r--r--utils/constituent_reorder_model.cc138
-rw-r--r--utils/synutils.h123
-rw-r--r--utils/tsuruoka_maxent.h1
4 files changed, 112 insertions, 276 deletions
diff --git a/utils/argument_reorder_model.cc b/utils/argument_reorder_model.cc
index 5caf318f..c4e90cba 100644
--- a/utils/argument_reorder_model.cc
+++ b/utils/argument_reorder_model.cc
@@ -12,60 +12,49 @@
#include <string>
#include <vector>
+#include "filelib.h"
+
#include "argument_reorder_model.h"
-#include "synutils.h"
#include "tsuruoka_maxent.h"
using namespace std;
inline void fnPreparingTrainingdata(const char* pszFName, int iCutoff,
const char* pszNewFName) {
- SFReader* pFReader = new STxtFileReader(pszFName);
- char* pszLine = new char[100001];
- int iLen;
Map hashPredicate;
- while (pFReader->fnReadNextLine(pszLine, &iLen)) {
- if (iLen == 0) continue;
-
- vector<string> vecTerms;
- SplitOnWhitespace(string(pszLine), &vecTerms);
-
- for (size_t i = 0; i < vecTerms.size() - 1; i++) {
- Iterator iter = hashPredicate.find(vecTerms[i]);
- if (iter == hashPredicate.end()) {
- hashPredicate[vecTerms[i]] = 1;
-
- } else {
- iter->second++;
+ {
+ ReadFile in(pszFName);
+ string line;
+ while (getline(*in.stream(), line)) {
+ if (!line.size()) continue;
+ vector<string> terms;
+ SplitOnWhitespace(line, &terms);
+ for (const auto& i : terms) {
+ ++hashPredicate[i];
}
}
}
- delete pFReader;
-
- pFReader = new STxtFileReader(pszFName);
- FILE* fpOut = fopen(pszNewFName, "w");
- while (pFReader->fnReadNextLine(pszLine, &iLen)) {
- if (iLen == 0) continue;
-
- vector<string> vecTerms;
- SplitOnWhitespace(string(pszLine), &vecTerms);
- ostringstream ostr;
- for (size_t i = 0; i < vecTerms.size() - 1; i++) {
- Iterator iter = hashPredicate.find(vecTerms[i]);
- assert(iter != hashPredicate.end());
- if (iter->second >= iCutoff) {
- ostr << vecTerms[i] << " ";
+
+ {
+ ReadFile in(pszFName);
+ WriteFile out(pszNewFName);
+ string line;
+ while (getline(*in.stream(), line)) {
+ if (!line.size()) continue;
+ vector<string> terms;
+ SplitOnWhitespace(line, &terms);
+ bool written = false;
+ for (const auto& i : terms) {
+ if (hashPredicate[i] >= iCutoff) {
+ (*out.stream()) << i << " ";
+ written = true;
+ }
+ }
+ if (written) {
+ (*out.stream()) << "\n";
}
- }
- if (ostr.str().length() > 0) {
- ostr << vecTerms[vecTerms.size() - 1];
- fprintf(fpOut, "%s\n", ostr.str().c_str());
}
}
- fclose(fpOut);
- delete pFReader;
-
- delete[] pszLine;
}
struct SArgumentReorderTrainer {
@@ -127,8 +116,8 @@ struct SArgumentReorderTrainer {
) {
SAlignmentReader* pAlignReader = new SAlignmentReader(pszAlignFname);
SSrlSentenceReader* pSRLReader = new SSrlSentenceReader(pszSRLFname);
- STxtFileReader* pTxtSReader = new STxtFileReader(pszSourceFname);
- STxtFileReader* pTxtTReader = new STxtFileReader(pszTargetFname);
+ ReadFile source_file(pszSourceFname);
+ ReadFile target_file(pszTargetFname);
Map* pMapPredicate;
if (pszTopPredicateFname != NULL)
@@ -136,13 +125,10 @@ struct SArgumentReorderTrainer {
else
pMapPredicate = NULL;
- char* pszLine = new char[50001];
+ string line;
- FILE* fpLeftOut, *fpRightOut;
- sprintf(pszLine, "%s.left", pszInstanceFname);
- fpLeftOut = fopen(pszLine, "w");
- sprintf(pszLine, "%s.right", pszInstanceFname);
- fpRightOut = fopen(pszLine, "w");
+ WriteFile left_file(pszInstanceFname + string(".left"));
+ WriteFile right_file(pszInstanceFname + string(".right"));
// read sentence by sentence
SAlignment* pAlign;
@@ -153,12 +139,12 @@ struct SArgumentReorderTrainer {
pSRL = pSRLReader->fnReadNextSrlSentence();
assert(pSRL != NULL);
pTree = pSRL->m_pTree;
- assert(pTxtSReader->fnReadNextLine(pszLine, NULL));
+ assert(getline(*source_file.stream(), line));
vector<string> vecSTerms;
- SplitOnWhitespace(string(pszLine), &vecSTerms);
- assert(pTxtTReader->fnReadNextLine(pszLine, NULL));
+ SplitOnWhitespace(line, &vecSTerms);
+ assert(getline(*target_file.stream(), line));
vector<string> vecTTerms;
- SplitOnWhitespace(string(pszLine), &vecTTerms);
+ SplitOnWhitespace(line, &vecTTerms);
// vecTPOSTerms.size() == 0, given the case when an english sentence fails
// parsing
@@ -204,10 +190,10 @@ struct SArgumentReorderTrainer {
// strOutcome.c_str());
// fprintf(fpOut, "sentid=%d %s %s\n", iSentNum, ostr.str().c_str(),
// strOutcome.c_str());
- fprintf(fpLeftOut, "%s %s\n", ostr.str().c_str(),
- strLeftOutcome.c_str());
- fprintf(fpRightOut, "%s %s\n", ostr.str().c_str(),
- strRightOutcome.c_str());
+ (*left_file.stream()) << ostr.str() << " " << strLeftOutcome
+ << "\n";
+ (*right_file.stream()) << ostr.str() << " " << strRightOutcome
+ << "\n";
}
}
}
@@ -218,36 +204,28 @@ struct SArgumentReorderTrainer {
if (iSentNum % 100000 == 0) fprintf(stderr, "#%d\n", iSentNum);
}
- delete[] pszLine;
-
- fclose(fpLeftOut);
- fclose(fpRightOut);
delete pAlignReader;
delete pSRLReader;
- delete pTxtSReader;
- delete pTxtTReader;
}
Map* fnLoadTopPredicates(const char* pszTopPredicateFname) {
if (pszTopPredicateFname == NULL) return NULL;
Map* pMapPredicate = new Map();
- STxtFileReader* pReader = new STxtFileReader(pszTopPredicateFname);
- char* pszLine = new char[50001];
+ // STxtFileReader* pReader = new STxtFileReader(pszTopPredicateFname);
+ ReadFile in(pszTopPredicateFname);
+ // char* pszLine = new char[50001];
+ string line;
int iNumCount = 0;
- while (pReader->fnReadNextLine(pszLine, NULL)) {
- if (pszLine[0] == '#') continue;
- char* p = strchr(pszLine, ' ');
- assert(p != NULL);
- p[0] = '\0';
- p++;
- int iCount = atoi(p);
+ while (getline(*in.stream(), line)) {
+ if (line.size() && line[0] == '#') continue;
+ auto p = line.find(' ');
+ assert(p != string::npos);
+ int iCount = atoi(line.substr(p + 1).c_str());
if (iCount < 100) break;
- (*pMapPredicate)[string(pszLine)] = iNumCount++;
+ (*pMapPredicate)[line] = iNumCount++;
}
- delete pReader;
- delete[] pszLine;
return pMapPredicate;
}
};
diff --git a/utils/constituent_reorder_model.cc b/utils/constituent_reorder_model.cc
index df75a1a0..bdb7c5d1 100644
--- a/utils/constituent_reorder_model.cc
+++ b/utils/constituent_reorder_model.cc
@@ -5,15 +5,17 @@
* Author: junhuili
*/
+#include <string>
+#include <unordered_map>
+
#include <boost/program_options.hpp>
+#include "filelib.h"
+
#include "alignment.h"
#include "tree.h"
-#include "synutils.h"
#include "tsuruoka_maxent.h"
-#include <unordered_map>
-
using namespace std;
typedef std::unordered_map<std::string, int> Map;
@@ -23,52 +25,40 @@ namespace po = boost::program_options;
inline void fnPreparingTrainingdata(const char* pszFName, int iCutoff,
const char* pszNewFName) {
- SFReader* pFReader = new STxtFileReader(pszFName);
- char* pszLine = new char[100001];
- int iLen;
Map hashPredicate;
- while (pFReader->fnReadNextLine(pszLine, &iLen)) {
- if (iLen == 0) continue;
-
- vector<string> vecTerms;
- SplitOnWhitespace(string(pszLine), &vecTerms);
-
- for (size_t i = 0; i < vecTerms.size() - 1; i++) {
- Iterator iter = hashPredicate.find(vecTerms[i]);
- if (iter == hashPredicate.end()) {
- hashPredicate[vecTerms[i]] = 1;
-
- } else {
- iter->second++;
+ {
+ ReadFile f(pszFName);
+ string line;
+ while (getline(*f.stream(), line)) {
+ if (!line.size()) continue;
+ vector<string> terms;
+ SplitOnWhitespace(line, &terms);
+ for (const auto& i : terms) {
+ ++hashPredicate[i];
}
}
}
- delete pFReader;
-
- pFReader = new STxtFileReader(pszFName);
- FILE* fpOut = fopen(pszNewFName, "w");
- while (pFReader->fnReadNextLine(pszLine, &iLen)) {
- if (iLen == 0) continue;
-
- vector<string> vecTerms;
- SplitOnWhitespace(string(pszLine), &vecTerms);
- ostringstream ostr;
- for (size_t i = 0; i < vecTerms.size() - 1; i++) {
- Iterator iter = hashPredicate.find(vecTerms[i]);
- assert(iter != hashPredicate.end());
- if (iter->second >= iCutoff) {
- ostr << vecTerms[i] << " ";
+
+ {
+ ReadFile in(pszFName);
+ WriteFile out(pszNewFName);
+ string line;
+ while (getline(*in.stream(), line)) {
+ if (!line.size()) continue;
+ vector<string> terms;
+ SplitOnWhitespace(line, &terms);
+ bool written = false;
+ for (const auto& i : terms) {
+ if (hashPredicate[i] >= iCutoff) {
+ (*out.stream()) << i << " ";
+ written = true;
+ }
+ }
+ if (written) {
+ (*out.stream()) << "\n";
}
- }
- if (ostr.str().length() > 0) {
- ostr << vecTerms[vecTerms.size() - 1];
- fprintf(fpOut, "%s\n", ostr.str().c_str());
}
}
- fclose(fpOut);
- delete pFReader;
-
- delete[] pszLine;
}
struct SConstReorderTrainer {
@@ -408,31 +398,29 @@ delete pZhangleMaxent;*/
) {
SAlignmentReader* pAlignReader = new SAlignmentReader(pszAlignFname);
SParseReader* pParseReader = new SParseReader(pszSynFname, false);
- STxtFileReader* pTxtSReader = new STxtFileReader(pszSourceFname);
- STxtFileReader* pTxtTReader = new STxtFileReader(pszTargetFname);
+ ReadFile source_file(pszSourceFname);
+ ReadFile target_file(pszTargetFname);
string strInstanceLeftFname = string(pszInstanceFname) + string(".left");
string strInstanceRightFname = string(pszInstanceFname) + string(".right");
-
- FILE* fpLeftOut = fopen(strInstanceLeftFname.c_str(), "w");
- assert(fpLeftOut != NULL);
-
- FILE* fpRightOut = fopen(strInstanceRightFname.c_str(), "w");
- assert(fpRightOut != NULL);
+ WriteFile left_file(strInstanceLeftFname);
+ WriteFile right_file(strInstanceRightFname);
// read sentence by sentence
SAlignment* pAlign;
SParsedTree* pTree;
- char* pszLine = new char[50001];
+ string line;
int iSentNum = 0;
while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) {
pTree = pParseReader->fnReadNextParseTree();
- assert(pTxtSReader->fnReadNextLine(pszLine, NULL));
+
+ assert(getline(*source_file.stream(), line));
vector<string> vecSTerms;
- SplitOnWhitespace(string(pszLine), &vecSTerms);
- assert(pTxtTReader->fnReadNextLine(pszLine, NULL));
+ SplitOnWhitespace(line, &vecSTerms);
+
+ assert(getline(*target_file.stream(), line));
vector<string> vecTTerms;
- SplitOnWhitespace(string(pszLine), &vecTTerms);
+ SplitOnWhitespace(line, &vecTTerms);
if (pTree != NULL) {
@@ -475,16 +463,18 @@ delete pZhangleMaxent;*/
vecLeftPosition, vecSTerms, vecTTerms,
strLeftOutcome, ostr);
+ string ostr_str = ostr.str();
+
// fprintf(stderr, "%s %s\n", ostr.str().c_str(),
// strLeftOutcome.c_str());
- fprintf(fpLeftOut, "%s %s\n", ostr.str().c_str(),
- strLeftOutcome.c_str());
+ (*left_file.stream()) << ostr_str << " " << strLeftOutcome << "\n";
string strRightOutcome;
fnGetOutcome(vecRightPosition[j - 1], vecRightPosition[j],
strRightOutcome);
- fprintf(fpRightOut, "%s LeftOrder=%s %s\n", ostr.str().c_str(),
- strLeftOutcome.c_str(), strRightOutcome.c_str());
+ (*right_file.stream()) << ostr_str
+ << " LeftOrder=" << strLeftOutcome << " "
+ << strRightOutcome << "\n";
}
}
delete pTree;
@@ -496,13 +486,8 @@ delete pZhangleMaxent;*/
if (iSentNum % 100000 == 0) fprintf(stderr, "#%d\n", iSentNum);
}
- fclose(fpLeftOut);
- fclose(fpRightOut);
delete pAlignReader;
delete pParseReader;
- delete pTxtSReader;
- delete pTxtTReader;
- delete[] pszLine;
}
void fnGenerateInstanceFile2(
@@ -514,25 +499,26 @@ delete pZhangleMaxent;*/
) {
SAlignmentReader* pAlignReader = new SAlignmentReader(pszAlignFname);
SParseReader* pParseReader = new SParseReader(pszSynFname, false);
- STxtFileReader* pTxtSReader = new STxtFileReader(pszSourceFname);
- STxtFileReader* pTxtTReader = new STxtFileReader(pszTargetFname);
- FILE* fpOut = fopen(pszInstanceFname, "w");
- assert(fpOut != NULL);
+ ReadFile source_file(pszSourceFname);
+ ReadFile target_file(pszTargetFname);
+
+ WriteFile output_file(pszInstanceFname);
// read sentence by sentence
SAlignment* pAlign;
SParsedTree* pTree;
- char* pszLine = new char[50001];
+ string line;
int iSentNum = 0;
while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) {
pTree = pParseReader->fnReadNextParseTree();
- assert(pTxtSReader->fnReadNextLine(pszLine, NULL));
+ assert(getline(*source_file.stream(), line));
vector<string> vecSTerms;
- SplitOnWhitespace(string(pszLine), &vecSTerms);
- assert(pTxtTReader->fnReadNextLine(pszLine, NULL));
+ SplitOnWhitespace(line, &vecSTerms);
+
+ assert(getline(*target_file.stream(), line));
vector<string> vecTTerms;
- SplitOnWhitespace(string(pszLine), &vecTTerms);
+ SplitOnWhitespace(line, &vecTTerms);
if (pTree != NULL) {
@@ -556,7 +542,7 @@ delete pZhangleMaxent;*/
// fprintf(stderr, "%s %s\n", ostr.str().c_str(),
// strOutcome.c_str());
- fprintf(fpOut, "%s %s\n", ostr.str().c_str(), strOutcome.c_str());
+ (*output_file.stream()) << ostr.str() << " " << strOutcome << "\n";
}
}
delete pTree;
@@ -568,12 +554,8 @@ delete pZhangleMaxent;*/
if (iSentNum % 100000 == 0) fprintf(stderr, "#%d\n", iSentNum);
}
- fclose(fpOut);
delete pAlignReader;
delete pParseReader;
- delete pTxtSReader;
- delete pTxtTReader;
- delete[] pszLine;
}
};
diff --git a/utils/synutils.h b/utils/synutils.h
deleted file mode 100644
index f611553e..00000000
--- a/utils/synutils.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * utility.h
- *
- * Created on: Jun 24, 2013
- * Author: lijunhui
- */
-
-#ifndef UTILITY_H_
-#define UTILITY_H_
-
-#include <zlib.h>
-#include <stdio.h>
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <string>
-#include <unordered_map>
-
-typedef std::unordered_map<std::string, int> MapString2Int;
-typedef std::unordered_map<std::string, float> MapString2Float;
-typedef std::unordered_map<std::string, float>::iterator
- MapString2FloatIterator;
-
-struct SFReader {
- SFReader() {}
- virtual ~SFReader() {}
-
- virtual bool fnReadNextLine(char* pszLine, int* piLength) = 0;
- virtual bool fnReadNextLine(std::string& strLine) = 0;
-};
-
-struct STxtFileReader : public SFReader {
- STxtFileReader(const char* pszFname) {
- m_fpIn = fopen(pszFname, "r");
- assert(m_fpIn != NULL);
- }
- ~STxtFileReader() {
- if (m_fpIn != NULL) fclose(m_fpIn);
- }
-
- bool fnReadNextLine(char* pszLine, int* piLength) {
- if (feof(m_fpIn) == true) return false;
-
- int iLen;
-
- pszLine[0] = '\0';
-
- fgets(pszLine, 10001, m_fpIn);
- iLen = strlen(pszLine);
- if (iLen == 0) return false;
- while (iLen > 0 && pszLine[iLen - 1] > 0 && pszLine[iLen - 1] < 33) {
- pszLine[iLen - 1] = '\0';
- iLen--;
- }
-
- if (piLength != NULL) (*piLength) = iLen;
-
- return true;
- }
-
- bool fnReadNextLine(std::string& strLine) {
- char* pszLine = new char[10001];
- bool bOut = fnReadNextLine(pszLine, NULL);
- if (bOut)
- strLine = std::string(pszLine);
- else
- strLine = std::string("");
- delete[] pszLine;
-
- return bOut;
- }
-
- private:
- FILE* m_fpIn;
-};
-
-struct SGZFileReader : public SFReader {
- SGZFileReader(const char* pszFname) {
- m_fpIn = gzopen(pszFname, "r");
- assert(m_fpIn != NULL);
- }
- ~SGZFileReader() {
- if (m_fpIn != NULL) gzclose(m_fpIn);
- }
-
- bool fnReadNextLine(char* pszLine, int* piLength) {
- if (m_fpIn == NULL) exit(0);
- if (gzeof(m_fpIn) == true) return false;
-
- int iLen;
-
- pszLine[0] = '\0';
-
- gzgets(m_fpIn, pszLine, 10001);
- iLen = strlen(pszLine);
- while (iLen > 0 && pszLine[iLen - 1] > 0 && pszLine[iLen - 1] < 33) {
- pszLine[iLen - 1] = '\0';
- iLen--;
- }
-
- if (piLength != NULL) (*piLength) = iLen;
-
- return true;
- }
-
- bool fnReadNextLine(std::string& strLine) {
- char* pszLine = new char[10001];
- bool bOut = fnReadNextLine(pszLine, NULL);
- if (bOut)
- strLine = std::string(pszLine);
- else
- strLine = std::string("");
- delete[] pszLine;
-
- return bOut;
- }
-
- private:
- gzFile m_fpIn;
-};
-
-#endif /* UTILITY_H_ */
diff --git a/utils/tsuruoka_maxent.h b/utils/tsuruoka_maxent.h
index 550a4b7f..82da44ff 100644
--- a/utils/tsuruoka_maxent.h
+++ b/utils/tsuruoka_maxent.h
@@ -13,7 +13,6 @@
#include <utility>
#include <vector>
-#include "synutils.h"
#include "stringlib.h"
#include "maxent.h"