summaryrefslogtreecommitdiff
path: root/utils
diff options
context:
space:
mode:
Diffstat (limited to 'utils')
-rw-r--r--utils/Makefile.am4
-rw-r--r--utils/alignment.h18
-rw-r--r--utils/argument_reorder_model.cc46
-rw-r--r--utils/argument_reorder_model.h73
-rw-r--r--utils/constituent_reorder_model.cc239
-rw-r--r--utils/lbfgs.h3
-rw-r--r--utils/maxent.cpp2
-rw-r--r--utils/srl_sentence.h31
-rw-r--r--utils/synutils.h18
-rw-r--r--utils/tree.h55
-rw-r--r--utils/tsuruoka_maxent.h49
11 files changed, 176 insertions, 362 deletions
diff --git a/utils/Makefile.am b/utils/Makefile.am
index 0bd21b2b..53967561 100644
--- a/utils/Makefile.am
+++ b/utils/Makefile.am
@@ -1,4 +1,4 @@
-bin_PROGRAMS = reconstruct_weights atools const_reorder_model_trainer
+bin_PROGRAMS = reconstruct_weights atools const_reorder_model_trainer argument_reorder_model_trainer
noinst_PROGRAMS = \
ts \
@@ -108,6 +108,8 @@ atools_LDADD = libutils.a
atools_LDFLAGS = $(STATIC_FLAGS)
const_reorder_model_trainer_SOURCES = constituent_reorder_model.cc
const_reorder_model_trainer_LDADD = libutils.a
+argument_reorder_model_trainer_SOURCES = argument_reorder_model.cc
+argument_reorder_model_trainer_LDADD = libutils.a
phmt_SOURCES = phmt.cc
phmt_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS)
diff --git a/utils/alignment.h b/utils/alignment.h
index c0648aab..456577ca 100644
--- a/utils/alignment.h
+++ b/utils/alignment.h
@@ -15,8 +15,6 @@
#include "stringlib.h"
-using namespace std;
-
/*
* Note:
* m_vec_s_align.size() may not be equal to the length of source side
@@ -25,12 +23,12 @@ using namespace std;
*
*/
struct SAlignment {
- typedef vector<int> SingleAlign;
+ typedef std::vector<int> SingleAlign;
SAlignment(const char* pszAlign) { fnInitializeAlignment(pszAlign); }
~SAlignment() {}
bool fnIsAligned(int i, bool s) const {
- const vector<SingleAlign>* palign;
+ const std::vector<SingleAlign>* palign;
if (s == true)
palign = &m_vec_s_align;
else
@@ -70,7 +68,7 @@ struct SAlignment {
}
bool fnIsAlignedTightPhrase(int b, int e, bool s, int* pob, int* poe) const {
- const vector<SingleAlign>* palign;
+ const std::vector<SingleAlign>* palign;
if (s == true)
palign = &m_vec_s_align;
else
@@ -97,7 +95,7 @@ struct SAlignment {
* aligned to any word outside source[b, e]
* 3) return "Discon't": otherwise;
*/
- string fnIsContinuous(int b, int e) const {
+ std::string fnIsContinuous(int b, int e) const {
int ob, oe;
fnGetLeftRightMost(b, e, true, ob, oe);
if (ob == -1) return "Unaligned";
@@ -124,7 +122,7 @@ struct SAlignment {
}
private:
- void fnGetLeftRightMost(int b, int e, const vector<SingleAlign>& align,
+ void fnGetLeftRightMost(int b, int e, const std::vector<SingleAlign>& align,
int& ob, int& oe) const {
ob = oe = -1;
for (int i = b; i <= e && i < align.size(); i++) {
@@ -139,7 +137,7 @@ struct SAlignment {
m_vec_s_align.clear();
m_vec_t_align.clear();
- vector<string> terms = SplitOnWhitespace(string(pszAlign));
+ std::vector<std::string> terms = SplitOnWhitespace(std::string(pszAlign));
int si, ti;
for (size_t i = 0; i < terms.size(); i++) {
sscanf(terms[i].c_str(), "%d-%d", &si, &ti);
@@ -167,8 +165,8 @@ struct SAlignment {
}
private:
- vector<SingleAlign> m_vec_s_align; // source side words' alignment
- vector<SingleAlign> m_vec_t_align; // target side words' alignment
+ std::vector<SingleAlign> m_vec_s_align; // source side words' alignment
+ std::vector<SingleAlign> m_vec_t_align; // target side words' alignment
};
struct SAlignmentReader {
diff --git a/utils/argument_reorder_model.cc b/utils/argument_reorder_model.cc
index 58886251..5caf318f 100644
--- a/utils/argument_reorder_model.cc
+++ b/utils/argument_reorder_model.cc
@@ -6,12 +6,18 @@
*/
#include <boost/program_options.hpp>
+#include <iostream>
#include <fstream>
+#include <sstream>
+#include <string>
+#include <vector>
#include "argument_reorder_model.h"
#include "synutils.h"
#include "tsuruoka_maxent.h"
+using namespace std;
+
inline void fnPreparingTrainingdata(const char* pszFName, int iCutoff,
const char* pszNewFName) {
SFReader* pFReader = new STxtFileReader(pszFName);
@@ -64,13 +70,13 @@ inline void fnPreparingTrainingdata(const char* pszFName, int iCutoff,
struct SArgumentReorderTrainer {
SArgumentReorderTrainer(
- const char* pszSRLFname, // source-side srl tree file name
- const char* pszAlignFname, // alignment filename
- const char* pszSourceFname, // source file name
- const char* pszTargetFname, // target file name
+ const char* pszSRLFname, // source-side srl tree file name
+ const char* pszAlignFname, // alignment filename
+ const char* pszSourceFname, // source file name
+ const char* pszTargetFname, // target file name
const char* pszTopPredicateFname, // target file name
- const char* pszInstanceFname, // training instance file name
- const char* pszModelFname, // classifier model file name
+ const char* pszInstanceFname, // training instance file name
+ const char* pszModelFname, // classifier model file name
int iCutoff) {
fnGenerateInstanceFiles(pszSRLFname, pszAlignFname, pszSourceFname,
pszTargetFname, pszTopPredicateFname,
@@ -110,14 +116,14 @@ struct SArgumentReorderTrainer {
}
void fnGenerateInstanceFiles(
- const char* pszSRLFname, // source-side flattened parse tree file name
- const char* pszAlignFname, // alignment filename
+ const char* pszSRLFname, // source-side flattened parse tree file name
+ const char* pszAlignFname, // alignment filename
const char* pszSourceFname, // source file name
const char* pszTargetFname, // target file name
const char* pszTopPredicateFname, // top predicate file name (we only
// consider predicates with 100+
// occurrences
- const char* pszInstanceFname // training instance file name
+ const char* pszInstanceFname // training instance file name
) {
SAlignmentReader* pAlignReader = new SAlignmentReader(pszAlignFname);
SSrlSentenceReader* pSRLReader = new SSrlSentenceReader(pszSRLFname);
@@ -257,24 +263,24 @@ inline void print_options(std::ostream& out,
if (i) out << ' ';
out << "--" << ds[i]->long_name();
}
- out << '"\n';
+ out << '\n';
}
inline string str(char const* name, po::variables_map const& conf) {
return conf[name].as<string>();
}
//--srl_file /scratch0/mt_exp/gale-align/gale-align.nw.srl.cn --align_file
-///scratch0/mt_exp/gale-align/gale-align.nw.al --source_file
-///scratch0/mt_exp/gale-align/gale-align.nw.cn --target_file
-///scratch0/mt_exp/gale-align/gale-align.nw.en --instance_file
-///scratch0/mt_exp/gale-align/gale-align.nw.argreorder.instance --model_prefix
-///scratch0/mt_exp/gale-align/gale-align.nw.argreorder.model --feature_cutoff 2
+/// scratch0/mt_exp/gale-align/gale-align.nw.al --source_file
+/// scratch0/mt_exp/gale-align/gale-align.nw.cn --target_file
+/// scratch0/mt_exp/gale-align/gale-align.nw.en --instance_file
+/// scratch0/mt_exp/gale-align/gale-align.nw.argreorder.instance --model_prefix
+/// scratch0/mt_exp/gale-align/gale-align.nw.argreorder.model --feature_cutoff 2
//--srl_file /scratch0/mt_exp/gale-ctb/gale-ctb.srl.cn --align_file
-///scratch0/mt_exp/gale-ctb/gale-ctb.align --source_file
-///scratch0/mt_exp/gale-ctb/gale-ctb.cn --target_file
-///scratch0/mt_exp/gale-ctb/gale-ctb.en0 --instance_file
-///scratch0/mt_exp/gale-ctb/gale-ctb.argreorder.instance --model_prefix
-///scratch0/mt_exp/gale-ctb/gale-ctb.argreorder.model --feature_cutoff 2
+/// scratch0/mt_exp/gale-ctb/gale-ctb.align --source_file
+/// scratch0/mt_exp/gale-ctb/gale-ctb.cn --target_file
+/// scratch0/mt_exp/gale-ctb/gale-ctb.en0 --instance_file
+/// scratch0/mt_exp/gale-ctb/gale-ctb.argreorder.instance --model_prefix
+/// scratch0/mt_exp/gale-ctb/gale-ctb.argreorder.model --feature_cutoff 2
int main(int argc, char** argv) {
po::options_description opts("Configuration options");
diff --git a/utils/argument_reorder_model.h b/utils/argument_reorder_model.h
index 062b8841..077fa5ba 100644
--- a/utils/argument_reorder_model.h
+++ b/utils/argument_reorder_model.h
@@ -8,17 +8,20 @@
#ifndef ARGUMENT_REORDER_MODEL_H_
#define ARGUMENT_REORDER_MODEL_H_
+#include <string>
+#include <vector>
+
#include "alignment.h"
#include "tree.h"
#include "srl_sentence.h"
// an argument item or a predicate item (the verb itself)
struct SSRLItem {
- SSRLItem(const STreeItem *tree_item, string role)
+ SSRLItem(const STreeItem *tree_item, std::string role)
: tree_item_(tree_item), role_(role) {}
~SSRLItem() {}
const STreeItem *tree_item_;
- const string role_;
+ const std::string role_;
};
struct SPredicateItem {
@@ -26,11 +29,13 @@ struct SPredicateItem {
: pred_(pred) {
vec_items_.reserve(pred->m_vecArgt.size() + 1);
for (int i = 0; i < pred->m_vecArgt.size(); i++) {
- vec_items_.push_back(new SSRLItem(pred->m_vecArgt[i]->m_pTreeItem,
- string(pred->m_vecArgt[i]->m_pszRole)));
+ vec_items_.push_back(
+ new SSRLItem(pred->m_vecArgt[i]->m_pTreeItem,
+ std::string(pred->m_vecArgt[i]->m_pszRole)));
}
- vec_items_.push_back(new SSRLItem(
- tree->m_vecTerminals[pred->m_iPosition]->m_ptParent, string("Pred")));
+ vec_items_.push_back(
+ new SSRLItem(tree->m_vecTerminals[pred->m_iPosition]->m_ptParent,
+ std::string("Pred")));
sort(vec_items_.begin(), vec_items_.end(), SortFunction);
begin_ = vec_items_[0]->tree_item_->m_iBegin;
@@ -43,7 +48,7 @@ struct SPredicateItem {
return (i->tree_item_->m_iBegin < j->tree_item_->m_iBegin);
}
- vector<SSRLItem *> vec_items_;
+ std::vector<SSRLItem *> vec_items_;
int begin_;
int end_;
const SPredicate *pred_;
@@ -51,13 +56,14 @@ struct SPredicateItem {
struct SArgumentReorderModel {
public:
- static string fnGetBlockOutcome(int iBegin, int iEnd, SAlignment *pAlign) {
+ static std::string fnGetBlockOutcome(int iBegin, int iEnd,
+ SAlignment *pAlign) {
return pAlign->fnIsContinuous(iBegin, iEnd);
}
static void fnGetReorderType(SPredicateItem *pPredItem, SAlignment *pAlign,
- vector<string> &vecStrLeftReorder,
- vector<string> &vecStrRightReorder) {
- vector<int> vecLeft, vecRight;
+ std::vector<std::string> &vecStrLeftReorder,
+ std::vector<std::string> &vecStrRightReorder) {
+ std::vector<int> vecLeft, vecRight;
for (int i = 0; i < pPredItem->vec_items_.size(); i++) {
const STreeItem *pCon1 = pPredItem->vec_items_[i]->tree_item_;
int iLeft1, iRight1;
@@ -66,15 +72,15 @@ struct SArgumentReorderModel {
vecLeft.push_back(iLeft1);
vecRight.push_back(iRight1);
}
- vector<int> vecLeftPosition;
+ std::vector<int> vecLeftPosition;
fnGetRelativePosition(vecLeft, vecLeftPosition);
- vector<int> vecRightPosition;
+ std::vector<int> vecRightPosition;
fnGetRelativePosition(vecRight, vecRightPosition);
vecStrLeftReorder.clear();
vecStrRightReorder.clear();
for (int i = 1; i < vecLeftPosition.size(); i++) {
- string strOutcome;
+ std::string strOutcome;
fnGetOutcome(vecLeftPosition[i - 1], vecLeftPosition[i], strOutcome);
vecStrLeftReorder.push_back(strOutcome);
fnGetOutcome(vecRightPosition[i - 1], vecRightPosition[i], strOutcome);
@@ -115,32 +121,33 @@ struct SArgumentReorderModel {
static void fnGenerateFeature(const SParsedTree *pTree,
const SPredicate *pPred,
const SPredicateItem *pPredItem, int iPos,
- const string &strBlock1,
- const string &strBlock2, ostringstream &ostr) {
+ const std::string &strBlock1,
+ const std::string &strBlock2,
+ std::ostringstream &ostr) {
SSRLItem *pSRLItem1 = pPredItem->vec_items_[iPos - 1];
SSRLItem *pSRLItem2 = pPredItem->vec_items_[iPos];
const STreeItem *pCon1 = pSRLItem1->tree_item_;
const STreeItem *pCon2 = pSRLItem2->tree_item_;
- string left_role = pSRLItem1->role_;
- string right_role = pSRLItem2->role_;
+ std::string left_role = pSRLItem1->role_;
+ std::string right_role = pSRLItem2->role_;
- string predicate_term =
+ std::string predicate_term =
pTree->m_vecTerminals[pPred->m_iPosition]->m_pszTerm;
- vector<string> vec_other_right_sibling;
+ std::vector<std::string> vec_other_right_sibling;
for (int i = iPos + 1; i < pPredItem->vec_items_.size(); i++)
vec_other_right_sibling.push_back(
- string(pPredItem->vec_items_[i]->role_));
+ std::string(pPredItem->vec_items_[i]->role_));
if (vec_other_right_sibling.size() == 0)
- vec_other_right_sibling.push_back(string("NULL"));
+ vec_other_right_sibling.push_back(std::string("NULL"));
- vector<string> vec_other_left_sibling;
+ std::vector<std::string> vec_other_left_sibling;
for (int i = 0; i < iPos - 1; i++)
vec_other_right_sibling.push_back(
- string(pPredItem->vec_items_[i]->role_));
+ std::string(pPredItem->vec_items_[i]->role_));
if (vec_other_left_sibling.size() == 0)
- vec_other_left_sibling.push_back(string("NULL"));
+ vec_other_left_sibling.push_back(std::string("NULL"));
// generate features
// f1
@@ -190,26 +197,26 @@ struct SArgumentReorderModel {
}
private:
- static void fnGetOutcome(int i1, int i2, string &strOutcome) {
+ static void fnGetOutcome(int i1, int i2, std::string &strOutcome) {
assert(i1 != i2);
if (i1 < i2) {
if (i2 > i1 + 1)
- strOutcome = string("DM");
+ strOutcome = std::string("DM");
else
- strOutcome = string("M");
+ strOutcome = std::string("M");
} else {
if (i1 > i2 + 1)
- strOutcome = string("DS");
+ strOutcome = std::string("DS");
else
- strOutcome = string("S");
+ strOutcome = std::string("S");
}
}
- static void fnGetRelativePosition(const vector<int> &vecLeft,
- vector<int> &vecPosition) {
+ static void fnGetRelativePosition(const std::vector<int> &vecLeft,
+ std::vector<int> &vecPosition) {
vecPosition.clear();
- vector<float> vec;
+ std::vector<float> vec;
for (int i = 0; i < vecLeft.size(); i++) {
if (vecLeft[i] == -1) {
if (i == 0)
diff --git a/utils/constituent_reorder_model.cc b/utils/constituent_reorder_model.cc
index 042c751b..df75a1a0 100644
--- a/utils/constituent_reorder_model.cc
+++ b/utils/constituent_reorder_model.cc
@@ -73,15 +73,14 @@ inline void fnPreparingTrainingdata(const char* pszFName, int iCutoff,
struct SConstReorderTrainer {
SConstReorderTrainer(
- const char* pszSynFname, // source-side flattened parse tree file name
- const char* pszAlignFname, // alignment filename
+ const char* pszSynFname, // source-side flattened parse tree file name
+ const char* pszAlignFname, // alignment filename
const char* pszSourceFname, // source file name
const char* pszTargetFname, // target file name
const char* pszInstanceFname, // training instance file name
- const char* pszModelPrefix, // classifier model file name prefix
- int iClassifierType, // classifier type
- int iCutoff, // feature count threshold
- const char* pszOption // other classifier parameters (for svmlight)
+ const char* pszModelPrefix, // classifier model file name prefix
+ int iCutoff, // feature count threshold
+ const char* /*pszOption*/ // other classifier parameters (for svmlight)
) {
fnGenerateInstanceFile(pszSynFname, pszAlignFname, pszSourceFname,
pszTargetFname, pszInstanceFname);
@@ -135,14 +134,14 @@ delete pZhangleMaxent;*/
}
inline void fnGetOutcome(int iL1, int iR1, int iL2, int iR2,
- const SAlignment* pAlign, string& strOutcome) {
+ const SAlignment* /*pAlign*/, string& strOutcome) {
if (iL1 == -1 && iL2 == -1)
strOutcome = "BU"; // 1. both are untranslated
else if (iL1 == -1)
strOutcome = "1U"; // 2. XP1 is untranslated
else if (iL2 == -1)
strOutcome = "2U"; // 3. XP2 is untranslated
- else if (iL1 == iL2 && iR2 == iR2)
+ else if (iL1 == iL2 && iR1 == iR2)
strOutcome = "SS"; // 4. Have same scope
else if (iL1 <= iL2 && iR1 >= iR2)
strOutcome = "1C2"; // 5. XP1's translation covers XP2's
@@ -241,7 +240,7 @@ delete pZhangleMaxent;*/
int iPos, const vector<string>& vecChunkStatus,
const vector<int>& vecPosition,
const vector<string>& vecSTerms,
- const vector<string>& vecTTerms, string& strOutcome,
+ const vector<string>& /*vecTTerms*/, string& strOutcome,
ostringstream& ostr) {
STreeItem* pCon1, *pCon2;
pCon1 = pParent->m_vecChildren[iPos - 1];
@@ -314,11 +313,11 @@ delete pZhangleMaxent;*/
* f8: the first and the last word of XP2's translation (f8_f, f8_l)
* f9: the translation of XP1's and XP2's head word (f9_1, f9_2)
*/
- void fnGenerateInstance(const SParsedTree* pTree, const STreeItem* pParent,
+ void fnGenerateInstance(const SParsedTree* /*pTree*/, const STreeItem* pParent,
const STreeItem* pCon1, const STreeItem* pCon2,
const SAlignment* pAlign,
const vector<string>& vecSTerms,
- const vector<string>& vecTTerms, string& strOutcome,
+ const vector<string>& /*vecTTerms*/, string& strOutcome,
ostringstream& ostr) {
int iLeft1, iRight1, iLeft2, iRight2;
@@ -401,8 +400,8 @@ delete pZhangleMaxent;*/
}
void fnGenerateInstanceFile(
- const char* pszSynFname, // source-side flattened parse tree file name
- const char* pszAlignFname, // alignment filename
+ const char* pszSynFname, // source-side flattened parse tree file name
+ const char* pszAlignFname, // alignment filename
const char* pszSourceFname, // source file name
const char* pszTargetFname, // target file name
const char* pszInstanceFname // training instance file name
@@ -507,8 +506,8 @@ delete pZhangleMaxent;*/
}
void fnGenerateInstanceFile2(
- const char* pszSynFname, // source-side flattened parse tree file name
- const char* pszAlignFname, // alignment filename
+ const char* pszSynFname, // source-side flattened parse tree file name
+ const char* pszAlignFname, // alignment filename
const char* pszSourceFname, // source file name
const char* pszTargetFname, // target file name
const char* pszInstanceFname // training instance file name
@@ -578,193 +577,6 @@ delete pZhangleMaxent;*/
}
};
-struct SConstContTrainer {
- SConstContTrainer(
- const char* pszFlattenedSynFname, // source-side flattened parse tree
- // file name
- const char* pszAlignFname, // alignment filename
- const char* pszSourceFname, // source file name
- const char* pszTargetFname, // target file name
- const char* pszInstanceFname, // training instance file name
- const char* pszModelPrefix, // classifier model file name prefix
- int iClassifierType, // classifier type
- int iCutoff, // feature count threshold
- const char* pszOption // other classifier parameters (for svmlight)
- ) {
- fnGenerateInstanceFile(pszFlattenedSynFname, pszAlignFname, pszSourceFname,
- pszTargetFname, pszInstanceFname);
- // fnTraining(pszInstanceFname, pszModelPrefix, iClassifierType, iCutoff,
- // pszOption);
- fnTraining(pszInstanceFname, pszModelPrefix, iCutoff);
- }
- ~SConstContTrainer() {}
-
- private:
- void fnTraining(const char* pszInstanceFname, const char* pszModelFname,
- int iCutoff) {
- char* pszNewInstanceFName = new char[strlen(pszInstanceFname) + 50];
- if (iCutoff > 0) {
- sprintf(pszNewInstanceFName, "%s.tmp", pszInstanceFname);
- fnPreparingTrainingdata(pszInstanceFname, iCutoff, pszNewInstanceFName);
- } else {
- strcpy(pszNewInstanceFName, pszInstanceFname);
- }
-
- /*Zhangle_Maxent *pZhangleMaxent = new Zhangle_Maxent(NULL);
- pZhangleMaxent->fnTrain(pszInstanceFname, "lbfgs", pszModelFname, 100,
- 2.0);
- delete pZhangleMaxent;*/
-
- Tsuruoka_Maxent* pMaxent = new Tsuruoka_Maxent(NULL);
- pMaxent->fnTrain(pszInstanceFname, "l1", pszModelFname, 300);
- delete pMaxent;
-
- if (strcmp(pszNewInstanceFName, pszInstanceFname) != 0) {
- sprintf(pszNewInstanceFName, "rm %s.tmp", pszInstanceFname);
- system(pszNewInstanceFName);
- }
- delete[] pszNewInstanceFName;
- }
-
- void fnGetFocusedParentNodes(const SParsedTree* pTree,
- vector<STreeItem*>& vecFocused) {
- for (size_t i = 0; i < pTree->m_vecTerminals.size(); i++) {
- STreeItem* pParent = pTree->m_vecTerminals[i]->m_ptParent;
-
- while (pParent != NULL) {
- // if (pParent->m_vecChildren.size() > 1 && pParent->m_iEnd -
- // pParent->m_iBegin > 5) {
- if (pParent->m_vecChildren.size() > 1) {
- // do constituent reordering for all children of pParent
- vecFocused.push_back(pParent);
- }
- if (pParent->m_iBrotherIndex != 0) break;
- pParent = pParent->m_ptParent;
- }
- }
- }
-
- inline void fnGetOutcome(int iL1, int iR1, const SAlignment* pAlign,
- string& strOutcome) {
- strOutcome = pAlign->fnIsContinuous(iL1, iR1);
- }
-
- inline string fnGetLengthType(int iLen) {
- if (iLen == 1) return string("1");
- if (iLen == 2) return string("2");
- if (iLen == 3) return string("3");
- if (iLen < 6) return string("4");
- if (iLen < 11) return string("6");
- return string("11");
- }
-
- /*
- * Source side (11 features):
- * f1: the syntactic category
- * f2: the syntactic category of its parent
- * f3: the head word's pos
- * f4: =1 if it's the head of its parent node
- * or
- * the head of its parent node
- * f5: length type
- */
- void fnGenerateInstance(const SParsedTree* pTree, const STreeItem* pCon1,
- const SAlignment* pAlign,
- const vector<string>& vecSTerms,
- const vector<string>& vecTTerms, string& strOutcome,
- ostringstream& ostr) {
-
- fnGetOutcome(pCon1->m_iBegin, pCon1->m_iEnd, pAlign, strOutcome);
-
- // generate features
- // f1
- ostr << "f1=" << pCon1->m_pszTerm;
- // f2
- ostr << " f2=" << pCon1->m_ptParent->m_pszTerm;
- // f3
- ostr << " f3=" << pTree->m_vecTerminals[pCon1->m_iHeadWord]
- ->m_ptParent->m_pszTerm;
- // f4
- if (pCon1->m_iBrotherIndex == pCon1->m_ptParent->m_iHeadChild) {
- ostr << " f4=1";
- } else {
- ostr << " f4="
- << pCon1->m_ptParent->m_vecChildren[pCon1->m_ptParent->m_iHeadChild]
- ->m_pszTerm;
- }
- // f5
- ostr << " f5=" << fnGetLengthType(pCon1->m_iEnd - pCon1->m_iBegin + 1);
- }
-
- void fnGenerateInstanceFile(
- const char* pszFlattenedSynFname, // source-side flattened parse tree
- // file name
- const char* pszAlignFname, // alignment filename
- const char* pszSourceFname, // source file name
- const char* pszTargetFname, // target file name
- const char* pszInstanceFname // training instance file name
- ) {
- SAlignmentReader* pAlignReader = new SAlignmentReader(pszAlignFname);
- SParseReader* pParseReader = new SParseReader(pszFlattenedSynFname, true);
- STxtFileReader* pTxtSReader = new STxtFileReader(pszSourceFname);
- STxtFileReader* pTxtTReader = new STxtFileReader(pszTargetFname);
-
- FILE* fpOut = fopen(pszInstanceFname, "w");
- assert(fpOut != NULL);
-
- // read sentence by sentence
- SAlignment* pAlign;
- SParsedTree* pTree;
- char* pszLine = new char[50001];
- int iSentNum = 0;
- while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) {
- pTree = pParseReader->fnReadNextParseTree();
- assert(pTree != NULL);
- assert(pTxtSReader->fnReadNextLine(pszLine, NULL));
- vector<string> vecSTerms;
- SplitOnWhitespace(string(pszLine), &vecSTerms);
- assert(pTxtTReader->fnReadNextLine(pszLine, NULL));
- vector<string> vecTTerms;
- SplitOnWhitespace(string(pszLine), &vecTTerms);
-
- vector<STreeItem*> vecFocused;
- fnGetFocusedParentNodes(pTree, vecFocused);
-
- for (size_t i = 0;
- i < vecFocused.size() && pTree->m_vecTerminals.size() > 10; i++) {
-
- STreeItem* pParent = vecFocused[i];
-
- for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) {
- // children[j-1] vs. children[j] reordering
-
- string strOutcome;
- ostringstream ostr;
-
- fnGenerateInstance(pTree, pParent->m_vecChildren[j], pAlign,
- vecSTerms, vecTTerms, strOutcome, ostr);
-
- // fprintf(stderr, "%s %s\n", ostr.str().c_str(), strOutcome.c_str());
- fprintf(fpOut, "%s %s\n", ostr.str().c_str(), strOutcome.c_str());
- }
- }
-
- delete pAlign;
- delete pTree;
- iSentNum++;
-
- if (iSentNum % 100000 == 0) fprintf(stderr, "#%d\n", iSentNum);
- }
-
- fclose(fpOut);
- delete pAlignReader;
- delete pParseReader;
- delete pTxtSReader;
- delete pTxtTReader;
- delete[] pszLine;
- }
-};
-
inline void print_options(std::ostream& out,
po::options_description const& opts) {
typedef std::vector<boost::shared_ptr<po::option_description> > Ds;
@@ -781,12 +593,11 @@ inline string str(char const* name, po::variables_map const& conf) {
}
//--parse_file /scratch0/mt_exp/gq-ctb/data/train.srl.cn --align_file
-///scratch0/mt_exp/gq-ctb/data/aligned.grow-diag-final-and --source_file
-///scratch0/mt_exp/gq-ctb/data/train.cn --target_file
-///scratch0/mt_exp/gq-ctb/data/train.en --instance_file
-///scratch0/mt_exp/gq-ctb/data/srl-instance --model_prefix
-///scratch0/mt_exp/gq-ctb/data/srl-instance --feature_cutoff 10
-//--classifier_type 1
+/// scratch0/mt_exp/gq-ctb/data/aligned.grow-diag-final-and --source_file
+/// scratch0/mt_exp/gq-ctb/data/train.cn --target_file
+/// scratch0/mt_exp/gq-ctb/data/train.en --instance_file
+/// scratch0/mt_exp/gq-ctb/data/srl-instance --model_prefix
+/// scratch0/mt_exp/gq-ctb/data/srl-instance --feature_cutoff 10
int main(int argc, char** argv) {
po::options_description opts("Configuration options");
@@ -798,11 +609,9 @@ int main(int argc, char** argv) {
"instance_file", po::value<string>(), "Instance file path (output)")(
"model_prefix", po::value<string>(),
"Model file path prefix (output): three files will be generated")(
- "classifier_type", po::value<int>()->default_value(1),
- "Classifier type: 1 for openNLP maxent; 2 for Zhangle maxent; and 3 for "
- "SVMLight")("feature_cutoff", po::value<int>()->default_value(100),
- "Feature cutoff threshold")(
- "svm_option", po::value<string>(), "Parameters for SVMLight classifier")(
+ "feature_cutoff", po::value<int>()->default_value(100),
+ "Feature cutoff threshold")("svm_option", po::value<string>(),
+ "Parameters for SVMLight classifier")(
"help", "produce help message");
po::variables_map vm;
@@ -839,8 +648,8 @@ int main(int argc, char** argv) {
str("parse_file", vm).c_str(), str("align_file", vm).c_str(),
str("source_file", vm).c_str(), str("target_file", vm).c_str(),
str("instance_file", vm).c_str(), str("model_prefix", vm).c_str(),
- vm["classifier_type"].as<int>(), vm["feature_cutoff"].as<int>(), pOption);
+ vm["feature_cutoff"].as<int>(), pOption);
delete pTrainer;
- return 1;
+ return 0;
}
diff --git a/utils/lbfgs.h b/utils/lbfgs.h
index ed5cd944..4d706f7a 100644
--- a/utils/lbfgs.h
+++ b/utils/lbfgs.h
@@ -1,6 +1,8 @@
#ifndef _LBFGS_H_
#define _LBFGS_H_
+#include <vector>
+
// template<class FuncGrad>
// std::vector<double>
// perform_LBFGS(FuncGrad func_grad, const std::vector<double> & x0);
@@ -13,7 +15,6 @@ std::vector<double> perform_OWLQN(
double (*func_grad)(const std::vector<double> &, std::vector<double> &),
const std::vector<double> &x0, const double C);
-// const int LBFGS_M = 7;
const int LBFGS_M = 10;
#endif
diff --git a/utils/maxent.cpp b/utils/maxent.cpp
index 9115f6f2..0f49ee9d 100644
--- a/utils/maxent.cpp
+++ b/utils/maxent.cpp
@@ -142,7 +142,7 @@ int ME_Model::make_feature_bag(const int cutoff) {
// count the occurrences of features
#ifdef USE_HASH_MAP
- typedef __gnu_cxx::hash_map<unsigned int, int> map_type;
+ typedef std::unordered_map<unsigned int, int> map_type;
#else
typedef std::map<unsigned int, int> map_type;
#endif
diff --git a/utils/srl_sentence.h b/utils/srl_sentence.h
index 61532fb2..9d509600 100644
--- a/utils/srl_sentence.h
+++ b/utils/srl_sentence.h
@@ -8,15 +8,12 @@
#ifndef SRL_SENTENCE_H_
#define SRL_SENTENCE_H_
-
#include <sstream>
#include <vector>
#include "tree.h"
#include "stringlib.h"
-using namespace std;
-
struct SArgument {
SArgument(const char* pszRole, int iBegin, int iEnd, float fProb) {
m_pszRole = new char[strlen(pszRole) + 1];
@@ -38,7 +35,7 @@ struct SArgument {
char* m_pszRole; // argument rule, e.g., ARG0, ARGM-TMP
int m_iBegin;
- int m_iEnd; // the span of the argument, [m_iBegin, m_iEnd]
+ int m_iEnd; // the span of the argument, [m_iBegin, m_iEnd]
float m_fProb; // the probability of this role,
STreeItem* m_pTreeItem;
};
@@ -68,8 +65,8 @@ struct SPredicate {
char* m_pszLemma; // lemma of the predicate, for Chinese, it's always as same
// as the predicate itself
- int m_iPosition; // the position in sentence
- vector<SArgument*> m_vecArgt; // arguments associated to the predicate
+ int m_iPosition; // the position in sentence
+ std::vector<SArgument*> m_vecArgt; // arguments associated to the predicate
};
struct SSrlSentence {
@@ -91,7 +88,7 @@ struct SSrlSentence {
int GetPredicateNum() { return m_vecPred.size(); }
SParsedTree* m_pTree;
- vector<SPredicate*> m_vecPred;
+ std::vector<SPredicate*> m_vecPred;
};
struct SSrlSentenceReader {
@@ -116,7 +113,7 @@ struct SSrlSentenceReader {
// TODO: here only considers flat predicate-argument structure
// i.e., no overlap among them
SSrlSentence* fnReadNextSrlSentence() {
- vector<vector<string> > vecContent;
+ std::vector<std::vector<std::string> > vecContent;
if (fnReadNextContent(vecContent) == false) return NULL;
SSrlSentence* pSrlSentence = new SSrlSentence();
@@ -124,18 +121,18 @@ struct SSrlSentenceReader {
// put together syntactic text
std::ostringstream ostr;
for (int i = 0; i < iSize; i++) {
- string strSynSeg =
+ std::string strSynSeg =
vecContent[i][5]; // the 5th column is the syntactic segment
size_t iPosition = strSynSeg.find_first_of('*');
- assert(iPosition != string::npos);
- ostringstream ostrTmp;
+ assert(iPosition != std::string::npos);
+ std::ostringstream ostrTmp;
ostrTmp << "(" << vecContent[i][2] << " " << vecContent[i][0]
<< ")"; // the 2th column is POS-tag, and the 0th column is word
strSynSeg.replace(iPosition, 1, ostrTmp.str());
fnReplaceAll(strSynSeg, "(", " (");
ostr << strSynSeg;
}
- string strSyn = ostr.str();
+ std::string strSyn = ostr.str();
pSrlSentence->m_pTree = SParsedTree::fnConvertFromString(strSyn.c_str());
pSrlSentence->m_pTree->fnSetHeadWord();
pSrlSentence->m_pTree->fnSetSpanInfo();
@@ -143,9 +140,9 @@ struct SSrlSentenceReader {
// read predicate-argument structure
int iNumPred = vecContent[0].size() - 8;
for (int i = 0; i < iNumPred; i++) {
- vector<string> vecRole;
- vector<int> vecBegin;
- vector<int> vecEnd;
+ std::vector<std::string> vecRole;
+ std::vector<int> vecBegin;
+ std::vector<int> vecEnd;
int iPred = -1;
for (int j = 0; j < iSize; j++) {
const char* p = vecContent[j][i + 8].c_str();
@@ -184,7 +181,7 @@ struct SSrlSentenceReader {
}
private:
- bool fnReadNextContent(vector<vector<string> >& vecContent) {
+ bool fnReadNextContent(std::vector<std::vector<std::string> >& vecContent) {
vecContent.clear();
if (feof(m_fpIn) == true) return false;
char* pszLine;
@@ -200,7 +197,7 @@ struct SSrlSentenceReader {
}
if (iLen == 0) break; // end of this sentence
- vector<string> terms = SplitOnWhitespace(string(pszLine));
+ std::vector<std::string> terms = SplitOnWhitespace(std::string(pszLine));
assert(terms.size() > 7);
vecContent.push_back(terms);
}
diff --git a/utils/synutils.h b/utils/synutils.h
index ef7b78b7..f611553e 100644
--- a/utils/synutils.h
+++ b/utils/synutils.h
@@ -17,21 +17,17 @@
#include <string>
#include <unordered_map>
-using namespace std;
-
typedef std::unordered_map<std::string, int> MapString2Int;
typedef std::unordered_map<std::string, float> MapString2Float;
typedef std::unordered_map<std::string, float>::iterator
MapString2FloatIterator;
-using namespace std;
-
struct SFReader {
SFReader() {}
virtual ~SFReader() {}
virtual bool fnReadNextLine(char* pszLine, int* piLength) = 0;
- virtual bool fnReadNextLine(string& strLine) = 0;
+ virtual bool fnReadNextLine(std::string& strLine) = 0;
};
struct STxtFileReader : public SFReader {
@@ -63,13 +59,13 @@ struct STxtFileReader : public SFReader {
return true;
}
- bool fnReadNextLine(string& strLine) {
+ bool fnReadNextLine(std::string& strLine) {
char* pszLine = new char[10001];
bool bOut = fnReadNextLine(pszLine, NULL);
if (bOut)
- strLine = string(pszLine);
+ strLine = std::string(pszLine);
else
- strLine = string("");
+ strLine = std::string("");
delete[] pszLine;
return bOut;
@@ -108,13 +104,13 @@ struct SGZFileReader : public SFReader {
return true;
}
- bool fnReadNextLine(string& strLine) {
+ bool fnReadNextLine(std::string& strLine) {
char* pszLine = new char[10001];
bool bOut = fnReadNextLine(pszLine, NULL);
if (bOut)
- strLine = string(pszLine);
+ strLine = std::string(pszLine);
else
- strLine = string("");
+ strLine = std::string("");
delete[] pszLine;
return bOut;
diff --git a/utils/tree.h b/utils/tree.h
index 8070f828..6c3406d6 100644
--- a/utils/tree.h
+++ b/utils/tree.h
@@ -14,8 +14,6 @@
#include <string>
#include <vector>
-using namespace std;
-
struct STreeItem {
STreeItem(const char *pszTerm) {
m_pszTerm = new char[strlen(pszTerm) + 1];
@@ -53,18 +51,18 @@ struct STreeItem {
public:
char *m_pszTerm;
- vector<STreeItem *> m_vecChildren; // children items
- STreeItem *m_ptParent; // the parent item
+ std::vector<STreeItem *> m_vecChildren; // children items
+ STreeItem *m_ptParent; // the parent item
int m_iBegin;
- int m_iEnd; // the node span words[m_iBegin, m_iEnd]
- int m_iHeadChild; // the index of its head child
- int m_iHeadWord; // the index of its head word
+ int m_iEnd; // the node span words[m_iBegin, m_iEnd]
+ int m_iHeadChild; // the index of its head child
+ int m_iHeadWord; // the index of its head word
int m_iBrotherIndex; // the index in his brothers
};
struct SGetHeadWord {
- typedef vector<string> CVectorStr;
+ typedef std::vector<std::string> CVectorStr;
SGetHeadWord() {}
~SGetHeadWord() {}
int fnGetHeadWord(char *pszCFGLeft, CVectorStr vectRight) {
@@ -311,7 +309,7 @@ struct SParsedTree {
if (strcmp(pszStr, "(())") == 0) return NULL;
SParsedTree *pTree = new SParsedTree();
- vector<string> vecSyn;
+ std::vector<std::string> vecSyn;
fnReadSyntactic(pszStr, vecSyn);
int iLeft = 1, iRight = 1; //# left/right parenthesis
@@ -418,13 +416,13 @@ struct SParsedTree {
for (I = 0; I < ptItem->m_vecChildren.size(); I++)
fnSuffixTraverseSetHeadWord(ptItem->m_vecChildren[I], pGetHeadWord);
- vector<string> vecRight;
+ std::vector<std::string> vecRight;
if (ptItem->m_vecChildren.size() == 1)
iHeadchild = 0;
else {
for (I = 0; I < ptItem->m_vecChildren.size(); I++)
- vecRight.push_back(string(ptItem->m_vecChildren[I]->m_pszTerm));
+ vecRight.push_back(std::string(ptItem->m_vecChildren[I]->m_pszTerm));
iHeadchild = pGetHeadWord->fnGetHeadWord(ptItem->m_pszTerm, vecRight);
}
@@ -433,7 +431,8 @@ struct SParsedTree {
ptItem->m_iHeadWord = ptItem->m_vecChildren[iHeadchild]->m_iHeadWord;
}
- static void fnReadSyntactic(const char *pszSyn, vector<string> &vec) {
+ static void fnReadSyntactic(const char *pszSyn,
+ std::vector<std::string> &vec) {
char *p;
int I;
@@ -481,29 +480,29 @@ struct SParsedTree {
if ((pszTerm[0] == '(') || (pszTerm[strlen(pszTerm) - 1] == ')')) {
if (pszTerm[0] == '(') {
- vec.push_back(string("("));
+ vec.push_back(std::string("("));
iLeftNum++;
I = 1;
while (pszTerm[I] == '(' && pszTerm[I] != '\0') {
- vec.push_back(string("("));
+ vec.push_back(std::string("("));
iLeftNum++;
I++;
}
- if (strlen(pszTerm) > 1) vec.push_back(string(pszTerm + I));
+ if (strlen(pszTerm) > 1) vec.push_back(std::string(pszTerm + I));
} else {
char *pTmp;
pTmp = pszTerm + strlen(pszTerm) - 1;
while ((pTmp[0] == ')') && (pTmp >= pszTerm)) pTmp--;
pTmp[1] = '\0';
- if (strlen(pszTerm) > 0) vec.push_back(string(pszTerm));
+ if (strlen(pszTerm) > 0) vec.push_back(std::string(pszTerm));
pTmp += 2;
for (I = 0; I <= (int)strlen(pTmp); I++) {
- vec.push_back(string(")"));
+ vec.push_back(std::string(")"));
iRightNum++;
}
}
@@ -512,26 +511,26 @@ struct SParsedTree {
q = strchr(pszTerm, ')');
if (q != NULL) {
q[0] = '\0';
- if (pszTerm[0] != '\0') vec.push_back(string(pszTerm));
- vec.push_back(string(")"));
+ if (pszTerm[0] != '\0') vec.push_back(std::string(pszTerm));
+ vec.push_back(std::string(")"));
iRightNum++;
q++;
while (q[0] == ')') {
- vec.push_back(string(")"));
+ vec.push_back(std::string(")"));
q++;
iRightNum++;
}
while (q[0] == '(') {
- vec.push_back(string("("));
+ vec.push_back(std::string("("));
q++;
iLeftNum++;
}
- if (q[0] != '\0') vec.push_back(string(q));
+ if (q[0] != '\0') vec.push_back(std::string(q));
} else
- vec.push_back(string(pszTerm));
+ vec.push_back(std::string(pszTerm));
}
}
@@ -547,10 +546,10 @@ struct SParsedTree {
if (vec.size() >= 2 && strcmp(vec[1].c_str(), "(") == 0) {
//( (IP..) )
- std::vector<string>::iterator it;
+ std::vector<std::string>::iterator it;
it = vec.begin();
it++;
- vec.insert(it, string("ROOT"));
+ vec.insert(it, std::string("ROOT"));
}
break;
@@ -563,7 +562,7 @@ struct SParsedTree {
public:
STreeItem *m_ptRoot;
- vector<STreeItem *> m_vecTerminals; // the leaf nodes
+ std::vector<STreeItem *> m_vecTerminals; // the leaf nodes
};
struct SParseReader {
@@ -645,7 +644,7 @@ struct SParseReader {
for (size_t i = 0; i < pTreeItem->m_vecChildren.size(); i++)
fnSuffixTraverseSetHeadWord(pTreeItem->m_vecChildren[i]);
- vector<string> vecRight;
+ std::vector<std::string> vecRight;
int iHeadchild;
@@ -658,7 +657,7 @@ struct SParseReader {
if (p[0] == '*' && p[strlen(p) - 1] == '*') {
iHeadchild = i;
p[strlen(p) - 1] = '\0';
- string str = p + 1;
+ std::string str = p + 1;
strcpy(p, str.c_str()); // erase the "*..*"
break;
}
diff --git a/utils/tsuruoka_maxent.h b/utils/tsuruoka_maxent.h
index e6bef232..550a4b7f 100644
--- a/utils/tsuruoka_maxent.h
+++ b/utils/tsuruoka_maxent.h
@@ -6,17 +6,16 @@
#ifndef TSURUOKA_MAXENT_H_
#define TSURUOKA_MAXENT_H_
-#include "synutils.h"
-#include "stringlib.h"
-#include "maxent.h"
-
#include <assert.h>
-#include <vector>
-#include <string>
#include <string.h>
+#include <string>
#include <unordered_map>
+#include <utility>
+#include <vector>
-using namespace std;
+#include "synutils.h"
+#include "stringlib.h"
+#include "maxent.h"
typedef std::unordered_map<std::string, int> Map;
typedef std::unordered_map<std::string, int>::iterator Iterator;
@@ -35,7 +34,7 @@ struct Tsuruoka_Maxent {
}
void fnTrain(const char* pszInstanceFName, const char* pszAlgorithm,
- const char* pszModelFName, int iNumIteration) {
+ const char* pszModelFName, int /*iNumIteration*/) {
assert(strcmp(pszAlgorithm, "l1") == 0 || strcmp(pszAlgorithm, "l2") == 0 ||
strcmp(pszAlgorithm, "sgd") == 0 ||
strcmp(pszAlgorithm, "SGD") == 0);
@@ -67,10 +66,10 @@ struct Tsuruoka_Maxent {
assert(p != NULL);
p[0] = '\0';
p++;
- vector<string> vecContext;
- SplitOnWhitespace(string(pszLine), &vecContext);
+ std::vector<std::string> vecContext;
+ SplitOnWhitespace(std::string(pszLine), &vecContext);
- pmes->label = string(p);
+ pmes->label = std::string(p);
for (size_t i = 0; i < vecContext.size(); i++)
pmes->add_feature(vecContext[i]);
pModel->add_training_sample((*pmes));
@@ -98,53 +97,53 @@ struct Tsuruoka_Maxent {
}
double fnEval(const char* pszContext, const char* pszOutcome) const {
- vector<string> vecContext;
+ std::vector<std::string> vecContext;
ME_Sample* pmes = new ME_Sample();
- SplitOnWhitespace(string(pszContext), &vecContext);
+ SplitOnWhitespace(std::string(pszContext), &vecContext);
for (size_t i = 0; i < vecContext.size(); i++)
pmes->add_feature(vecContext[i]);
- vector<double> vecProb = m_pModel->classify(*pmes);
+ std::vector<double> vecProb = m_pModel->classify(*pmes);
delete pmes;
int iLableID = m_pModel->get_class_id(pszOutcome);
return vecProb[iLableID];
}
void fnEval(const char* pszContext,
- vector<pair<string, double> >& vecOutput) const {
- vector<string> vecContext;
+ std::vector<std::pair<std::string, double> >& vecOutput) const {
+ std::vector<std::string> vecContext;
ME_Sample* pmes = new ME_Sample();
- SplitOnWhitespace(string(pszContext), &vecContext);
+ SplitOnWhitespace(std::string(pszContext), &vecContext);
vecOutput.clear();
for (size_t i = 0; i < vecContext.size(); i++)
pmes->add_feature(vecContext[i]);
- vector<double> vecProb = m_pModel->classify(*pmes);
+ std::vector<double> vecProb = m_pModel->classify(*pmes);
for (size_t i = 0; i < vecProb.size(); i++) {
- string label = m_pModel->get_class_label(i);
+ std::string label = m_pModel->get_class_label(i);
vecOutput.push_back(make_pair(label, vecProb[i]));
}
delete pmes;
}
- void fnEval(const char* pszContext, vector<double>& vecOutput) const {
- vector<string> vecContext;
+ void fnEval(const char* pszContext, std::vector<double>& vecOutput) const {
+ std::vector<std::string> vecContext;
ME_Sample* pmes = new ME_Sample();
- SplitOnWhitespace(string(pszContext), &vecContext);
+ SplitOnWhitespace(std::string(pszContext), &vecContext);
vecOutput.clear();
for (size_t i = 0; i < vecContext.size(); i++)
pmes->add_feature(vecContext[i]);
- vector<double> vecProb = m_pModel->classify(*pmes);
+ std::vector<double> vecProb = m_pModel->classify(*pmes);
for (size_t i = 0; i < vecProb.size(); i++) {
- string label = m_pModel->get_class_label(i);
+ std::string label = m_pModel->get_class_label(i);
vecOutput.push_back(vecProb[i]);
}
delete pmes;
}
- int fnGetClassId(const string& strLabel) const {
+ int fnGetClassId(const std::string& strLabel) const {
return m_pModel->get_class_id(strLabel);
}