summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWu, Ke <wuke@cs.umd.edu>2014-10-10 17:12:03 -0400
committerWu, Ke <wuke@cs.umd.edu>2014-10-10 17:12:03 -0400
commitcf7f7fdd156d4f9d1217aee9a4f419bf8e4eb80b (patch)
tree506e433f62ffec3a2129ed661821472990798232
parent6cb042533b49765c06f4fa072947304433182add (diff)
Trainers for reordering constraint models
-rw-r--r--.gitignore1
-rw-r--r--utils/Makefile.am4
-rw-r--r--utils/argument_reorder_model.cc6
-rw-r--r--utils/constituent_reorder_model.cc197
4 files changed, 13 insertions, 195 deletions
diff --git a/.gitignore b/.gitignore
index 4efcdd8d..c75328dc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -181,6 +181,7 @@ training/mr_reduce_to_weights
training/optimize_test
training/plftools
training/test_ngram
+utils/argument_reorder_model_trainer
utils/atools
utils/bin/
utils/const_reorder_model_trainer
diff --git a/utils/Makefile.am b/utils/Makefile.am
index 0bd21b2b..53967561 100644
--- a/utils/Makefile.am
+++ b/utils/Makefile.am
@@ -1,4 +1,4 @@
-bin_PROGRAMS = reconstruct_weights atools const_reorder_model_trainer
+bin_PROGRAMS = reconstruct_weights atools const_reorder_model_trainer argument_reorder_model_trainer
noinst_PROGRAMS = \
ts \
@@ -108,6 +108,8 @@ atools_LDADD = libutils.a
atools_LDFLAGS = $(STATIC_FLAGS)
const_reorder_model_trainer_SOURCES = constituent_reorder_model.cc
const_reorder_model_trainer_LDADD = libutils.a
+argument_reorder_model_trainer_SOURCES = argument_reorder_model.cc
+argument_reorder_model_trainer_LDADD = libutils.a
phmt_SOURCES = phmt.cc
phmt_LDADD = libutils.a $(BOOST_UNIT_TEST_FRAMEWORK_LDFLAGS) $(BOOST_UNIT_TEST_FRAMEWORK_LIBS)
diff --git a/utils/argument_reorder_model.cc b/utils/argument_reorder_model.cc
index 537dfb20..40647f8a 100644
--- a/utils/argument_reorder_model.cc
+++ b/utils/argument_reorder_model.cc
@@ -6,12 +6,18 @@
*/
#include <boost/program_options.hpp>
+#include <iostream>
#include <fstream>
+#include <sstream>
+#include <string>
+#include <vector>
#include "argument_reorder_model.h"
#include "synutils.h"
#include "tsuruoka_maxent.h"
+using namespace std;
+
inline void fnPreparingTrainingdata(const char* pszFName, int iCutoff,
const char* pszNewFName) {
SFReader* pFReader = new STxtFileReader(pszFName);
diff --git a/utils/constituent_reorder_model.cc b/utils/constituent_reorder_model.cc
index d5c1cf5b..5a5c5546 100644
--- a/utils/constituent_reorder_model.cc
+++ b/utils/constituent_reorder_model.cc
@@ -79,7 +79,6 @@ struct SConstReorderTrainer {
const char* pszTargetFname, // target file name
const char* pszInstanceFname, // training instance file name
const char* pszModelPrefix, // classifier model file name prefix
- int iClassifierType, // classifier type
int iCutoff, // feature count threshold
const char* pszOption // other classifier parameters (for svmlight)
) {
@@ -578,193 +577,6 @@ delete pZhangleMaxent;*/
}
};
-struct SConstContTrainer {
- SConstContTrainer(
- const char* pszFlattenedSynFname, // source-side flattened parse tree
- // file name
- const char* pszAlignFname, // alignment filename
- const char* pszSourceFname, // source file name
- const char* pszTargetFname, // target file name
- const char* pszInstanceFname, // training instance file name
- const char* pszModelPrefix, // classifier model file name prefix
- int iClassifierType, // classifier type
- int iCutoff, // feature count threshold
- const char* pszOption // other classifier parameters (for svmlight)
- ) {
- fnGenerateInstanceFile(pszFlattenedSynFname, pszAlignFname, pszSourceFname,
- pszTargetFname, pszInstanceFname);
- // fnTraining(pszInstanceFname, pszModelPrefix, iClassifierType, iCutoff,
- // pszOption);
- fnTraining(pszInstanceFname, pszModelPrefix, iCutoff);
- }
- ~SConstContTrainer() {}
-
- private:
- void fnTraining(const char* pszInstanceFname, const char* pszModelFname,
- int iCutoff) {
- char* pszNewInstanceFName = new char[strlen(pszInstanceFname) + 50];
- if (iCutoff > 0) {
- sprintf(pszNewInstanceFName, "%s.tmp", pszInstanceFname);
- fnPreparingTrainingdata(pszInstanceFname, iCutoff, pszNewInstanceFName);
- } else {
- strcpy(pszNewInstanceFName, pszInstanceFname);
- }
-
- /*Zhangle_Maxent *pZhangleMaxent = new Zhangle_Maxent(NULL);
- pZhangleMaxent->fnTrain(pszInstanceFname, "lbfgs", pszModelFname, 100,
- 2.0);
- delete pZhangleMaxent;*/
-
- Tsuruoka_Maxent* pMaxent = new Tsuruoka_Maxent(NULL);
- pMaxent->fnTrain(pszInstanceFname, "l1", pszModelFname, 300);
- delete pMaxent;
-
- if (strcmp(pszNewInstanceFName, pszInstanceFname) != 0) {
- sprintf(pszNewInstanceFName, "rm %s.tmp", pszInstanceFname);
- system(pszNewInstanceFName);
- }
- delete[] pszNewInstanceFName;
- }
-
- void fnGetFocusedParentNodes(const SParsedTree* pTree,
- vector<STreeItem*>& vecFocused) {
- for (size_t i = 0; i < pTree->m_vecTerminals.size(); i++) {
- STreeItem* pParent = pTree->m_vecTerminals[i]->m_ptParent;
-
- while (pParent != NULL) {
- // if (pParent->m_vecChildren.size() > 1 && pParent->m_iEnd -
- // pParent->m_iBegin > 5) {
- if (pParent->m_vecChildren.size() > 1) {
- // do constituent reordering for all children of pParent
- vecFocused.push_back(pParent);
- }
- if (pParent->m_iBrotherIndex != 0) break;
- pParent = pParent->m_ptParent;
- }
- }
- }
-
- inline void fnGetOutcome(int iL1, int iR1, const SAlignment* pAlign,
- string& strOutcome) {
- strOutcome = pAlign->fnIsContinuous(iL1, iR1);
- }
-
- inline string fnGetLengthType(int iLen) {
- if (iLen == 1) return string("1");
- if (iLen == 2) return string("2");
- if (iLen == 3) return string("3");
- if (iLen < 6) return string("4");
- if (iLen < 11) return string("6");
- return string("11");
- }
-
- /*
- * Source side (11 features):
- * f1: the syntactic category
- * f2: the syntactic category of its parent
- * f3: the head word's pos
- * f4: =1 if it's the head of its parent node
- * or
- * the head of its parent node
- * f5: length type
- */
- void fnGenerateInstance(const SParsedTree* pTree, const STreeItem* pCon1,
- const SAlignment* pAlign,
- const vector<string>& vecSTerms,
- const vector<string>& vecTTerms, string& strOutcome,
- ostringstream& ostr) {
-
- fnGetOutcome(pCon1->m_iBegin, pCon1->m_iEnd, pAlign, strOutcome);
-
- // generate features
- // f1
- ostr << "f1=" << pCon1->m_pszTerm;
- // f2
- ostr << " f2=" << pCon1->m_ptParent->m_pszTerm;
- // f3
- ostr << " f3=" << pTree->m_vecTerminals[pCon1->m_iHeadWord]
- ->m_ptParent->m_pszTerm;
- // f4
- if (pCon1->m_iBrotherIndex == pCon1->m_ptParent->m_iHeadChild) {
- ostr << " f4=1";
- } else {
- ostr << " f4="
- << pCon1->m_ptParent->m_vecChildren[pCon1->m_ptParent->m_iHeadChild]
- ->m_pszTerm;
- }
- // f5
- ostr << " f5=" << fnGetLengthType(pCon1->m_iEnd - pCon1->m_iBegin + 1);
- }
-
- void fnGenerateInstanceFile(
- const char* pszFlattenedSynFname, // source-side flattened parse tree
- // file name
- const char* pszAlignFname, // alignment filename
- const char* pszSourceFname, // source file name
- const char* pszTargetFname, // target file name
- const char* pszInstanceFname // training instance file name
- ) {
- SAlignmentReader* pAlignReader = new SAlignmentReader(pszAlignFname);
- SParseReader* pParseReader = new SParseReader(pszFlattenedSynFname, true);
- STxtFileReader* pTxtSReader = new STxtFileReader(pszSourceFname);
- STxtFileReader* pTxtTReader = new STxtFileReader(pszTargetFname);
-
- FILE* fpOut = fopen(pszInstanceFname, "w");
- assert(fpOut != NULL);
-
- // read sentence by sentence
- SAlignment* pAlign;
- SParsedTree* pTree;
- char* pszLine = new char[50001];
- int iSentNum = 0;
- while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) {
- pTree = pParseReader->fnReadNextParseTree();
- assert(pTree != NULL);
- assert(pTxtSReader->fnReadNextLine(pszLine, NULL));
- vector<string> vecSTerms;
- SplitOnWhitespace(string(pszLine), &vecSTerms);
- assert(pTxtTReader->fnReadNextLine(pszLine, NULL));
- vector<string> vecTTerms;
- SplitOnWhitespace(string(pszLine), &vecTTerms);
-
- vector<STreeItem*> vecFocused;
- fnGetFocusedParentNodes(pTree, vecFocused);
-
- for (size_t i = 0;
- i < vecFocused.size() && pTree->m_vecTerminals.size() > 10; i++) {
-
- STreeItem* pParent = vecFocused[i];
-
- for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) {
- // children[j-1] vs. children[j] reordering
-
- string strOutcome;
- ostringstream ostr;
-
- fnGenerateInstance(pTree, pParent->m_vecChildren[j], pAlign,
- vecSTerms, vecTTerms, strOutcome, ostr);
-
- // fprintf(stderr, "%s %s\n", ostr.str().c_str(), strOutcome.c_str());
- fprintf(fpOut, "%s %s\n", ostr.str().c_str(), strOutcome.c_str());
- }
- }
-
- delete pAlign;
- delete pTree;
- iSentNum++;
-
- if (iSentNum % 100000 == 0) fprintf(stderr, "#%d\n", iSentNum);
- }
-
- fclose(fpOut);
- delete pAlignReader;
- delete pParseReader;
- delete pTxtSReader;
- delete pTxtTReader;
- delete[] pszLine;
- }
-};
-
inline void print_options(std::ostream& out,
po::options_description const& opts) {
typedef std::vector<boost::shared_ptr<po::option_description> > Ds;
@@ -786,7 +598,6 @@ inline string str(char const* name, po::variables_map const& conf) {
/// scratch0/mt_exp/gq-ctb/data/train.en --instance_file
/// scratch0/mt_exp/gq-ctb/data/srl-instance --model_prefix
/// scratch0/mt_exp/gq-ctb/data/srl-instance --feature_cutoff 10
-//--classifier_type 1
int main(int argc, char** argv) {
po::options_description opts("Configuration options");
@@ -798,9 +609,7 @@ int main(int argc, char** argv) {
"instance_file", po::value<string>(), "Instance file path (output)")(
"model_prefix", po::value<string>(),
"Model file path prefix (output): three files will be generated")(
- "classifier_type", po::value<int>()->default_value(1),
- "Classifier type: 1 for openNLP maxent; 2 for Zhangle maxent; and 3 for "
- "SVMLight")("feature_cutoff", po::value<int>()->default_value(100),
+ "feature_cutoff", po::value<int>()->default_value(100),
"Feature cutoff threshold")(
"svm_option", po::value<string>(), "Parameters for SVMLight classifier")(
"help", "produce help message");
@@ -839,8 +648,8 @@ int main(int argc, char** argv) {
str("parse_file", vm).c_str(), str("align_file", vm).c_str(),
str("source_file", vm).c_str(), str("target_file", vm).c_str(),
str("instance_file", vm).c_str(), str("model_prefix", vm).c_str(),
- vm["classifier_type"].as<int>(), vm["feature_cutoff"].as<int>(), pOption);
+ vm["feature_cutoff"].as<int>(), pOption);
delete pTrainer;
- return 1;
+ return 0;
}