diff options
| author | Wu, Ke <wuke@cs.umd.edu> | 2014-10-07 17:12:32 -0400 | 
|---|---|---|
| committer | Wu, Ke <wuke@cs.umd.edu> | 2014-10-07 17:12:32 -0400 | 
| commit | f762dbbf10a8204d0d0b82e9acb29feacd3b3bb4 (patch) | |
| tree | 5b775aaed0a060ec13626eb90dba583e59feddc6 /utils/synutils | |
| parent | d88186af251ecae60974b20395ce75807bfdda35 (diff) | |
Import synutils
Diffstat (limited to 'utils/synutils')
| -rw-r--r-- | utils/synutils/Makefile.am | 16 | ||||
| -rw-r--r-- | utils/synutils/alignment.h | 208 | ||||
| -rw-r--r-- | utils/synutils/argument_reorder_model.cc | 311 | ||||
| -rw-r--r-- | utils/synutils/argument_reorder_model.h | 222 | ||||
| -rw-r--r-- | utils/synutils/constituent_reorder_model.cc | 796 | ||||
| -rw-r--r-- | utils/synutils/maxent-3.0/Makefile.am | 10 | ||||
| -rw-r--r-- | utils/synutils/maxent-3.0/lbfgs.cpp | 110 | ||||
| -rw-r--r-- | utils/synutils/maxent-3.0/lbfgs.h | 21 | ||||
| -rw-r--r-- | utils/synutils/maxent-3.0/mathvec.h | 97 | ||||
| -rw-r--r-- | utils/synutils/maxent-3.0/maxent.cpp | 698 | ||||
| -rw-r--r-- | utils/synutils/maxent-3.0/maxent.h | 395 | ||||
| -rw-r--r-- | utils/synutils/maxent-3.0/owlqn.cpp | 137 | ||||
| -rw-r--r-- | utils/synutils/maxent-3.0/sgd.cpp | 180 | ||||
| -rw-r--r-- | utils/synutils/srl_sentence.h | 215 | ||||
| -rw-r--r-- | utils/synutils/tree.h | 731 | ||||
| -rw-r--r-- | utils/synutils/tsuruoka_maxent.h | 160 | ||||
| -rw-r--r-- | utils/synutils/utility.h | 129 | 
17 files changed, 4436 insertions, 0 deletions
| diff --git a/utils/synutils/Makefile.am b/utils/synutils/Makefile.am new file mode 100644 index 00000000..f87f1f8a --- /dev/null +++ b/utils/synutils/Makefile.am @@ -0,0 +1,16 @@ +bin_PROGRAMS = const_reorder_model_trainer + +const_reorder_model_trainer_SOURCES = constituent_reorder_model.cc +const_reorder_model_trainer_LDADD = libfeature.a  maxent-3.0/libtsuruoka_maxent.a -lgfortran + +noinst_LIBRARIES = libfeature.a + +libfeature_a_SOURCES = \ +  alignment.h \ +  srl_sentence.h \ +  tree.h \ +  utility.h \ +  argument_reorder_model.h \ +  tsuruoka_maxent.h  +   +AM_CPPFLAGS = -W -Wall -I$(top_srcdir) -I$(top_srcdir)/utils -I$(top_srcdir)/synutils/maxent-3.0  diff --git a/utils/synutils/alignment.h b/utils/synutils/alignment.h new file mode 100644 index 00000000..f8fe1993 --- /dev/null +++ b/utils/synutils/alignment.h @@ -0,0 +1,208 @@ +/* + * alignment.h + * + *  Created on: May 23, 2013 + *      Author: lijunhui + */ + +#ifndef ALIGNMENT_H_ +#define ALIGNMENT_H_ + +#include <string> +#include <assert.h> +#include <stdio.h> +#include <string.h> + +#include "stringlib.h" + +using namespace std; + +/* + * Note: + *      m_vec_s_align.size() may not be equal to the length of source side sentence + *                           due to the last words may not be aligned + * + */ +struct SAlignment { +	typedef vector<int> SingleAlign; +	SAlignment(const char* pszAlign) { +		fnInitializeAlignment(pszAlign); +	} +	~SAlignment() { + +	} + +	bool fnIsAligned(int i, bool s) const { +		const vector<SingleAlign>* palign; +		if (s == true) +			palign = &m_vec_s_align; +		else +			palign = &m_vec_t_align; +		if ((*palign)[i].size() == 0) +			return false; +		return true; +	} + +	/* +	 * return true if [b, e] is aligned phrases on source side (if s==true) or on the target side (if s==false); +	 * return false, otherwise. +	 */ +	bool fnIsAlignedPhrase(int b, int e, bool s, int *pob, int *poe) const { +		int ob, oe; //[b, e] on the other side +		if (s == true) +			fnGetLeftRightMost(b, e, m_vec_s_align, ob, oe); +		else +			fnGetLeftRightMost(b, e, m_vec_t_align, ob, oe); + +		if (ob == -1) { +			if (pob != NULL) (*pob) = -1; +			if (poe != NULL) (*poe) = -1; +			return false; //no aligned word among [b, e] +		} +		if (pob != NULL) (*pob) = ob; +		if (poe != NULL) (*poe) = oe; + +		int bb, be; //[b, e] back given [ob, oe] on the other side +		if (s == true) +			fnGetLeftRightMost(ob, oe, m_vec_t_align, bb, be); +		else +			fnGetLeftRightMost(ob, oe, m_vec_s_align, bb, be); + +		if (bb < b || be > e) +			return false; +		return true; +	} + +	bool fnIsAlignedTightPhrase(int b, int e, bool s, int *pob, int *poe) const { +		const vector<SingleAlign>* palign; +		if (s == true) +			palign = &m_vec_s_align; +		else +			palign = &m_vec_t_align; + +		if ((*palign).size() <= e || (*palign)[b].size() == 0 || (*palign)[e].size() == 0) +			return false; + +		return fnIsAlignedPhrase(b, e, s, pob, poe); +	} + +	void fnGetLeftRightMost(int b, int e, bool s, int& ob, int& oe) const { +		if (s == true) +			fnGetLeftRightMost(b, e, m_vec_s_align, ob, oe); +		else +			fnGetLeftRightMost(b, e, m_vec_t_align, ob, oe); +	} + +	/* +	 * look the translation of source[b, e] is continuous or not +	 * 1) return "Unaligned": if the source[b, e] is translated silently; +	 * 2) return "Con't": if none of target words in target[.., ..] is exclusively aligned to any word outside source[b, e] +	 * 3) return "Discon't": otherwise; +	 */ +	string fnIsContinuous(int b, int e) const { +		int ob, oe; +		fnGetLeftRightMost(b, e, true, ob, oe); +		if (ob == -1) return "Unaligned"; + +		for (int i = ob; i <= oe; i++) { +			if (!fnIsAligned(i, false)) continue; +			const SingleAlign& a = m_vec_t_align[i]; +			int j; +			for (j = 0; j < a.size(); j++) +				if (a[j] >= b && a[j] <= e) break; +			if (j == a.size()) return "Discon't"; +		} +		return "Con't"; +	} + +	const SingleAlign* fnGetSingleWordAlign(int i, bool s) const { +		if (s == true) { +			if (i >= m_vec_s_align.size()) +				return NULL; +			return &(m_vec_s_align[i]); +		} else { +			if (i >= m_vec_t_align.size()) +				return NULL; +			return &(m_vec_t_align[i]); +		} +	} +private: +	void fnGetLeftRightMost(int b, int e, const vector<SingleAlign>& align, int& ob, int& oe) const { +		ob = oe = -1; +		for (int i = b; i <= e && i < align.size(); i++) { +			if (align[i].size() > 0) { +				if (align[i][0] < ob || ob == -1) +					ob = align[i][0]; +				if (oe < align[i][align[i].size() - 1]) +					oe = align[i][align[i].size() - 1]; +			} +		} +	} +	void fnInitializeAlignment(const char* pszAlign) { +		m_vec_s_align.clear(); +		m_vec_t_align.clear(); + +		vector<string> terms = SplitOnWhitespace(string(pszAlign)); +		int si, ti; +		for (size_t i = 0; i < terms.size(); i++) { +			sscanf(terms[i].c_str(), "%d-%d", &si, &ti); + +			while (m_vec_s_align.size() <= si) { +				SingleAlign sa; +				m_vec_s_align.push_back(sa); +			} +			while (m_vec_t_align.size() <= ti) { +				SingleAlign sa; +				m_vec_t_align.push_back(sa); +			} + +			m_vec_s_align[si].push_back(ti); +			m_vec_t_align[ti].push_back(si); +		} + +		//sort +		for (size_t i = 0; i < m_vec_s_align.size(); i++) { +			std::sort(m_vec_s_align[i].begin(), m_vec_s_align[i].end()); +		} +		for (size_t i = 0; i < m_vec_t_align.size(); i++) { +			std::sort(m_vec_t_align[i].begin(), m_vec_t_align[i].end()); +		} +	} + +private: +	vector<SingleAlign> m_vec_s_align; //source side words' alignment +	vector<SingleAlign> m_vec_t_align; //target side words' alignment +}; + +struct SAlignmentReader { +	SAlignmentReader(const char *pszFname) { +		m_fpIn = fopen(pszFname, "r"); +		assert(m_fpIn != NULL); +	} +	~SAlignmentReader() { +		if (m_fpIn != NULL) +			fclose(m_fpIn); +	} +	SAlignment* fnReadNextAlignment() { +		if (feof(m_fpIn) == true) +			return NULL; +		char *pszLine = new char[100001]; +		pszLine[0] = '\0'; +		fgets(pszLine, 10001, m_fpIn); +		int iLen = strlen(pszLine); +		if (iLen == 0) +			return NULL; +		while (iLen > 0 && pszLine[iLen - 1] > 0 && pszLine[iLen -1] < 33) { +			pszLine[ iLen - 1 ] = '\0'; +			iLen--; +		} +		SAlignment *pAlign = new SAlignment(pszLine); +		delete [] pszLine; +		return pAlign; +	} +private: +	FILE *m_fpIn; +}; + + +#endif /* ALIGNMENT_H_ */ diff --git a/utils/synutils/argument_reorder_model.cc b/utils/synutils/argument_reorder_model.cc new file mode 100644 index 00000000..e30ee971 --- /dev/null +++ b/utils/synutils/argument_reorder_model.cc @@ -0,0 +1,311 @@ +/* + * argument_reorder_model.cc + * + *  Created on: Dec 15, 2013 + *      Author: lijunhui + */ + +#include <boost/program_options.hpp> +#include <fstream> + +#include "argument_reorder_model.h" +#include "utility.h" +#include "tsuruoka_maxent.h" + +inline void fnPreparingTrainingdata(const char* pszFName, int iCutoff, const char* pszNewFName) { +	SFReader *pFReader = new STxtFileReader(pszFName); +	char *pszLine = new char[ 100001 ]; +	int iLen; +	Map hashPredicate; +	while (pFReader->fnReadNextLine(pszLine, &iLen)) { +		if (iLen == 0) +			continue; + +		vector<string> vecTerms; +		SplitOnWhitespace(string(pszLine), &vecTerms); + +		for (size_t i = 0; i < vecTerms.size() - 1; i++) { +			Iterator iter = hashPredicate.find(vecTerms[i]); +			if (iter == hashPredicate.end()) { +				hashPredicate[vecTerms[i]] = 1; + +			} else { +				iter->second++; +			} +		} +	} +	delete pFReader; + +	pFReader = new STxtFileReader(pszFName); +	FILE *fpOut = fopen(pszNewFName, "w"); +	while (pFReader->fnReadNextLine(pszLine, &iLen)) { +		if (iLen == 0) +			continue; + +		vector<string> vecTerms; +		SplitOnWhitespace(string(pszLine), &vecTerms); +		ostringstream ostr; +		for (size_t i = 0; i < vecTerms.size() - 1; i++) { +			Iterator iter = hashPredicate.find(vecTerms[i]); +			assert(iter != hashPredicate.end()); +			if (iter->second >= iCutoff) { +				ostr << vecTerms[i] << " "; +			} +		} +		if (ostr.str().length() > 0) { +			ostr << vecTerms[vecTerms.size() - 1]; +			fprintf(fpOut, "%s\n", ostr.str().c_str()); +		} +	} +	fclose(fpOut); +	delete pFReader; + + +	delete [] pszLine; +} + +struct SArgumentReorderTrainer{ +	SArgumentReorderTrainer(const char* pszSRLFname,        //source-side srl tree file name +				const char* pszAlignFname,               //alignment filename +	            const char* pszSourceFname,              //source file name +	            const char* pszTargetFname,              //target file name +	            const char* pszTopPredicateFname,        //target file name +	            const char* pszInstanceFname,            //training instance file name +	            const char* pszModelFname,              //classifier model file name +	            int iCutoff) { +		fnGenerateInstanceFiles(pszSRLFname, pszAlignFname, pszSourceFname, pszTargetFname, pszTopPredicateFname, pszInstanceFname); + +		string strInstanceFname, strModelFname; +		strInstanceFname = string(pszInstanceFname) + string(".left"); +		strModelFname = string(pszModelFname) + string(".left"); +		fnTraining(strInstanceFname.c_str(), strModelFname.c_str(), iCutoff); +		strInstanceFname = string(pszInstanceFname) + string(".right"); +		strModelFname = string(pszModelFname) + string(".right"); +		fnTraining(strInstanceFname.c_str(), strModelFname.c_str(), iCutoff); +	} + +	~SArgumentReorderTrainer() { + +	} + +private: +	void fnTraining(const char* pszInstanceFname, const char* pszModelFname, int iCutoff) { +		char *pszNewInstanceFName = new char[strlen(pszInstanceFname) + 50]; +		if (iCutoff > 0) { +			sprintf(pszNewInstanceFName, "%s.tmp", pszInstanceFname); +			fnPreparingTrainingdata(pszInstanceFname, iCutoff, pszNewInstanceFName); +		} else { +			strcpy(pszNewInstanceFName, pszInstanceFname); +		} + +		Tsuruoka_Maxent *pMaxent = new Tsuruoka_Maxent(NULL); +		pMaxent->fnTrain(pszNewInstanceFName, "l1", pszModelFname, 300); +		delete pMaxent; + +		if (strcmp(pszNewInstanceFName, pszInstanceFname) != 0) { +			sprintf(pszNewInstanceFName, "rm %s.tmp", pszInstanceFname); +			system(pszNewInstanceFName); +		} +		delete [] pszNewInstanceFName; +	} + +	void fnGenerateInstanceFiles(const char* pszSRLFname,        //source-side flattened parse tree file name +				const char* pszAlignFname,               //alignment filename +				const char* pszSourceFname,              //source file name +				const char* pszTargetFname,              //target file name +				const char* pszTopPredicateFname,        //top predicate file name (we only consider predicates with 100+ occurrences +				const char* pszInstanceFname             //training instance file name +				) { +		SAlignmentReader *pAlignReader = new SAlignmentReader(pszAlignFname); +		SSrlSentenceReader *pSRLReader = new SSrlSentenceReader(pszSRLFname); +		STxtFileReader *pTxtSReader = new STxtFileReader(pszSourceFname); +		STxtFileReader *pTxtTReader = new STxtFileReader(pszTargetFname); + +		Map *pMapPredicate; +		if (pszTopPredicateFname != NULL) +			pMapPredicate = fnLoadTopPredicates(pszTopPredicateFname); +		else +			pMapPredicate = NULL; + +		char *pszLine = new char[50001]; + +		FILE *fpLeftOut, *fpRightOut; +		sprintf(pszLine, "%s.left", pszInstanceFname); +		fpLeftOut = fopen(pszLine, "w"); +		sprintf(pszLine, "%s.right", pszInstanceFname); +		fpRightOut = fopen(pszLine, "w"); + +		//read sentence by sentence +		SAlignment *pAlign; +		SSrlSentence *pSRL; +		SParsedTree *pTree; +		int iSentNum = 0; +		while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { +			pSRL = pSRLReader->fnReadNextSrlSentence(); +			assert(pSRL != NULL); +			pTree = pSRL->m_pTree; +			assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); +			vector<string> vecSTerms; +			SplitOnWhitespace(string(pszLine), &vecSTerms); +			assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); +			vector<string> vecTTerms; +			SplitOnWhitespace(string(pszLine), &vecTTerms); +			//vecTPOSTerms.size() == 0, given the case when an english sentence fails parsing + +			if (pTree != NULL) { +				for (size_t i = 0; i < pSRL->m_vecPred.size(); i++) { +					SPredicate *pPred = pSRL->m_vecPred[i]; +					if (strcmp(pTree->m_vecTerminals[pPred->m_iPosition]->m_ptParent->m_pszTerm, "VA") == 0) continue; +					string strPred = string(pTree->m_vecTerminals[pPred->m_iPosition]->m_pszTerm); +					if (pMapPredicate != NULL) { +						Map::iterator iter_map = pMapPredicate->find(strPred); +						if (pMapPredicate != NULL && iter_map == pMapPredicate->end()) +							continue; +					} + +					SPredicateItem *pPredItem = new SPredicateItem(pTree, pPred); + +					vector<string> vecStrBlock; +					for (size_t j = 0; j < pPredItem->vec_items_.size(); j++) { +						SSRLItem *pItem1 = pPredItem->vec_items_[j]; +						vecStrBlock.push_back(SArgumentReorderModel::fnGetBlockOutcome(pItem1->tree_item_->m_iBegin, pItem1->tree_item_->m_iEnd, pAlign)); +					} + +					vector<string> vecStrLeftReorderType; +					vector<string> vecStrRightReorderType; +					SArgumentReorderModel::fnGetReorderType(pPredItem, pAlign, vecStrLeftReorderType, vecStrRightReorderType); +					for (int j = 1; j < pPredItem->vec_items_.size(); j++) { +						string strLeftOutcome, strRightOutcome; +						strLeftOutcome = vecStrLeftReorderType[j - 1]; +						strRightOutcome = vecStrRightReorderType[j - 1]; +						ostringstream ostr; +						SArgumentReorderModel::fnGenerateFeature(pTree, pPred, pPredItem, j, vecStrBlock[j - 1], vecStrBlock[j], ostr); + +						//fprintf(stderr, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); +						//fprintf(fpOut, "sentid=%d %s %s\n", iSentNum, ostr.str().c_str(), strOutcome.c_str()); +						fprintf(fpLeftOut, "%s %s\n", ostr.str().c_str(), strLeftOutcome.c_str()); +						fprintf(fpRightOut, "%s %s\n", ostr.str().c_str(), strRightOutcome.c_str()); +					} +				} +			} +			delete pSRL; + +			delete pAlign; +			iSentNum++; + +			if (iSentNum % 100000 == 0) +				fprintf(stderr, "#%d\n", iSentNum); +		} +		delete [] pszLine; + +		fclose(fpLeftOut); +		fclose(fpRightOut); + + +		delete pAlignReader; +		delete pSRLReader; +		delete pTxtSReader; +		delete pTxtTReader; +	} + + + +	Map* fnLoadTopPredicates(const char* pszTopPredicateFname) { +		if (pszTopPredicateFname == NULL) +			return NULL; + +		Map* pMapPredicate  = new Map(); +		STxtFileReader *pReader = new STxtFileReader(pszTopPredicateFname); +		char *pszLine = new char[50001]; +		int iNumCount = 0; +		while (pReader->fnReadNextLine(pszLine, NULL)) { +			if (pszLine[0] == '#') +				continue; +			char *p = strchr(pszLine, ' '); +			assert(p != NULL); +			p[0] = '\0'; +			p++; +			int iCount = atoi(p); +			if (iCount < 100) +				break; +			(*pMapPredicate)[string(pszLine)] = iNumCount++; +		} +		delete pReader; +		delete [] pszLine; +		return pMapPredicate; +	} +}; + + + +namespace po = boost::program_options; + +inline void print_options(std::ostream &out,po::options_description const& opts) { +  typedef std::vector< boost::shared_ptr<po::option_description> > Ds; +  Ds const& ds=opts.options(); +  out << '"'; +  for (unsigned i=0;i<ds.size();++i) { +    if (i) out<<' '; +    out<<"--"<<ds[i]->long_name(); +  } +  out << '"\n'; +} +inline string str(char const* name,po::variables_map const& conf) { +  return conf[name].as<string>(); +} + +//--srl_file /scratch0/mt_exp/gale-align/gale-align.nw.srl.cn --align_file /scratch0/mt_exp/gale-align/gale-align.nw.al --source_file /scratch0/mt_exp/gale-align/gale-align.nw.cn --target_file /scratch0/mt_exp/gale-align/gale-align.nw.en --instance_file /scratch0/mt_exp/gale-align/gale-align.nw.argreorder.instance --model_prefix /scratch0/mt_exp/gale-align/gale-align.nw.argreorder.model --feature_cutoff 2 +//--srl_file /scratch0/mt_exp/gale-ctb/gale-ctb.srl.cn --align_file /scratch0/mt_exp/gale-ctb/gale-ctb.align --source_file /scratch0/mt_exp/gale-ctb/gale-ctb.cn --target_file /scratch0/mt_exp/gale-ctb/gale-ctb.en0 --instance_file /scratch0/mt_exp/gale-ctb/gale-ctb.argreorder.instance --model_prefix /scratch0/mt_exp/gale-ctb/gale-ctb.argreorder.model --feature_cutoff 2 +int main(int argc, char** argv) { + +	po::options_description opts("Configuration options"); +	opts.add_options() +                ("srl_file",po::value<string>(),"srl file path (input)") +                ("align_file",po::value<string>(),"Alignment file path (input)") +                ("source_file",po::value<string>(),"Source text file path (input)") +                ("target_file",po::value<string>(),"Target text file path (input)") +                ("instance_file",po::value<string>(),"Instance file path (output)") +                ("model_prefix",po::value<string>(),"Model file path prefix (output): three files will be generated") +                ("feature_cutoff",po::value<int>()->default_value(100),"Feature cutoff threshold") +                ("help", "produce help message"); + +	po::variables_map vm; +	if (argc) { +		po::store(po::parse_command_line(argc, argv, opts), vm); +		po::notify(vm); +	} + +	if (vm.count("help")) { +		print_options(cout, opts); +		return 1; +	} + +	if (!vm.count("srl_file") +			|| !vm.count("align_file") +			|| !vm.count("source_file") +			|| !vm.count("target_file") +			|| !vm.count("instance_file") +			|| !vm.count("model_prefix") +			) { +		print_options(cout, opts); +		if (!vm.count("parse_file")) cout << "--parse_file NOT FOUND\n"; +		if (!vm.count("align_file")) cout << "--align_file NOT FOUND\n"; +		if (!vm.count("source_file")) cout << "--source_file NOT FOUND\n"; +		if (!vm.count("target_file")) cout << "--target_file NOT FOUND\n"; +		if (!vm.count("instance_file")) cout << "--instance_file NOT FOUND\n"; +		if (!vm.count("model_prefix")) cout << "--model_prefix NOT FOUND\n"; +		exit(0); +	} + +	SArgumentReorderTrainer *pTrainer = new SArgumentReorderTrainer(str("srl_file", vm).c_str(), +			                                                  str("align_file", vm).c_str(), +			                                                  str("source_file", vm).c_str(), +			                                                  str("target_file", vm).c_str(), +			                                                  NULL, +			                                                  str("instance_file", vm).c_str(), +			                                                  str("model_prefix", vm).c_str(), +			                                                  vm["feature_cutoff"].as<int>()); +	delete pTrainer; + +	return 1; +} diff --git a/utils/synutils/argument_reorder_model.h b/utils/synutils/argument_reorder_model.h new file mode 100644 index 00000000..b553b34b --- /dev/null +++ b/utils/synutils/argument_reorder_model.h @@ -0,0 +1,222 @@ +/* + * argument_reorder_model.h + * + *  Created on: Dec 15, 2013 + *      Author: lijunhui + */ + +#ifndef ARGUMENT_REORDER_MODEL_H_ +#define ARGUMENT_REORDER_MODEL_H_ + +#include "alignment.h" +#include "tree.h" +#include "srl_sentence.h" + + +//an argument item or a predicate item (the verb itself) +struct SSRLItem{ +        SSRLItem(const STreeItem *tree_item, string role): +        tree_item_(tree_item), +        role_(role) { + +        } +        ~SSRLItem() { + +        } +        const STreeItem *tree_item_; +        const string role_; +}; + +struct SPredicateItem{ +	SPredicateItem(const SParsedTree *tree, const SPredicate *pred): +                pred_(pred) { +                vec_items_.reserve(pred->m_vecArgt.size() + 1); +                for (int i = 0; i < pred->m_vecArgt.size(); i++) { +                        vec_items_.push_back(new SSRLItem(pred->m_vecArgt[i]->m_pTreeItem, string(pred->m_vecArgt[i]->m_pszRole))); +                } +                vec_items_.push_back(new SSRLItem(tree->m_vecTerminals[pred->m_iPosition]->m_ptParent, string("Pred"))); +                sort(vec_items_.begin(), vec_items_.end(), SortFunction); + +                begin_ = vec_items_[0]->tree_item_->m_iBegin; +                end_ = vec_items_[vec_items_.size() - 1]->tree_item_->m_iEnd; +        } + +        ~SPredicateItem() { +                vec_items_.clear(); +        } + +        static bool SortFunction (SSRLItem *i, SSRLItem* j) { return (i->tree_item_->m_iBegin < j->tree_item_->m_iBegin); } + +        vector<SSRLItem*> vec_items_; +        int begin_; +        int end_; +        const SPredicate *pred_; +}; + +struct SArgumentReorderModel { +public: +	static string fnGetBlockOutcome(int iBegin, int iEnd, SAlignment *pAlign) { +		return pAlign->fnIsContinuous(iBegin, iEnd); +	} +	static void fnGetReorderType(SPredicateItem *pPredItem, SAlignment *pAlign, vector<string>& vecStrLeftReorder, vector<string>& vecStrRightReorder) { +    	vector<int> vecLeft, vecRight; +    	for (int i = 0; i < pPredItem->vec_items_.size(); i++) { +            const STreeItem *pCon1 = pPredItem->vec_items_[i]->tree_item_; +    		int iLeft1, iRight1; +            pAlign->fnGetLeftRightMost(pCon1->m_iBegin, pCon1->m_iEnd, true, iLeft1, iRight1); +            vecLeft.push_back(iLeft1); +            vecRight.push_back(iRight1); +    	} +    	vector<int> vecLeftPosition; +    	fnGetRelativePosition(vecLeft, vecLeftPosition); +    	vector<int> vecRightPosition; +    	fnGetRelativePosition(vecRight, vecRightPosition); + +    	vecStrLeftReorder.clear(); +    	vecStrRightReorder.clear(); +    	for (int i = 1; i < vecLeftPosition.size(); i++) { +    		string strOutcome; +    		fnGetOutcome(vecLeftPosition[i - 1], vecLeftPosition[i], strOutcome); +    		vecStrLeftReorder.push_back(strOutcome); +    		fnGetOutcome(vecRightPosition[i - 1], vecRightPosition[i], strOutcome); +    		vecStrRightReorder.push_back(strOutcome); +    	} +	} + +	/* +	 * features: +	 * f1: (left_label, right_label, parent_label) +	 * f2: (left_label, right_label, parent_label, other_right_sibling_label) +	 * f3: (left_label, right_label, parent_label, other_left_sibling_label) +	 * f4: (left_label, right_label, left_head_pos) +	 * f5: (left_label, right_label, left_head_word) +	 * f6: (left_label, right_label, right_head_pos) +	 * f7: (left_label, right_label, right_head_word) +	 * f8: (left_label, right_label, left_chunk_status) +	 * f9: (left_label, right_label, right_chunk_status) +	 * f10: (left_label, parent_label) +	 * f11: (right_label, parent_label) +	 * +	 * f1: (left_role, right_role, predicate_term) +	 * f2: (left_role, right_role, predicate_term, other_right_role) +	 * f3: (left_role, right_role, predicate_term, other_left_role) +	 * f4: (left_role, right_role, left_head_pos) +	 * f5: (left_role, right_role, left_head_word) +	 * f6: (left_role, right_role, left_syntactic_label) +	 * f7: (left_role, right_role, right_head_pos) +	 * f8: (left_role, right_role, right_head_word) +	 * f8: (left_role, right_role, right_syntactic_label) +	 * f8: (left_role, right_role, left_chunk_status) +	 * f9: (left_role, right_role, right_chunk_status) +	 * f10: (left_role, right_role, left_chunk_status) +	 * f11: (left_role, right_role, right_chunk_status) +	 * f12: (left_label, parent_label) +	 * f13: (right_label, parent_label) +	 */ +	static void fnGenerateFeature(const SParsedTree *pTree, const SPredicate *pPred, const SPredicateItem *pPredItem, int iPos, const string& strBlock1, const string& strBlock2, ostringstream& ostr) { +		SSRLItem *pSRLItem1 = pPredItem->vec_items_[iPos - 1]; +		SSRLItem *pSRLItem2 = pPredItem->vec_items_[iPos]; +		const STreeItem *pCon1 = pSRLItem1->tree_item_; +		const STreeItem *pCon2 = pSRLItem2->tree_item_; + +		string left_role = pSRLItem1->role_; +		string right_role = pSRLItem2->role_; + +		string predicate_term = pTree->m_vecTerminals[pPred->m_iPosition]->m_pszTerm; + +		vector<string> vec_other_right_sibling; +		for (int i = iPos + 1; i < pPredItem->vec_items_.size(); i++) +			vec_other_right_sibling.push_back(string(pPredItem->vec_items_[i]->role_)); +		if (vec_other_right_sibling.size() == 0) +			vec_other_right_sibling.push_back(string("NULL")); + +		vector<string> vec_other_left_sibling; +		for (int i = 0; i < iPos - 1; i++) +			vec_other_right_sibling.push_back(string(pPredItem->vec_items_[i]->role_)); +		if (vec_other_left_sibling.size() == 0) +			vec_other_left_sibling.push_back(string("NULL")); + + +		//generate features +		//f1 +		ostr << "f1=" << left_role << "_" << right_role << "_" << predicate_term; +		ostr << "f1=" << left_role << "_" << right_role; + +		//f2 +		for (int i = 0; i < vec_other_right_sibling.size(); i++) { +			ostr << " f2=" << left_role << "_" << right_role << "_" << predicate_term << "_" << vec_other_right_sibling[i]; +			ostr << " f2=" << left_role << "_" << right_role << "_" << vec_other_right_sibling[i]; +		} +		//f3 +		for (int i = 0; i < vec_other_left_sibling.size(); i++) { +			ostr << " f3=" << left_role << "_" << right_role << "_" << predicate_term << "_" << vec_other_left_sibling[i]; +			ostr << " f3=" << left_role << "_" << right_role << "_" << vec_other_left_sibling[i]; +		} +		//f4 +		ostr << " f4=" << left_role << "_" << right_role << "_" << pTree->m_vecTerminals[pCon1->m_iHeadWord]->m_ptParent->m_pszTerm; +		//f5 +		ostr << " f5=" << left_role << "_" << right_role << "_" << pTree->m_vecTerminals[pCon1->m_iHeadWord]->m_pszTerm; +		//f6 +		ostr << " f6=" << left_role << "_" << right_role << "_" << pCon2->m_pszTerm; +		//f7 +		ostr << " f7=" << left_role << "_" << right_role << "_" << pTree->m_vecTerminals[pCon2->m_iHeadWord]->m_ptParent->m_pszTerm; +		//f8 +		ostr << " f8=" << left_role << "_" << right_role << "_" << pTree->m_vecTerminals[pCon2->m_iHeadWord]->m_pszTerm; +		//f9 +		ostr << " f9=" << left_role << "_" << right_role << "_" << pCon2->m_pszTerm; +		//f10 +		ostr << " f10=" << left_role << "_" << right_role << "_" << strBlock1; +		//f11 +		ostr << " f11=" << left_role << "_" << right_role << "_" << strBlock2; +		//f12 +		ostr << " f12=" << left_role << "_" << predicate_term; +		ostr << " f12=" << left_role; +		//f13 +		ostr << " f13=" << right_role << "_" << predicate_term; +		ostr << " f13=" << right_role; +	} + +private: +	static void fnGetOutcome(int i1, int i2, string& strOutcome) { +		assert(i1 != i2); +		if (i1 < i2) { +			if (i2 > i1 + 1) strOutcome = string("DM"); +			else strOutcome = string("M"); +		} else { +			if (i1 > i2 + 1) strOutcome = string("DS"); +			else strOutcome = string("S"); +		} +	} + +	static void fnGetRelativePosition(const vector<int>& vecLeft, vector<int>& vecPosition) { +		vecPosition.clear(); + +		vector<float> vec; +		for (int i = 0; i < vecLeft.size(); i++) { +			if (vecLeft[i] == -1) { +				if (i == 0) +					vec.push_back(-1); +				else +					vec.push_back(vecLeft[i-1] + 0.1); +			} else +				vec.push_back(vecLeft[i]); +		} + +		for (int i = 0; i < vecLeft.size(); i++) { +			int count = 0; + +			for (int j = 0; j < vecLeft.size(); j++) { +				if ( j == i) continue; +				if (vec[j] < vec[i]) { +					count++; +				} else if (vec[j] == vec[i] && j < i) { +					count++; +				} +			} +			vecPosition.push_back(count); +		} +	} +}; + + +#endif /* ARGUMENT_REORDER_MODEL_H_ */ diff --git a/utils/synutils/constituent_reorder_model.cc b/utils/synutils/constituent_reorder_model.cc new file mode 100644 index 00000000..485c9667 --- /dev/null +++ b/utils/synutils/constituent_reorder_model.cc @@ -0,0 +1,796 @@ +/* + * constituent_reorder_model.cc + * + *  Created on: Jul 10, 2013 + *      Author: junhuili + */ + + +#include <boost/program_options.hpp> + +#include "alignment.h" +#include "tree.h" +#include "utility.h" +#include "tsuruoka_maxent.h" + +#include <tr1/unordered_map> + +using namespace std; + + +typedef std::tr1::unordered_map<std::string, int> Map; +typedef std::tr1::unordered_map<std::string, int>::iterator Iterator; + +namespace po = boost::program_options; + +inline void fnPreparingTrainingdata(const char* pszFName, int iCutoff, const char* pszNewFName) { +	SFReader *pFReader = new STxtFileReader(pszFName); +	char *pszLine = new char[ 100001 ]; +	int iLen; +	Map hashPredicate; +	while (pFReader->fnReadNextLine(pszLine, &iLen)) { +		if (iLen == 0) +			continue; + +		vector<string> vecTerms; +		SplitOnWhitespace(string(pszLine), &vecTerms); + +		for (size_t i = 0; i < vecTerms.size() - 1; i++) { +			Iterator iter = hashPredicate.find(vecTerms[i]); +			if (iter == hashPredicate.end()) { +				hashPredicate[vecTerms[i]] = 1; + +			} else { +				iter->second++; +			} +		} +	} +	delete pFReader; + +	pFReader = new STxtFileReader(pszFName); +	FILE *fpOut = fopen(pszNewFName, "w"); +	while (pFReader->fnReadNextLine(pszLine, &iLen)) { +		if (iLen == 0) +			continue; + +		vector<string> vecTerms; +		SplitOnWhitespace(string(pszLine), &vecTerms); +		ostringstream ostr; +		for (size_t i = 0; i < vecTerms.size() - 1; i++) { +			Iterator iter = hashPredicate.find(vecTerms[i]); +			assert(iter != hashPredicate.end()); +			if (iter->second >= iCutoff) { +				ostr << vecTerms[i] << " "; +			} +		} +		if (ostr.str().length() > 0) { +			ostr << vecTerms[vecTerms.size() - 1]; +			fprintf(fpOut, "%s\n", ostr.str().c_str()); +		} +	} +	fclose(fpOut); +	delete pFReader; + + +	delete [] pszLine; +} + +struct SConstReorderTrainer{ +	SConstReorderTrainer(const char* pszSynFname,        //source-side flattened parse tree file name +                         const char* pszAlignFname,               //alignment filename +                         const char* pszSourceFname,              //source file name +                         const char* pszTargetFname,              //target file name +                         const char* pszInstanceFname,            //training instance file name +                         const char* pszModelPrefix,              //classifier model file name prefix +                         int iClassifierType,                     //classifier type +                         int iCutoff,                             //feature count threshold +                         const char* pszOption                    //other classifier parameters (for svmlight) +                         ) { +		fnGenerateInstanceFile(pszSynFname, pszAlignFname, pszSourceFname, pszTargetFname, pszInstanceFname); + +		string strInstanceLeftFname = string(pszInstanceFname) + string(".left"); +		string strInstanceRightFname = string(pszInstanceFname) + string(".right"); + +		string strModelLeftFname = string(pszModelPrefix) + string(".left"); +		string strModelRightFname = string(pszModelPrefix) + string(".right"); + +		fprintf(stdout, "...Training the left ordering model\n"); +		fnTraining(strInstanceLeftFname.c_str(), strModelLeftFname.c_str(), iCutoff); +		fprintf(stdout, "...Training the right ordering model\n"); +		fnTraining(strInstanceRightFname.c_str(), strModelRightFname.c_str(), iCutoff); +	} +	~SConstReorderTrainer() { + +	} + +private: + +	void fnTraining(const char* pszInstanceFname, const char* pszModelFname, int iCutoff) { +		char *pszNewInstanceFName = new char[strlen(pszInstanceFname) + 50]; +		if (iCutoff > 0) { +			sprintf(pszNewInstanceFName, "%s.tmp", pszInstanceFname); +			fnPreparingTrainingdata(pszInstanceFname, iCutoff, pszNewInstanceFName); +		} else { +			strcpy(pszNewInstanceFName, pszInstanceFname); +		} + +		/*Zhangle_Maxent *pZhangleMaxent = new Zhangle_Maxent(NULL); +	    pZhangleMaxent->fnTrain(pszInstanceFname, "lbfgs", pszModelFname, 100, 2.0); +	    delete pZhangleMaxent;*/ + +		Tsuruoka_Maxent *pMaxent = new Tsuruoka_Maxent(NULL); +		pMaxent->fnTrain(pszNewInstanceFName, "l1", pszModelFname, 300); +		delete pMaxent; + +		if (strcmp(pszNewInstanceFName, pszInstanceFname) != 0) { +			sprintf(pszNewInstanceFName, "rm %s.tmp", pszInstanceFname); +			system(pszNewInstanceFName); +		} +		delete [] pszNewInstanceFName; +	} + +	inline bool fnIsVerbPOS(const char* pszTerm) { +		if (strcmp(pszTerm, "VV") == 0 +			|| strcmp(pszTerm, "VA") == 0 +			|| strcmp(pszTerm, "VC") == 0 +			|| strcmp(pszTerm, "VE") == 0) +			return true; +		return false; +	} + +	inline void fnGetOutcome(int iL1, int iR1, int iL2, int iR2, const SAlignment *pAlign, string& strOutcome) { +		if (iL1 == -1 && iL2 == -1) +			strOutcome = "BU"; //1. both are untranslated +		else if (iL1 == -1) +			strOutcome = "1U"; //2. XP1 is untranslated +		else if (iL2 == -1) +			strOutcome = "2U"; //3. XP2 is untranslated +		else if (iL1 == iL2 && iR2 == iR2) +			strOutcome = "SS"; //4. Have same scope +		else if (iL1 <= iL2 && iR1 >= iR2) +			strOutcome = "1C2"; //5. XP1's translation covers XP2's +		else if (iL1 >= iL2 && iR1 <= iR2) +			strOutcome = "2C1"; //6. XP2's translation covers XP1's +		else if (iR1 < iL2) { +			int i = iR1 + 1; +			/*while (i < iL2) { +				if (pAlign->fnIsAligned(i, false)) +					break; +				i++; +			}*/ +			if (i == iL2) +				strOutcome = "M"; //7. Monotone +			else +				strOutcome = "DM"; //8. Discontinuous monotone +		} else if (iL1 < iL2 && iL2 <= iR1 && iR1 < iR2) +			strOutcome = "OM"; //9. Overlap monotone +		else if (iR2 < iL1) { +			int i = iR2 + 1; +			/*while (i < iL1) { +				if (pAlign->fnIsAligned(i, false)) +					break; +				i++; +			}*/ +			if (i == iL1) +				strOutcome = "S"; //10. Swap +			else +				strOutcome = "DS"; //11. Discontinuous swap +		} else if (iL2 < iL1 && iL1 <= iR2 && iR2 < iR1) +			strOutcome = "OS"; //12. Overlap swap +		else +			assert(false); +	} + +	inline void fnGetOutcome(int i1, int i2, string& strOutcome) { +		assert(i1 != i2); +		if (i1 < i2) { +			if (i2 > i1 + 1) strOutcome = string("DM"); +			else strOutcome = string("M"); +		} else { +			if (i1 > i2 + 1) strOutcome = string("DS"); +			else strOutcome = string("S"); +		} +	} + +	inline void fnGetRelativePosition(const vector<int>& vecLeft, vector<int>& vecPosition) { +		vecPosition.clear(); + +		vector<float> vec; +		for (size_t i = 0; i < vecLeft.size(); i++) { +			if (vecLeft[i] == -1) { +				if (i == 0) +					vec.push_back(-1); +				else +					vec.push_back(vecLeft[i-1] + 0.1); +			} else +				vec.push_back(vecLeft[i]); +		} + +		for (size_t i = 0; i < vecLeft.size(); i++) { +			int count = 0; + +			for (size_t j = 0; j < vecLeft.size(); j++) { +				if ( j == i) continue; +				if (vec[j] < vec[i]) { +					count++; +				} else if (vec[j] == vec[i] && j < i) { +					count++; +				} +			} +			vecPosition.push_back(count); +		} +	} + +	/* +	 * features: +	 * f1: (left_label, right_label, parent_label) +	 * f2: (left_label, right_label, parent_label, other_right_sibling_label) +	 * f3: (left_label, right_label, parent_label, other_left_sibling_label) +	 * f4: (left_label, right_label, left_head_pos) +	 * f5: (left_label, right_label, left_head_word) +	 * f6: (left_label, right_label, right_head_pos) +	 * f7: (left_label, right_label, right_head_word) +	 * f8: (left_label, right_label, left_chunk_status) +	 * f9: (left_label, right_label, right_chunk_status) +	 * f10: (left_label, parent_label) +	 * f11: (right_label, parent_label) +	 */ +	void fnGenerateInstance(const SParsedTree *pTree, const STreeItem *pParent, int iPos, const vector<string>& vecChunkStatus, const vector<int>& vecPosition, const vector<string>& vecSTerms, const vector<string>& vecTTerms, string& strOutcome, ostringstream& ostr) { +		STreeItem *pCon1, *pCon2; +		pCon1 = pParent->m_vecChildren[iPos - 1]; +		pCon2 = pParent->m_vecChildren[iPos]; + +		fnGetOutcome(vecPosition[iPos-1], vecPosition[iPos], strOutcome); + +		string left_label = string(pCon1->m_pszTerm); +		string right_label = string(pCon2->m_pszTerm); +		string parent_label = string(pParent->m_pszTerm); + +		vector<string> vec_other_right_sibling; +		for (int i = iPos + 1; i < pParent->m_vecChildren.size(); i++) +			vec_other_right_sibling.push_back(string(pParent->m_vecChildren[i]->m_pszTerm)); +		if (vec_other_right_sibling.size() == 0) +			vec_other_right_sibling.push_back(string("NULL")); +		vector<string> vec_other_left_sibling; +		for (int i = 0; i < iPos - 1; i++) +			vec_other_left_sibling.push_back(string(pParent->m_vecChildren[i]->m_pszTerm)); +		if (vec_other_left_sibling.size() == 0) +			vec_other_left_sibling.push_back(string("NULL")); + +		//generate features +		//f1 +		ostr << "f1=" << left_label << "_" << right_label << "_" << parent_label; +		//f2 +		for (int i = 0; i < vec_other_right_sibling.size(); i++) +			ostr << " f2=" << left_label << "_" << right_label << "_" << parent_label << "_" << vec_other_right_sibling[i]; +		//f3 +		for (int i = 0; i < vec_other_left_sibling.size(); i++) +			ostr << " f3=" << left_label << "_" << right_label << "_" << parent_label << "_" << vec_other_left_sibling[i]; +		//f4 +		ostr << " f4=" << left_label << "_" << right_label << "_" << pTree->m_vecTerminals[pCon1->m_iHeadWord]->m_ptParent->m_pszTerm; +		//f5 +		ostr << " f5=" << left_label << "_" << right_label << "_" << vecSTerms[pCon1->m_iHeadWord]; +		//f6 +		ostr << " f6=" << left_label << "_" << right_label << "_" << pTree->m_vecTerminals[pCon2->m_iHeadWord]->m_ptParent->m_pszTerm; +		//f7 +		ostr << " f7=" << left_label << "_" << right_label << "_" << vecSTerms[pCon2->m_iHeadWord]; +		//f8 +		ostr << " f8=" << left_label << "_" << right_label << "_" << vecChunkStatus[iPos - 1]; +		//f9 +		ostr << " f9=" << left_label << "_" << right_label << "_" << vecChunkStatus[iPos]; +		//f10 +		ostr << " f10=" << left_label << "_" << parent_label; +		//f11 +		ostr << " f11=" << right_label << "_" << parent_label; +	} + +	/* +	 * Source side (11 features): +	 * f1: the categories of XP1 and XP2 (f1_1, f1_2) +	 * f2: the head words of XP1 and XP2 (f2_1, f2_2) +	 * f3: the first and last word of XP1 (f3_f, f3_l) +	 * f4: the first and last word of XP2 (f4_f, f4_l) +	 * f5: is XP1 or XP2 the head node (f5_1, f5_2) +	 * f6: the category of the common parent +	 * Target side (6 features): +	 * f7: the first and the last word of XP1's translation (f7_f, f7_l) +	 * f8: the first and the last word of XP2's translation (f8_f, f8_l) +	 * f9: the translation of XP1's and XP2's head word (f9_1, f9_2) +	 */ +	void fnGenerateInstance(const SParsedTree *pTree, const STreeItem *pParent, const STreeItem *pCon1, const STreeItem *pCon2, const SAlignment *pAlign, const vector<string>& vecSTerms, const vector<string>& vecTTerms, string& strOutcome, ostringstream& ostr) { + +		int iLeft1, iRight1, iLeft2, iRight2; +		pAlign->fnGetLeftRightMost(pCon1->m_iBegin, pCon1->m_iEnd, true, iLeft1, iRight1); +		pAlign->fnGetLeftRightMost(pCon2->m_iBegin, pCon2->m_iEnd, true, iLeft2, iRight2); + +		fnGetOutcome(iLeft1, iRight1, iLeft2, iRight2, pAlign, strOutcome); + +		//generate features +		//f1 +		ostr << "f1_1=" << pCon1->m_pszTerm << " f1_2=" << pCon2->m_pszTerm; +		//f2 +		ostr << " f2_1=" << vecSTerms[pCon1->m_iHeadWord] << " f2_2" << vecSTerms[pCon2->m_iHeadWord]; +		//f3 +		ostr << " f3_f=" << vecSTerms[pCon1->m_iBegin] << " f3_l=" << vecSTerms[pCon1->m_iEnd]; +		//f4 +		ostr << " f4_f=" << vecSTerms[pCon2->m_iBegin] << " f4_l=" << vecSTerms[pCon2->m_iEnd]; +		//f5 +		if (pParent->m_iHeadChild == pCon1->m_iBrotherIndex) +			ostr << " f5_1=1"; +		else +			ostr << " f5_1=0"; +		if (pParent->m_iHeadChild == pCon2->m_iBrotherIndex) +			ostr << " f5_2=1"; +		else +			ostr << " f5_2=0"; +		//f6 +		ostr << " f6=" << pParent->m_pszTerm; + +		/*//f7 +		if (iLeft1 != -1) { +			ostr << " f7_f=" << vecTTerms[iLeft1] << " f7_l=" << vecTTerms[iRight1]; +		} +		if (iLeft2 != -1) { +			ostr << " f8_f=" << vecTTerms[iLeft2] << " f8_l=" << vecTTerms[iRight2]; +		} + +		const vector<int>* pvecTarget = pAlign->fnGetSingleWordAlign(pCon1->m_iHeadWord, true); +		string str = ""; +		for (size_t i = 0; pvecTarget != NULL && i < pvecTarget->size(); i++) { +			str += vecTTerms[(*pvecTarget)[i]] + "_"; +		} +		if (str.length() > 0) { +			ostr << " f9_1=" << str.substr(0, str.size()-1); +		} +		pvecTarget = pAlign->fnGetSingleWordAlign(pCon2->m_iHeadWord, true); +		str = ""; +		for (size_t i = 0; pvecTarget != NULL && i < pvecTarget->size(); i++) { +			str += vecTTerms[(*pvecTarget)[i]] + "_"; +		} +		if (str.length() > 0) { +			ostr << " f9_2=" << str.substr(0, str.size()-1); +		} */ + +	} + +	void fnGetFocusedParentNodes(const SParsedTree* pTree, vector<STreeItem*>& vecFocused){ +		for (size_t i = 0; i < pTree->m_vecTerminals.size(); i++) { +			STreeItem *pParent = pTree->m_vecTerminals[i]->m_ptParent; + +			while (pParent != NULL) { +				//if (pParent->m_vecChildren.size() > 1 && pParent->m_iEnd - pParent->m_iBegin > 5) { +				if (pParent->m_vecChildren.size() > 1) { +					//do constituent reordering for all children of pParent +					vecFocused.push_back(pParent); +				} +				if (pParent->m_iBrotherIndex != 0) break; +				pParent = pParent->m_ptParent; +			} +		} +	} + +	void fnGenerateInstanceFile(const char* pszSynFname,        //source-side flattened parse tree file name +                                const char* pszAlignFname,               //alignment filename +                                const char* pszSourceFname,              //source file name +                                const char* pszTargetFname,              //target file name +                                const char* pszInstanceFname             //training instance file name +                                ) { +		SAlignmentReader *pAlignReader = new SAlignmentReader(pszAlignFname); +		SParseReader *pParseReader = new SParseReader(pszSynFname, false); +		STxtFileReader *pTxtSReader = new STxtFileReader(pszSourceFname); +		STxtFileReader *pTxtTReader = new STxtFileReader(pszTargetFname); + +		string strInstanceLeftFname = string(pszInstanceFname) + string(".left"); +		string strInstanceRightFname = string(pszInstanceFname) + string(".right"); + +		FILE *fpLeftOut = fopen(strInstanceLeftFname.c_str(), "w"); +		assert(fpLeftOut != NULL); + +		FILE *fpRightOut = fopen(strInstanceRightFname.c_str(), "w"); +		assert(fpRightOut != NULL); + +		//read sentence by sentence +		SAlignment *pAlign; +		SParsedTree *pTree; +		char *pszLine = new char[50001]; +		int iSentNum = 0; +		while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { +			pTree = pParseReader->fnReadNextParseTree(); +			assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); +			vector<string> vecSTerms; +			SplitOnWhitespace(string(pszLine), &vecSTerms); +			assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); +			vector<string> vecTTerms; +			SplitOnWhitespace(string(pszLine), &vecTTerms); + + +			if (pTree != NULL) { + +				vector<STreeItem*> vecFocused; +				fnGetFocusedParentNodes(pTree, vecFocused); + +				for (size_t i = 0; i < vecFocused.size(); i++) { + +					STreeItem *pParent = vecFocused[i]; + +	            	vector<int> vecLeft, vecRight; +	            	for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) { +	                    STreeItem *pCon1 = pParent->m_vecChildren[j]; +	            		int iLeft1, iRight1; +	                    pAlign->fnGetLeftRightMost(pCon1->m_iBegin, pCon1->m_iEnd, true, iLeft1, iRight1); +	                    vecLeft.push_back(iLeft1); +	                    vecRight.push_back(iRight1); +	            	} +	            	vector<int> vecLeftPosition; +	            	fnGetRelativePosition(vecLeft, vecLeftPosition); +	            	vector<int> vecRightPosition; +	            	fnGetRelativePosition(vecRight, vecRightPosition); + +	            	vector<string> vecChunkStatus; +	            	for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) { +	            		string strOutcome = pAlign->fnIsContinuous(pParent->m_vecChildren[j]->m_iBegin, pParent->m_vecChildren[j]->m_iEnd); +	            		vecChunkStatus.push_back(strOutcome); +	            	} + +					for (size_t j = 1; j < pParent->m_vecChildren.size(); j++) { +						//children[j-1] vs. children[j] reordering + +						string strLeftOutcome; +						ostringstream ostr; + +						fnGenerateInstance(pTree, pParent, j, vecChunkStatus, vecLeftPosition, vecSTerms, vecTTerms, strLeftOutcome, ostr); + +						//fprintf(stderr, "%s %s\n", ostr.str().c_str(), strLeftOutcome.c_str()); +						fprintf(fpLeftOut, "%s %s\n", ostr.str().c_str(), strLeftOutcome.c_str()); + +						string strRightOutcome; +						fnGetOutcome(vecRightPosition[j-1], vecRightPosition[j], strRightOutcome); +						fprintf(fpRightOut, "%s LeftOrder=%s %s\n", ostr.str().c_str(), strLeftOutcome.c_str(), strRightOutcome.c_str()); +					} +				} +				delete pTree; +			} + +			delete pAlign; +			iSentNum++; + +			if (iSentNum % 100000 == 0) +				fprintf(stderr, "#%d\n", iSentNum); +		} + + +		fclose(fpLeftOut); +		fclose(fpRightOut); +		delete pAlignReader; +		delete pParseReader; +		delete pTxtSReader; +		delete pTxtTReader; +		delete [] pszLine; +	} + +	void fnGenerateInstanceFile2(const char* pszSynFname,        //source-side flattened parse tree file name +                                const char* pszAlignFname,               //alignment filename +                                const char* pszSourceFname,              //source file name +                                const char* pszTargetFname,              //target file name +                                const char* pszInstanceFname             //training instance file name +                                ) { +		SAlignmentReader *pAlignReader = new SAlignmentReader(pszAlignFname); +		SParseReader *pParseReader = new SParseReader(pszSynFname, false); +		STxtFileReader *pTxtSReader = new STxtFileReader(pszSourceFname); +		STxtFileReader *pTxtTReader = new STxtFileReader(pszTargetFname); + +		FILE *fpOut = fopen(pszInstanceFname, "w"); +		assert(fpOut != NULL); + +		//read sentence by sentence +		SAlignment *pAlign; +		SParsedTree *pTree; +		char *pszLine = new char[50001]; +		int iSentNum = 0; +		while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { +			pTree = pParseReader->fnReadNextParseTree(); +			assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); +			vector<string> vecSTerms; +			SplitOnWhitespace(string(pszLine), &vecSTerms); +			assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); +			vector<string> vecTTerms; +			SplitOnWhitespace(string(pszLine), &vecTTerms); + + +			if (pTree != NULL) { + +				vector<STreeItem*> vecFocused; +				fnGetFocusedParentNodes(pTree, vecFocused); + +				for (size_t i = 0; i < vecFocused.size() && pTree->m_vecTerminals.size() > 10; i++) { + +					STreeItem *pParent = vecFocused[i]; + +					for (size_t j = 1; j < pParent->m_vecChildren.size(); j++) { +						//children[j-1] vs. children[j] reordering + +						string strOutcome; +						ostringstream ostr; + +						fnGenerateInstance(pTree, pParent, pParent->m_vecChildren[j-1], pParent->m_vecChildren[j], pAlign, vecSTerms, vecTTerms, strOutcome, ostr); + +						//fprintf(stderr, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); +						fprintf(fpOut, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); +					} +				} +				delete pTree; +			} + +			delete pAlign; +			iSentNum++; + +			if (iSentNum % 100000 == 0) +				fprintf(stderr, "#%d\n", iSentNum); +		} + + +		fclose(fpOut); +		delete pAlignReader; +		delete pParseReader; +		delete pTxtSReader; +		delete pTxtTReader; +		delete [] pszLine; +	} +}; + +struct SConstContTrainer{ +	SConstContTrainer(const char* pszFlattenedSynFname,        //source-side flattened parse tree file name +	                  const char* pszAlignFname,               //alignment filename +	                  const char* pszSourceFname,              //source file name +	                  const char* pszTargetFname,              //target file name +	                  const char* pszInstanceFname,            //training instance file name +	                  const char* pszModelPrefix,              //classifier model file name prefix +	                  int iClassifierType,                     //classifier type +	                  int iCutoff,                             //feature count threshold +	                  const char* pszOption                    //other classifier parameters (for svmlight) +	                 ) { +		fnGenerateInstanceFile(pszFlattenedSynFname, pszAlignFname, pszSourceFname, pszTargetFname, pszInstanceFname); +		//fnTraining(pszInstanceFname, pszModelPrefix, iClassifierType, iCutoff, pszOption); +		fnTraining(pszInstanceFname, pszModelPrefix, iCutoff); +	} +	~SConstContTrainer() { + +	} + +private: + +	void fnTraining(const char* pszInstanceFname, const char* pszModelFname, int iCutoff) { +		char *pszNewInstanceFName = new char[strlen(pszInstanceFname) + 50]; +		if (iCutoff > 0) { +			sprintf(pszNewInstanceFName, "%s.tmp", pszInstanceFname); +			fnPreparingTrainingdata(pszInstanceFname, iCutoff, pszNewInstanceFName); +		} else { +			strcpy(pszNewInstanceFName, pszInstanceFname); +		} + +		/*Zhangle_Maxent *pZhangleMaxent = new Zhangle_Maxent(NULL); +		   pZhangleMaxent->fnTrain(pszInstanceFname, "lbfgs", pszModelFname, 100, 2.0); +		   delete pZhangleMaxent;*/ + +		Tsuruoka_Maxent *pMaxent = new Tsuruoka_Maxent(NULL); +		pMaxent->fnTrain(pszInstanceFname, "l1", pszModelFname, 300); +		delete pMaxent; + +		if (strcmp(pszNewInstanceFName, pszInstanceFname) != 0) { +			sprintf(pszNewInstanceFName, "rm %s.tmp", pszInstanceFname); +			system(pszNewInstanceFName); +		} +		delete [] pszNewInstanceFName; +	} + + +	void fnGetFocusedParentNodes(const SParsedTree* pTree, vector<STreeItem*>& vecFocused){ +		for (size_t i = 0; i < pTree->m_vecTerminals.size(); i++) { +			STreeItem *pParent = pTree->m_vecTerminals[i]->m_ptParent; + +			while (pParent != NULL) { +				//if (pParent->m_vecChildren.size() > 1 && pParent->m_iEnd - pParent->m_iBegin > 5) { +				if (pParent->m_vecChildren.size() > 1) { +					//do constituent reordering for all children of pParent +					vecFocused.push_back(pParent); +				} +				if (pParent->m_iBrotherIndex != 0) break; +				pParent = pParent->m_ptParent; +			} +		} +	} + +	inline void fnGetOutcome(int iL1, int iR1, const SAlignment *pAlign, string& strOutcome) { +		strOutcome = pAlign->fnIsContinuous(iL1, iR1); +	} + +	inline string fnGetLengthType(int iLen) { +		if (iLen == 1) +			return string("1"); +		if (iLen == 2) +			return string("2"); +		if (iLen == 3) +			return string("3"); +		if (iLen < 6) +			return string("4"); +		if (iLen < 11) +			return string("6"); +		return string("11"); +	} + +	/* +	 * Source side (11 features): +	 * f1: the syntactic category +	 * f2: the syntactic category of its parent +	 * f3: the head word's pos +	 * f4: =1 if it's the head of its parent node +	 *     or +	 *     the head of its parent node +	 * f5: length type +	 */ +	void fnGenerateInstance(const SParsedTree *pTree, const STreeItem *pCon1, const SAlignment *pAlign, const vector<string>& vecSTerms, const vector<string>& vecTTerms, string& strOutcome, ostringstream& ostr) { + +		fnGetOutcome(pCon1->m_iBegin, pCon1->m_iEnd, pAlign, strOutcome); + +		//generate features +		//f1 +		ostr << "f1=" << pCon1->m_pszTerm; +		//f2 +		ostr << " f2=" << pCon1->m_ptParent->m_pszTerm; +		//f3 +		ostr << " f3=" << pTree->m_vecTerminals[pCon1->m_iHeadWord]->m_ptParent->m_pszTerm; +		//f4 +		if (pCon1->m_iBrotherIndex == pCon1->m_ptParent->m_iHeadChild) { +			ostr << " f4=1"; +		} else { +			ostr << " f4=" << pCon1->m_ptParent->m_vecChildren[pCon1->m_ptParent->m_iHeadChild]->m_pszTerm; +		} +		//f5 +		ostr << " f5=" << fnGetLengthType(pCon1->m_iEnd - pCon1->m_iBegin + 1); +	} + +	void fnGenerateInstanceFile(const char* pszFlattenedSynFname,        //source-side flattened parse tree file name +                                const char* pszAlignFname,               //alignment filename +                                const char* pszSourceFname,              //source file name +                                const char* pszTargetFname,              //target file name +                                const char* pszInstanceFname             //training instance file name +                                ) { +		SAlignmentReader *pAlignReader = new SAlignmentReader(pszAlignFname); +		SParseReader *pParseReader = new SParseReader(pszFlattenedSynFname, true); +		STxtFileReader *pTxtSReader = new STxtFileReader(pszSourceFname); +		STxtFileReader *pTxtTReader = new STxtFileReader(pszTargetFname); + +		FILE *fpOut = fopen(pszInstanceFname, "w"); +		assert(fpOut != NULL); + +		//read sentence by sentence +		SAlignment *pAlign; +		SParsedTree *pTree; +		char *pszLine = new char[50001]; +		int iSentNum = 0; +		while ((pAlign = pAlignReader->fnReadNextAlignment()) != NULL) { +			pTree = pParseReader->fnReadNextParseTree(); +			assert(pTree != NULL); +			assert(pTxtSReader->fnReadNextLine(pszLine, NULL)); +			vector<string> vecSTerms; +			SplitOnWhitespace(string(pszLine), &vecSTerms); +			assert(pTxtTReader->fnReadNextLine(pszLine, NULL)); +			vector<string> vecTTerms; +			SplitOnWhitespace(string(pszLine), &vecTTerms); + +			vector<STreeItem*> vecFocused; +			fnGetFocusedParentNodes(pTree, vecFocused); + +			for (size_t i = 0; i < vecFocused.size() && pTree->m_vecTerminals.size() > 10; i++) { + +				STreeItem *pParent = vecFocused[i]; + +				for (size_t j = 0; j < pParent->m_vecChildren.size(); j++) { +					//children[j-1] vs. children[j] reordering + +					string strOutcome; +					ostringstream ostr; + +					fnGenerateInstance(pTree, pParent->m_vecChildren[j], pAlign, vecSTerms, vecTTerms, strOutcome, ostr); + +					//fprintf(stderr, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); +					fprintf(fpOut, "%s %s\n", ostr.str().c_str(), strOutcome.c_str()); +				} +			} + +			delete pAlign; +			delete pTree; +			iSentNum++; + +			if (iSentNum % 100000 == 0) +				fprintf(stderr, "#%d\n", iSentNum); +		} + + +		fclose(fpOut); +		delete pAlignReader; +		delete pParseReader; +		delete pTxtSReader; +		delete pTxtTReader; +		delete [] pszLine; +	} +}; + +inline void print_options(std::ostream &out,po::options_description const& opts) { +  typedef std::vector< boost::shared_ptr<po::option_description> > Ds; +  Ds const& ds=opts.options(); +  out << '"'; +  for (unsigned i=0;i<ds.size();++i) { +    if (i) out<<' '; +    out<<"--"<<ds[i]->long_name(); +  } +  out << '\n'; +} +inline string str(char const* name,po::variables_map const& conf) { +  return conf[name].as<string>(); +} + +//--parse_file /scratch0/mt_exp/gq-ctb/data/train.srl.cn --align_file /scratch0/mt_exp/gq-ctb/data/aligned.grow-diag-final-and --source_file /scratch0/mt_exp/gq-ctb/data/train.cn --target_file /scratch0/mt_exp/gq-ctb/data/train.en --instance_file /scratch0/mt_exp/gq-ctb/data/srl-instance --model_prefix /scratch0/mt_exp/gq-ctb/data/srl-instance --feature_cutoff 10 --classifier_type 1 +int main(int argc, char** argv) { + +	po::options_description opts("Configuration options"); +	opts.add_options() +                ("parse_file",po::value<string>(),"parse file path (input)") +                ("align_file",po::value<string>(),"Alignment file path (input)") +                ("source_file",po::value<string>(),"Source text file path (input)") +                ("target_file",po::value<string>(),"Target text file path (input)") +                ("instance_file",po::value<string>(),"Instance file path (output)") +                ("model_prefix",po::value<string>(),"Model file path prefix (output): three files will be generated") +                ("classifier_type",po::value<int>()->default_value(1),"Classifier type: 1 for openNLP maxent; 2 for Zhangle maxent; and 3 for SVMLight") +                ("feature_cutoff",po::value<int>()->default_value(100),"Feature cutoff threshold") +                ("svm_option",po::value<string>(),"Parameters for SVMLight classifier") +                ("help", "produce help message"); + +	po::variables_map vm; +	if (argc) { +		po::store(po::parse_command_line(argc, argv, opts), vm); +		po::notify(vm); +	} + +	if (vm.count("help")) { +		print_options(cout, opts); +		return 1; +	} + +	if (!vm.count("parse_file") +			|| !vm.count("align_file") +			|| !vm.count("source_file") +			|| !vm.count("target_file") +			|| !vm.count("instance_file") +			|| !vm.count("model_prefix") +			) { +		print_options(cout, opts); +		if (!vm.count("parse_file")) cout << "--parse_file NOT FOUND\n"; +		if (!vm.count("align_file")) cout << "--align_file NOT FOUND\n"; +		if (!vm.count("source_file")) cout << "--source_file NOT FOUND\n"; +		if (!vm.count("target_file")) cout << "--target_file NOT FOUND\n"; +		if (!vm.count("instance_file")) cout << "--instance_file NOT FOUND\n"; +		if (!vm.count("model_prefix")) cout << "--model_prefix NOT FOUND\n"; +		exit(0); +	} + +	const char *pOption; +	if (vm.count("svm_option")) +		pOption = str("svm_option", vm).c_str(); +	else +		pOption = NULL; + +	SConstReorderTrainer *pTrainer = new SConstReorderTrainer(str("parse_file", vm).c_str(), +			                                                  str("align_file", vm).c_str(), +			                                                  str("source_file", vm).c_str(), +			                                                  str("target_file", vm).c_str(), +			                                                  str("instance_file", vm).c_str(), +			                                                  str("model_prefix", vm).c_str(), +			                                                  vm["classifier_type"].as<int>(), +			                                                  vm["feature_cutoff"].as<int>(), +			                                                  pOption); +	delete pTrainer; + +	return 1; + +} diff --git a/utils/synutils/maxent-3.0/Makefile.am b/utils/synutils/maxent-3.0/Makefile.am new file mode 100644 index 00000000..64bb038c --- /dev/null +++ b/utils/synutils/maxent-3.0/Makefile.am @@ -0,0 +1,10 @@ +noinst_LIBRARIES = libtsuruoka_maxent.a + +libtsuruoka_maxent_a_SOURCES = \ +  lbfgs.cpp \ +  maxent.cpp \ +  owlqn.cpp \ +  sgd.cpp  +   +AM_CPPFLAGS = -W -Wall + diff --git a/utils/synutils/maxent-3.0/lbfgs.cpp b/utils/synutils/maxent-3.0/lbfgs.cpp new file mode 100644 index 00000000..9eb04bef --- /dev/null +++ b/utils/synutils/maxent-3.0/lbfgs.cpp @@ -0,0 +1,110 @@ +#include <vector> +#include <iostream> +#include <cmath> +#include <stdio.h> +#include "mathvec.h" +#include "lbfgs.h" +#include "maxent.h" + +using namespace std; + +const static int    M = LBFGS_M; +const static double LINE_SEARCH_ALPHA = 0.1; +const static double LINE_SEARCH_BETA  = 0.5; + +// stopping criteria +int    LBFGS_MAX_ITER      = 300; +const static double MIN_GRAD_NORM = 0.0001; + + +double  +ME_Model::backtracking_line_search( +			 const Vec & x0, const Vec & grad0, const double f0,  +			 const Vec & dx, Vec & x, Vec & grad1) +{ +  double t = 1.0 / LINE_SEARCH_BETA; + +  double f; +  do { +    t *= LINE_SEARCH_BETA; +    x = x0 + t * dx; +    f = FunctionGradient(x.STLVec(), grad1.STLVec()); +    //        cout << "*"; +  } while (f > f0 + LINE_SEARCH_ALPHA * t * dot_product(dx, grad0)); + +  return f; +} + +// +// Jorge Nocedal, "Updating Quasi-Newton Matrices With Limited Storage", +// Mathematics of Computation, Vol. 35, No. 151, pp. 773-782, 1980. +// +Vec  +approximate_Hg(const int iter, const Vec & grad, +	       const Vec s[],  const Vec y[], const double z[]) +{ +  int offset, bound; +  if (iter <= M) { offset = 0;        bound = iter; } +  else           { offset = iter - M; bound = M;    } + +  Vec q = grad; +  double alpha[M], beta[M]; +  for (int i = bound - 1; i >= 0; i--) { +    const int j = (i + offset) % M; +    alpha[i]    = z[j]   * dot_product(s[j], q); +    q          += -alpha[i] * y[j]; +  } +  if (iter > 0) { +    const int j = (iter - 1) % M; +    const double gamma = ((1.0 / z[j]) / dot_product(y[j], y[j])); +    //    static double gamma; +    //    if (gamma == 0) gamma = ((1.0 / z[j]) / dot_product(y[j], y[j])); +    q *= gamma; +  } +  for (int i = 0; i <= bound - 1; i++) { +    const int j = (i + offset) % M; +    beta[i]     = z[j] * dot_product(y[j], q); +    q          += s[j] * (alpha[i] - beta[i]); +  } + +  return q; +} + +vector<double>  +ME_Model::perform_LBFGS(const vector<double> & x0) +{ +  const size_t dim = x0.size(); +  Vec x = x0; + +  Vec grad(dim), dx(dim); +  double f = FunctionGradient(x.STLVec(), grad.STLVec()); + +  Vec s[M], y[M]; +  double z[M];  // rho + +  for (int iter = 0; iter < LBFGS_MAX_ITER; iter++) { + +    fprintf(stderr, "%3d  obj(err) = %f (%6.4f)", iter+1, -f, _train_error); +    if (_nheldout > 0) { +      const double heldout_logl = heldout_likelihood(); +      fprintf(stderr, "  heldout_logl(err) = %f (%6.4f)", heldout_logl, _heldout_error); +    } +    fprintf(stderr, "\n"); + +    if (sqrt(dot_product(grad, grad)) < MIN_GRAD_NORM) break; + +    dx = -1 * approximate_Hg(iter, grad, s, y, z); + +    Vec x1(dim), grad1(dim); +    f = backtracking_line_search(x, grad, f, dx, x1, grad1); + +    s[iter % M] = x1 - x; +    y[iter % M] = grad1 - grad; +    z[iter % M] = 1.0 / dot_product(y[iter % M], s[iter % M]); +    x = x1; +    grad = grad1; +  } + +  return x.STLVec(); +} + diff --git a/utils/synutils/maxent-3.0/lbfgs.h b/utils/synutils/maxent-3.0/lbfgs.h new file mode 100644 index 00000000..113919a3 --- /dev/null +++ b/utils/synutils/maxent-3.0/lbfgs.h @@ -0,0 +1,21 @@ +#ifndef _LBFGS_H_ +#define _LBFGS_H_ + +//template<class FuncGrad> +//std::vector<double>  +//perform_LBFGS(FuncGrad func_grad, const std::vector<double> & x0); + +std::vector<double>  +perform_LBFGS(double (*func_grad)(const std::vector<double> &, std::vector<double> &),  +	      const std::vector<double> & x0); + + +std::vector<double>  +perform_OWLQN(double (*func_grad)(const std::vector<double> &, std::vector<double> &),  +	      const std::vector<double> & x0, +	      const double C); + +//const int    LBFGS_M = 7; +const int    LBFGS_M = 10; + +#endif diff --git a/utils/synutils/maxent-3.0/mathvec.h b/utils/synutils/maxent-3.0/mathvec.h new file mode 100644 index 00000000..4ec82797 --- /dev/null +++ b/utils/synutils/maxent-3.0/mathvec.h @@ -0,0 +1,97 @@ +#ifndef _MATH_VECTOR_H_ +#define _MATH_VECTOR_H_ + +#include <vector> +#include <iostream> +#include <cassert> + +class Vec +{ +private: +  std::vector<double> _v; +public: +  Vec(const size_t n = 0, const double val = 0) { _v.resize(n, val); } +  Vec(const std::vector<double> & v) : _v(v)    {} +  const std::vector<double> & STLVec() const { return _v; } +  std::vector<double>       & STLVec()       { return _v; } +  size_t Size() const { return _v.size(); } +  double       & operator[](int i)       { return _v[i]; } +  const double & operator[](int i) const { return _v[i]; } +  Vec & operator+=(const Vec & b) { +    assert(b.Size() == _v.size()); +    for (size_t i = 0; i < _v.size(); i++) { +      _v[i] += b[i]; +    } +    return *this; +  } +  Vec & operator*=(const double c) { +    for (size_t i = 0; i < _v.size(); i++) { +      _v[i] *= c; +    } +    return *this; +  } +  void Project(const Vec & y) { +    for (size_t i = 0; i < _v.size(); i++) { +      //      if (sign(_v[i]) != sign(y[i])) _v[i] = 0; +      if (_v[i] * y[i] <=0) _v[i] = 0; +    } +  } +}; + +inline double dot_product(const Vec & a, const Vec & b) +{ +  double sum = 0; +  for (size_t i = 0; i < a.Size(); i++) { +    sum += a[i] * b[i]; +  } +  return sum; +} + +inline std::ostream & operator<<(std::ostream & s, const Vec & a) +{ +  s << "("; +  for (size_t i = 0; i < a.Size(); i++) { +    if (i != 0) s << ", "; +    s << a[i]; +  } +  s << ")"; +  return s; +} + +inline const Vec operator+(const Vec & a, const Vec & b) +{ +  Vec v(a.Size()); +  assert(a.Size() == b.Size()); +  for (size_t i = 0; i < a.Size(); i++) { +    v[i] = a[i] + b[i]; +  } +  return v; +} + +inline const Vec operator-(const Vec & a, const Vec & b) +{ +  Vec v(a.Size()); +  assert(a.Size() == b.Size()); +  for (size_t i = 0; i < a.Size(); i++) { +    v[i] = a[i] - b[i]; +  } +  return v; +} + +inline const Vec operator*(const Vec & a, const double c) +{ +  Vec v(a.Size()); +  for (size_t i = 0; i < a.Size(); i++) { +    v[i] = a[i] * c; +  } +  return v; +} + +inline const Vec operator*(const double c, const Vec & a) +{ +  return a * c; +} + + + +#endif diff --git a/utils/synutils/maxent-3.0/maxent.cpp b/utils/synutils/maxent-3.0/maxent.cpp new file mode 100644 index 00000000..feb0efdc --- /dev/null +++ b/utils/synutils/maxent-3.0/maxent.cpp @@ -0,0 +1,698 @@ +/* + * $Id: maxent.cpp,v 1.1.1.1 2007/05/15 08:30:35 kyoshida Exp $ + */ + +#include "maxent.h" +#include <cmath> +#include <cstdio> +#include "lbfgs.h" + +using namespace std; + +double +ME_Model::FunctionGradient(const vector<double> & x, vector<double> & grad) +{ +  assert((int)_fb.Size() == x.size()); +  for (size_t i = 0; i < x.size(); i++) { +    _vl[i] = x[i]; +  } +   +  double score = update_model_expectation(); + +  if (_l2reg == 0) { +    for (size_t i = 0; i < x.size(); i++) { +      grad[i] = -(_vee[i] - _vme[i]); +    } +  } else { +    const double c = _l2reg * 2; +    for (size_t i = 0; i < x.size(); i++) { +      grad[i] = -(_vee[i] - _vme[i] - c * _vl[i]); +    } +  } + +  return -score; +} + +int +ME_Model::perform_GIS(int C) +{ +  cerr << "C = " << C << endl; +  C = 1; +  cerr << "performing AGIS" << endl; +  vector<double> pre_v; +  double pre_logl = -999999; +  for (int iter = 0; iter < 200; iter++) { + +    double logl =  update_model_expectation(); +    fprintf(stderr, "iter = %2d  C = %d  f = %10.7f  train_err = %7.5f", iter, C, logl, _train_error); +    if (_heldout.size() > 0) { +      double hlogl = heldout_likelihood(); +      fprintf(stderr, "  heldout_logl(err) = %f (%6.4f)", hlogl, _heldout_error); +    } +    cerr << endl; + +    if (logl < pre_logl) { +      C += 1; +      _vl = pre_v; +      iter--; +      continue; +    } +    if (C > 1 && iter % 10 == 0) C--; + +    pre_logl = logl; +    pre_v = _vl; +    for (int i = 0; i < _fb.Size(); i++) { +      double coef = _vee[i] / _vme[i]; +      _vl[i] += log(coef) / C; +    } +  } +  cerr << endl; + +  return 0; +} + +int +ME_Model::perform_QUASI_NEWTON() +{ +  const int dim = _fb.Size(); +  vector<double> x0(dim); + +  for (int i = 0; i < dim; i++) { x0[i] = _vl[i]; } + +  vector<double> x; +  if (_l1reg > 0) { +    cerr << "performing OWLQN" << endl; +    x = perform_OWLQN(x0, _l1reg); +  } else { +    cerr << "performing LBFGS" << endl; +    x = perform_LBFGS(x0); +  } + +  for (int i = 0; i < dim; i++) { _vl[i] = x[i]; } + +  return 0; +} + +int +ME_Model::conditional_probability(const Sample & s, +                                  std::vector<double> & membp) const +{ +  //int num_classes = membp.size(); +  double sum = 0; +  int max_label = -1; +  //  double maxp = 0; + +  vector<double> powv(_num_classes, 0.0); +  for (vector<int>::const_iterator j = s.positive_features.begin(); j != s.positive_features.end(); j++){ +    for (vector<int>::const_iterator k = _feature2mef[*j].begin(); k != _feature2mef[*j].end(); k++) { +      powv[_fb.Feature(*k).label()] += _vl[*k]; +    } +  } +  for (vector<pair<int, double> >::const_iterator j = s.rvfeatures.begin(); j != s.rvfeatures.end(); j++) { +    for (vector<int>::const_iterator k = _feature2mef[j->first].begin(); k != _feature2mef[j->first].end(); k++) { +      powv[_fb.Feature(*k).label()] += _vl[*k] * j->second; +    } +  } + +  std::vector<double>::const_iterator pmax = max_element(powv.begin(), powv.end()); +  double offset = max(0.0, *pmax - 700); // to avoid overflow +  for (int label = 0; label < _num_classes; label++) { +    double pow = powv[label] - offset; +    double prod = exp(pow); +    //      cout << pow << " " << prod << ", "; +    //      if (_ref_modelp != NULL) prod *= _train_refpd[n][label]; +    if (_ref_modelp != NULL) prod *= s.ref_pd[label]; +    assert(prod != 0); +    membp[label] = prod; +    sum += prod; +  } +  for (int label = 0; label < _num_classes; label++) { +    membp[label] /= sum; +    if (membp[label] > membp[max_label]) max_label = label; +  } +  assert(max_label >= 0); +  return max_label; +} + +int +ME_Model::make_feature_bag(const int cutoff) +{ +  int max_num_features = 0; + +  // count the occurrences of features +#ifdef USE_HASH_MAP +  typedef __gnu_cxx::hash_map<unsigned int, int> map_type; +#else     +  typedef std::map<unsigned int, int> map_type; +#endif +  map_type count; +  if (cutoff > 0) { +    for (std::vector<Sample>::const_iterator i = _vs.begin(); i != _vs.end(); i++) { +      for (std::vector<int>::const_iterator j = i->positive_features.begin(); j != i->positive_features.end(); j++) { +        count[ME_Feature(i->label, *j).body()]++; +      } +      for (std::vector<pair<int, double> >::const_iterator j = i->rvfeatures.begin(); j != i->rvfeatures.end(); j++) { +        count[ME_Feature(i->label, j->first).body()]++; +      } +    } +  } + +  int n = 0;  +  for (std::vector<Sample>::const_iterator i = _vs.begin(); i != _vs.end(); i++, n++) { +    max_num_features = max(max_num_features, (int)(i->positive_features.size())); +    for (std::vector<int>::const_iterator j = i->positive_features.begin(); j != i->positive_features.end(); j++) { +      const ME_Feature feature(i->label, *j); +      //      if (cutoff > 0 && count[feature.body()] < cutoff) continue; +      if (cutoff > 0 && count[feature.body()] <= cutoff) continue; +      _fb.Put(feature); +      //      cout << i->label << "\t" << *j << "\t" << id << endl; +      //      feature2sample[id].push_back(n); +    } +    for (std::vector<pair<int, double> >::const_iterator j = i->rvfeatures.begin(); j != i->rvfeatures.end(); j++) { +      const ME_Feature feature(i->label, j->first); +      //      if (cutoff > 0 && count[feature.body()] < cutoff) continue; +      if (cutoff > 0 && count[feature.body()] <= cutoff) continue; +      _fb.Put(feature); +    } +  } +  count.clear(); +   +  //  cerr << "num_classes = " << _num_classes << endl; +  //  cerr << "max_num_features = " << max_num_features << endl; + +  init_feature2mef(); +   +  return max_num_features; +} + +double +ME_Model::heldout_likelihood() +{ +  double logl = 0; +  int ncorrect = 0; +  for (std::vector<Sample>::const_iterator i = _heldout.begin(); i != _heldout.end(); i++) { +    vector<double> membp(_num_classes); +    int l = classify(*i, membp); +    logl += log(membp[i->label]); +    if (l == i->label) ncorrect++; +  } +  _heldout_error = 1 - (double)ncorrect / _heldout.size(); +   +  return logl /= _heldout.size(); +} + +double +ME_Model::update_model_expectation() +{ +  double logl = 0; +  int ncorrect = 0; + +  _vme.resize(_fb.Size()); +  for (int i = 0; i < _fb.Size(); i++) _vme[i] = 0; +   +  int n = 0; +  for (vector<Sample>::const_iterator i = _vs.begin(); i != _vs.end(); i++, n++) { +    vector<double> membp(_num_classes); +    int max_label = conditional_probability(*i, membp); +     +    logl += log(membp[i->label]); +    //    cout << membp[*i] << " " << logl << " "; +    if (max_label == i->label) ncorrect++; + +    // model_expectation +    for (vector<int>::const_iterator j = i->positive_features.begin(); j != i->positive_features.end(); j++){ +      for (vector<int>::const_iterator k = _feature2mef[*j].begin(); k != _feature2mef[*j].end(); k++) { +	_vme[*k] += membp[_fb.Feature(*k).label()]; +      } +    } +    for (vector<pair<int, double> >::const_iterator j = i->rvfeatures.begin(); j != i->rvfeatures.end(); j++) { +      for (vector<int>::const_iterator k = _feature2mef[j->first].begin(); k != _feature2mef[j->first].end(); k++) { +	_vme[*k] += membp[_fb.Feature(*k).label()] * j->second; +      } +    } +     +  } + +  for (int i = 0; i < _fb.Size(); i++) { +    _vme[i] /= _vs.size(); +  } +   +  _train_error = 1 - (double)ncorrect / _vs.size(); + +  logl /= _vs.size(); +   +  if (_l2reg > 0) { +    const double c = _l2reg; +    for (int i = 0; i < _fb.Size(); i++) { +      logl -= _vl[i] * _vl[i] * c; +    } +  } + +  //logl /= _vs.size(); +   +  //  fprintf(stderr, "iter =%3d  logl = %10.7f  train_acc = %7.5f\n", iter, logl, (double)ncorrect/train.size()); +  //  fprintf(stderr, "logl = %10.7f  train_acc = %7.5f\n", logl, (double)ncorrect/_train.size()); + +  return logl; +} + +int +ME_Model::train(const vector<ME_Sample> & vms) +{ +  _vs.clear(); +  for (vector<ME_Sample>::const_iterator i = vms.begin(); i != vms.end(); i++) { +    add_training_sample(*i); +  } + +  return train(); +} + +void +ME_Model::add_training_sample(const ME_Sample & mes) +{ +  Sample s; +  s.label = _label_bag.Put(mes.label); +  if (s.label > ME_Feature::MAX_LABEL_TYPES) { +    cerr << "error: too many types of labels." << endl; +    exit(1); +  } +  for (vector<string>::const_iterator j = mes.features.begin(); j != mes.features.end(); j++) { +    s.positive_features.push_back(_featurename_bag.Put(*j)); +  } +  for (vector<pair<string, double> >::const_iterator j = mes.rvfeatures.begin(); j != mes.rvfeatures.end(); j++) { +    s.rvfeatures.push_back(pair<int, double>(_featurename_bag.Put(j->first), j->second)); +  } +  if (_ref_modelp != NULL) { +    ME_Sample tmp = mes;; +    s.ref_pd = _ref_modelp->classify(tmp); +  } +  //  cout << s.label << "\t"; +  //  for (vector<int>::const_iterator j = s.positive_features.begin(); j != s.positive_features.end(); j++){ +  //    cout << *j << " "; +  //  } +  //  cout << endl; +   +  _vs.push_back(s); +} + +int +ME_Model::train() +{ +  if (_l1reg > 0 && _l2reg > 0) { +    cerr << "error: L1 and L2 regularizers cannot be used simultaneously." << endl; +    return 0; +  } +  if (_vs.size() == 0) { +    cerr << "error: no training data." << endl; +    return 0; +  } +  if (_nheldout >= (int)_vs.size()) { +    cerr << "error: too much heldout data. no training data is available." << endl; +    return 0; +  } +  //  if (_nheldout > 0) random_shuffle(_vs.begin(), _vs.end()); + +  int max_label = 0; +  for (std::vector<Sample>::const_iterator i = _vs.begin(); i != _vs.end(); i++) { +    max_label = max(max_label, i->label); +  } +  _num_classes = max_label + 1; +  if (_num_classes != _label_bag.Size()) { +    cerr << "warning: _num_class != _label_bag.Size()" << endl; +  } +   +  if (_ref_modelp != NULL) { +    cerr << "setting reference distribution..."; +    for (int i = 0; i < _ref_modelp->num_classes(); i++) { +      _label_bag.Put(_ref_modelp->get_class_label(i)); +    } +    _num_classes = _label_bag.Size(); +    for (vector<Sample>::iterator i = _vs.begin(); i != _vs.end(); i++) { +      set_ref_dist(*i); +    } +    cerr << "done" << endl; +  } +   +  for (int i = 0; i < _nheldout; i++) { +    _heldout.push_back(_vs.back()); +    _vs.pop_back(); +  } + +  sort(_vs.begin(), _vs.end()); + +  int cutoff = 0; +  if (cutoff > 0) cerr << "cutoff threshold = " << cutoff << endl; +  if (_l1reg > 0) cerr << "L1 regularizer = " << _l1reg << endl; +  if (_l2reg > 0) cerr << "L2 regularizer = " << _l2reg << endl; + +  // normalize +  _l1reg /= _vs.size(); +  _l2reg /= _vs.size(); + +  cerr << "preparing for estimation..."; +  make_feature_bag(cutoff); +  //  _vs.clear(); +  cerr << "done" << endl; +  cerr << "number of samples = " << _vs.size() << endl; +  cerr << "number of features = " << _fb.Size() << endl; + +  cerr << "calculating empirical expectation..."; +  _vee.resize(_fb.Size()); +  for (int i = 0; i < _fb.Size(); i++) { +    _vee[i] = 0; +  } +  for (int n = 0; n < (int)_vs.size(); n++) { +    const Sample * i = &_vs[n]; +    for (vector<int>::const_iterator j = i->positive_features.begin(); j != i->positive_features.end(); j++){ +      for (vector<int>::const_iterator k = _feature2mef[*j].begin(); k != _feature2mef[*j].end(); k++) { +	if (_fb.Feature(*k).label() == i->label) _vee[*k] += 1.0; +      } +    } + +    for (vector<pair<int, double> >::const_iterator j = i->rvfeatures.begin(); j != i->rvfeatures.end(); j++) { +      for (vector<int>::const_iterator k = _feature2mef[j->first].begin(); k != _feature2mef[j->first].end(); k++) { +	if (_fb.Feature(*k).label() == i->label) _vee[*k] += j->second; +      } +    } + +  } +  for (int i = 0; i < _fb.Size(); i++) { +    _vee[i] /= _vs.size(); +  } +  cerr << "done" << endl; +   +  _vl.resize(_fb.Size()); +  for (int i = 0; i < _fb.Size(); i++) _vl[i] = 0.0; + +  if (_optimization_method == SGD) { +    perform_SGD(); +  } else { +    perform_QUASI_NEWTON(); +  } + +  int num_active = 0; +  for (int i = 0; i < _fb.Size(); i++) { +    if (_vl[i] != 0) num_active++; +  } +  cerr << "number of active features = " << num_active << endl; + +  return 0; +} + +void +ME_Model::get_features(list< pair< pair<string, string>, double> > & fl) +{ +  fl.clear(); +  //  for (int i = 0; i < _fb.Size(); i++) { +  //    ME_Feature f = _fb.Feature(i); +  //    fl.push_back( make_pair(make_pair(_label_bag.Str(f.label()), _featurename_bag.Str(f.feature())), _vl[i])); +  //  } +  for (MiniStringBag::map_type::const_iterator i = _featurename_bag.begin(); +       i != _featurename_bag.end(); i++) { +    for (int j = 0; j < _label_bag.Size(); j++) { +      string label = _label_bag.Str(j); +      string history = i->first; +      int id = _fb.Id(ME_Feature(j, i->second)); +      if (id < 0) continue; +      fl.push_back( make_pair(make_pair(label, history), _vl[id]) ); +    } +  } +} + +void +ME_Model::clear() +{ +  _vl.clear(); +  _label_bag.Clear(); +  _featurename_bag.Clear(); +  _fb.Clear(); +  _feature2mef.clear(); +  _vee.clear(); +  _vme.clear(); +  _vs.clear(); +  _heldout.clear(); +} + +bool +ME_Model::load_from_file(const string & filename) +{ +  FILE * fp = fopen(filename.c_str(), "r"); +  if (!fp) { +    cerr << "error: cannot open " << filename << "!" << endl; +    return false; +  } + +  _vl.clear(); +  _label_bag.Clear(); +  _featurename_bag.Clear(); +  _fb.Clear(); +  char buf[1024]; +  while(fgets(buf, 1024, fp)) { +    string line(buf); +    string::size_type t1 = line.find_first_of('\t'); +    string::size_type t2 = line.find_last_of('\t'); +    string classname = line.substr(0, t1); +    string featurename = line.substr(t1 + 1, t2 - (t1 + 1) ); +    float lambda; +    string w = line.substr(t2+1); +    sscanf(w.c_str(), "%f", &lambda); +       +    int label = _label_bag.Put(classname); +    int feature = _featurename_bag.Put(featurename); +    _fb.Put(ME_Feature(label, feature)); +    _vl.push_back(lambda); +  } +     +  _num_classes = _label_bag.Size(); + +  init_feature2mef(); + +  fclose(fp); + +  return true; +} + +void +ME_Model::init_feature2mef() +{ +  _feature2mef.clear(); +  for (int i = 0; i < _featurename_bag.Size(); i++) { +    vector<int> vi; +    for (int k = 0; k < _num_classes; k++) { +      int id = _fb.Id(ME_Feature(k, i)); +      if (id >= 0) vi.push_back(id); +    } +    _feature2mef.push_back(vi); +  } +} + +bool +ME_Model::load_from_array(const ME_Model_Data data[]) +{ +  _vl.clear(); +  for (int i = 0;; i++) { +    if (string(data[i].label) == "///") break; +    int label = _label_bag.Put(data[i].label); +    int feature = _featurename_bag.Put(data[i].feature); +    _fb.Put(ME_Feature(label, feature)); +    _vl.push_back(data[i].weight); +  } +  _num_classes = _label_bag.Size(); + +  init_feature2mef(); +   +  return true; +} + +bool +ME_Model::save_to_file(const string & filename, const double th) const +{ +  FILE * fp = fopen(filename.c_str(), "w"); +  if (!fp) { +    cerr << "error: cannot open " << filename << "!" << endl; +    return false; +  } + +  //  for (int i = 0; i < _fb.Size(); i++) { +  //    if (_vl[i] == 0) continue; // ignore zero-weight features +  //    ME_Feature f = _fb.Feature(i); +  //    fprintf(fp, "%s\t%s\t%f\n", _label_bag.Str(f.label()).c_str(), _featurename_bag.Str(f.feature()).c_str(), _vl[i]); +  //  } +  for (MiniStringBag::map_type::const_iterator i = _featurename_bag.begin(); +       i != _featurename_bag.end(); i++) { +    for (int j = 0; j < _label_bag.Size(); j++) { +      string label = _label_bag.Str(j); +      string history = i->first; +      int id = _fb.Id(ME_Feature(j, i->second)); +      if (id < 0) continue; +      if (_vl[id] == 0) continue; // ignore zero-weight features +      if (fabs(_vl[id]) < th) continue; // cut off low-weight features +      fprintf(fp, "%s\t%s\t%f\n", label.c_str(), history.c_str(), _vl[id]); +    } +  } + +  fclose(fp); + +  return true; +} + +void +ME_Model::set_ref_dist(Sample & s) const +{ +  vector<double> v0 = s.ref_pd; +  vector<double> v(_num_classes); +  for (unsigned int i = 0; i < v.size(); i++) { +    v[i] = 0; +    string label = get_class_label(i); +    int id_ref = _ref_modelp->get_class_id(label); +    if (id_ref != -1) { +      v[i] = v0[id_ref]; +    } +    if (v[i] == 0) v[i] = 0.001; // to avoid -inf logl +  } +  s.ref_pd = v; +} +   +int +ME_Model::classify(const Sample & nbs, vector<double> & membp) const +{ +  //  vector<double> membp(_num_classes); +  assert(_num_classes == (int)membp.size()); +  conditional_probability(nbs, membp); +  int max_label = 0; +  double max = 0.0; +  for (int i = 0; i < (int)membp.size(); i++) { +    //    cout << membp[i] << " "; +    if (membp[i] > max) { max_label = i; max = membp[i]; } +  } +  //  cout << endl; +  return max_label; +} + +vector<double> +ME_Model::classify(ME_Sample & mes) const +{ +  Sample s; +  for (vector<string>::const_iterator j = mes.features.begin(); j != mes.features.end(); j++) { +    int id = _featurename_bag.Id(*j); +    if (id >= 0) +      s.positive_features.push_back(id); +  } +  for (vector<pair<string, double> >::const_iterator j = mes.rvfeatures.begin(); j != mes.rvfeatures.end(); j++) { +    int id = _featurename_bag.Id(j->first); +    if (id >= 0) { +      s.rvfeatures.push_back(pair<int, double>(id, j->second)); +    } +  } +  if (_ref_modelp != NULL) { +    s.ref_pd = _ref_modelp->classify(mes); +    set_ref_dist(s); +  } + +  vector<double> vp(_num_classes); +  int label = classify(s, vp); +  mes.label = get_class_label(label); +  return vp; +} + + +/* + * $Log: maxent.cpp,v $ + * Revision 1.1.1.1  2007/05/15 08:30:35  kyoshida + * stepp tagger, by Okanohara and Tsuruoka + * + * Revision 1.28  2006/08/21 17:30:38  tsuruoka + * use MAX_LABEL_TYPES + * + * Revision 1.27  2006/07/25 13:19:53  tsuruoka + * sort _vs[] + * + * Revision 1.26  2006/07/18 11:13:15  tsuruoka + * modify comments + * + * Revision 1.25  2006/07/18 10:02:15  tsuruoka + * remove sample2feature[] + * speed up conditional_probability() + * + * Revision 1.24  2006/07/18 05:10:51  tsuruoka + * add ref_dist + * + * Revision 1.23  2005/12/24 07:05:32  tsuruoka + * modify conditional_probability() to avoid overflow + * + * Revision 1.22  2005/12/24 07:01:25  tsuruoka + * add cutoff for real-valued features + * + * Revision 1.21  2005/12/23 10:33:02  tsuruoka + * support real-valued features + * + * Revision 1.20  2005/12/23 09:15:29  tsuruoka + * modify _train to reduce memory consumption + * + * Revision 1.19  2005/10/28 13:10:14  tsuruoka + * fix for overflow (thanks to Ming Li) + * + * Revision 1.18  2005/10/28 13:03:07  tsuruoka + * add progress_bar + * + * Revision 1.17  2005/09/12 13:51:16  tsuruoka + * Sample: list -> vector + * + * Revision 1.16  2005/09/12 13:27:10  tsuruoka + * add add_training_sample() + * + * Revision 1.15  2005/04/27 11:22:27  tsuruoka + * bugfix + * ME_Sample: list -> vector + * + * Revision 1.14  2005/04/27 10:00:42  tsuruoka + * remove tmpfb + * + * Revision 1.13  2005/04/26 14:25:53  tsuruoka + * add MiniStringBag, USE_HASH_MAP + * + * Revision 1.12  2005/02/11 10:20:08  tsuruoka + * modify cutoff + * + * Revision 1.11  2004/10/04 05:50:25  tsuruoka + * add Clear() + * + * Revision 1.10  2004/08/26 16:52:26  tsuruoka + * fix load_from_file() + * + * Revision 1.9  2004/08/09 12:27:21  tsuruoka + * change messages + * + * Revision 1.8  2004/08/04 13:55:18  tsuruoka + * modify _sample2feature + * + * Revision 1.7  2004/07/28 13:42:58  tsuruoka + * add AGIS + * + * Revision 1.6  2004/07/28 05:54:13  tsuruoka + * get_class_name() -> get_class_label() + * ME_Feature: bugfix + * + * Revision 1.5  2004/07/27 16:58:47  tsuruoka + * modify the interface of classify() + * + * Revision 1.4  2004/07/26 17:23:46  tsuruoka + * _sample2feature: list -> vector + * + * Revision 1.3  2004/07/26 15:49:23  tsuruoka + * modify ME_Feature + * + * Revision 1.2  2004/07/26 13:52:18  tsuruoka + * modify cutoff + * + * Revision 1.1  2004/07/26 13:10:55  tsuruoka + * add files + * + * Revision 1.20  2004/07/22 08:34:45  tsuruoka + * modify _sample2feature[] + * + * Revision 1.19  2004/07/21 16:33:01  tsuruoka + * remove some comments + * + */ + diff --git a/utils/synutils/maxent-3.0/maxent.h b/utils/synutils/maxent-3.0/maxent.h new file mode 100644 index 00000000..a4391ead --- /dev/null +++ b/utils/synutils/maxent-3.0/maxent.h @@ -0,0 +1,395 @@ +/* + * $Id: maxent.h,v 1.1.1.1 2007/05/15 08:30:35 kyoshida Exp $ + */ + +#ifndef __MAXENT_H_ +#define __MAXENT_H_ + +#include <string> +#include <vector> +#include <list> +#include <map> +#include <algorithm> +#include <iostream> +#include <string> +#include <cassert> +#include "mathvec.h" + +#define USE_HASH_MAP  // if you encounter errors with hash, try commenting out this line. (the program will be a bit slower, though) +#ifdef USE_HASH_MAP +#include <ext/hash_map> +#endif + +// +// data format for each sample for training/testing +// +struct ME_Sample +{ +public: +  ME_Sample() : label("") {}; +  ME_Sample(const std::string & l) : label(l) {}; +  void set_label(const std::string & l) { label = l; } + +  // to add a binary feature +  void add_feature(const std::string & f) { +    features.push_back(f);    +  } + +  // to add a real-valued feature +  void add_feature(const std::string & s, const double d) { +    rvfeatures.push_back(std::pair<std::string, double>(s, d));  +  } + +public: +  std::string label; +  std::vector<std::string> features; +  std::vector<std::pair<std::string, double> > rvfeatures; + +  // obsolete +  void add_feature(const std::pair<std::string, double> & f) {   +    rvfeatures.push_back(f); // real-valued features +  } +}; + + +// +// for those who want to use load_from_array() +// +typedef struct ME_Model_Data +{ +  char * label; +  char * feature; +  double weight; +} ME_Model_Data; + + +class ME_Model +{ +public: + +  void add_training_sample(const ME_Sample & s); +  int train(); +  std::vector<double> classify(ME_Sample & s) const; +  bool load_from_file(const std::string & filename); +  bool save_to_file(const std::string & filename, const double th = 0) const; +  int num_classes() const { return _num_classes; } +  std::string get_class_label(int i) const { return _label_bag.Str(i); } +  int get_class_id(const std::string & s) const { return _label_bag.Id(s); } +  void get_features(std::list< std::pair< std::pair<std::string, std::string>, double> > & fl); +  void set_heldout(const int h, const int n = 0) { _nheldout = h; _early_stopping_n = n; }; +  void use_l1_regularizer(const double v) { _l1reg = v; } +  void use_l2_regularizer(const double v) { _l2reg = v; } +  void use_SGD(int iter = 30, double eta0 = 1, double alpha = 0.85) { +    _optimization_method = SGD; +    SGD_ITER = iter; SGD_ETA0 = eta0; SGD_ALPHA = alpha; +  } +  bool load_from_array(const ME_Model_Data data[]); +  void set_reference_model(const ME_Model & ref_model) { _ref_modelp = &ref_model; }; +  void clear(); + +  ME_Model() { +    _l1reg = _l2reg = 0; +    _nheldout = 0; +    _early_stopping_n = 0; +    _ref_modelp = NULL; +    _optimization_method = LBFGS; +  } + +public: +  // obsolete. just for downward compatibility +  int train(const std::vector<ME_Sample> & train); + +private:   + +  enum OPTIMIZATION_METHOD { LBFGS, OWLQN, SGD } _optimization_method; +  // OWLQN and SGD are available only for L1-regularization + +  int SGD_ITER; +  double SGD_ETA0; +  double SGD_ALPHA; + +  double _l1reg, _l2reg; +   +  struct Sample { +    int label; +    std::vector<int> positive_features; +    std::vector<std::pair<int, double> > rvfeatures; +    std::vector<double> ref_pd; // reference probability distribution +    bool operator<(const Sample & x) const { +      for (unsigned int i = 0; i < positive_features.size(); i++) { +        if (i >= x.positive_features.size()) return false; +        int v0 = positive_features[i]; +        int v1 = x.positive_features[i]; +        if (v0 < v1) return true; +        if (v0 > v1) return false; +      } +      return false; +    } +  }; + +  struct ME_Feature +  { +    enum { MAX_LABEL_TYPES = 255 }; +       +    //    ME_Feature(const int l, const int f) : _body((l << 24) + f) { +    //      assert(l >= 0 && l < 256); +    //      assert(f >= 0 && f <= 0xffffff); +    //    }; +    //    int label() const { return _body >> 24; } +    //    int feature() const { return _body & 0xffffff; } +    ME_Feature(const int l, const int f) : _body((f << 8) + l) { +      assert(l >= 0 && l <= MAX_LABEL_TYPES); +      assert(f >= 0 && f <= 0xffffff); +    }; +    int label() const { return _body & 0xff; } +    int feature() const { return _body >> 8; } +    unsigned int body() const { return _body; } +  private: +    unsigned int _body; +  }; + +  struct ME_FeatureBag +  { +#ifdef USE_HASH_MAP +    typedef __gnu_cxx::hash_map<unsigned int, int> map_type; +#else     +    typedef std::map<unsigned int, int> map_type; +#endif +    map_type mef2id; +    std::vector<ME_Feature> id2mef; +    int Put(const ME_Feature & i) { +      map_type::const_iterator j = mef2id.find(i.body()); +      if (j == mef2id.end()) { +        int id = id2mef.size(); +        id2mef.push_back(i); +        mef2id[i.body()] = id; +        return id; +      } +      return j->second; +    } +    int Id(const ME_Feature & i) const { +      map_type::const_iterator j = mef2id.find(i.body()); +      if (j == mef2id.end()) { +        return -1; +      } +      return j->second; +    } +    ME_Feature Feature(int id) const { +      assert(id >= 0 && id < (int)id2mef.size()); +      return id2mef[id]; +    } +    int Size() const { +      return id2mef.size(); +    } +    void Clear() { +      mef2id.clear(); +      id2mef.clear(); +    } +  }; + +  struct hashfun_str +  { +    size_t operator()(const std::string& s) const { +      assert(sizeof(int) == 4 && sizeof(char) == 1); +      const int* p = reinterpret_cast<const int*>(s.c_str()); +      size_t v = 0; +      int n = s.size() / 4; +      for (int i = 0; i < n; i++, p++) { +        //      v ^= *p; +        v ^= *p << (4 * (i % 2)); // note) 0 <= char < 128 +      } +      int m = s.size() % 4; +      for (int i = 0; i < m; i++) { +        v ^= s[4 * n + i] << (i * 8); +      } +      return v; +    } +  }; + +  struct MiniStringBag +  { +#ifdef USE_HASH_MAP +    typedef __gnu_cxx::hash_map<std::string, int, hashfun_str> map_type; +#else     +    typedef std::map<std::string, int> map_type; +#endif +    int _size; +    map_type str2id; +    MiniStringBag() : _size(0) {} +    int Put(const std::string & i) { +      map_type::const_iterator j = str2id.find(i); +      if (j == str2id.end()) { +        int id = _size; +        _size++; +        str2id[i] = id; +        return id; +      } +      return j->second; +    } +    int Id(const std::string & i) const { +      map_type::const_iterator j = str2id.find(i); +      if (j == str2id.end())  return -1; +      return j->second; +    } +    int Size() const { return _size; } +    void Clear() { str2id.clear(); _size = 0; } +    map_type::const_iterator begin() const { return str2id.begin(); } +    map_type::const_iterator end()   const { return str2id.end(); } +  }; + +  struct StringBag : public MiniStringBag +  { +    std::vector<std::string> id2str; +    int Put(const std::string & i) { +      map_type::const_iterator j = str2id.find(i); +      if (j == str2id.end()) { +        int id = id2str.size(); +        id2str.push_back(i); +        str2id[i] = id; +        return id; +      } +      return j->second; +    } +    std::string Str(const int id) const { +      assert(id >= 0 && id < (int)id2str.size()); +      return id2str[id]; +    } +    int Size() const { return id2str.size(); } +    void Clear() { +      str2id.clear(); +      id2str.clear(); +    } +  }; + +  std::vector<Sample> _vs; // vector of training_samples +  StringBag _label_bag; +  MiniStringBag _featurename_bag; +  std::vector<double> _vl;  // vector of lambda +  ME_FeatureBag _fb; +  int _num_classes; +  std::vector<double> _vee;  // empirical expectation +  std::vector<double> _vme;  // empirical expectation +  std::vector< std::vector< int > > _feature2mef; +  std::vector< Sample > _heldout; +  double _train_error;   // current error rate on the training data +  double _heldout_error; // current error rate on the heldout data +  int _nheldout; +  int _early_stopping_n; +  std::vector<double> _vhlogl; +  const ME_Model * _ref_modelp; + +  double heldout_likelihood(); +  int conditional_probability(const Sample & nbs, std::vector<double> & membp) const; +  int make_feature_bag(const int cutoff); +  int classify(const Sample & nbs, std::vector<double> & membp) const; +  double update_model_expectation(); +  int perform_QUASI_NEWTON(); +  int perform_SGD(); +  int perform_GIS(int C); +  std::vector<double> perform_LBFGS(const std::vector<double> & x0); +  std::vector<double> perform_OWLQN(const std::vector<double> & x0, const double C); +  double backtracking_line_search(const Vec & x0, const Vec & grad0, const double f0, const Vec & dx, Vec & x, Vec & grad1); +  double regularized_func_grad(const double C, const Vec & x, Vec & grad); +  double constrained_line_search(double C, const Vec & x0, const Vec & grad0, const double f0, const Vec & dx, Vec & x, Vec & grad1); + + +  void set_ref_dist(Sample & s) const; +  void init_feature2mef(); + +  double FunctionGradient(const std::vector<double> & x, std::vector<double> & grad); +  static double FunctionGradientWrapper(const std::vector<double> & x, std::vector<double> & grad); +   +}; + + +#endif + + +/* + * $Log: maxent.h,v $ + * Revision 1.1.1.1  2007/05/15 08:30:35  kyoshida + * stepp tagger, by Okanohara and Tsuruoka + * + * Revision 1.24  2006/08/21 17:30:38  tsuruoka + * use MAX_LABEL_TYPES + * + * Revision 1.23  2006/07/25 13:19:53  tsuruoka + * sort _vs[] + * + * Revision 1.22  2006/07/18 11:13:15  tsuruoka + * modify comments + * + * Revision 1.21  2006/07/18 10:02:15  tsuruoka + * remove sample2feature[] + * speed up conditional_probability() + * + * Revision 1.20  2006/07/18 05:10:51  tsuruoka + * add ref_dist + * + * Revision 1.19  2005/12/23 10:33:02  tsuruoka + * support real-valued features + * + * Revision 1.18  2005/12/23 09:15:29  tsuruoka + * modify _train to reduce memory consumption + * + * Revision 1.17  2005/10/28 13:02:34  tsuruoka + * set_heldout(): add default value + * Feature() + * + * Revision 1.16  2005/09/12 13:51:16  tsuruoka + * Sample: list -> vector + * + * Revision 1.15  2005/09/12 13:27:10  tsuruoka + * add add_training_sample() + * + * Revision 1.14  2005/04/27 11:22:27  tsuruoka + * bugfix + * ME_Sample: list -> vector + * + * Revision 1.13  2005/04/27 10:20:19  tsuruoka + * MiniStringBag -> StringBag + * + * Revision 1.12  2005/04/27 10:00:42  tsuruoka + * remove tmpfb + * + * Revision 1.11  2005/04/26 14:25:53  tsuruoka + * add MiniStringBag, USE_HASH_MAP + * + * Revision 1.10  2004/10/04 05:50:25  tsuruoka + * add Clear() + * + * Revision 1.9  2004/08/09 12:27:21  tsuruoka + * change messages + * + * Revision 1.8  2004/08/04 13:55:19  tsuruoka + * modify _sample2feature + * + * Revision 1.7  2004/07/29 05:51:13  tsuruoka + * remove modeldata.h + * + * Revision 1.6  2004/07/28 13:42:58  tsuruoka + * add AGIS + * + * Revision 1.5  2004/07/28 05:54:14  tsuruoka + * get_class_name() -> get_class_label() + * ME_Feature: bugfix + * + * Revision 1.4  2004/07/27 16:58:47  tsuruoka + * modify the interface of classify() + * + * Revision 1.3  2004/07/26 17:23:46  tsuruoka + * _sample2feature: list -> vector + * + * Revision 1.2  2004/07/26 15:49:23  tsuruoka + * modify ME_Feature + * + * Revision 1.1  2004/07/26 13:10:55  tsuruoka + * add files + * + * Revision 1.18  2004/07/22 08:34:45  tsuruoka + * modify _sample2feature[] + * + * Revision 1.17  2004/07/21 16:33:01  tsuruoka + * remove some comments + * + */ diff --git a/utils/synutils/maxent-3.0/owlqn.cpp b/utils/synutils/maxent-3.0/owlqn.cpp new file mode 100644 index 00000000..7b2cea7d --- /dev/null +++ b/utils/synutils/maxent-3.0/owlqn.cpp @@ -0,0 +1,137 @@ +#include <vector> +#include <iostream> +#include <cmath> +#include <stdio.h> +#include "mathvec.h" +#include "lbfgs.h" +#include "maxent.h" + +using namespace std; + +const static int    M = LBFGS_M; +const static double LINE_SEARCH_ALPHA = 0.1; +const static double LINE_SEARCH_BETA  = 0.5; + +// stopping criteria +int    OWLQN_MAX_ITER      = 300; +const static double MIN_GRAD_NORM = 0.0001; + + +Vec approximate_Hg(const int iter, const Vec & grad, +		   const Vec s[],  const Vec y[], const double z[]); + + +inline int sign(double x)  +{ +  if (x > 0) return 1; +  if (x < 0) return -1; +  return 0; +}; + +static Vec +pseudo_gradient(const Vec & x, const Vec & grad0, const double C) +{ +  Vec grad = grad0; +  for (size_t i = 0; i < x.Size(); i++) { +    if (x[i] != 0) { +      grad[i] += C * sign(x[i]); +      continue; +    } +    const double gm = grad0[i] - C; +    if (gm > 0) { +      grad[i] = gm; +      continue; +    } +    const double gp = grad0[i] + C; +    if (gp < 0) { +      grad[i] = gp; +      continue; +    } +    grad[i] = 0; +  } + +  return grad; +} + +double  +ME_Model::regularized_func_grad(const double C, const Vec & x, Vec & grad) +{ +  double f = FunctionGradient(x.STLVec(), grad.STLVec()); +  for (size_t i = 0; i < x.Size(); i++) { +    f += C * fabs(x[i]); +  } + +  return f; +} + +double  +ME_Model::constrained_line_search(double C, +			const Vec & x0, const Vec & grad0, const double f0,  +			const Vec & dx, Vec & x, Vec & grad1) +{ +  // compute the orthant to explore +  Vec orthant = x0; +  for (size_t i = 0; i < orthant.Size(); i++) { +    if (orthant[i] == 0) orthant[i] = -grad0[i]; +  } + +  double t = 1.0 / LINE_SEARCH_BETA; + +  double f; +  do { +    t *= LINE_SEARCH_BETA; +    x = x0 + t * dx; +    x.Project(orthant); +    //    for (size_t i = 0; i < x.Size(); i++) { +    //      if (x0[i] != 0 && sign(x[i]) != sign(x0[i])) x[i] = 0; +    //    } + +    f = regularized_func_grad(C, x, grad1); +    //        cout << "*"; +  } while (f > f0 + LINE_SEARCH_ALPHA * dot_product(x - x0, grad0)); + +  return f; +} + +vector<double>  +ME_Model::perform_OWLQN(const vector<double> & x0, const double C) +{ +  const size_t dim = x0.size(); +  Vec x = x0; + +  Vec grad(dim), dx(dim); +  double f = regularized_func_grad(C, x, grad); + +  Vec s[M], y[M]; +  double z[M];  // rho + +  for (int iter = 0; iter < OWLQN_MAX_ITER; iter++) { +    Vec pg = pseudo_gradient(x, grad, C); + +    fprintf(stderr, "%3d  obj(err) = %f (%6.4f)", iter+1, -f, _train_error); +    if (_nheldout > 0) { +      const double heldout_logl = heldout_likelihood(); +      fprintf(stderr, "  heldout_logl(err) = %f (%6.4f)", heldout_logl, _heldout_error); +    } +    fprintf(stderr, "\n"); + +    if (sqrt(dot_product(pg, pg)) < MIN_GRAD_NORM) break; + +    dx = -1 * approximate_Hg(iter, pg, s, y, z); +    if (dot_product(dx, pg) >= 0) +      dx.Project(-1 * pg); + +    Vec x1(dim), grad1(dim); +    f = constrained_line_search(C, x, pg, f, dx, x1, grad1); + +    s[iter % M] = x1 - x; +    y[iter % M] = grad1 - grad; +    z[iter % M] = 1.0 / dot_product(y[iter % M], s[iter % M]); + +    x = x1; +    grad = grad1; +  } + +  return x.STLVec(); +} + diff --git a/utils/synutils/maxent-3.0/sgd.cpp b/utils/synutils/maxent-3.0/sgd.cpp new file mode 100644 index 00000000..6d28c23f --- /dev/null +++ b/utils/synutils/maxent-3.0/sgd.cpp @@ -0,0 +1,180 @@ +#include "maxent.h" +#include <cmath> +#include <stdio.h> + +using namespace std; + +//const double SGD_ETA0 = 1; +//const double SGD_ITER = 30; +//const double SGD_ALPHA = 0.85; + + +//#define FOLOS_NAIVE +//#define FOLOS_LAZY +#define SGD_CP + +inline void +apply_l1_penalty(const int i, const double u, +		 vector<double> & _vl, vector<double> & q) +{ +  double & w = _vl[i]; +  const double z = w; +  double & qi = q[i]; +  if (w > 0) {  +    w = max(0.0, w - (u + qi)); +  } else if (w < 0) { +    w = min(0.0, w + (u - qi)); +  } +  qi += w - z; +} + +static double +l1norm(const vector<double>& v) +{ +  double sum = 0; +  for (size_t i = 0; i < v.size(); i++) sum += abs(v[i]); +  return sum; +} + +inline void +update_folos_lazy(const int iter_sample, +		  const int k, vector<double> & _vl, const vector<double> & sum_eta, +		  vector<int> & last_updated) +{ +  const double penalty = sum_eta[iter_sample] - sum_eta[last_updated[k]]; +  double & x = _vl[k]; +  if (x > 0) x = max(0.0, x - penalty); +  else       x = min(0.0, x + penalty); +  last_updated[k] = iter_sample; +} + +int +ME_Model::perform_SGD() +{ +  if (_l2reg > 0) { +    cerr << "error: L2 regularization is currently not supported in SGD mode." << endl; +    exit(1); +  } + +  cerr << "performing SGD" << endl; + +  const double l1param = _l1reg; + +  const int d = _fb.Size(); + +  vector<int> ri(_vs.size()); +  for (size_t i = 0; i < ri.size(); i++) ri[i] = i; + +  vector<double> grad(d); +  int iter_sample = 0; +  const double eta0 = SGD_ETA0; + +  //  cerr << "l1param = " << l1param << endl; +  cerr << "eta0 = " << eta0 << " alpha = " << SGD_ALPHA << endl; + +  double u = 0; +  vector<double> q(d, 0); +  vector<int> last_updated(d, 0); +  vector<double> sum_eta; +  sum_eta.push_back(0); + +  for (int iter = 0; iter < SGD_ITER; iter++) { + +    random_shuffle(ri.begin(), ri.end()); + +    double logl = 0; +    int ncorrect = 0, ntotal = 0; +    for (size_t i = 0; i < _vs.size(); i++, ntotal++, iter_sample++) { +      const Sample & s = _vs[ri[i]]; + +#ifdef FOLOS_LAZY +      for (vector<int>::const_iterator j = s.positive_features.begin(); j != s.positive_features.end(); j++){ +	for (vector<int>::const_iterator k = _feature2mef[*j].begin(); k != _feature2mef[*j].end(); k++) { +	  update_folos_lazy(iter_sample, *k, _vl, sum_eta, last_updated); +	} +      } +#endif + +      vector<double> membp(_num_classes); +      const int max_label = conditional_probability(s, membp); + +      const double eta = eta0 * pow(SGD_ALPHA, (double)iter_sample / _vs.size()); // exponential decay +      //      const double eta = eta0 / (1.0 + (double)iter_sample / _vs.size()); + +      //      if (iter_sample % _vs.size() == 0) cerr << "eta = " << eta << endl; +      u += eta * l1param; + +      sum_eta.push_back(sum_eta.back() + eta * l1param); +     +      logl += log(membp[s.label]); +      if (max_label == s.label) ncorrect++; + +      // binary features +      for (vector<int>::const_iterator j = s.positive_features.begin(); j != s.positive_features.end(); j++){ +	for (vector<int>::const_iterator k = _feature2mef[*j].begin(); k != _feature2mef[*j].end(); k++) { +	  const double me = membp[_fb.Feature(*k).label()]; +	  const double ee = (_fb.Feature(*k).label() == s.label ? 1.0 : 0); +	  const double grad = (me - ee); +	  _vl[*k] -= eta * grad; +#ifdef SGD_CP +	  apply_l1_penalty(*k, u, _vl, q); +#endif +	} +      } +      // real-valued features +      for (vector<pair<int, double> >::const_iterator j = s.rvfeatures.begin(); j != s.rvfeatures.end(); j++) { +	for (vector<int>::const_iterator k = _feature2mef[j->first].begin(); k != _feature2mef[j->first].end(); k++) { +	  const double me = membp[_fb.Feature(*k).label()]; +	  const double ee = (_fb.Feature(*k).label() == s.label ? 1.0 : 0); +	  const double grad = (me - ee) * j->second; +	  _vl[*k] -= eta * grad; +#ifdef SGD_CP +	  apply_l1_penalty(*k, u, _vl, q); +#endif +	} +      } + +#ifdef FOLOS_NAIVE +      for (size_t j = 0; j < d; j++) { +	double & x = _vl[j]; +	if (x > 0) x = max(0.0, x - eta * l1param); +	else       x = min(0.0, x + eta * l1param); +      } +#endif + +    } +    logl /= _vs.size(); +    //    fprintf(stderr, "%4d logl = %8.3f acc = %6.4f ", iter, logl, (double)ncorrect / ntotal); + +#ifdef FOLOS_LAZY +    if (l1param > 0) { +      for (size_t j = 0; j < d; j++) +	update_folos_lazy(iter_sample, j, _vl, sum_eta, last_updated); +    } +#endif + +    double f = logl; +    if (l1param > 0) { +      const double l1 = l1norm(_vl); // this is not accurate when lazy update is used +      //      cerr << "f0 = " <<  update_model_expectation() - l1param * l1 << " "; +      f -= l1param * l1; +      int nonzero = 0; +      for (int j = 0; j < d; j++) if (_vl[j] != 0) nonzero++; +      //      cerr << " f = " << f << " l1 = " << l1 << " nonzero_features = " << nonzero << endl; +    } +    //    fprintf(stderr, "%4d  obj = %7.3f acc = %6.4f", iter+1, f, (double)ncorrect/ntotal); +    //    fprintf(stderr, "%4d  obj = %f", iter+1, f); +    fprintf(stderr, "%3d  obj(err) = %f (%6.4f)", iter+1, f, 1 - (double)ncorrect/ntotal); + +    if (_nheldout > 0) { +      double heldout_logl = heldout_likelihood(); +      //      fprintf(stderr, "  heldout_logl = %f  acc = %6.4f\n", heldout_logl, 1 - _heldout_error); +      fprintf(stderr, "  heldout_logl(err) = %f (%6.4f)", heldout_logl, _heldout_error); +    } +    fprintf(stderr, "\n"); + +     +  } + +  return 0; +} diff --git a/utils/synutils/srl_sentence.h b/utils/synutils/srl_sentence.h new file mode 100644 index 00000000..6f00267a --- /dev/null +++ b/utils/synutils/srl_sentence.h @@ -0,0 +1,215 @@ +/* + * srl_sentence.h + * + *  Created on: May 26, 2013 + *      Author: junhuili + */ + +#ifndef SRL_SENTENCE_H_ +#define SRL_SENTENCE_H_ + +#include "tree.h" + +#include <vector> + +using namespace std; + +struct SArgument{ +	SArgument(const char* pszRole, int iBegin, int iEnd, float fProb) { +		m_pszRole = new char[strlen(pszRole) + 1]; +		strcpy(m_pszRole, pszRole); +		m_iBegin = iBegin; +		m_iEnd = iEnd; +		m_fProb = fProb; +		m_pTreeItem = NULL; +	} +	~SArgument() { +		delete [] m_pszRole; +	} + +	void fnSetTreeItem(STreeItem *pTreeItem) { +		m_pTreeItem = pTreeItem; +		if (m_pTreeItem != NULL && m_pTreeItem->m_iBegin != -1) { +			assert(m_pTreeItem->m_iBegin == m_iBegin); +			assert(m_pTreeItem->m_iEnd == m_iEnd); +		} +	} + +	char *m_pszRole; //argument rule, e.g., ARG0, ARGM-TMP +	int m_iBegin; +	int m_iEnd;  //the span of the argument, [m_iBegin, m_iEnd] +	float m_fProb; //the probability of this role, +	STreeItem *m_pTreeItem; +}; + +struct SPredicate{ +	SPredicate(const char* pszLemma, int iPosition) { +		if (pszLemma != NULL) { +			m_pszLemma = new char[strlen(pszLemma) + 1]; +			strcpy(m_pszLemma, pszLemma); +		} else +			m_pszLemma = NULL; +		m_iPosition = iPosition; +	} +	~SPredicate() { +		if (m_pszLemma != NULL) +			delete [] m_pszLemma; +		for (size_t i = 0; i < m_vecArgt.size(); i++) +			delete m_vecArgt[i]; +	} +	int fnAppend(const char* pszRole, int iBegin, int iEnd) { +		SArgument *pArgt = new SArgument(pszRole, iBegin, iEnd, 1.0); +		return fnAppend(pArgt); +	} +	int fnAppend(SArgument *pArgt) { +		m_vecArgt.push_back(pArgt); +		int iPosition = m_vecArgt.size() - 1; +		return iPosition; +	} + +	char *m_pszLemma; //lemma of the predicate, for Chinese, it's always as same as the predicate itself +	int m_iPosition; //the position in sentence +	vector<SArgument*> m_vecArgt; //arguments associated to the predicate +}; + +struct SSrlSentence{ +	SSrlSentence() { +		m_pTree = NULL; +	} +	~SSrlSentence() { +		if (m_pTree != NULL) +			delete m_pTree; + +		for (size_t i = 0; i < m_vecPred.size(); i++) +			delete m_vecPred[i]; +	} +	int fnAppend(const char* pszLemma, int iPosition) { +		SPredicate *pPred = new SPredicate(pszLemma, iPosition); +		return fnAppend(pPred); +	} +	int fnAppend(SPredicate* pPred) { +		m_vecPred.push_back(pPred); +		int iPosition = m_vecPred.size() - 1; +		return iPosition; +	} +	int GetPredicateNum() { +		return m_vecPred.size(); +	} + +	SParsedTree *m_pTree; +	vector<SPredicate*> m_vecPred; +}; + +struct SSrlSentenceReader { +	SSrlSentenceReader(const char* pszSrlFname) { +		m_fpIn = fopen(pszSrlFname, "r"); +		assert(m_fpIn != NULL); +	} +	~SSrlSentenceReader() { +		if (m_fpIn != NULL) +			fclose(m_fpIn); +	} + +	inline void fnReplaceAll(std::string& str, const std::string& from, const std::string& to) { +	    size_t start_pos = 0; +	    while((start_pos = str.find(from, start_pos)) != std::string::npos) { +	             str.replace(start_pos, from.length(), to); +	             start_pos += to.length(); // In case 'to' contains 'from', like replacing 'x' with 'yx' +	    } +	} + +	//TODO: here only considers flat predicate-argument structure +	//      i.e., no overlap among them +	SSrlSentence* fnReadNextSrlSentence() { +		vector<vector<string> > vecContent; +		if (fnReadNextContent(vecContent) == false) +			return NULL; + +		SSrlSentence *pSrlSentence = new SSrlSentence(); +		int iSize = vecContent.size(); +		//put together syntactic text +		std::ostringstream ostr; +		for (int i = 0; i < iSize; i++) { +			string strSynSeg = vecContent[i][5]; //the 5th column is the syntactic segment +			size_t iPosition = strSynSeg.find_first_of('*'); +			assert(iPosition != string::npos); +			ostringstream ostrTmp; +			ostrTmp << "(" << vecContent[i][2] << " " << vecContent[i][0] << ")"; //the 2th column is POS-tag, and the 0th column is word +			strSynSeg.replace(iPosition, 1, ostrTmp.str()); +			fnReplaceAll(strSynSeg, "(", " ("); +			ostr << strSynSeg; +		} +		string strSyn = ostr.str(); +		pSrlSentence->m_pTree = SParsedTree::fnConvertFromString(strSyn.c_str()); +		pSrlSentence->m_pTree->fnSetHeadWord(); +		pSrlSentence->m_pTree->fnSetSpanInfo(); + +		//read predicate-argument structure +		int iNumPred = vecContent[0].size() - 8; +		for (int i = 0; i < iNumPred; i++) { +			vector<string> vecRole; +			vector<int> vecBegin; +			vector<int> vecEnd; +			int iPred = -1; +			for (int j = 0; j < iSize; j++) { +				const char* p = vecContent[j][i + 8].c_str(); +				const char* q; +				if (p[0] == '(') { +					//starting position of an argument(or predicate) +					vecBegin.push_back(j); +					q = strchr(p, '*'); +					assert(q != NULL); +					vecRole.push_back(vecContent[j][i + 8].substr(1, q - p - 1)); +					if (vecRole.back().compare("V") == 0) { +						assert(iPred == -1); +						iPred = vecRole.size() - 1; +					} +				} +				if (p[strlen(p) - 1] == ')') { +					//end position of an argument(or predicate) +					vecEnd.push_back(j); +					assert(vecBegin.size() == vecEnd.size()); +				} +			} +			assert(iPred != -1); +			SPredicate *pPred =  new SPredicate(pSrlSentence->m_pTree->m_vecTerminals[vecBegin[iPred]]->m_pszTerm, vecBegin[iPred]); +			pSrlSentence->fnAppend(pPred); +			for (size_t j = 0; j < vecBegin.size(); j++) { +				if (j == iPred) +					continue; +				pPred->fnAppend(vecRole[j].c_str(), vecBegin[j], vecEnd[j]); +				pPred->m_vecArgt.back()->fnSetTreeItem(pSrlSentence->m_pTree->fnFindNodeForSpan(vecBegin[j], vecEnd[j], false)); +			} +		} +		return pSrlSentence; +	} +private: +	bool fnReadNextContent(vector<vector<string> >& vecContent) { +		vecContent.clear(); +		if (feof(m_fpIn) == true) +			return false; +		char *pszLine; +		pszLine = new char[100001]; +		pszLine[0] = '\0'; +		int iLen; +		while (!feof(m_fpIn)) { +			fgets(pszLine, 10001, m_fpIn); +			iLen = strlen(pszLine); +			while (iLen > 0 && pszLine[iLen - 1] > 0 && pszLine[iLen -1] < 33) { +				pszLine[ iLen - 1 ] = '\0'; +				iLen--; +			} +			if (iLen == 0) +				break; //end of this sentence + +			vector<string> terms = SplitOnWhitespace(string(pszLine)); +			assert(terms.size() > 7); +			vecContent.push_back(terms); +		} +		delete [] pszLine; +		return true; +	} +private: +	FILE *m_fpIn; +}; +#endif /* SRL_SENTENCE_H_ */ diff --git a/utils/synutils/tree.h b/utils/synutils/tree.h new file mode 100644 index 00000000..658a470d --- /dev/null +++ b/utils/synutils/tree.h @@ -0,0 +1,731 @@ +/* + * tree.h + * + *  Created on: May 23, 2013 + *      Author: lijunhui + */ + +#ifndef TREE_H_ +#define TREE_H_ + +#include <string> +#include <assert.h> +#include <stdio.h> + +using namespace std; + +struct STreeItem{ +	STreeItem(const char* pszTerm) { +		m_pszTerm = new char[ strlen( pszTerm ) + 1 ]; +		strcpy( m_pszTerm, pszTerm ); + +		m_ptParent = NULL; +		m_iBegin = -1; +		m_iEnd = -1; +		m_iHeadChild = -1; +		m_iHeadWord = -1; +		m_iBrotherIndex = -1; +	} +	~STreeItem( ) { +		delete [] m_pszTerm; +		for (size_t i = 0; i < m_vecChildren.size(); i++) +			delete m_vecChildren[i]; +	} +	int fnAppend(STreeItem *ptChild) { +		m_vecChildren.push_back(ptChild); +		ptChild->m_iBrotherIndex = m_vecChildren.size() - 1; +		ptChild->m_ptParent = this; +		return m_vecChildren.size() - 1; +	} +	int fnGetChildrenNum() { +		return m_vecChildren.size(); +	} + +	bool fnIsPreTerminal( void ) { +		int I; +		if ( this == NULL || m_vecChildren.size() == 0 ) +			return false; + +		for ( I = 0; I < m_vecChildren.size(); I++ ) +			if (m_vecChildren[I]->m_vecChildren.size() > 0 ) +				return false; + +		return true; +	} +public: +	char *m_pszTerm; + +	vector<STreeItem*> m_vecChildren;//children items +	STreeItem *m_ptParent;//the parent item + +	int m_iBegin; +	int m_iEnd; //the node span words[m_iBegin, m_iEnd] +	int m_iHeadChild; //the index of its head child +	int m_iHeadWord; //the index of its head word +	int m_iBrotherIndex;//the index in his brothers +}; + + +struct SGetHeadWord{ +	typedef vector<string> CVectorStr; +	SGetHeadWord() { + +	} +	~SGetHeadWord() { + +	} +	int fnGetHeadWord( char *pszCFGLeft, CVectorStr vectRight ) { +		//0 indicating from right to left while 1 indicating from left to right +		char szaHeadLists[ 201 ] = "0"; + +		/*  //head rules for Egnlish +		if( strcmp( pszCFGLeft, "ADJP" ) == 0 ) +			strcpy( szaHeadLists, "0NNS 0QP 0NN 0$ 0ADVP 0JJ 0VBN 0VBG 0ADJP 0JJR 0NP 0JJS 0DT 0FW 0RBR 0RBS 0SBAR 0RB 0" ); +		else if( strcmp( pszCFGLeft, "ADVP" ) == 0 ) +			strcpy( szaHeadLists, "1RB 1RBR 1RBS 1FW 1ADVP 1TO 1CD 1JJR 1JJ 1IN 1NP 1JJS 1NN 1" ); +		else if( strcmp( pszCFGLeft, "CONJP" ) == 0 ) +			strcpy( szaHeadLists, "1CC 1RB 1IN 1" ); +		else if( strcmp( pszCFGLeft, "FRAG" ) == 0 ) +			strcpy( szaHeadLists, "1" ); +		else if( strcmp( pszCFGLeft, "INTJ" ) == 0 ) +			strcpy( szaHeadLists, "0" ); +		else if( strcmp( pszCFGLeft, "LST" ) == 0 ) +			strcpy( szaHeadLists, "1LS 1: 1CLN 1" ); +		else if( strcmp( pszCFGLeft, "NAC" ) == 0 ) +			strcpy( szaHeadLists, "0NN 0NNS 0NNP 0NNPS 0NP 0NAC 0EX 0$ 0CD 0QP 0PRP 0VBG 0JJ 0JJS 0JJR 0ADJP 0FW 0" ); +		else if( strcmp( pszCFGLeft, "PP" ) == 0 ) +			strcpy( szaHeadLists, "1IN 1TO 1VBG 1VBN 1RP 1FW 1" ); +		else if( strcmp( pszCFGLeft, "PRN" ) == 0 ) +			strcpy( szaHeadLists, "1" ); +		else if( strcmp( pszCFGLeft, "PRT" ) == 0 ) +			strcpy( szaHeadLists, "1RP 1" ); +		else if( strcmp( pszCFGLeft, "QP" ) == 0 ) +			strcpy( szaHeadLists, "0$ 0IN 0NNS 0NN 0JJ 0RB 0DT 0CD 0NCD 0QP 0JJR 0JJS 0" ); +		else if( strcmp( pszCFGLeft, "RRC" ) == 0 ) +			strcpy( szaHeadLists, "1VP 1NP 1ADVP 1ADJP 1PP 1" ); +		else if( strcmp( pszCFGLeft, "S" ) == 0 ) +			strcpy( szaHeadLists, "0TO 0IN 0VP 0S 0SBAR 0ADJP 0UCP 0NP 0" ); +		else if( strcmp( pszCFGLeft, "SBAR" ) == 0 ) +			strcpy( szaHeadLists, "0WHNP 0WHPP 0WHADVP 0WHADJP 0IN 0DT 0S 0SQ 0SINV 0SBAR 0FRAG 0" ); +		else if( strcmp( pszCFGLeft, "SBARQ" ) == 0 ) +			strcpy( szaHeadLists, "0SQ 0S 0SINV 0SBARQ 0FRAG 0" ); +		else if( strcmp( pszCFGLeft, "SINV" ) == 0 ) +			strcpy( szaHeadLists, "0VBZ 0VBD 0VBP 0VB 0MD 0VP 0S 0SINV 0ADJP 0NP 0" ); +		else if( strcmp( pszCFGLeft, "SQ" ) == 0 ) +			strcpy( szaHeadLists, "0VBZ 0VBD 0VBP 0VB 0MD 0VP 0SQ 0" ); +		else if( strcmp( pszCFGLeft, "UCP" ) == 0 ) +			strcpy( szaHeadLists, "1" ); +		else if( strcmp( pszCFGLeft, "VP" ) == 0 ) +			strcpy( szaHeadLists, "0TO 0VBD 0VBN 0MD 0VBZ 0VB 0VBG 0VBP 0VP 0ADJP 0NN 0NNS 0NP 0" ); +		else if( strcmp( pszCFGLeft, "WHADJP" ) == 0 ) +			strcpy( szaHeadLists, "0CC 0WRB 0JJ 0ADJP 0" ); +		else if( strcmp( pszCFGLeft, "WHADVP" ) == 0 ) +			strcpy( szaHeadLists, "1CC 1WRB 1" ); +		else if( strcmp( pszCFGLeft, "WHNP" ) == 0 ) +			strcpy( szaHeadLists, "0WDT 0WP 0WP$ 0WHADJP 0WHPP 0WHNP 0" ); +		else if( strcmp( pszCFGLeft, "WHPP" ) == 0 ) +			strcpy( szaHeadLists, "1IN 1TO FW 1" ); +		else if( strcmp( pszCFGLeft, "NP" ) == 0 ) +			strcpy( szaHeadLists, "0NN NNP NNS NNPS NX POS JJR 0NP 0$ ADJP PRN 0CD 0JJ JJS RB QP 0" ); +		*/ + +		if( strcmp( pszCFGLeft, "ADJP" ) == 0 ) +			strcpy( szaHeadLists, "0ADJP JJ 0AD NN CS 0" ); +		else if( strcmp( pszCFGLeft, "ADVP" ) == 0 ) +			strcpy( szaHeadLists, "0ADVP AD 0" ); +		else if( strcmp( pszCFGLeft, "CLP" ) == 0 ) +			strcpy( szaHeadLists, "0CLP M 0" ); +		else if( strcmp( pszCFGLeft, "CP" ) == 0 ) +			strcpy( szaHeadLists, "0DEC SP 1ADVP CS 0CP IP 0" ); +		else if( strcmp( pszCFGLeft, "DNP" ) == 0 ) +			strcpy( szaHeadLists, "0DNP DEG 0DEC 0" ); +		else if( strcmp( pszCFGLeft, "DVP" ) == 0 ) +			strcpy( szaHeadLists, "0DVP DEV 0" ); +		else if( strcmp( pszCFGLeft, "DP" ) == 0 ) +			strcpy( szaHeadLists, "1DP DT 1" ); +		else if( strcmp( pszCFGLeft, "FRAG" ) == 0 ) +			strcpy( szaHeadLists, "0VV NR NN 0" ); +		else if( strcmp( pszCFGLeft, "INTJ" ) == 0 ) +			strcpy( szaHeadLists, "0INTJ IJ 0" ); +		else if( strcmp( pszCFGLeft, "LST" ) == 0 ) +			strcpy( szaHeadLists, "1LST CD OD 1" ); +		else if( strcmp( pszCFGLeft, "IP" ) == 0 ) +			strcpy( szaHeadLists, "0IP VP 0VV 0" ); +			//strcpy( szaHeadLists, "0VP 0VV 1IP 0" ); +		else if( strcmp( pszCFGLeft, "LCP" ) == 0 ) +			strcpy( szaHeadLists, "0LCP LC 0" ); +		else if( strcmp( pszCFGLeft, "NP" ) == 0 ) +			strcpy( szaHeadLists, "0NP NN NT NR QP 0" ); +		else if( strcmp( pszCFGLeft, "PP" ) == 0 ) +			strcpy( szaHeadLists, "1PP P 1" ); +		else if( strcmp( pszCFGLeft, "PRN" ) == 0 ) +			strcpy( szaHeadLists, "0 NP IP VP NT NR NN 0" ); +		else if( strcmp( pszCFGLeft, "QP" ) == 0 ) +			strcpy( szaHeadLists, "0QP CLP CD OD 0" ); +		else if( strcmp( pszCFGLeft, "VP" ) == 0 ) +			strcpy( szaHeadLists, "1VP VA VC VE VV BA LB VCD VSB VRD VNV VCP 1" ); +		else if( strcmp( pszCFGLeft, "VCD" ) == 0 ) +			strcpy( szaHeadLists, "0VCD VV VA VC VE 0" ); +		if( strcmp( pszCFGLeft, "VRD" ) == 0 ) +			strcpy( szaHeadLists, "0VRD VV VA VC VE 0" ); +		else if( strcmp( pszCFGLeft, "VSB" ) == 0 ) +			strcpy( szaHeadLists, "0VSB VV VA VC VE 0" ); +		else if( strcmp( pszCFGLeft, "VCP" ) == 0 ) +			strcpy( szaHeadLists, "0VCP VV VA VC VE 0" ); +		else if( strcmp( pszCFGLeft, "VNV" ) == 0 ) +			strcpy( szaHeadLists, "0VNV VV VA VC VE 0" ); +		else if( strcmp( pszCFGLeft, "VPT" ) == 0 ) +			strcpy( szaHeadLists, "0VNV VV VA VC VE 0" ); +		else if( strcmp( pszCFGLeft, "UCP" ) == 0 ) +			strcpy( szaHeadLists, "0" ); +		else if( strcmp( pszCFGLeft, "WHNP" ) == 0 ) +			strcpy( szaHeadLists, "0WHNP NP NN NT NR QP 0" ); +		else if( strcmp( pszCFGLeft, "WHPP" ) == 0 ) +			strcpy( szaHeadLists, "1WHPP PP P 1" ); + +		/*  //head rules for GENIA corpus +		if( strcmp( pszCFGLeft, "ADJP" ) == 0 ) +			strcpy( szaHeadLists, "0NNS 0QP 0NN 0$ 0ADVP 0JJ 0VBN 0VBG 0ADJP 0JJR 0NP 0JJS 0DT 0FW 0RBR 0RBS 0SBAR 0RB 0" ); +		else if( strcmp( pszCFGLeft, "ADVP" ) == 0 ) +			strcpy( szaHeadLists, "1RB 1RBR 1RBS 1FW 1ADVP 1TO 1CD 1JJR 1JJ 1IN 1NP 1JJS 1NN 1" ); +		else if( strcmp( pszCFGLeft, "CONJP" ) == 0 ) +			strcpy( szaHeadLists, "1CC 1RB 1IN 1" ); +		else if( strcmp( pszCFGLeft, "FRAG" ) == 0 ) +			strcpy( szaHeadLists, "1" ); +		else if( strcmp( pszCFGLeft, "INTJ" ) == 0 ) +			strcpy( szaHeadLists, "0" ); +		else if( strcmp( pszCFGLeft, "LST" ) == 0 ) +			strcpy( szaHeadLists, "1LS 1: 1CLN 1" ); +		else if( strcmp( pszCFGLeft, "NAC" ) == 0 ) +			strcpy( szaHeadLists, "0NN 0NNS 0NNP 0NNPS 0NP 0NAC 0EX 0$ 0CD 0QP 0PRP 0VBG 0JJ 0JJS 0JJR 0ADJP 0FW 0" ); +		else if( strcmp( pszCFGLeft, "PP" ) == 0 ) +			strcpy( szaHeadLists, "1IN 1TO 1VBG 1VBN 1RP 1FW 1" ); +		else if( strcmp( pszCFGLeft, "PRN" ) == 0 ) +			strcpy( szaHeadLists, "1" ); +		else if( strcmp( pszCFGLeft, "PRT" ) == 0 ) +			strcpy( szaHeadLists, "1RP 1" ); +		else if( strcmp( pszCFGLeft, "QP" ) == 0 ) +			strcpy( szaHeadLists, "0$ 0IN 0NNS 0NN 0JJ 0RB 0DT 0CD 0NCD 0QP 0JJR 0JJS 0" ); +		else if( strcmp( pszCFGLeft, "RRC" ) == 0 ) +			strcpy( szaHeadLists, "1VP 1NP 1ADVP 1ADJP 1PP 1" ); +		else if( strcmp( pszCFGLeft, "S" ) == 0 ) +			strcpy( szaHeadLists, "0TO 0IN 0VP 0S 0SBAR 0ADJP 0UCP 0NP 0" ); +		else if( strcmp( pszCFGLeft, "SBAR" ) == 0 ) +			strcpy( szaHeadLists, "0WHNP 0WHPP 0WHADVP 0WHADJP 0IN 0DT 0S 0SQ 0SINV 0SBAR 0FRAG 0" ); +		else if( strcmp( pszCFGLeft, "SBARQ" ) == 0 ) +			strcpy( szaHeadLists, "0SQ 0S 0SINV 0SBARQ 0FRAG 0" ); +		else if( strcmp( pszCFGLeft, "SINV" ) == 0 ) +			strcpy( szaHeadLists, "0VBZ 0VBD 0VBP 0VB 0MD 0VP 0S 0SINV 0ADJP 0NP 0" ); +		else if( strcmp( pszCFGLeft, "SQ" ) == 0 ) +			strcpy( szaHeadLists, "0VBZ 0VBD 0VBP 0VB 0MD 0VP 0SQ 0" ); +		else if( strcmp( pszCFGLeft, "UCP" ) == 0 ) +			strcpy( szaHeadLists, "1" ); +		else if( strcmp( pszCFGLeft, "VP" ) == 0 ) +			strcpy( szaHeadLists, "0TO 0VBD 0VBN 0MD 0VBZ 0VB 0VBG 0VBP 0VP 0ADJP 0NN 0NNS 0NP 0" ); +		else if( strcmp( pszCFGLeft, "WHADJP" ) == 0 ) +			strcpy( szaHeadLists, "0CC 0WRB 0JJ 0ADJP 0" ); +		else if( strcmp( pszCFGLeft, "WHADVP" ) == 0 ) +			strcpy( szaHeadLists, "1CC 1WRB 1" ); +		else if( strcmp( pszCFGLeft, "WHNP" ) == 0 ) +			strcpy( szaHeadLists, "0WDT 0WP 0WP$ 0WHADJP 0WHPP 0WHNP 0" ); +		else if( strcmp( pszCFGLeft, "WHPP" ) == 0 ) +			strcpy( szaHeadLists, "1IN 1TO FW 1" ); +		else if( strcmp( pszCFGLeft, "NP" ) == 0 ) +			strcpy( szaHeadLists, "0NN NNP NNS NNPS NX POS JJR 0NP 0$ ADJP PRN 0CD 0JJ JJS RB QP 0" ); +		*/ + +		return fnMyOwnHeadWordRule( szaHeadLists, vectRight ); +	} + +private: +	int fnMyOwnHeadWordRule( char *pszaHeadLists, CVectorStr vectRight ) { +		char szHeadList[ 201 ], *p; +		char szTerm[ 101 ]; +		int J; + +		p = pszaHeadLists; + +		int iCountRight; + +		iCountRight = vectRight.size( ); + +		szHeadList[ 0 ] = '\0'; +		while( 1 ){ +			szTerm[ 0 ] = '\0'; +			sscanf( p, "%s", szTerm ); +			if( strlen( szHeadList ) == 0 ){ +				if( strcmp( szTerm, "0" ) == 0 ){ +					return iCountRight - 1; +				} +				if( strcmp( szTerm, "1" ) == 0 ){ +					return 0; +				} + +				sprintf( szHeadList, "%c %s ", szTerm[ 0 ], szTerm + 1 ); +				p = strstr( p, szTerm ); +				p += strlen( szTerm ); +			} +			else{ +				if(   ( szTerm[ 0 ] == '0' ) +					||( szTerm[ 0 ] == '1' ) ){ +					if( szHeadList[ 0 ] == '0' ){ +						for( J = iCountRight - 1; J >= 0; J -- ){ +							sprintf( szTerm, " %s ", vectRight.at( J ).c_str( ) ); +							if( strstr( szHeadList, szTerm ) != NULL ) +								return J; +						} +					} +					else{ +						for( J = 0; J < iCountRight; J ++ ){ +							sprintf( szTerm, " %s ", vectRight.at( J ).c_str( ) ); +							if(	strstr( szHeadList, szTerm ) != NULL ) +								return J; +						} +					} + +					szHeadList[ 0 ] = '\0'; +				} +				else{ +					strcat( szHeadList, szTerm ); +					strcat( szHeadList, " " ); + +					p = strstr( p, szTerm ); +					p += strlen( szTerm ); +				} +			} +		} + +		return 0; +	} + +}; + +struct SParsedTree{ +	SParsedTree( ) { +		m_ptRoot = NULL; +	} +	~SParsedTree( ) { +		if (m_ptRoot != NULL) +			delete m_ptRoot; +	} +	static SParsedTree* fnConvertFromString(const char* pszStr) { +		if (strcmp(pszStr, "(())") == 0) +			return NULL; +		SParsedTree* pTree = new SParsedTree(); + +		vector<string> vecSyn; +		fnReadSyntactic(pszStr, vecSyn); + +		int iLeft = 1, iRight = 1; //# left/right parenthesis + +		STreeItem *pcurrent; + +		pTree->m_ptRoot = new STreeItem(vecSyn[1].c_str()); + +		pcurrent = pTree->m_ptRoot; + +		for (size_t i = 2; i < vecSyn.size() - 1; i++) { +			if ( strcmp(vecSyn[i].c_str(), "(") == 0 ) +					iLeft++; +			else if (strcmp(vecSyn[i].c_str(), ")") == 0 ) { +				iRight++; +				if (pcurrent == NULL) { +					//error +					fprintf(stderr, "ERROR in ConvertFromString\n"); +					fprintf(stderr, "%s\n", pszStr); +					return NULL; +				} +				pcurrent = pcurrent->m_ptParent; +			} else { +				STreeItem *ptNewItem = new STreeItem(vecSyn[i].c_str()); +				pcurrent->fnAppend( ptNewItem ); +				pcurrent = ptNewItem; + +				if (strcmp(vecSyn[i - 1].c_str(), "(" ) != 0 +						&& strcmp(vecSyn[i - 1].c_str(), ")" ) != 0 ) { +					pTree->m_vecTerminals.push_back(ptNewItem); +					pcurrent = pcurrent->m_ptParent; +				} +			} +		} + +		if ( iLeft != iRight ) { +			//error +			fprintf(stderr, "the left and right parentheses are not matched!"); +			fprintf(stderr, "ERROR in ConvertFromString\n"); +			fprintf(stderr, "%s\n", pszStr); +			return NULL; +		} + +		return pTree; +	} + +	int fnGetNumWord() { +		return m_vecTerminals.size(); +	} + +	void fnSetSpanInfo() { +		int iNextNum = 0; +		fnSuffixTraverseSetSpanInfo(m_ptRoot, iNextNum); +	} + +	void fnSetHeadWord() { +		for (size_t i = 0; i < m_vecTerminals.size(); i++) +			m_vecTerminals[i]->m_iHeadWord = i; +		SGetHeadWord *pGetHeadWord = new SGetHeadWord(); +		fnSuffixTraverseSetHeadWord(m_ptRoot, pGetHeadWord); +		delete pGetHeadWord; +	} + +	STreeItem *fnFindNodeForSpan(int iLeft, int iRight, bool bLowest) { +		STreeItem *pTreeItem = m_vecTerminals[iLeft]; + +		while (pTreeItem->m_iEnd < iRight) { +			pTreeItem = pTreeItem->m_ptParent; +			if (pTreeItem == NULL) break; +		} +		if (pTreeItem == NULL) +			return NULL; +		if (pTreeItem->m_iEnd > iRight) +			return NULL; + +		assert(pTreeItem->m_iEnd == iRight); +		if (bLowest) +			return pTreeItem; + +		while (pTreeItem->m_ptParent != NULL && pTreeItem->m_ptParent->fnGetChildrenNum() == 1) +			pTreeItem = pTreeItem->m_ptParent; + +		return pTreeItem; +	} + +private: +	void fnSuffixTraverseSetSpanInfo(STreeItem *ptItem, int& iNextNum) { +		int I; +		int iNumChildren = ptItem->fnGetChildrenNum(); +		for ( I = 0; I < iNumChildren; I++ ) +			fnSuffixTraverseSetSpanInfo(ptItem->m_vecChildren[ I ], iNextNum); + +		if ( I == 0 ) +		{ +			ptItem->m_iBegin = iNextNum; +			ptItem->m_iEnd = iNextNum++; +		} +		else +		{ +			ptItem->m_iBegin = ptItem->m_vecChildren[0]->m_iBegin; +			ptItem->m_iEnd = ptItem->m_vecChildren[I - 1]->m_iEnd; +		} +	} + + +	void fnSuffixTraverseSetHeadWord(STreeItem *ptItem, SGetHeadWord *pGetHeadWord) { +		int I, iHeadchild; + +		if ( ptItem->m_vecChildren.size() == 0 ) +			return; + +		for ( I = 0; I < ptItem->m_vecChildren.size(); I++ ) +			fnSuffixTraverseSetHeadWord(ptItem->m_vecChildren[I], pGetHeadWord); + +		vector<string> vecRight; + + +		if ( ptItem->m_vecChildren.size() == 1 ) +			iHeadchild = 0; +		else +		{ +			for ( I = 0; I < ptItem->m_vecChildren.size(); I++ ) +				vecRight.push_back( string( ptItem->m_vecChildren[ I ]->m_pszTerm ) ); + +			iHeadchild = pGetHeadWord->fnGetHeadWord( ptItem->m_pszTerm, vecRight ); +		} + +		ptItem->m_iHeadChild = iHeadchild; +		ptItem->m_iHeadWord = ptItem->m_vecChildren[iHeadchild]->m_iHeadWord; +	} + + +	static void fnReadSyntactic(const char *pszSyn, vector<string>& vec) { +		char *p; +		int I; + +		int iLeftNum, iRightNum; +		char *pszTmp, *pszTerm; +		pszTmp = new char[strlen(pszSyn)]; +		pszTerm = new char[strlen(pszSyn)]; +		pszTmp[0] = pszTerm[0] = '\0'; + +		vec.clear(); + +		char *pszLine; +		pszLine = new char[strlen(pszSyn) + 1]; +		strcpy( pszLine, pszSyn ); + +		char *pszLine2; + +		while( 1 ) { +			while(( strlen( pszLine ) > 0 ) +					&&( pszLine[ strlen( pszLine ) - 1 ] > 0 ) +					&&( pszLine[ strlen( pszLine ) - 1 ] <= ' ' ) ) +				pszLine[ strlen( pszLine ) - 1 ] = '\0'; + +			if( strlen( pszLine ) == 0 ) +				break; + +			//printf( "%s\n", pszLine ); +			pszLine2 = pszLine; +			while( pszLine2[ 0 ] <= ' ' ) +				pszLine2 ++; +			if( pszLine2[ 0 ] == '<' ) +				continue; + +			sscanf( pszLine2 + 1, "%s", pszTmp ); + +			if ( pszLine2[ 0 ] == '(' ) { +				iLeftNum = 0; +				iRightNum = 0; +			} + +			p = pszLine2; +			while ( 1 ) { +				pszTerm[ 0 ] = '\0'; +				sscanf( p, "%s", pszTerm ); + +				if( strlen( pszTerm ) == 0 ) +					break; +				p = strstr( p, pszTerm ); +				p += strlen( pszTerm ); + +				if( ( pszTerm[ 0 ] == '(' ) +						||( pszTerm[ strlen( pszTerm ) - 1 ] == ')' ) ) { +					if( pszTerm[ 0 ] == '(' ) { +						vec.push_back(string("(")); +						iLeftNum++; + +						I = 1; +						while ( pszTerm[ I ] == '(' && pszTerm[ I ]!= '\0' ) { +							vec.push_back(string("(")); +							iLeftNum++; + +							I++; +						} + +						if( strlen( pszTerm ) > 1 ) +							vec.push_back(string(pszTerm + I)); +					} else { +						char *pTmp; +						pTmp = pszTerm + strlen( pszTerm ) - 1; +						while( ( pTmp[ 0 ] == ')' ) && ( pTmp >= pszTerm ) ) +							pTmp --; +						pTmp[ 1 ] = '\0'; + +						if( strlen( pszTerm ) > 0 ) +							vec.push_back(string(pszTerm)); +						pTmp += 2; + +						for( I = 0; I <= (int)strlen( pTmp ); I ++ ) { +							vec.push_back(string(")")); +							iRightNum++; +						} +					} +				} else { +					char *q; +					q = strchr( pszTerm, ')' ); +					if ( q != NULL ) { +						q[ 0 ] = '\0'; +						if ( pszTerm[ 0 ] != '\0' ) +							vec.push_back(string(pszTerm)); +						vec.push_back(string(")")); +						iRightNum++; + +						q++; +						while ( q[ 0 ] == ')' ) { +							vec.push_back(string(")")); +							q++; +							iRightNum ++; +						} + +						while( q[ 0 ] == '(' ) { +							vec.push_back(string("(")); +							q++; +							iLeftNum++; +						} + +						if ( q[ 0 ] != '\0' ) +							vec.push_back(string(q)); +					} +					else +						vec.push_back(string(pszTerm)); +				} +			} + +			if (iLeftNum != iRightNum) { +				fprintf(stderr, "%s\n", pszSyn); +				assert(iLeftNum == iRightNum); +			} +			/*if ( iLeftNum != iRightNum ) { +				printf( "ERROR: left( and right ) is not matched, %d ( and %d )\n", iLeftNum, iRightNum ); +				return; +			}*/ + + +			if( vec.size() >= 2 +					&& strcmp( vec[1].c_str(), "(" ) == 0 ) { +				//( (IP..) ) +				std::vector<string>::iterator it; +				it = vec.begin(); +				it++; +				vec.insert(it, string("ROOT")); +			} + +			break; +		} + +		delete [] pszLine; +		delete [] pszTmp; +		delete [] pszTerm; +	} + +public: +	STreeItem *m_ptRoot; +	vector<STreeItem*>  m_vecTerminals; //the leaf nodes +}; + +struct SParseReader { +	SParseReader(const char* pszParse_Fname, bool bFlattened = false) : +	m_bFlattened(bFlattened){ +		m_fpIn = fopen( pszParse_Fname, "r" ); +		assert(m_fpIn != NULL); +	} +	~SParseReader() { +		if (m_fpIn != NULL) +			fclose(m_fpIn); +	} + +	SParsedTree* fnReadNextParseTree( ) { +		SParsedTree *pTree = NULL; +		char *pszLine = new char[100001]; +		int iLen; + +		while (fnReadNextSentence(pszLine, &iLen) == true ) { +			if (iLen == 0) +				continue; + +			pTree = SParsedTree::fnConvertFromString(pszLine); +			if (pTree == NULL)  +				break; +			if (m_bFlattened) +				fnPostProcessingFlattenedParse(pTree); +			else { +				pTree->fnSetSpanInfo(); +				pTree->fnSetHeadWord(); +			} +			break; +		} + +		delete [] pszLine; +		return pTree; +	} + +	SParsedTree* fnReadNextParseTreeWithProb(double *pProb) { +		SParsedTree *pTree = NULL; +		char *pszLine = new char[100001]; +		int iLen; + +		while (fnReadNextSentence(pszLine, &iLen) == true ) { +			if (iLen == 0) +				continue; + +			char *p = strchr(pszLine, ' '); +			assert(p != NULL); +			p[0] = '\0'; +			p++; +			if (pProb) +				(*pProb) = atof(pszLine); + +			pTree = SParsedTree::fnConvertFromString(p); +			if (m_bFlattened) +				fnPostProcessingFlattenedParse(pTree); +			else { +				pTree->fnSetSpanInfo(); +				pTree->fnSetHeadWord(); +			} +			break; +		} + +		delete [] pszLine; +		return pTree; +	} +private: +	/* +	 * since to the parse tree is a flattened tree, use the head mark to identify head info. +	 * the head node will be marked as "*XP*" +	 */ +	void fnSetParseTreeHeadInfo(SParsedTree *pTree) { +		for (size_t i = 0; i < pTree->m_vecTerminals.size(); i++) +			pTree->m_vecTerminals[i]->m_iHeadWord = i; +		fnSuffixTraverseSetHeadWord(pTree->m_ptRoot); +	} + +	void fnSuffixTraverseSetHeadWord(STreeItem *pTreeItem) { +		if (pTreeItem->m_vecChildren.size() == 0) +			return; + +		for (size_t i = 0; i < pTreeItem->m_vecChildren.size(); i++ ) +			fnSuffixTraverseSetHeadWord(pTreeItem->m_vecChildren[i]); + +		vector<string> vecRight; + +		int iHeadchild; + +		if (pTreeItem->fnIsPreTerminal()) { +			iHeadchild = 0; +		} else { +			size_t i; +			for (i = 0; i < pTreeItem->m_vecChildren.size(); i++) { +				char *p = pTreeItem->m_vecChildren[i]->m_pszTerm; +				if (p[0] == '*' && p[strlen(p) - 1] == '*') { +					iHeadchild = i; +					p[strlen(p) - 1] = '\0'; +					string str = p + 1; +					strcpy(p, str.c_str()); //erase the "*..*" +					break; +				} +			} +			assert(i < pTreeItem->m_vecChildren.size()); +		} + +		pTreeItem->m_iHeadChild = iHeadchild; +		pTreeItem->m_iHeadWord = pTreeItem->m_vecChildren[iHeadchild]->m_iHeadWord; +	} +	void fnPostProcessingFlattenedParse(SParsedTree *pTree) { +		pTree->fnSetSpanInfo(); +		fnSetParseTreeHeadInfo(pTree); +	} +	bool fnReadNextSentence(char *pszLine, int* piLength) { +		if (feof(m_fpIn) == true) +			return false; + +		int iLen; + +		pszLine[ 0 ] = '\0'; + +		fgets(pszLine, 10001, m_fpIn); +		iLen = strlen(pszLine); +		while (iLen > 0 && pszLine[iLen - 1] > 0 && pszLine[iLen -1] < 33) { +			pszLine[ iLen - 1 ] = '\0'; +			iLen--; +		} + +		if ( piLength != NULL ) +			(*piLength) = iLen; + +		return true; +	} +private: +	FILE *m_fpIn; +	const bool m_bFlattened; +}; + +#endif /* TREE_H_ */ diff --git a/utils/synutils/tsuruoka_maxent.h b/utils/synutils/tsuruoka_maxent.h new file mode 100644 index 00000000..790876ab --- /dev/null +++ b/utils/synutils/tsuruoka_maxent.h @@ -0,0 +1,160 @@ +/* + * tsuruoka_maxent.h + * + */ + +#ifndef TSURUOKA_MAXENT_H_ +#define TSURUOKA_MAXENT_H_ + +#include "utility.h" +#include "stringlib.h" +#include "maxent-3.0/maxent.h" + +#include <assert.h> +#include <vector> +#include <string> +#include <string.h> +#include <tr1/unordered_map> + +using namespace std; + + +typedef std::tr1::unordered_map<std::string, int> Map; +typedef std::tr1::unordered_map<std::string, int>::iterator Iterator; + + + +struct Tsuruoka_Maxent{ +	Tsuruoka_Maxent(const char* pszModelFName) { +		if (pszModelFName != NULL) { +			m_pModel = new ME_Model(); +			m_pModel->load_from_file(pszModelFName); +		} else +			m_pModel = NULL; +	} + +	~Tsuruoka_Maxent() { +		if (m_pModel != NULL) +			delete m_pModel; +	} + +	void fnTrain(const char* pszInstanceFName, const char* pszAlgorithm, const char* pszModelFName, int iNumIteration) { +		assert(strcmp(pszAlgorithm, "l1") == 0 +				|| strcmp(pszAlgorithm, "l2") == 0 +				|| strcmp(pszAlgorithm, "sgd") == 0 +				|| strcmp(pszAlgorithm, "SGD") == 0); +		FILE *fpIn = fopen(pszInstanceFName, "r"); + +		ME_Model *pModel = new ME_Model(); + +		char *pszLine = new char[100001]; +		int iNumInstances = 0; +		int iLen; +		while (!feof(fpIn)) { +			pszLine[0] = '\0'; +			fgets(pszLine, 20000, fpIn); +			if (strlen(pszLine) == 0) { +				continue; +			} + +			iLen = strlen(pszLine); +			while (iLen > 0 && pszLine[iLen - 1] > 0 && pszLine[iLen -1] < 33) { +				pszLine[ iLen - 1 ] = '\0'; +				iLen--; +			} + + +			iNumInstances++; + +			ME_Sample *pmes = new ME_Sample(); + +			char *p = strrchr(pszLine, ' '); +			assert(p != NULL); +			p[0] = '\0'; +			p++; +			vector<string> vecContext; +			SplitOnWhitespace(string(pszLine), &vecContext); + +			pmes->label = string(p); +			for (size_t i = 0; i < vecContext.size(); i++) +				pmes->add_feature(vecContext[i]); +			pModel->add_training_sample((*pmes)); +			if (iNumInstances % 100000 == 0) +				fprintf(stdout, "......Reading #Instances: %1d\n", iNumInstances); +			delete pmes; +		} +		fprintf(stdout, "......Reading #Instances: %1d\n", iNumInstances); +		fclose(fpIn); + +		if (strcmp(pszAlgorithm, "l1") == 0) +			pModel->use_l1_regularizer(1.0); +		else if (strcmp(pszAlgorithm, "l2") == 0) +			pModel->use_l2_regularizer(1.0); +		else +			pModel->use_SGD(); + +		pModel->train(); +		pModel->save_to_file(pszModelFName); + +		delete pModel; +		fprintf(stdout, "......Finished Training\n"); +		fprintf(stdout, "......Model saved as %s\n", pszModelFName); +		delete [] pszLine; +	} + +	double fnEval(const char* pszContext, const char* pszOutcome) const { +		vector<string> vecContext; +		ME_Sample *pmes = new ME_Sample(); +		SplitOnWhitespace(string(pszContext), &vecContext); + +		for (size_t i = 0; i < vecContext.size(); i++) +			pmes->add_feature(vecContext[i]); +		vector<double> vecProb = m_pModel->classify(*pmes); +		delete pmes; +		int iLableID = m_pModel->get_class_id(pszOutcome); +		return vecProb[iLableID]; +	} +	void fnEval(const char* pszContext, vector<pair<string, double> >& vecOutput) const { +		vector<string> vecContext; +		ME_Sample *pmes = new ME_Sample(); +		SplitOnWhitespace(string(pszContext), &vecContext); + +		vecOutput.clear(); + +		for (size_t i = 0; i < vecContext.size(); i++) +			pmes->add_feature(vecContext[i]); +		vector<double> vecProb = m_pModel->classify(*pmes); + +		for (size_t i = 0; i < vecProb.size(); i++) { +			string label = m_pModel->get_class_label(i); +			vecOutput.push_back(make_pair(label, vecProb[i])); +		} +		delete pmes; +	} +	void fnEval(const char* pszContext, vector<double>& vecOutput) const{ +		vector<string> vecContext; +		ME_Sample *pmes = new ME_Sample(); +		SplitOnWhitespace(string(pszContext), &vecContext); + +		vecOutput.clear(); + +		for (size_t i = 0; i < vecContext.size(); i++) +			pmes->add_feature(vecContext[i]); +		vector<double> vecProb = m_pModel->classify(*pmes); + +		for (size_t i = 0; i < vecProb.size(); i++) { +			string label = m_pModel->get_class_label(i); +			vecOutput.push_back(vecProb[i]); +		} +		delete pmes; +	} +	int fnGetClassId(const string& strLabel) const { +		return m_pModel->get_class_id(strLabel); +	} +private: +	ME_Model *m_pModel; +}; + + + +#endif /* TSURUOKA_MAXENT_H_ */ diff --git a/utils/synutils/utility.h b/utils/synutils/utility.h new file mode 100644 index 00000000..d3f52ee0 --- /dev/null +++ b/utils/synutils/utility.h @@ -0,0 +1,129 @@ +/* + * utility.h + * + *  Created on: Jun 24, 2013 + *      Author: lijunhui + */ + +#ifndef UTILITY_H_ +#define UTILITY_H_ + + +#include <zlib.h> +#include <stdio.h> +#include <assert.h> +#include <stdlib.h> +#include <string.h> +#include <unordered_map> + +using namespace std; + +typedef std::unordered_map<std::string, int> MapString2Int; +typedef std::unordered_map<std::string, float> MapString2Float; +typedef std::unordered_map<std::string, float>::iterator MapString2FloatIterator; + +using namespace std; + +struct SFReader { +	SFReader( ){ } +	virtual ~SFReader( ){ } + +	virtual bool fnReadNextLine( char *pszLine, int* piLength ) = 0; +	virtual bool fnReadNextLine(string& strLine) = 0; +}; + +struct STxtFileReader: public SFReader { +	STxtFileReader(const char* pszFname) { +		m_fpIn = fopen(pszFname, "r"); +		assert(m_fpIn != NULL); +	} +	~STxtFileReader() { +		if (m_fpIn != NULL) +			fclose(m_fpIn); +	} + +	bool fnReadNextLine(char *pszLine, int* piLength) { +			if (feof(m_fpIn) == true) +				return false; + +			int iLen; + +			pszLine[ 0 ] = '\0'; + +			fgets(pszLine, 10001, m_fpIn); +			iLen = strlen(pszLine); +			if (iLen == 0) return false; +			while (iLen > 0 && pszLine[iLen - 1] > 0 && pszLine[iLen -1] < 33) { +				pszLine[ iLen - 1 ] = '\0'; +				iLen--; +			} + +			if ( piLength != NULL ) +				(*piLength) = iLen; + +			return true; +		} + +	bool fnReadNextLine(string& strLine) { +		char *pszLine = new char[10001]; +		bool bOut = fnReadNextLine(pszLine, NULL); +		if (bOut) strLine = string(pszLine); +		else strLine = string(""); +		delete [] pszLine; + +		return bOut; +	} +private: +	FILE *m_fpIn; +}; + +struct SGZFileReader: public SFReader { +	SGZFileReader(const char* pszFname) { +		m_fpIn = gzopen( pszFname, "r" ); +		assert(m_fpIn != NULL); +	} +	~SGZFileReader() { +		if (m_fpIn != NULL) +			gzclose(m_fpIn); +	} + +	bool fnReadNextLine(char *pszLine, int* piLength) { +		if ( m_fpIn == NULL ) +			exit( 0 ); +		if ( gzeof( m_fpIn ) == true ) +			return false; + +		int iLen; + +		pszLine[ 0 ] = '\0'; + +		gzgets( m_fpIn, pszLine, 10001 ); +		iLen = strlen( pszLine ); +		while ( iLen > 0 && pszLine[ iLen - 1 ] > 0 && pszLine[ iLen -1 ] < 33 ) +		{ +			pszLine[ iLen - 1 ] = '\0'; +			iLen--; +		} + +		if ( piLength != NULL ) +			(*piLength) = iLen; + +		return true; +	} + +	bool fnReadNextLine(string& strLine) { +		char *pszLine = new char[10001]; +		bool bOut = fnReadNextLine(pszLine, NULL); +		if (bOut) strLine = string(pszLine); +		else strLine = string(""); +		delete [] pszLine; + +		return bOut; +	} +private: +	gzFile m_fpIn; +}; + + +#endif /* UTILITY_H_ */ + | 
