diff options
96 files changed, 1139 insertions, 6054 deletions
@@ -103,7 +103,7 @@ jam-files/bjam jam-files/engine/bin.* jam-files/engine/bootstrap/ klm/lm/bin/ -klm/lm/builder/builder +klm/lm/builder/lmplz klm/lm/build_binary klm/lm/ngram_query klm/lm/query diff --git a/Makefile.am b/Makefile.am index 1c30a6ff..008dc704 100644 --- a/Makefile.am +++ b/Makefile.am @@ -14,12 +14,11 @@ SUBDIRS = \ training \ training/liblbfgs \ word-aligner \ - example_extff \ - extractor + example_extff -#gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava +# extractor -EXTRA_DIST = corpus tests python/pkg python/src python/tests python/examples compound-split environment +EXTRA_DIST = corpus tests python/cdec python/tests python/examples compound-split environment AUTOMAKE_OPTIONS = foreign ACLOCAL_AMFLAGS = -I m4 AM_CPPFLAGS = -D_GLIBCXX_PARALLEL -march=native -mtune=native -O2 -pipe -fomit-frame-pointer -Wall diff --git a/configure.ac b/configure.ac index 37d8cced..03bae25c 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cdec],[2013-03-08]) +AC_INIT([cdec],[2013-11-10]) AC_CONFIG_SRCDIR([decoder/cdec.cc]) AM_INIT_AUTOMAKE AC_CONFIG_HEADERS(config.h) @@ -7,7 +7,6 @@ AC_PROG_LEX case $LEX in :) AC_MSG_ERROR([No lex (Flex, lex, etc.) program found]);; esac -# CPPFLAGS="$CPPFLAGS -std=c++0x" AC_PROG_CC AC_PROG_CXX AX_CXX_COMPILE_STDCXX_11 @@ -187,6 +186,11 @@ AC_CHECK_HEADER(google/dense_hash_map, AC_PROG_INSTALL CPPFLAGS="-DPIC -fPIC $CPPFLAGS -DHAVE_CONFIG_H -DKENLM_MAX_ORDER=6" +CXXFLAGS="$CXX11_SWITCH $CXXFLAGS" + +if test "x$HAVE_CXX11" = "x0"; then + CPPFLAGS="$CPPFLAGS -DHAVE_OLD_CPP" +fi # core cdec stuff AC_CONFIG_FILES([Makefile]) @@ -195,7 +199,7 @@ AC_CONFIG_FILES([mteval/Makefile]) AC_CONFIG_FILES([mteval/meteor_jar.cc]) AC_CONFIG_FILES([decoder/Makefile]) AC_CONFIG_FILES([python/setup.py]) -AC_CONFIG_FILES([extractor/Makefile]) +#AC_CONFIG_FILES([extractor/Makefile]) AC_CONFIG_FILES([word-aligner/Makefile]) # KenLM stuff diff --git a/corpus/tokenize-anything.sh b/corpus/tokenize-anything.sh index fb038717..c7adfa61 100755 --- a/corpus/tokenize-anything.sh +++ b/corpus/tokenize-anything.sh @@ -12,8 +12,8 @@ fi $SUPPORT/utf8-normalize.sh $NORMARGS | $SUPPORT/quote-norm.pl | $SUPPORT/tokenizer.pl | - sed -u -e 's/ al - / al-/g' | + sed -e 's/ al - / al-/g' | $SUPPORT/fix-contract.pl | - sed -u -e 's/^ //' | sed -u -e 's/ $//' | + sed -e 's/^ //' | sed -e 's/ $//' | perl -e '$|++; while(<>){s/(\d+)(\.+)$/$1 ./; s/(\d+)(\.+) \|\|\|/$1 . |||/; print;}' diff --git a/decoder/Makefile.am b/decoder/Makefile.am index 914faaea..8280b22c 100644 --- a/decoder/Makefile.am +++ b/decoder/Makefile.am @@ -41,7 +41,6 @@ libcdec_a_SOURCES = \ cfg_options.h \ csplit.h \ decoder.h \ - dwarf.h \ earley_composer.h \ exp_semiring.h \ factored_lexicon_helper.h \ @@ -51,18 +50,21 @@ libcdec_a_SOURCES = \ ff_charset.h \ ff_context.h \ ff_csplit.h \ - ff_dwarf.h \ ff_external.h \ ff_factory.h \ ff_klm.h \ ff_lm.h \ ff_ngrams.h \ + ff_parse_match.h \ ff_register.h \ ff_rules.h \ ff_ruleshape.h \ ff_sample_fsa.h \ + ff_soft_syntax.h \ + ff_soft_syntax_mindist.h \ ff_source_path.h \ ff_source_syntax.h \ + ff_source_syntax2.h \ ff_spans.h \ ff_tagger.h \ ff_wordalign.h \ @@ -96,68 +98,64 @@ libcdec_a_SOURCES = \ sentences.h \ tagger.h \ translator.h \ - tromble_loss.h \ trule.h \ viterbi.h \ - forest_writer.cc \ - maxtrans_blunsom.cc \ + aligner.cc \ + apply_models.cc \ + bottom_up_parser.cc \ + cdec.cc \ cdec_ff.cc \ cfg.cc \ - dwarf.cc \ - ff_dwarf.cc \ - ff_external.cc \ - rule_lexer.cc \ - fst_translator.cc \ csplit.cc \ - translator.cc \ - rescore_translator.cc \ - scfg_translator.cc \ - hg.cc \ - hg_io.cc \ - hg_remove_eps.cc \ decoder.cc \ - hg_intersect.cc \ - hg_union.cc \ - hg_sampler.cc \ - factored_lexicon_helper.cc \ - viterbi.cc \ - lattice.cc \ - aligner.cc \ - apply_models.cc \ earley_composer.cc \ - phrasetable_fst.cc \ - trule.cc \ + factored_lexicon_helper.cc \ ff.cc \ - ffset.cc \ ff_basic.cc \ - ff_rules.cc \ - ff_wordset.cc \ - ff_context.cc \ + ff_bleu.cc \ ff_charset.cc \ - ff_lm.cc \ + ff_context.cc \ + ff_csplit.cc \ + ff_external.cc \ + ff_factory.cc \ ff_klm.cc \ + ff_lm.cc \ ff_ngrams.cc \ - ff_spans.cc \ + ff_parse_match.cc \ + ff_rules.cc \ ff_ruleshape.cc \ - ff_wordalign.cc \ - ff_csplit.cc \ - ff_tagger.cc \ + ff_soft_syntax.cc \ + ff_soft_syntax_mindist.cc \ ff_source_path.cc \ - ff_parse_match.cc \ - ff_soft_syntax.cc \ - ff_soft_syntax2.cc \ ff_source_syntax.cc \ - ff_source_syntax_p.cc \ ff_source_syntax2.cc \ - ff_source_syntax2_p.cc \ - ff_bleu.cc \ - ff_factory.cc \ + ff_spans.cc \ + ff_tagger.cc \ + ff_wordalign.cc \ + ff_wordset.cc \ + ffset.cc \ + forest_writer.cc \ + fst_translator.cc \ + grammar.cc \ + hg.cc \ + hg_intersect.cc \ + hg_io.cc \ + hg_remove_eps.cc \ + hg_sampler.cc \ + hg_union.cc \ incremental.cc \ + json_parse.cc \ + lattice.cc \ lexalign.cc \ lextrans.cc \ - tagger.cc \ - bottom_up_parser.cc \ + maxtrans_blunsom.cc \ phrasebased_translator.cc \ - JSON_parser.c \ - json_parse.cc \ - grammar.cc + phrasetable_fst.cc \ + rescore_translator.cc \ + rule_lexer.cc \ + scfg_translator.cc \ + tagger.cc \ + translator.cc \ + trule.cc \ + viterbi.cc \ + JSON_parser.c diff --git a/decoder/apply_models.cc b/decoder/apply_models.cc index 330de9e2..4cd8b36f 100644 --- a/decoder/apply_models.cc +++ b/decoder/apply_models.cc @@ -8,8 +8,14 @@ #include <vector> #include <algorithm> -#include <tr1/unordered_map> -#include <tr1/unordered_set> +#ifndef HAVE_OLD_CPP +# include <unordered_map> +# include <unordered_set> +#else +# include <tr1/unordered_map> +# include <tr1/unordered_set> +namespace std { using std::tr1::unordered_map; using std::tr1::unordered_set; } +#endif #include <boost/functional/hash.hpp> @@ -23,7 +29,6 @@ #define FAST_CP_2 3 using namespace std; -using namespace std::tr1; struct Candidate; typedef SmallVectorInt JVector; diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc index e7b31f50..d586c1d1 100644 --- a/decoder/cdec_ff.cc +++ b/decoder/cdec_ff.cc @@ -15,26 +15,16 @@ #include "ff_ruleshape.h" #include "ff_bleu.h" #include "ff_soft_syntax.h" -#include "ff_soft_syntax2.h" +#include "ff_soft_syntax_mindist.h" #include "ff_source_path.h" - - #include "ff_parse_match.h" #include "ff_source_syntax.h" -#include "ff_source_syntax_p.h" #include "ff_source_syntax2.h" -#include "ff_source_syntax2_p.h" - - #include "ff_register.h" #include "ff_charset.h" #include "ff_wordset.h" -#include "ff_dwarf.h" #include "ff_external.h" -#ifdef HAVE_GLC -#include <cdec/ff_glc.h> -#endif void register_feature_functions() { static bool registered = false; @@ -51,30 +41,16 @@ void register_feature_functions() { RegisterFF<BLEUModel>(); //TODO: use for all features the new Register which requires static FF::usage(false,false) give name -#ifdef HAVE_RANDLM - ff_registry.Register("RandLM", new FFFactory<LanguageModelRandLM>); -#endif ff_registry.Register("SpanFeatures", new FFFactory<SpanFeatures>()); ff_registry.Register("NgramFeatures", new FFFactory<NgramDetector>()); ff_registry.Register("RuleContextFeatures", new FFFactory<RuleContextFeatures>()); ff_registry.Register("RuleIdentityFeatures", new FFFactory<RuleIdentityFeatures>()); - - ff_registry.Register("ParseMatchFeatures", new FFFactory<ParseMatchFeatures>); - - ff_registry.Register("SoftSyntacticFeatures", new FFFactory<SoftSyntacticFeatures>); - ff_registry.Register("SoftSyntacticFeatures2", new FFFactory<SoftSyntacticFeatures2>); - + ff_registry.Register("SoftSyntaxFeatures", new FFFactory<SoftSyntaxFeatures>); + ff_registry.Register("SoftSyntaxFeaturesMindist", new FFFactory<SoftSyntaxFeaturesMindist>); ff_registry.Register("SourceSyntaxFeatures", new FFFactory<SourceSyntaxFeatures>); - ff_registry.Register("SourceSyntaxFeatures2", new FFFactory<SourceSyntaxFeatures2>); - ff_registry.Register("SourceSpanSizeFeatures", new FFFactory<SourceSpanSizeFeatures>); - - //ff_registry.Register("PSourceSyntaxFeatures", new FFFactory<PSourceSyntaxFeatures>); - //ff_registry.Register("PSourceSpanSizeFeatures", new FFFactory<PSourceSpanSizeFeatures>); - //ff_registry.Register("PSourceSyntaxFeatures2", new FFFactory<PSourceSyntaxFeatures2>); - - + ff_registry.Register("SourceSyntaxFeatures2", new FFFactory<SourceSyntaxFeatures2>); ff_registry.Register("CMR2008ReorderingFeatures", new FFFactory<CMR2008ReorderingFeatures>()); ff_registry.Register("RuleSourceBigramFeatures", new FFFactory<RuleSourceBigramFeatures>()); ff_registry.Register("RuleTargetBigramFeatures", new FFFactory<RuleTargetBigramFeatures>()); @@ -98,10 +74,6 @@ void register_feature_functions() { ff_registry.Register("WordPairFeatures", new FFFactory<WordPairFeatures>); ff_registry.Register("SourcePathFeatures", new FFFactory<SourcePathFeatures>); ff_registry.Register("WordSet", new FFFactory<WordSet>); - ff_registry.Register("Dwarf", new FFFactory<Dwarf>); ff_registry.Register("External", new FFFactory<ExternalFeature>); -#ifdef HAVE_GLC - ff_registry.Register("ContextCRF", new FFFactory<Model1Features>); -#endif } diff --git a/decoder/decoder.cc b/decoder/decoder.cc index 31e6dc46..da65713a 100644 --- a/decoder/decoder.cc +++ b/decoder/decoder.cc @@ -1,6 +1,11 @@ #include "decoder.h" -#include <tr1/unordered_map> +#ifndef HAVE_OLD_CPP +# include <unordered_map> +#else +# include <tr1/unordered_map> +namespace std { using std::tr1::unordered_map; } +#endif #include <boost/program_options.hpp> #include <boost/program_options/variables_map.hpp> #include <boost/make_shared.hpp> @@ -61,7 +66,6 @@ static const double kMINUS_EPSILON = -1e-6; // don't be too strict using namespace std; -using namespace std::tr1; namespace po = boost::program_options; static bool verbose_feature_functions=true; @@ -90,7 +94,7 @@ struct ELengthWeightFunction { } }; inline void ShowBanner() { - cerr << "cdec v1.0 (c) 2009-2011 by Chris Dyer\n"; + cerr << "cdec (c) 2009--2013 by Chris Dyer\n"; } inline string str(char const* name,po::variables_map const& conf) { diff --git a/decoder/dwarf.cc b/decoder/dwarf.cc deleted file mode 100644 index fb0404a6..00000000 --- a/decoder/dwarf.cc +++ /dev/null @@ -1,3209 +0,0 @@ -#include "dwarf.h" -#include "tdict.h" -#include "wordid.h" -#include "lattice.h" -#include "ff_dwarf.h" -#include <assert.h> -#include <algorithm> -#include <ostream> -#include <sstream> -#include <iostream> -#include <fstream> -#include <vector> -#include <map> -#include <set> -#include <boost/functional/hash.hpp> -#include <tr1/unordered_map> -#include <boost/tuple/tuple.hpp> - -using namespace std; -using namespace std::tr1; -using namespace boost::tuples; -using namespace boost; - -Alignment::Alignment() { - //unordered_map<std::vector<WordID>,int> XX; - _I=0; - _J=0; - kSOS = TD::Convert("<s>"); - kEOS = TD::Convert("</s>"); - kUNK = TD::Convert("**UNKNOWN**"); - SourceFWAntsIdxs = new int*[MAX_ARITY]; - SourceFWAntsAbsIdxs = new int*[MAX_ARITY]; - TargetFWAntsIdxs = new int*[MAX_ARITY]; - SourceAntsIdxs = new int*[MAX_ARITY]; - TargetAntsIdxs = new int*[MAX_ARITY]; - AntsAl = new int*[MAX_ARITY]; - for (int idx=0; idx<MAX_ARITY; idx++) { - SourceAntsIdxs[idx] = new int[40]; - SourceFWAntsIdxs[idx] = new int[40]; - SourceFWAntsAbsIdxs[idx] = new int[40]; - TargetAntsIdxs[idx] = new int[40]; - TargetFWAntsIdxs[idx] = new int[40]; - AntsAl[idx] = new int[40]; - } - for (int j=0; j<MAX_WORDS; j++) - for (int i=0; i<MAX_WORDS; i++) _matrix[j][i]=false; - for (int j=0; j<MAX_WORDS; j++) { - _tSpan[j][0]=MINIMUM_INIT; - _sSpan[j][1]=MAXIMUM_INIT; - } - for (int i=0; i<MAX_WORDS; i++) { - _sSpan[i][0]=MINIMUM_INIT; - _sSpan[i][1]=MAXIMUM_INIT; - } - alpha_oris=0.1; - alpha_orit=0.1; - alpha_doms=0.1; - alpha_domt=0.1; - beta_oris=0.1; - beta_orit=0.1; - beta_doms=0.1; - beta_domt=0.1; -} - -void Alignment::set(int j,int i) { -// create a link between j and i, update their corresponding span accordingly - if (DEBUG) cerr << "set(" << j << "," << i << ")" << endl; - assert(0<=j && j<MAX_WORDS); - assert(0<=i && i<MAX_WORDS); - if (0<=j && j<MAX_WORDS && 0<=i && i<MAX_WORDS) { - _matrix[j][i] = true; - _tSpan[j][0]=least(i,_tSpan[j][0]); - _tSpan[j][1]=most(i,_tSpan[j][1]); - _sSpan[i][0]=least(j,_sSpan[i][0]); - _sSpan[i][1]=most(j,_sSpan[i][1]); - } - _J=most(j+1,_J); - _I=most(i+1,_I); -} - -void Alignment::reset(int j,int i) { //probably won't be used, since the alignment is not dynamic -// remove the link between j and i, update their corresponding span accordingly - if (DEBUG) cerr << "reset(" << j << "," << i << ")" << endl; - assert(0<=j && j<MAX_WORDS); - assert(0<=i && i<MAX_WORDS); - _matrix[j][i] = false; - if (j==_sSpan[i][0] || j==_sSpan[i][1]) { - int min=MINIMUM_INIT; - int max=MAXIMUM_INIT; - for (int idx=_sSpan[i][0]; idx<=_sSpan[i][1]; idx++) { - if (_matrix[idx][i]) { - min=least(min,idx); - max=most(max,idx); - } - } - _sSpan[i][0]=min; - _sSpan[i][1]=max; - } - if (i==_tSpan[j][0] || i==_tSpan[j][1]) { - int min=MINIMUM_INIT; - int max=MAXIMUM_INIT; - for (int idx=_tSpan[j][0]; idx<=_tSpan[j][1]; idx++) { - if (_matrix[j][idx]) { - min=least(min,idx); - max=most(max,idx); - } - } - _tSpan[j][0]=min; - _tSpan[j][1]=max; - } -} - -int Alignment::targetOf(int j, int start) { - assert(j>=0); - if (start==-1) start = _tSpan[j][0]; - if (_tSpan[j][0]==MINIMUM_INIT) return -1; - for (int idx=start; idx<=_tSpan[j][1]; idx++) { - if (_matrix[j][idx]) return idx; - } - return -1; -} - -int Alignment::sourceOf(int i, int start) { - assert(i>=0); - if (start==-1) start = _sSpan[i][0]; - if (_sSpan[i][0]==MINIMUM_INIT) return -1; - for (int idx=start; idx<=_sSpan[i][1]; idx++) { - if (_matrix[idx][i]) return idx; - } - return -1; -} - -void Alignment::clearAls(int prevJ, int prevI) { - for (int j=0; j<=prevJ; j++) { - for (int i=0; i<prevI; i++) { - _matrix[j][i]=false; - } - } - for (int j=0; j<=prevJ; j++) { - _tSpan[j][0] = MINIMUM_INIT; - _tSpan[j][1] = MAXIMUM_INIT; - } - for (int i=0; i<=prevI; i++) { - _sSpan[i][0] = MINIMUM_INIT; - _sSpan[i][1] = MAXIMUM_INIT; - } - _J=0; - _I=0; -} - -int Alignment::DominanceSource(int fw1, int fw2) { - // Dominance of fw1 and fw2 - // 0 -> neither, 1 -> leftFirst, 2 -> rightFirst, 3 -> dontCare - if (DEBUG) cerr << "DominanceSource(" << fw1 << "," << fw2 << ")" << endl; - //cerr << TD::Convert(_f[fw1]) << "," << TD::Convert(_f[fw2]) << endl; - //cerr << AsString() << endl; - int dom = 0; - curr_al.push_back(fw1); curr_al.push_back(fw2); - if (doms_hash.find(curr_al)==doms_hash.end()) { - int* block = blockSource(fw1,fw2); - //cerr << "block = " << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << endl; - if (block[0]==fw1) { - int tfw10 = _tSpan[fw1][0]; - int tfw11 = _tSpan[fw1][1]; - //cerr << "tfw = " << tfw10 << "," << tfw11 << endl; - if (tfw11<0) { - dom+=1; - } else { - if ((block[2]==tfw10 || block[3]==tfw11)) dom+=1; - } - } - if (block[1]==fw2) { - int tfw20 = _tSpan[fw2][0]; - int tfw21 = _tSpan[fw2][1]; - //cerr << "tfw = " << tfw20 << "," << tfw21 << endl; - if (tfw21<0) { - dom+=2; - } else { - if ((block[2]==tfw20 || block[3]==tfw21)) dom+=2; - } - } - delete block; - doms_hash.insert(pair<vector<int>,int>(curr_al,dom)); - } else { - dom = doms_hash[curr_al]; - } - if (DEBUG) cerr << " dom = " << dom << endl; - curr_al.pop_back(); curr_al.pop_back(); - return dom; -} - -vector<int> Alignment::DominanceSource4Sampler(int fw1, int fw2) { - if (DEBUG) cerr << "DominanceSource4Sampler(" << fw1 << "," << fw2 << ")" << endl; - int dom = 0; - int* block = blockSource(fw1,fw2); - //cerr << "block = " << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << endl; - if (block[0]==fw1) { - int tfw10 = _tSpan[fw1][0]; - int tfw11 = _tSpan[fw1][1]; - //cerr << "tfw = " << tfw10 << "," << tfw11 << endl; - if (tfw11<0) { - dom+=1; - } else { - if ((block[2]==tfw10 || block[3]==tfw11)) dom+=1; - } - } - if (block[1]==fw2) { - int tfw20 = _tSpan[fw2][0]; - int tfw21 = _tSpan[fw2][1]; - //cerr << "tfw = " << tfw20 << "," << tfw21 << endl; - if (tfw21<0) { - dom+=2; - } else { - if ((block[2]==tfw20 || block[3]==tfw21)) dom+=2; - } - } - if (DEBUG) cerr << "doms = " << dom << endl; - vector<int> ret; - ret.push_back(dom); ret.push_back(block[0]); ret.push_back(block[1]); - ret.push_back(block[2]); ret.push_back(block[3]); - delete block; - return ret; -} - -int Alignment::DominanceTarget(int fw1, int fw2) { - int dom = 0; - curr_al.push_back(fw1); curr_al.push_back(fw2); - if (domt_hash.find(curr_al)==domt_hash.end()) { - int* block = blockTarget(fw1,fw2); - if (block[2]==fw1) { - int sfw10 = _sSpan[fw1][0]; - int sfw11 = _sSpan[fw1][1]; - if (sfw11<0) { - dom+=1; - } else { - if (block[0]==sfw10 || block[1]==sfw11) dom+=1; - } - } - if (block[3]==fw2) { - int sfw20 = _sSpan[fw2][0]; - int sfw21 = _sSpan[fw2][0]; - if (sfw21<0) { - dom+=2; - } else { - if (block[0]==sfw20 || block[1]==sfw21) dom+=2; - } - } - delete block; - domt_hash.insert(pair<vector<int>,int>(curr_al,dom)); - } else { - dom = domt_hash[curr_al]; - } - curr_al.pop_back(); curr_al.pop_back(); - return dom; -} - -vector<int> Alignment::DominanceTarget4Sampler(int fw1, int fw2) { - int dom = 0; - int* block = blockTarget(fw1,fw2); - if (block[2]==fw1) { - int sfw10 = _sSpan[fw1][0]; - int sfw11 = _sSpan[fw1][1]; - if (sfw11<0) { - dom+=1; - } else { - if (block[0]==sfw10 || block[1]==sfw11) dom+=1; - } - } - if (block[3]==fw2) { - int sfw20 = _sSpan[fw2][0]; - int sfw21 = _sSpan[fw2][0]; - if (sfw21<0) { - dom+=2; - } else { - if (block[0]==sfw20 || block[1]==sfw21) dom+=2; - } - } - vector<int> ret; - ret.push_back(dom); ret.push_back(block[0]); ret.push_back(block[1]); - ret.push_back(block[2]); ret.push_back(block[3]); - delete block; - return ret; -} - -void Alignment::OrientationSource(int fw, int* oril, int* orir, bool Lcompute, bool Rcompute) { - OrientationSource(fw,fw,oril,orir,Lcompute,Rcompute); -} - -vector<int> Alignment::OrientationSourceLeft4Sampler(int fw) { - return OrientationSourceLeft4Sampler(fw,fw); -} - -vector<int> Alignment::OrientationSourceLeft4Sampler(int fw0, int fw1) { - if (DEBUG) cerr << "OrientationSourceLeft4Sampler(" << fw0 << "," << fw1 << ")" << endl; - int oril = 0; - int N0=fw0-1; - while (N0>=0) { - if (minTSpan(N0)!=MINIMUM_INIT) break; - N0--; - } - int N1=fw1+1; - while (N1<_J) { - if (minTSpan(N1)!=MINIMUM_INIT) break; - N1++; - } - if (minTSpan(fw0)==MINIMUM_INIT && minTSpan(fw1)==MINIMUM_INIT) { - fw0 = N1; fw1 = N0; - } - if (DEBUG) cerr << "fw0=" << fw0 << ", fw1=" << fw1 << ", N0=" << N0 << ", N1=" << N1 << endl; - if (maxTSpan(N0)<minTSpan(fw0) || maxTSpan(fw0)<minTSpan(N0)) { - if (DEBUG) cerr << "N0=" << minTSpan(N0) << "-" << maxTSpan(N0); - if (DEBUG) cerr << "fw=" << minTSpan(fw0) << "-" << maxTSpan(fw0) << endl; - int *block = blockTarget(minTSpan(N0),maxTSpan(N0)); - if (block[0]<=fw0 && fw0<=block[1]) oril=5; - delete block; - if (oril==0) { - block = blockTarget(minTSpan(fw0),maxTSpan(fw0)); - if (block[0]<=N0 && N0<=block[1]) oril=5; - delete block; - } - if (oril==0) { - if (maxTSpan(N0)<minTSpan(fw0)) {// if N0 is monotone - oril=1; - block = blockTarget(maxTSpan(N0),minTSpan(fw0)-1); - if (block[0] <= fw0 && fw0 <= block[1]) oril+=2; - delete block; - } else { //if (maxTSpan(fw0)<minTSpan(N0)) { // if NO is non-monotone - oril=2; - block = blockTarget(maxTSpan(fw0)+1,minTSpan(N0)); - if (block[0] <= fw0 && fw0 <= block[1]) oril+=2; - delete block; - } - } - } else { - oril=5; - } - if (DEBUG) cerr << "oril = " << oril << endl; - int* block = blockSource(N0,fw0); - if (DEBUG) { - for (int i=0; i<4; i++) cerr << "block[" << i << "]=" << block[i] << endl; - } - vector<int> ret; - ret.push_back(oril); ret.push_back(block[0]); ret.push_back(block[1]); - ret.push_back(block[2]); ret.push_back(block[3]); - delete block; - return ret; -} - -vector<int> Alignment::OrientationSourceRight4Sampler(int fw) { - return OrientationSourceRight4Sampler(fw,fw); -} - -vector<int> Alignment::OrientationSourceRight4Sampler(int fw0, int fw1) { - if (DEBUG) cerr << "OrientationSourceLeft4Sampler(" << fw0 << "," << fw1 << ")" << endl; - int orir = 0; - int N0=fw0-1; - while (N0>=0) { - if (minTSpan(N0)!=MINIMUM_INIT) break; - N0--; - } - int N1=fw1+1; - while (N1<_J) { - if (minTSpan(N1)!=MINIMUM_INIT) break; - N1++; - } - if (minTSpan(fw0)==MINIMUM_INIT && minTSpan(fw1)==MINIMUM_INIT) { - fw0 = N1; fw1 = N0; - } - if (DEBUG) cerr << "fw0=" << fw0 << ", fw1=" << fw1 << ", N0=" << N0 << ", N1=" << N1 << endl; - if (maxTSpan(N1)<minTSpan(fw1) || maxTSpan(fw1)<minTSpan(N1)) { - int* block = blockTarget(minTSpan(N1),maxTSpan(N1)); - if (block[0]<=fw1 && fw1<=block[2]) orir=5; - delete block; - if (orir==0) { - block = blockTarget(minTSpan(fw1),maxTSpan(fw1)); - if (block[0]<=N1 && N1 <=block[1]) orir=5; - delete block; - } - if (DEBUG) cerr << "N1=" << minTSpan(N1) << "-" << maxTSpan(N1); - if (DEBUG) cerr << "fw1=" << minTSpan(fw1) << "-" << maxTSpan(fw1) << endl; - if (orir==0) { - if (maxTSpan(fw1)<minTSpan(N1)) { // if N1 is monotone - orir = 1; - block = blockTarget(maxTSpan(fw1)+1,minTSpan(N1)); - if (block[0] <= fw1 && fw1 <= block[1]) orir+=2; - delete block; - } else {// if (maxTSpan(N1)<minTSpan(fw1)) { // if N1 is non-monotone - orir = 2; - block = blockTarget(maxTSpan(N1),minTSpan(fw1)-1); - if (block[0] <= fw1 && fw1 <= block[1]) orir+=2; - delete block; - } - } - } else { - orir = 5; - } - if (DEBUG) cerr << "orir = " << orir << endl; - int* block = blockSource(fw1,N1); - vector<int> ret; - ret.push_back(orir); ret.push_back(block[0]); ret.push_back(block[1]); - ret.push_back(block[2]); ret.push_back(block[3]); - delete block; - return ret; -} - -void Alignment::OrientationSource(int fw0, int fw1, int* oril, int* orir, bool Lcompute, bool Rcompute) { - // Orientation - // A bit tricky since fw can be 1) unaligned 2) aligned to many - // 1 -> MA, 2 -> RA, 3 -> MG, 4 -> RG, 5 -> Other - if (DEBUG) cerr << "OrientationSource(" << fw0 << "," << fw1 << ")" << endl; - if (!Lcompute && !Rcompute) return; - curr_al.push_back(fw0); - curr_al.push_back(fw1); - *oril=0; - *orir=0; - int lr=0; - if (oris_hash.find(curr_al)==oris_hash.end()) { - // Find first aligned word N0 to the left of fw - int N0=fw0-1; - while (N0>=0) { - if (minTSpan(N0)!=MINIMUM_INIT) break; - N0--; - } - int N1=fw1+1; - while (N1<_J) { - if (minTSpan(N1)!=MINIMUM_INIT) break; - N1++; - } - if (minTSpan(fw0)==MINIMUM_INIT && minTSpan(fw1)==MINIMUM_INIT) { - fw0 = N1; fw1 = N0; - //cerr << "minTSpan(fw)==MINIMUM_INIT, thus fw0=" << fw0 << ", fw1=" << fw1 << endl; - } - if (DEBUG) cerr << "fw0=" << fw0 << ", fw1=" << fw1 << ", N0=" << N0 << ", N1=" << N1 << endl; - if (maxTSpan(N0)<minTSpan(fw0) || maxTSpan(fw0)<minTSpan(N0)) { - if (DEBUG) cerr << "N0=" << minTSpan(N0) << "-" << maxTSpan(N0); - if (DEBUG) cerr << "fw=" << minTSpan(fw0) << "-" << maxTSpan(fw0) << endl; - int *block = blockTarget(minTSpan(N0),maxTSpan(N0)); - if (block[0]<=fw0 && fw0<=block[1]) *oril=5; - delete block; - if (*oril==0) { - block = blockTarget(minTSpan(fw0),maxTSpan(fw0)); - if (block[0]<=N0 && N0<=block[1]) *oril=5; - delete block; - } - if (*oril==0) { - if (maxTSpan(N0)<minTSpan(fw0)) {// if N0 is monotone - *oril=1; - block = blockTarget(maxTSpan(N0),minTSpan(fw0)-1); - if (block[0] <= fw0 && fw0 <= block[1]) *oril+=2; - delete block; - } else { //if (maxTSpan(fw0)<minTSpan(N0)) { // if NO is non-monotone - *oril=2; - block = blockTarget(maxTSpan(fw0)+1,minTSpan(N0)); - if (block[0] <= fw0 && fw0 <= block[1]) *oril+=2; - delete block; - } - } - } else { - *oril=5; - } - if (DEBUG) cerr << "oril =" << *oril << endl; - // Right neighbor - if (maxTSpan(N1)<minTSpan(fw1) || maxTSpan(fw1)<minTSpan(N1)) { - int* block = blockTarget(minTSpan(N1),maxTSpan(N1)); - if (block[0]<=fw1 && fw1<=block[2]) *orir=5; - delete block; - if (*orir==0) { - block = blockTarget(minTSpan(fw1),maxTSpan(fw1)); - if (block[0]<=N1 && N1 <=block[1]) *orir=5; - delete block; - } - if (DEBUG) cerr << "N1=" << minTSpan(N1) << "-" << maxTSpan(N1); - if (DEBUG) cerr << "fw1=" << minTSpan(fw1) << "-" << maxTSpan(fw1) << endl; - if (*orir==0) { - if (maxTSpan(fw1)<minTSpan(N1)) { // if N1 is monotone - *orir = 1; - block = blockTarget(maxTSpan(fw1)+1,minTSpan(N1)); - if (block[0] <= fw1 && fw1 <= block[1]) *orir+=2; - delete block; - } else {// if (maxTSpan(N1)<minTSpan(fw1)) { // if N1 is non-monotone - *orir = 2; - block = blockTarget(maxTSpan(N1),minTSpan(fw1)-1); - if (block[0] <= fw1 && fw1 <= block[1]) *orir+=2; - delete block; - } - } - } else { - *orir = 5; - } - if (DEBUG) cerr << "orir =" << *orir << endl; - lr = link(*oril,*orir); - oris_hash.insert(pair<vector<int>,int>(curr_al,lr)); - } else { - lr = oris_hash[curr_al]; - } - if (DEBUG) cerr << "Lcompute=" << Lcompute << ", Rcompute=" << Rcompute << endl; - if (Lcompute) *oril = source(lr); - if (Rcompute) *orir = target(lr); - curr_al.pop_back(); - curr_al.pop_back(); -} - -int Alignment::OrientationSource(int* left, int* right) { - if (DEBUG) { - cerr << " OrientationSource("; - cerr << "left="<<left[0]<<","<<left[1]<<","<<left[2]<<","<<left[3]; - cerr << " right="<<right[0]<<","<<right[1]<<","<<right[2]<<","<<right[3]; - cerr << ")" << endl; - } - //if ((right[1]<=left[0]) return 5; - if (!(left[1]<right[0])) return 5; - int ori = 1; - if (right[3]<left[2]) ori=2; - int gapstart = left[3]+1; int gapend = right[2]-1; - if (ori==2) { gapstart = right[3]+1; gapend = left[2]-1; } - for (int j=gapstart; j<=gapend; j++) { - if (sourceOf(j)!=-1) { - ori+=2; break; - } - } - return ori; -} - -void Alignment::OrientationTarget(int fw, int *oril, int *orir, bool Lcompute, bool Rcompute) { - OrientationTarget(fw,fw,oril,orir,Lcompute,Rcompute); -} - -vector<int> Alignment::OrientationTargetLeft4Sampler(int fw) { - return OrientationTargetLeft4Sampler(fw,fw); -} - -vector<int> Alignment::OrientationTargetLeft4Sampler(int fw0, int fw1) { - if (DEBUG) cerr << "OrientationTargetLeft4Sampler " << fw0 << "," << fw1 << endl; - int oril=0; - int N0=fw0-1; - while (N0>=0) { - if (minSSpan(N0)!=MINIMUM_INIT) break; - N0--; - } - int N1=fw1+1; - while (N1<_I) { - if (minSSpan(N1)!=MINIMUM_INIT) break; - N1++; - } - if (minSSpan(fw0)==MINIMUM_INIT && minSSpan(fw1)==MINIMUM_INIT) { - fw0=N1; fw1=N0; - } - if (maxSSpan(N0)<minSSpan(fw0) || maxSSpan(fw0)<minSSpan(N0)) { - int *block = blockSource(minSSpan(N0),maxSSpan(N0)); - if (DEBUG) cerr << "block1[" << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << endl; - if (block[2]<=fw0 && fw0<=block[3]) //source span of fw0 subsumes NO's or the other way around - oril=5; - delete block; - if (oril==0) { - block = blockSource(minSSpan(fw0), maxSSpan(fw0)); - if (DEBUG) cerr << "block2[" << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << endl; - if (block[2] <= N0 && N0 <= block[3]) oril=5; - delete block; - } - if (oril==0) { - if (maxSSpan(N0)<minSSpan(fw0)) {// if N0 is monotone - oril=1; - block = blockSource(maxSSpan(N0),minSSpan(fw0)-1); - if (DEBUG) cerr << "block3[" << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << endl; - if (block[2] <= fw0 && fw0 <= block[3]) oril+=2; - delete block; - } else { // (maxSSpan(fw0)<minSSpan(N0)) // if NO is non-monotone - oril=2; - block = blockSource(maxSSpan(fw0)+1,minSSpan(N0)); - if (DEBUG) cerr << "block4[" << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << endl; - if (block[2] <= fw0 && fw0 <= block[3]) oril+=2; - delete block; - } - } - } else { //source span of fw0 subsumes NO's or the other way around - oril=5; - } - if (DEBUG) cerr << "oril = " << oril << endl; - int* block = blockSource(N0,fw0); - vector<int> ret; - ret.push_back(oril); ret.push_back(block[0]); ret.push_back(block[1]); - ret.push_back(block[2]); ret.push_back(block[3]); - delete block; - return ret; -} - -vector<int> Alignment::OrientationTargetRight4Sampler(int fw) { - return OrientationTargetRight4Sampler(fw,fw); -} - -vector<int> Alignment::OrientationTargetRight4Sampler(int fw0, int fw1) { - if (DEBUG) cerr << "OrientationTargetRight4Sampler " << fw0 << "," << fw1 << endl; - int orir=0; - int N0=fw0-1; - while (N0>=0) { - if (minSSpan(N0)!=MINIMUM_INIT) break; - N0--; - } - int N1=fw1+1; - while (N1<_I) { - if (minSSpan(N1)!=MINIMUM_INIT) break; - N1++; - } - if (minSSpan(fw0)==MINIMUM_INIT && minSSpan(fw1)==MINIMUM_INIT) { - fw0=N1; fw1=N0; - } - if (maxSSpan(N1)<minSSpan(fw1) || maxSSpan(fw1)<minSSpan(N1)) { - int *block = blockSource(minSSpan(N1),maxSSpan(N1)); - if (block[2]<=fw1 && fw1<=block[3]) orir=5; - delete block; - if (orir==0) { - block = blockSource(minSSpan(fw1),maxSSpan(fw1)); - if (block[2] <= N1 && N1 <= block[3]) orir=5; - delete block; - } - if (orir==0) { - if (maxSSpan(fw1)<minSSpan(N1)) { // if N1 is monotone - orir=1; - block = blockSource(maxSSpan(fw1)+1,minSSpan(N1)); - if (block[2] <= fw1 && fw1 <= block[3]) orir+=2; - delete block; - } else { //if (maxSSpan(N1)<minSSpan(fw1)) { // if N1 is non-monotone - orir=2; - block = blockSource(maxSSpan(N1),minSSpan(fw1)-1); - if (block[2] <= fw1 && fw1 <= block[3]) orir+=2; - delete block; - } - } - } else { - orir=5; - } - if (DEBUG) cerr << "orir = " << orir << endl; - int* block = blockSource(fw1,N1); - vector<int> ret; - ret.push_back(orir); ret.push_back(block[0]); ret.push_back(block[1]); - ret.push_back(block[2]); ret.push_back(block[3]); - delete block; - return ret; - -} - -void Alignment::OrientationTarget(int fw0, int fw1, int*oril, int*orir, bool Lcompute, bool Rcompute) { - if (DEBUG) cerr << "OrientationTarget " << fw0 << "," << fw1 << endl; - // Left Neighbor - if (!Lcompute && !Rcompute) return; - *oril=0; - *orir=0; - curr_al.push_back(fw0); - curr_al.push_back(fw1); - int lr = 0; - if (orit_hash.find(curr_al)==orit_hash.end()) { - // Find first aligned word N0 to the left of fw - //int fw0 = fw; int fw1 = fw; - int N0=fw0-1; - while (N0>=0) { - if (minSSpan(N0)!=MINIMUM_INIT) break; - N0--; - } - int N1=fw1+1; - while (N1<_I) { - if (minSSpan(N1)!=MINIMUM_INIT) break; - N1++; - } - if (minSSpan(fw0)==MINIMUM_INIT && minSSpan(fw1)==MINIMUM_INIT) { - fw0=N1; fw1=N0; - } - if (DEBUG) { - cerr << "fw0:" << fw0 << ", fw1:" << fw1 << ", N0:" << N0 << ", N1:" << N1 << endl ; - cerr << "minSSpan(N0)=" << minSSpan(N0) << " maxSSpan(N0)=" << maxSSpan(N0); - cerr << " minSSpan(fw0)="<< minSSpan(fw0) << " maxSSpan(fw0)=" << maxSSpan(fw0) << endl; - cerr << "minSSpan(fw1)=" << minSSpan(fw1) << " maxSSpan(fw1)=" << maxSSpan(fw1); - cerr << " minSSpan(N1)="<< minSSpan(N1) << " maxSSpan(N1)=" << maxSSpan(N1) << endl; - } - if (maxSSpan(N0)<minSSpan(fw0) || maxSSpan(fw0)<minSSpan(N0)) { - int *block = blockSource(minSSpan(N0),maxSSpan(N0)); - if (DEBUG) cerr << "block1[" << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << endl; - if (block[2]<=fw0 && fw0<=block[3]) //source span of fw0 subsumes NO's or the other way around - *oril=5; - delete block; - if (*oril==0) { - block = blockSource(minSSpan(fw0), maxSSpan(fw0)); - if (DEBUG) cerr << "block2[" << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << endl; - if (block[2] <= N0 && N0 <= block[3]) *oril=5; - delete block; - } - if (*oril==0) { - if (maxSSpan(N0)<minSSpan(fw0)) {// if N0 is monotone - *oril=1; - block = blockSource(maxSSpan(N0),minSSpan(fw0)-1); - if (DEBUG) cerr << "block3[" << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << endl; - if (block[2] <= fw0 && fw0 <= block[3]) *oril+=2; - delete block; - } else { // (maxSSpan(fw0)<minSSpan(N0)) // if NO is non-monotone - *oril=2; - block = blockSource(maxSSpan(fw0)+1,minSSpan(N0)); - if (DEBUG) cerr << "block4[" << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << endl; - if (block[2] <= fw0 && fw0 <= block[3]) *oril+=2; - delete block; - } - } - } else { //source span of fw0 subsumes NO's or the other way around - *oril=5; - } - if (DEBUG) cerr << "oril = " << *oril << endl; - // Right Neighbor - if (maxSSpan(N1)<minSSpan(fw1) || maxSSpan(fw1)<minSSpan(N1)) { - int *block = blockSource(minSSpan(N1),maxSSpan(N1)); - if (block[2]<=fw1 && fw1<=block[3]) *orir=5; - delete block; - if (*orir==0) { - block = blockSource(minSSpan(fw1),maxSSpan(fw1)); - if (block[2] <= N1 && N1 <= block[3]) *orir=5; - delete block; - } - if (*orir==0) { - if (maxSSpan(fw1)<minSSpan(N1)) { // if N1 is monotone - *orir=1; - block = blockSource(maxSSpan(fw1)+1,minSSpan(N1)); - if (block[2] <= fw1 && fw1 <= block[3]) *orir+=2; - delete block; - } else { //if (maxSSpan(N1)<minSSpan(fw1)) { // if N1 is non-monotone - *orir=2; - block = blockSource(maxSSpan(N1),minSSpan(fw1)-1); - if (block[2] <= fw1 && fw1 <= block[3]) *orir+=2; - delete block; - } - } - } else { - *orir=5; - } - if (DEBUG) cerr << "orir = " << *orir << endl; - lr = link(*oril,*orir); - orit_hash.insert(pair<vector<int>,int>(curr_al,lr)); - } else { - lr = orit_hash[curr_al]; - } - if (DEBUG) cerr << "Lcompute=" << Lcompute << ", Rcompute=" << Rcompute << endl; - if (DEBUG) cerr << "lr=" << lr << ", l=" << source(lr) << ", r=" << target(lr) << endl; - if (Lcompute>0) *oril=source(lr); - if (Rcompute>0) *orir=target(lr); - curr_al.pop_back(); - curr_al.pop_back(); -} - -int* Alignment::blockSource(int idx1, int idx2) { -// outputs a minimal block [s1,s2,t1,t2] that contains idx1 and idx2, where idx1 <= idx2 - if (DEBUG) cerr << "blockSource[" << idx1 << "," << idx2 << "]" << endl; - int *curr = new int[4]; - curr[0]=idx1; curr[1]=idx2; curr[2]=MINIMUM_INIT; curr[3]=MAXIMUM_INIT; - for (int j=curr[0]; j<=curr[1]; j++) { - curr[2] = least(curr[2],_tSpan[j][0]); - curr[3] = most(curr[3],_tSpan[j][1]); - } - int next[4]; - next[0]=curr[0]; next[1]=curr[1]; - for (int i=curr[2]; i<=curr[3]; i++) { - next[0] = least(next[0],_sSpan[i][0]); - next[1] = most(next[1],_sSpan[i][1]); - } - next[2] = curr[2]; next[3]= curr[3]; - int idx=1; - do { - // update the current - for (int j=next[0]; j<curr[0]; j++) { - curr[2] = least(curr[2],_tSpan[j][0]); - curr[3] = most(curr[3],_tSpan[j][1]); - } - for (int j=curr[1]+1; j<=next[1]; j++) { - curr[2] = least(curr[2],_tSpan[j][0]); - curr[3] = most(curr[3],_tSpan[j][1]); - } - curr[0] = next[0]; curr[1] = next[1]; - if (curr[2]==next[2] && curr[3]==next[3]) break; - // prepare for the next - for (int i=curr[2]; i<next[2]; i++) { - next[0]= least(next[0],_sSpan[i][0]); - next[1]= most(next[1],_sSpan[i][1]); - } - for (int i=next[3]+1; i<=curr[3]; i++) { - next[0] = least(next[0],_sSpan[i][0]); - next[1] = most(next[1],_sSpan[i][1]); - } - next[2] = curr[2]; next[3]= curr[3]; - idx++; - } while(1); - return curr; -} - -int* Alignment::blockTarget(int idx1, int idx2) { -// outputs a minimal [s1,s2,t1,t2] that contains idx1 and idx2, where idx1<=idx2 - int *curr = new int[4]; - curr[0]=MINIMUM_INIT; curr[1]=MAXIMUM_INIT; curr[2]=idx1; curr[3]=idx2; - for (int i=curr[2]; i<=curr[3]; i++) { - curr[0] = least(curr[0],_sSpan[i][0]); - curr[1] = most(curr[1],_sSpan[i][1]); - } - int next[4]; - next[2]=curr[2]; next[3]=curr[3]; - for (int j=curr[0]; j<=curr[1]; j++) { - next[2] = least(next[2],_tSpan[j][0]); - next[3] = most(next[3],_tSpan[j][1]); - } - next[0] = curr[0]; next[1]= curr[1]; - int idx=1; - do { - // update the current - for (int i=next[2]; i<curr[2]; i++) { - curr[0] = least(curr[0],_sSpan[i][0]); - curr[1] = most(curr[1],_sSpan[i][1]); - } - for (int i=curr[3]+1; i<=next[3]; i++) { - curr[0] = least(curr[0],_sSpan[i][0]); - curr[1] = most(curr[1],_sSpan[i][1]); - } - curr[2] = next[2]; curr[3] = next[3]; - if (curr[0]==next[0] && curr[1]==next[1]) break; - // prepare for the next - for (int j=curr[0]; j<next[0]; j++) { - next[2]= least(next[2],_tSpan[j][0]); - next[3]= most(next[3],_tSpan[j][1]); - } - for (int j=next[1]+1; j<=curr[1]; j++) { - next[2] = least(next[2],_tSpan[j][0]); - next[3] = most(next[3],_tSpan[j][1]); - } - next[0] = curr[0]; next[1]= curr[1]; - idx++; - } while(1); - return curr; -} - -int Alignment::firstSourceAligned(int start) { - for (int j=start; j<_J; j++) - if (_tSpan[j][0]!=MINIMUM_INIT) return j; - return -1; -} - -int Alignment::lastSourceAligned(int end) { - for (int j=end; j>=0; j--) - if (_tSpan[j][0]!=MINIMUM_INIT) return j; - return -1; -} - -int Alignment::firstTargetAligned(int start) { - for (int i=start; i<_I; i++) - if (_sSpan[i][0]!=MINIMUM_INIT) return i; - return -1; -} - -int Alignment::lastTargetAligned(int end) { - for (int i=end; i>=0; i--) - if (_sSpan[i][0]!=MINIMUM_INIT) return i; - return -1; -} - -void Alignment::BorderingSFWsOnly() { -// removes the record of all function word alignments, except those at the borders -// the number of alignments kept may be more than two -// i.e. where the leftmost / the rightmost alignments are unaligned. -// In such cases, this function continues keeping function word alignments until the -// first (or last) alignment words. - if (SourceFWIdxs[0]>2) { - int firstCut = 1; - for (int j=2; j<=SourceFWIdxs[0]; j++) { - if (SourceFWIdxs[3*j-2]>fas) break; - firstCut=j; - } - int lastCut = SourceFWIdxs[0]; - for (int j=SourceFWIdxs[0]-1; j>=0; j--) { - if (SourceFWIdxs[3*j-2]<las) break; - lastCut=j; - } - if (firstCut>=lastCut) return; - int delta = 0; - for (int j=lastCut; j<=SourceFWIdxs[0]; j++) { - delta++; - SourceFWIdxs[3*(firstCut+delta)-2]=SourceFWIdxs[3*j-2]; - SourceFWIdxs[3*(firstCut+delta)-1]=SourceFWIdxs[3*j-1]; - SourceFWIdxs[3*(firstCut+delta)] =SourceFWIdxs[3*j]; - } - SourceFWIdxs[0]=firstCut+delta; - } -} - -void Alignment::BorderingTFWsOnly() { -// similar to BorderingSFWsOnly() except this looks at the source side. - if (TargetFWIdxs[0]>2) { - int firstCut = 1; - for (int j=2; j<=TargetFWIdxs[0]; j++) { - if (TargetFWIdxs[3*j-2]>fat) break; - firstCut=j; - } - int lastCut = TargetFWIdxs[0]; - for (int j=TargetFWIdxs[0]-1; j>=0; j--) { - if (TargetFWIdxs[3*j-2]<lat) break; - lastCut=j; - } - if (firstCut>=lastCut) return; - int delta = 0; - for (int j=lastCut; j<=TargetFWIdxs[0]; j++) { - delta++; - TargetFWIdxs[3*(firstCut+delta)-2]=TargetFWIdxs[3*j-2]; - TargetFWIdxs[3*(firstCut+delta)-1]=TargetFWIdxs[3*j-1]; - TargetFWIdxs[3*(firstCut+delta)] =TargetFWIdxs[3*j]; - } - TargetFWIdxs[0]=firstCut+delta; - } -} - -void Alignment::FillFWIdxsState(int* state, int fas, int las, int fat, int lat) { - if (DEBUG) cerr << "FillFWIdxsState ("<< fas <<","<< las<<"," << fat <<"," << lat << ")" << endl; - if (fas==las) las+=1; - if (fat==lat) lat+=1; - for (int idx=0; idx<12; idx++) state[idx]=-1; - if (SourceFWIdxs[0]<=2) { - if (SourceFWIdxs[0]>=1) {state[0]=SourceFWIdxs[1]; state[1]=SourceFWIdxs[2]; state[2]=SourceFWIdxs[3];} - if (SourceFWIdxs[0]==2) {state[3]=SourceFWIdxs[4]; state[4]=SourceFWIdxs[5]; state[5]=SourceFWIdxs[6];} - } else { - if (SourceFWIdxs[1]>fas) { - state[0]=SourceFWIdxs[1]; state[1]=SourceFWIdxs[2]; state[2]=SourceFWIdxs[3]; - } else { - ostringstream issf; ostringstream isse; - for (int idx=1; idx<=SourceFWIdxs[0]; idx++) { - if (SourceFWIdxs[3*idx-2]>las) break; - if (idx>1) { issf << " "; isse << " ";}; - issf << TD::Convert(SourceFWIdxs[3*idx-1]); - isse << TD::Convert(SourceFWIdxs[3*idx]); - state[0]=SourceFWIdxs[3*idx-2]; - if (state[0]>=fas) break; - } - if (state[0]>=0) { - state[1]=TD::Convert(issf.str())*-1; state[2]=TD::Convert(isse.str()); //multiplying source with -1 as marker - } - } - if (SourceFWIdxs[SourceFWIdxs[0]*3-2]==las) { - state[3]=SourceFWIdxs[SourceFWIdxs[0]*3-2]; - state[4]=SourceFWIdxs[SourceFWIdxs[0]*3-1]; - state[5]=SourceFWIdxs[SourceFWIdxs[0]*3]; - } else { - int lastCut = SourceFWIdxs[0]; - for (int j=lastCut-1; j>=state[0]+1; j--) { - if (SourceFWIdxs[3*j-2]==state[0]) break; - if (SourceFWIdxs[3*j-2]<las) break; - lastCut=j; - } - state[3]=SourceFWIdxs[3*lastCut-2]; - ostringstream issf; ostringstream isse; - for (int idx=lastCut; idx<=SourceFWIdxs[0]; idx++) { - if (idx>lastCut) { issf << " "; isse << " ";}; - issf << TD::Convert(SourceFWIdxs[3*idx-1]); - isse << TD::Convert(SourceFWIdxs[3*idx]); - } - if (state[3]>=0) { - //multiplying source with -1 as compound marker - state[4]=TD::Convert(issf.str())*-1; state[5]=TD::Convert(isse.str()); - } - } - } - if (TargetFWIdxs[0]<=2) { - if (TargetFWIdxs[0]>=1) {state[6]=TargetFWIdxs[1]; state[7]=TargetFWIdxs[2]; state[8]=TargetFWIdxs[3];} - if (TargetFWIdxs[0]==2) {state[9]=TargetFWIdxs[4]; state[10]=TargetFWIdxs[5]; state[11]=TargetFWIdxs[6];} - } else { - if (TargetFWIdxs[1]>fat) { //shouldn't come here if SetTargetBorderingFW is invoked - state[6]=TargetFWIdxs[1]; state[7]=TargetFWIdxs[2]; state[8]=TargetFWIdxs[3]; - } else { - ostringstream issf; ostringstream isse; - for (int idx=1; idx<=TargetFWIdxs[0]; idx++) { - if (TargetFWIdxs[3*idx-2]>fat) break; - if (idx>1) { issf << " "; isse << " ";}; - issf << TD::Convert(TargetFWIdxs[3*idx-1]); - isse << TD::Convert(TargetFWIdxs[3*idx]); - state[6]=TargetFWIdxs[3*idx-2]; - } - state[7]=TD::Convert(issf.str()); state[8]=TD::Convert(isse.str())*-1; - //multiplying target with -1 as compound marker - } - if (TargetFWIdxs[TargetFWIdxs[0]*3-2]==lat) { - state[9]=TargetFWIdxs[TargetFWIdxs[0]*3-2]; - state[10]=TargetFWIdxs[TargetFWIdxs[0]*3-1]; - state[11]=TargetFWIdxs[TargetFWIdxs[0]*3]; - } else { - int lastCut = TargetFWIdxs[0]; - for (int j=lastCut-1; j>=1; j--) { - if (TargetFWIdxs[3*j-2]<=state[9]) break; - if (TargetFWIdxs[3*j-2]<lat) break; - lastCut=j; - } - state[9]=TargetFWIdxs[3*lastCut-2]; - ostringstream issf; ostringstream isse; - for (int idx=lastCut; idx<=TargetFWIdxs[0]; idx++) { - if (idx>lastCut) issf << " "; isse << " ";; - issf << TD::Convert(TargetFWIdxs[3*idx-1]); - isse << TD::Convert(TargetFWIdxs[3*idx]); - } - state[10]=TD::Convert(issf.str()); state[11]=TD::Convert(isse.str())*-1; - } - } -} - -void Alignment::simplifyBackward(vector<int *>*blocks, int* block, const vector<int>& danglings) { -// given a *block*, see whether its target span contains any index inside *danglings*. -// if yes, break it; otherwise, keep it. put the result(s) to *blocks* - if (DEBUG) cerr << "simplifyBackward[" << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << "]" << endl; - if (DEBUG) for (int i=0; i<danglings.size(); i++) cerr << "danglings[" << i << "] = " << danglings[i] << endl; - if (danglings.size()==0) { - blocks->push_back(block); - if (DEBUG) cerr << "pushing(0) " << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << endl; - return; - } - int currIdx = block[2]; - int i_dangling = 0; - while (block[2]>danglings[i_dangling]) { - if (i_dangling+1 >= danglings.size()) break; - i_dangling++; - } - while (danglings[i_dangling]==currIdx) { - i_dangling++; - currIdx++; - } - /*if (i_dangling>=danglings.size() && currIdx) { - blocks->push_back(block); - if (DEBUG) cerr << "pushing(1) " << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << endl; - return; - } - if (block[3]<danglings[i_dangling]) { - blocks->push_back(block); - if (DEBUG) cerr << "pushing(2) " << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << endl; - return; - }*/ - if (DEBUG) cerr << "i_dangling = " << i_dangling << endl; - int anchorIdx = danglings[i_dangling]; - if (i_dangling+1>=danglings.size() || anchorIdx>block[3]+1) anchorIdx=block[3]+1; - if (DEBUG) cerr << "anchorIdx = " << anchorIdx << ", currIdx = " << currIdx << endl; - do { - while(currIdx<anchorIdx) { - if (DEBUG) cerr << "currIdx = " << currIdx << ", anchorIdx = " << anchorIdx << endl; - bool isMoved = false; - for (int idx=anchorIdx-1; idx>=currIdx; idx--) { - int *nublock = blockTarget(currIdx,idx); - if (nublock[2]==currIdx && nublock[3]==idx) { - if (nublock[0]!=MINIMUM_INIT) { - blocks->push_back(nublock); - if (DEBUG) cerr << "pushing(3) " << nublock[0] << "," << nublock[1] << "," << nublock[2] << "," << nublock[3] << endl; - } else { - delete nublock; - } - isMoved = true; - currIdx=idx+1; break; - } else { - delete nublock; - } - } - if (DEBUG) cerr << "isMoved=" << isMoved << ", currIdx=" << currIdx << endl; - if (!isMoved) { - int source = sourceOf(currIdx); - while (source>=0) { - if (source >= block[0]) { - int* nublock = new int[4]; - nublock[0]=source; nublock[1]=source; nublock[2]=currIdx; nublock[3]=currIdx; - blocks->push_back(nublock); - if (DEBUG) cerr << "pushing(4) " << nublock[0] << "," << nublock[1] << "," << nublock[2] << "," << nublock[3] << endl; - } - source = sourceOf(currIdx,source+1); - } - currIdx++; - } - } - currIdx=anchorIdx+1; - anchorIdx=block[3]+1; - if (i_dangling+1<danglings.size()) anchorIdx=danglings[++i_dangling]; - } while(currIdx<=block[3]); -} - -void Alignment::simplify(int* ret) { - // the idea is to create blocks of maximal consistent alignment in between a pair of function words - // exceptional cases include: one to non-contiguous many (or vice versa) -> treat this as one alignment each - // record all function word alignments first, important because it may be unaligned - // return true if it's truly simple (no function word alignment involves); false, otherwise - if (DEBUG) cerr << "begin simplify" << endl; - reset(0,0); reset(_J-1,_I-1); // remove the phrase boundary alignments, NEED TO CHECK AGAIN !!! - if (SourceFWIdxs[0]+TargetFWIdxs[0]==0) { // return singleton - if (DEBUG) cerr << "no function words" << endl; - for (int idx=0; idx<12; idx++) ret[idx]=-1; - ret[12]=1; ret[13]=0; ret[14]=0; // 0-0 - FillFWIdxsState(ret,0,0,0,0); - return; - } - curr_al.insert(curr_al.begin(),curr_al.size()); - curr_al.push_back(SourceFWIdxs[0]); - for (int i=1; i<=SourceFWIdxs[0]; i++) curr_al.push_back(SourceFWIdxs[3*i-2]); - curr_al.push_back(TargetFWIdxs[0]); - for (int i=1; i<=TargetFWIdxs[0]; i++) curr_al.push_back(TargetFWIdxs[3*i-2]); - vector<int> el; - if (simplify_hash.find(curr_al)==simplify_hash.end()) { - if (DEBUG) { - cerr << "SourceFWIdxs:" << SourceFWIdxs[0] << endl; - for (int i=1; i<=SourceFWIdxs[0]; i++) - cerr << SourceFWIdxs[3*i-2] << "," << SourceFWIdxs[3*i-1] << "," << SourceFWIdxs[3*i] << endl; - cerr << "TargetFWIdxs:" << TargetFWIdxs[0] << endl; - for (int i=1; i<=TargetFWIdxs[0]; i++) { - cerr << TargetFWIdxs[3*i-2] << "," << TargetFWIdxs[3*i-1] << "," << TargetFWIdxs[3*i] << endl; - } - } - - vector< int* > blocks; // each element contains s1,s2,t1,t2 - int currIdx = 1; // start from 1 to avoid considering phrase start - std::set<int> FWIdxs; - std::vector<int> DanglingTargetFWIdxs; - for (int i=1; i<= SourceFWIdxs[0]; i++) FWIdxs.insert(SourceFWIdxs[3*i-2]); - for (int i=1; i<= TargetFWIdxs[0]; i++) { - int source = sourceOf(TargetFWIdxs[3*i-2]); - if (source>=0) { - do { - FWIdxs.insert(source); - source = sourceOf(TargetFWIdxs[3*i-2],source+1); - } while(source >=0); - } else { - int *block = new int[4]; - block[0]=-1; block[1]=-1; block[2]=TargetFWIdxs[3*i-2]; block[3]=TargetFWIdxs[3*i-2]; - blocks.push_back(block); - if (DEBUG) cerr << "pushing[1] " << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << endl; - DanglingTargetFWIdxs.push_back(TargetFWIdxs[3*i-2]); - } - } - if (DEBUG) - for (std::set<int>::const_iterator iter=FWIdxs.begin(); iter!=FWIdxs.end(); iter++) { - cerr << "FWIdxs=" << *iter << endl; - } - std::set<int>::const_iterator currFWIdx = FWIdxs.begin(); - if (currFWIdx == FWIdxs.end()) { - int* block = new int[4]; - block[0]=1; block[1]=_J-2; block[2]=1; block[3]=_I-2; // no need to consider phrase boundaries - simplifyBackward(&blocks,block,DanglingTargetFWIdxs); - } else { - int anchorIdx = *currFWIdx; // also used to denote _J+1 - do { - // add alignments whose source from currIdx to currFWIdx-1 - while (currIdx<anchorIdx) { - bool isMoved = false; - //cerr << "anchorIdx = " << anchorIdx << ", currIdx = " << currIdx << endl; - for (int idx=anchorIdx-1; idx>=currIdx; idx--) { - int* block = blockSource(currIdx,idx); - if (block[0]==currIdx&&block[1]==idx) { - if (block[2]!=MINIMUM_INIT) { // must be aligned - simplifyBackward(&blocks,block,DanglingTargetFWIdxs); - } else { - delete block; - } - currIdx = idx+1; isMoved = true; - break; - } else { - delete block; - } - } - if (!isMoved) { - int target = targetOf(currIdx); - while (target>=0) { - int* block = new int[4]; - block[0]=currIdx; block[1]=currIdx; block[2]=target; block[3]=target; - blocks.push_back(block); - if (DEBUG) cerr << "pushing[2] " << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << endl; - target = targetOf(currIdx,target+1); - } - currIdx++; - } - } - // add function word alignments (anchorIdx) - if (anchorIdx==getJ()) break; - int target = targetOf(anchorIdx); - do { - int* block = new int[4]; - block[0]=anchorIdx; block[1]=anchorIdx; block[2]=target; block[3]=target; - blocks.push_back(block); - if (DEBUG) cerr << "pushing[3] " << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << endl; - if (target>=0) target = targetOf(anchorIdx,target+1); - } while (target>=0); - // advance indexes - currIdx = anchorIdx+1; - anchorIdx = getJ()-1; // was minus 2 - if (++currFWIdx!=FWIdxs.end()) anchorIdx = *currFWIdx; - } while (currIdx<=getJ()-2); - } - - - vector<int> source_block_mapper(getJ(),-1); - vector<int> target_block_mapper(getI(),-1); - for (int i = 0; i<blocks.size(); i++) { - if (DEBUG) cerr << "blocks[" << i << "]=" << blocks[i][0] << "," << blocks[i][1] << "," << blocks[i][2] << "," << blocks[i][3] << endl; - if (blocks[i][0]>=0) source_block_mapper[blocks[i][0]]=1; - if (blocks[i][2]>=0) target_block_mapper[blocks[i][2]]=1; - } - int curr = 1; - int prev = -1; - for (int idx=0; idx<source_block_mapper.size(); idx++) { - if (source_block_mapper[idx]>0) { - source_block_mapper[idx]=curr++; - prev = curr; - } else { - source_block_mapper[idx]=prev; - } - } - curr = 1; - for (int idx=0; idx<target_block_mapper.size(); idx++) { - if (target_block_mapper[idx]>0) { - target_block_mapper[idx]=curr++; - prev = curr; - } else { - target_block_mapper[idx]=prev; - } - } - - //assert(blocks.size()<=50); - if (DEBUG) cerr << "resulting alignment:" << endl; - for (int i = 0; i<blocks.size(); i++) { - if (blocks[i][2]<0 || blocks[i][0]<0) continue; - int source = source_block_mapper[blocks[i][0]]-1; - int target = target_block_mapper[blocks[i][2]]-1; - el.push_back(link(source,target)); - if (DEBUG) cerr << source << "-" << target << " "; - } - el.insert(el.begin(),el.size()); - if (DEBUG) cerr << endl; - el.push_back(SourceFWIdxs[0]); - for (int idx=1; idx<=SourceFWIdxs[0]; idx++) { - if (DEBUG) cerr << "SourceFWIdxs[" << (3*idx-2) << "] from " << SourceFWIdxs[3*idx-2] << endl; - el.push_back(source_block_mapper[SourceFWIdxs[3*idx-2]]-1); - } - el.push_back(TargetFWIdxs[0]); - for (int idx=1; idx<=TargetFWIdxs[0]; idx++) { - if (DEBUG) cerr << "TargetFWIdxs[" << (3*idx-2) << "] from " << TargetFWIdxs[3*idx-2] << endl; - el.push_back(target_block_mapper[TargetFWIdxs[3*idx-2]]-1); - } - el.push_back(source_block_mapper[fas]-1); - el.push_back(source_block_mapper[las]-1); - el.push_back(target_block_mapper[fat]-1); - el.push_back(target_block_mapper[lat]-1); - if (DEBUG) { - cerr << "insert key:el = "; - for (int ii=0; ii<el.size(); ii++) - cerr << ii << "." << el[ii] << " "; - cerr << " || " << endl; - } - if (DEBUG) cerr << "trying to insertL " << endl; - if (DEBUG) { - cerr << "size=" << curr_al.size() << " "; - for (int ii=0; ii<curr_al.size(); ii++) cerr << "curr_al[" << ii << "]=" << curr_al[ii] << " "; - cerr << endl; - } - simplify_hash.insert(pair<vector<int>, vector<int> > (curr_al,el)); - if (DEBUG) cerr << "inserted" << endl; - } else { - el = simplify_hash[curr_al]; - } - if (DEBUG) { - cerr << "pull key:el = "; - for (int ii=0; ii<el.size(); ii++) - cerr << ii << "." << el[ii] << " "; - cerr << endl; - } - ret[12] = el[0]; - for (int i=1; i<=el[0]; i++) ret[12+i] = el[i]; - int istart = el[0]+1; - assert(el[istart]==SourceFWIdxs[0]); - for (int i=1; i<=el[istart]; i++) SourceFWIdxs[3*i-2]=el[istart+i]; - istart += el[istart]+1; - assert(el[istart]==TargetFWIdxs[0]); - for (int i=1; i<=el[istart]; i++) TargetFWIdxs[3*i-2]=el[istart+i]; - istart += el[istart]+1; - FillFWIdxsState(ret,el[istart],el[istart+1],el[istart+2],el[istart+3]); -} - -void Alignment::simplify_nofw(int* ret) { - for (int i=0; i<12; i++) ret[i]=-1; - ret[12]=1; ret[13]=0; -} - -void Alignment::sort(int* num) { - if (num[0]>1) quickSort(num,1,num[0]); -} - -void Alignment::quickSort(int arr[], int left, int right) { - int i = left, j = right; - int tmp1,tmp2,tmp3; - int mid = (left + right) / 2; - int pivot = arr[3*mid-2]; - - /* partition */ - while (i <= j) { - while (arr[3*i-2] < pivot) i++; - while (arr[3*j-2] > pivot) j--; - if (i <= j) { - tmp1 = arr[3*i-2]; tmp2 = arr[3*i-1]; tmp3 = arr[3*i]; - arr[3*i-2] = arr[3*j-2]; arr[3*i-1] = arr[3*j-1]; arr[3*i] = arr[3*j]; - arr[3*j-2] = tmp1; arr[3*j-1] = tmp2; arr[3*j] = tmp3; - i++; - j--; - } - }; - - /* recursion */ - if (left < j) quickSort(arr, left, j); - if (i < right) quickSort(arr, i, right); -} - -double Alignment::ScoreOrientation(const CountTable& table, int offset, int ori, WordID cond1, WordID cond2) { - string source = TD::Convert(cond1); - string sourceidx; - if (table.mode == 1) { - sourceidx = source; - int slashidx = sourceidx.find_last_of("/"); - source = sourceidx.substr(0,slashidx); - string idx = sourceidx.substr(slashidx+1); - if (DEBUG) cerr << " sourceidx = " << sourceidx << ", idx = " << idx << endl; - if (idx == "X") { - if (DEBUG) cerr << " idx == X, returning 0" << endl; - return 0; - } - } - string target = TD::Convert(cond2); - if (DEBUG) cerr << "sourceidx='" << sourceidx << "', source='" << source << "', target='" << target << "'" << endl; - double count = table.ultimate[offset+ori-1]; - double total = table.ultimate[offset+5]; - double alpha = 0.1; - double prob = count/total; - if (DEBUG) cerr << "level0 " << count << "/" << total << "=" << prob << endl; - - WordID key_id = (table.mode!=1) ? cond1 : TD::Convert(source); - map<WordID,int*>::const_iterator it = table.model.find(key_id); - bool stop = (it==table.model.end()); - if (!stop) { - stop=true; - if (it->second[offset+5]>=0) { - count = it->second[offset+ori-1] + alpha * prob; - total = it->second[offset+5] + alpha; - prob = count/total; - stop = false; - if (DEBUG) cerr << "level1 " << count << "/" << total << "=" << prob << endl; - } - } - if (stop) return prob; - - string key = source + " " + target; - it = table.model.find(TD::Convert(key)); - stop = (it==table.model.end()); - if (!stop) { - stop = true; - if (it->second[offset+5]>=0) { - count = it->second[offset+ori-1] + alpha * prob; - total = it->second[offset+5] + alpha; - prob = count/total; - stop = false; - if (DEBUG) cerr << "level2 " << count << "/" << total << "=" << prob << endl; - } - } - - if (stop || table.mode!=1) return prob; - - key = sourceidx + " " + target; - it = table.model.find(TD::Convert(key)); - if (it!=table.model.end()) { - if (it->second[offset+5]>=0) { - count = it->second[offset+ori-1] + alpha * prob; - total = it->second[offset+5] + alpha; - prob = count/total; - if (DEBUG) cerr << "level3 " << count << "/" << total << "=" << prob << endl; - } - } - - return prob; -} - -void Alignment::ScoreOrientation(const CountTable& table, int offset, int ori, WordID cond1, WordID cond2, - bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, - double alpha1, double beta1) { - if (DEBUG) cerr << "ScoreOrientation:" << TD::Convert(cond1) << "," << TD::Convert(cond2) << ", alpha1 = " << alpha1 << ", beta1 = " << beta1 << endl; - double ret = ScoreOrientation(table,offset,ori,cond1,cond2); - if (isBonus) { - if (table.mode == 0) *bonus += log(ret); else *bonus += ret; - } else { - if (table.mode == 0) *cost += log(ret); else *cost += ret; - } -} - -double Alignment::ScoreOrientationLeft(const CountTable& table, int ori, WordID cond1, WordID cond2) { - double ret = ScoreOrientation(table,0,ori,cond1,cond2); - if (table.mode == 0) return log(ret); - return ret; -} - -double Alignment::ScoreOrientationLeftBackward(const CountTable& table, int ori, WordID cond1, WordID cond2) { - double ret = ScoreOrientation(table,12,ori,cond1,cond2); - if (table.mode == 0) return log(ret); - return ret; -} - -double Alignment::ScoreOrientationRight(const CountTable& table, int ori, WordID cond1, WordID cond2) { - double ret = ScoreOrientation(table,6,ori,cond1,cond2); - if (table.mode == 0) return log(ret); - return ret; -} - -double Alignment::ScoreOrientationRightBackward(const CountTable& table, int ori, WordID cond1, WordID cond2) { - double ret = ScoreOrientation(table,18,ori,cond1,cond2); - if (table.mode == 0) return log(ret); - return ret; -} - -void Alignment::ScoreOrientationLeft(const CountTable& table, int ori, WordID cond1, WordID cond2, - bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, double alpha1, double beta1) { - if (DEBUG) cerr << "ScoreOrientationLeft(" << isBonus << ")" << endl; - ScoreOrientation(table,0,ori,cond1,cond2,isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha1,beta1); -} - -void Alignment::ScoreOrientationLeftBackward(const CountTable& table, int ori, WordID cond1, WordID cond2, - bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, double alpha1, double beta1) { - if (DEBUG) cerr << "ScoreOrientationLeftBackward" << endl; - ScoreOrientation(table,12,ori,cond1,cond2,isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha1,beta1); -} - -void Alignment::ScoreOrientationRight(const CountTable& table, int ori, WordID cond1, WordID cond2, - bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, double alpha1, double beta1) { - if (DEBUG) cerr << "ScoreOrientationRight(" << isBonus << ")" << endl; - ScoreOrientation(table,6,ori,cond1,cond2,isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha1,beta1); -} - -void Alignment::ScoreOrientationRightBackward(const CountTable& table, int ori, WordID cond1, WordID cond2, - bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, double alpha1, double beta1) { - if (DEBUG) cerr << "ScoreOrientationRightBackward" << endl; - ScoreOrientation(table,18,ori,cond1,cond2,isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha1,beta1); -} - -void Alignment::computeOrientationSourceBackwardPos(const CountTable& table, double *cost, double *bonus, - double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, int maxfwidx, int maxdepth1, int maxdepth2) { - if (DEBUG) cerr << "computeOrientationSourceBackward" << endl; - int oril, orir; - for (int idx=1; idx<=SourceFWRuleIdxs[0]; idx++) { - if (DEBUG) cerr << "considering SourceFWRuleIdxs[" << idx << "]: " << SourceFWRuleIdxs[3*idx-2] << endl; - if (!(SourceFWRuleAbsIdxs[idx]<=maxdepth1 || maxfwidx-SourceFWRuleAbsIdxs[idx]+1<=maxdepth2)) continue; - int* fwblock = blockSource(SourceFWRuleIdxs[3*idx-2],SourceFWRuleIdxs[3*idx-2]); - bool aligned = (fwblock[2]!=MINIMUM_INIT); - if (aligned) { - OrientationTarget(fwblock[2],fwblock[3],&oril,&orir); - } else { - OrientationSource(SourceFWRuleIdxs[3*idx-2],&oril,&orir); - } - if (DEBUG) cerr << "oril = " << oril << ", orir = " << orir << endl; - bool isBonus = false; // fas -> first aligned source word, las -> last aligned source word - if ((aligned && fwblock[2]<=fat)|| - (!aligned && SourceFWRuleIdxs[3*idx-2]<=fas)) isBonus=true; - if (SourceFWRuleAbsIdxs[idx]<=maxdepth1) { - ostringstream nusource; - nusource << TD::Convert(SourceFWRuleIdxs[3*idx-1]) << "/" << SourceFWRuleAbsIdxs[idx]; - ScoreOrientationLeftBackward(table,oril,TD::Convert(nusource.str()),SourceFWRuleIdxs[3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris); - } - if (maxfwidx-SourceFWRuleAbsIdxs[idx]+1<=maxdepth2) { - ostringstream nusource; - nusource << TD::Convert(SourceFWRuleIdxs[3*idx-1]) << "/" << ((maxfwidx-SourceFWRuleAbsIdxs[idx]+1)*-1); - ScoreOrientationLeftBackward(table,oril,TD::Convert(nusource.str()),SourceFWRuleIdxs[3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris); - } - isBonus = false; - if ((aligned && lat<=fwblock[3])|| - (!aligned && las<=SourceFWRuleIdxs[3*idx-2])) isBonus=true; - if (SourceFWRuleAbsIdxs[idx]<=maxdepth1) { - ostringstream nusource; - nusource << TD::Convert(SourceFWRuleIdxs[3*idx-1]) << "/" << SourceFWRuleAbsIdxs[idx]; - ScoreOrientationRightBackward(table,orir,SourceFWRuleIdxs[3*idx-1],SourceFWRuleIdxs[3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris); - } - if (maxfwidx-SourceFWRuleAbsIdxs[idx]+1<=maxdepth2) { - ostringstream nusource; - nusource << TD::Convert(SourceFWRuleIdxs[3*idx-1]) << "/" << ((maxfwidx-SourceFWRuleAbsIdxs[idx]+1)*-1); - ScoreOrientationRightBackward(table,orir,SourceFWRuleIdxs[3*idx-1],SourceFWRuleIdxs[3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris); - } - delete fwblock; - } - for (int i_ant=0; i_ant<_Arity; i_ant++) { - // antfas -> first aligned source word antecedent-wise - // antlas -> last aligned source word antecedent-wise - int antfat = firstTargetAligned(TargetAntsIdxs[i_ant][1]); - int antlat = lastTargetAligned(TargetAntsIdxs[i_ant][TargetAntsIdxs[i_ant][0]]); - int antfas = firstSourceAligned(SourceAntsIdxs[i_ant][1]); - int antlas = lastSourceAligned(SourceAntsIdxs[i_ant][SourceAntsIdxs[i_ant][0]]); - assert(antfat <= antlat); - assert(antfas <= antlas); - for (int idx=1; idx<=SourceFWAntsIdxs[i_ant][0]; idx++) { - if (DEBUG) - cerr << "considering SourceFWAntsIdxs[" << i_ant << "][" << idx << "]: " << SourceFWAntsIdxs[i_ant][3*idx-2] << endl; - if (!(SourceFWAntsAbsIdxs[i_ant][idx]<=maxdepth1 || maxfwidx-SourceFWAntsAbsIdxs[i_ant][idx]+1<=maxdepth2)) continue; - int* fwblock = blockSource(SourceFWAntsIdxs[i_ant][3*idx-2],SourceFWAntsIdxs[i_ant][3*idx-2]); - //bool aligned = (minTSpan(SourceFWAntsIdxs[i_ant][3*idx-2])!=MINIMUM_INIT); - bool aligned = (fwblock[2]!=MINIMUM_INIT); - bool Lcompute = true; bool Rcompute = true; - if (DEBUG) { - cerr << " aligned = " << aligned << endl; - cerr << " fwblock = " << fwblock[0] << "," << fwblock[1] << "," << fwblock[2] << "," << fwblock[3] << endl; - cerr << " antfas=" << antfas << ", antlas=" << antlas << ", antfat=" << antfat << ", antlat=" << antlat << endl; - } - if (aligned) { - if (DEBUG) cerr << "laligned" << endl; - if (antfat<fwblock[2]) { - if (DEBUG) cerr << antfat << "<" << fwblock[2] << endl; - Lcompute=false; - } - } else { - if (DEBUG) cerr << "!laligned" << endl; - if (antfas<fwblock[0] && fwblock[1] < antlas) Lcompute=false; - } - if (aligned) { - if (DEBUG) cerr << "raligned" << endl; - if (fwblock[3]<antlat) { - if (DEBUG) cerr << fwblock[3] << "<" << antlat << endl; - Rcompute=false; - } - } else { - if (DEBUG) cerr << "!raligned" << endl; - if (fwblock[1]<antlas && fwblock[1] < antlas) Rcompute=false; - } - if (!Lcompute && !Rcompute) continue; - if (!aligned) { - OrientationSource(SourceFWAntsIdxs[i_ant][3*idx-2],&oril,&orir,Lcompute,Rcompute); - } else { - OrientationTarget(fwblock[2],fwblock[3],&oril,&orir,Lcompute,Rcompute); - } - if (DEBUG) cerr << "oril = " << oril << ", orir = " << orir << endl; - bool isBonus = false; - if (Lcompute) { - if ((aligned && fwblock[3]<=fat) || - (!aligned && SourceFWAntsIdxs[i_ant][3*idx-2]<=fas)) isBonus = true; - if (SourceFWAntsAbsIdxs[i_ant][idx]<=maxdepth1) { - ostringstream nusource; - nusource << TD::Convert(SourceFWAntsIdxs[i_ant][3*idx-1]) << "/" << SourceFWAntsAbsIdxs[i_ant][idx]; - ScoreOrientationLeftBackward(table,oril,TD::Convert(nusource.str()),SourceFWAntsIdxs[i_ant][3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris); - } - if (maxfwidx-SourceFWAntsAbsIdxs[i_ant][idx]+1<=maxdepth2) { - ostringstream nusource; - nusource << TD::Convert(SourceFWAntsIdxs[i_ant][3*idx-1]) << "/" << (-1*(maxfwidx-SourceFWAntsAbsIdxs[i_ant][idx]+1)); - ScoreOrientationLeftBackward(table,oril,TD::Convert(nusource.str()),SourceFWAntsIdxs[i_ant][3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris); - } - } - isBonus = false; - if (Rcompute) { - if ((aligned && lat<=fwblock[2]) || - (!aligned && las<=SourceFWAntsIdxs[i_ant][3*idx-2]))isBonus = true; - if (SourceFWAntsAbsIdxs[i_ant][idx]<=maxdepth1) { - ostringstream nusource; - nusource << TD::Convert(SourceFWAntsIdxs[i_ant][3*idx-1]) << "/" << SourceFWAntsAbsIdxs[i_ant][idx]; - ScoreOrientationRightBackward(table,orir,TD::Convert(nusource.str()),SourceFWAntsIdxs[i_ant][3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris); - } - if (maxfwidx-SourceFWAntsAbsIdxs[i_ant][idx]+1<=maxdepth2) { - ostringstream nusource; - nusource << TD::Convert(SourceFWAntsIdxs[i_ant][3*idx-1]) << "/" << (-1*(maxfwidx-SourceFWAntsAbsIdxs[i_ant][idx]+1)); - ScoreOrientationRightBackward(table,orir,TD::Convert(nusource.str()),SourceFWAntsIdxs[i_ant][3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris); - } - } - delete fwblock; - } - } -} - - -void Alignment::computeOrientationSourceBackward(const CountTable& table, double *cost, double *bonus, - double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus) { - if (DEBUG) cerr << "computeOrientationSourceBackward" << endl; - int oril, orir; - for (int idx=1; idx<=SourceFWRuleIdxs[0]; idx++) { - if (DEBUG) cerr << "considering SourceFWRuleIdxs[" << idx << "]: " << SourceFWRuleIdxs[3*idx-2] << endl; - int* fwblock = blockSource(SourceFWRuleIdxs[3*idx-2],SourceFWRuleIdxs[3*idx-2]); - bool aligned = (fwblock[2]!=MINIMUM_INIT); - if (aligned) { - OrientationTarget(fwblock[2],fwblock[3],&oril,&orir); - } else { - OrientationSource(SourceFWRuleIdxs[3*idx-2],&oril,&orir); - } - if (DEBUG) cerr << "oril = " << oril << ", orir = " << orir << endl; - bool isBonus = false; // fas -> first aligned source word, las -> last aligned source word - if ((aligned && fwblock[2]<=fat)|| - (!aligned && SourceFWRuleIdxs[3*idx-2]<=fas)) isBonus=true; - ScoreOrientationLeftBackward(table,oril,SourceFWRuleIdxs[3*idx-1],SourceFWRuleIdxs[3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris); - isBonus = false; - if ((aligned && lat<=fwblock[3])|| - (!aligned && las<=SourceFWRuleIdxs[3*idx-2])) isBonus=true; - ScoreOrientationRightBackward(table,orir,SourceFWRuleIdxs[3*idx-1],SourceFWRuleIdxs[3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris); - delete fwblock; - } - for (int i_ant=0; i_ant<_Arity; i_ant++) { - // antfas -> first aligned source word antecedent-wise - // antlas -> last aligned source word antecedent-wise - int antfat = firstTargetAligned(TargetAntsIdxs[i_ant][1]); - int antlat = lastTargetAligned(TargetAntsIdxs[i_ant][TargetAntsIdxs[i_ant][0]]); - int antfas = firstSourceAligned(SourceAntsIdxs[i_ant][1]); - int antlas = lastSourceAligned(SourceAntsIdxs[i_ant][SourceAntsIdxs[i_ant][0]]); - assert(antfat <= antlat); - assert(antfas <= antlas); - for (int idx=1; idx<=SourceFWAntsIdxs[i_ant][0]; idx++) { - if (DEBUG) - cerr << "considering SourceFWAntsIdxs[" << i_ant << "][" << idx << "]: " << SourceFWAntsIdxs[i_ant][3*idx-2] << endl; - int* fwblock = blockSource(SourceFWAntsIdxs[i_ant][3*idx-2],SourceFWAntsIdxs[i_ant][3*idx-2]); - //bool aligned = (minTSpan(SourceFWAntsIdxs[i_ant][3*idx-2])!=MINIMUM_INIT); - bool aligned = (fwblock[2]!=MINIMUM_INIT); - bool Lcompute = true; bool Rcompute = true; - if (DEBUG) { - cerr << " aligned = " << aligned << endl; - cerr << " fwblock = " << fwblock[0] << "," << fwblock[1] << "," << fwblock[2] << "," << fwblock[3] << endl; - cerr << " antfas=" << antfas << ", antlas=" << antlas << ", antfat=" << antfat << ", antlat=" << antlat << endl; - } - if (aligned) { - if (DEBUG) cerr << "laligned" << endl; - if (antfat<fwblock[2]) { - if (DEBUG) cerr << antfat << "<" << fwblock[2] << endl; - Lcompute=false; - } - } else { - if (DEBUG) cerr << "!laligned" << endl; - if (antfas<fwblock[0] && fwblock[1] < antlas) Lcompute=false; - } - if (aligned) { - if (DEBUG) cerr << "raligned" << endl; - if (fwblock[3]<antlat) { - if (DEBUG) cerr << fwblock[3] << "<" << antlat << endl; - Rcompute=false; - } - } else { - if (DEBUG) cerr << "!raligned" << endl; - if (fwblock[1]<antlas && fwblock[1] < antlas) Rcompute=false; - } - if (!Lcompute && !Rcompute) continue; - if (!aligned) { - OrientationSource(SourceFWAntsIdxs[i_ant][3*idx-2],&oril,&orir,Lcompute,Rcompute); - } else { - OrientationTarget(fwblock[2],fwblock[3],&oril,&orir,Lcompute,Rcompute); - } - if (DEBUG) cerr << "oril = " << oril << ", orir = " << orir << endl; - bool isBonus = false; - if (Lcompute) { - if ((aligned && fwblock[3]<=fat) || - (!aligned && SourceFWAntsIdxs[i_ant][3*idx-2]<=fas)) isBonus = true; - ScoreOrientationLeftBackward(table,oril,SourceFWAntsIdxs[i_ant][3*idx-1],SourceFWAntsIdxs[i_ant][3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris); - } - isBonus = false; - if (Rcompute) { - if ((aligned && lat<=fwblock[2]) || - (!aligned && las<=SourceFWAntsIdxs[i_ant][3*idx-2]))isBonus = true; - ScoreOrientationRightBackward(table,orir,SourceFWAntsIdxs[i_ant][3*idx-1],SourceFWAntsIdxs[i_ant][3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris); - } - delete fwblock; - } - } -} - -void Alignment::computeOrientationSourcePos(const CountTable& table, double *cost, double *bonus, - double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, int maxfwidx, int maxdepth1, int maxdepth2) { - // This implementation is actually really bad, not reusing codes at all - if (DEBUG) cerr << "computeOrientationSourcePos(maxfwidx=" << maxfwidx << ",maxdepth=" << maxdepth1 << "," << maxdepth2 << ")" << endl; - if (maxdepth1+maxdepth2==0) return; - int oril, orir; - ostringstream oss; - WordID sourceID; - for (int idx=1; idx<=SourceFWRuleIdxs[0]; idx++) { - if (DEBUG) cerr << "considering SourceFWRuleIdxs[" << idx << "]: " << SourceFWRuleIdxs[3*idx-2] << endl; - //if (!((SourceFWRuleAbsIdxs[idx]<=maxdepth1) || (maxfwidx-SourceFWRuleAbsIdxs[idx]+1<=maxdepth2))) continue; - string source = TD::Convert(SourceFWRuleIdxs[3*idx-1]); - OrientationSource(SourceFWRuleIdxs[3*idx-2],&oril,&orir); - bool isBonus = false; // fas -> first aligned source word, las -> last aligned source word - if (SourceFWRuleIdxs[3*idx-2]<=fas) isBonus=true; - if (!isBonus) // this is unnecessary because fas <= las assertion - if (minTSpan(SourceFWRuleIdxs[3*idx-2])==MINIMUM_INIT && las<=SourceFWRuleIdxs[3*idx-2]) isBonus=true; - if (maxdepth1>0) { - oss << source << "/"; - if (SourceFWRuleAbsIdxs[idx]<=maxdepth1) - oss << SourceFWRuleAbsIdxs[idx]; - else - oss << "X"; - sourceID = TD::Convert(oss.str()); - oss.str(""); - ScoreOrientationLeft(table,oril,sourceID,SourceFWRuleIdxs[3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris); - } - if (maxdepth2>0) { - oss << source << "/"; - if (maxfwidx-SourceFWRuleAbsIdxs[idx]+1<=maxdepth2) - oss << ((maxfwidx-SourceFWRuleAbsIdxs[idx]+1)*-1); - else - oss << "X"; - sourceID = TD::Convert(oss.str()); - oss.str(""); - ScoreOrientationLeft(table,oril,sourceID,SourceFWRuleIdxs[3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris); - } - isBonus = false; - if (las<=SourceFWRuleIdxs[3*idx-2]) isBonus=true; - if (!isBonus) // this is unnecessary becuase fas <= las assertion - if (minTSpan(SourceFWRuleIdxs[3*idx-2])==MINIMUM_INIT && SourceFWRuleIdxs[3*idx-2]<=fas) isBonus=true; - if (maxdepth1>0) { - oss << source << "/"; - if (SourceFWRuleAbsIdxs[idx]<=maxdepth1) - oss << SourceFWRuleAbsIdxs[idx]; - else - oss << "X"; - sourceID = TD::Convert(oss.str()); - oss.str(""); - ScoreOrientationRight(table,orir,sourceID,SourceFWRuleIdxs[3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris); - } - if (maxdepth2>0) { - oss << source << "/"; - if (maxfwidx-SourceFWRuleAbsIdxs[idx]+1<=maxdepth2) - oss << ((maxfwidx-SourceFWRuleAbsIdxs[idx]+1)*-1); - else - oss << "X"; - sourceID = TD::Convert(oss.str()); - oss.str(""); - ScoreOrientationRight(table,orir,sourceID,SourceFWRuleIdxs[3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris); - } - - } - for (int i_ant=0; i_ant<_Arity; i_ant++) { - for (int idx=1; idx<=SourceFWAntsIdxs[i_ant][0]; idx++) { - if (DEBUG) - cerr << "considering SourceFWAntsIdxs[" << i_ant << "][" << idx << "]: " << SourceFWAntsIdxs[i_ant][3*idx-2] << endl; - //if (!((SourceFWAntsAbsIdxs[i_ant][idx]<=maxdepth1)||(maxfwidx-SourceFWAntsAbsIdxs[i_ant][idx]+1<=maxdepth2))) continue; - // antfas -> first aligned source word antecedent-wise - // antlas -> last aligned source word antecedent-wise - int antfas = firstSourceAligned(SourceAntsIdxs[i_ant][1]); - int antlas = lastSourceAligned(SourceAntsIdxs[i_ant][SourceAntsIdxs[i_ant][0]]); - if (DEBUG) cerr << " SourceFWAntsAbsIdxs[i_ant][3*idx-1]=" << SourceFWAntsAbsIdxs[i_ant][3*idx-1] << endl; - string source = TD::Convert(SourceFWAntsIdxs[i_ant][3*idx-1]); - assert(antfas <= antlas); - bool aligned = (minTSpan(SourceFWAntsIdxs[i_ant][3*idx-2])!=MINIMUM_INIT); - bool Lcompute = true;bool Rcompute = true; - if ((aligned && antfas<SourceFWAntsIdxs[i_ant][3*idx-2]) || - (!aligned && antfas < SourceFWAntsIdxs[i_ant][3*idx-2] && SourceFWAntsIdxs[i_ant][3*idx-2] < antlas)) - Lcompute=false; - if ((aligned && SourceFWAntsIdxs[i_ant][3*idx-2]<antlas) || - (!aligned && antfas < SourceFWAntsIdxs[i_ant][3*idx-2] && SourceFWAntsIdxs[i_ant][3*idx-2] < antlas)) - Rcompute=false; - if (!Lcompute && !Rcompute) continue; - OrientationSource(SourceFWAntsIdxs[i_ant][3*idx-2],&oril,&orir,Lcompute, Rcompute); - bool isBonus = false; - if (Lcompute) { - if (SourceFWAntsIdxs[i_ant][3*idx-2]<=fas) isBonus = true; - //if (!isBonus) // this is unnecessary - // if (!aligned && las<=SourceFWAntsIdxs[i_ant][3*idx-2]) isBonus=true; - if (maxdepth1>0) { - oss << source << "/"; - if (SourceFWAntsAbsIdxs[i_ant][idx]<=maxdepth1) - oss << SourceFWAntsAbsIdxs[i_ant][idx]; - else - oss << "X"; - sourceID = TD::Convert(oss.str()); - oss.str(""); - ScoreOrientationLeft(table,oril,sourceID,SourceFWAntsIdxs[i_ant][3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris); - } - if (maxdepth2>0) { - oss << source << "/"; - if (maxfwidx-SourceFWAntsAbsIdxs[i_ant][idx]+1<=maxdepth2) - oss << ((maxfwidx-SourceFWAntsAbsIdxs[i_ant][idx]+1)*-1); - else - oss << "X"; - sourceID = TD::Convert(oss.str()); - oss.str(""); - ScoreOrientationLeft(table,oril,sourceID,SourceFWAntsIdxs[i_ant][3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris); - } - } - isBonus = false; - if (Rcompute) { - if (las<=SourceFWAntsIdxs[i_ant][3*idx-2]) isBonus = true; - //if (!isBonus) // this is unnecessary - // if (!aligned && SourceFWAntsIdxs[i_ant][3*idx-2]<=fas) isBonus=true; - if (maxdepth1>0) { - oss << source << "/"; - if (SourceFWAntsAbsIdxs[i_ant][idx]<=maxdepth1) - oss << SourceFWAntsAbsIdxs[i_ant][idx]; - else - oss << "X"; - sourceID = TD::Convert(oss.str()); - oss.str(""); - ScoreOrientationRight(table,orir,sourceID,SourceFWAntsIdxs[i_ant][3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris); - } - if (maxdepth2>0) { - oss << source << "/"; - if (maxfwidx-SourceFWAntsAbsIdxs[i_ant][idx]+1<=maxdepth2) - oss << ((maxfwidx-SourceFWAntsAbsIdxs[i_ant][idx]+1)*-1); - else - oss << "X"; - sourceID = TD::Convert(oss.str()); - oss.str(""); - ScoreOrientationRight(table,orir,sourceID,SourceFWAntsIdxs[i_ant][3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris); - } - } - } - } -} - -void Alignment::computeOrientationSource(const CountTable& table, double *cost, double *bonus, - double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus) { -// a bit complex due to imperfect state (TO DO!!!) -// 1. there are cases where function word alignments come from antecedents, which orientation -// (either its left or its right) has been computed earlier. -// 2. some orientation will go as bonus - if (DEBUG) cerr << "computeOrientationSource" << endl; - int oril, orir; - for (int idx=1; idx<=SourceFWRuleIdxs[0]; idx++) { - if (DEBUG) cerr << "considering SourceFWRuleIdxs[" << idx << "]: " << SourceFWRuleIdxs[3*idx-2] << endl; - OrientationSource(SourceFWRuleIdxs[3*idx-2],&oril,&orir); - bool isBonus = false; // fas -> first aligned source word, las -> last aligned source word - if (SourceFWRuleIdxs[3*idx-2]<=fas) isBonus=true; - if (!isBonus) // this is unnecessary because fas <= las assertion - if (minTSpan(SourceFWRuleIdxs[3*idx-2])==MINIMUM_INIT && las<=SourceFWRuleIdxs[3*idx-2]) isBonus=true; - ScoreOrientationLeft(table,oril,SourceFWRuleIdxs[3*idx-1],SourceFWRuleIdxs[3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris); - isBonus = false; - if (las<=SourceFWRuleIdxs[3*idx-2]) isBonus=true; - if (!isBonus) // this is unnecessary becuase fas <= las assertion - if (minTSpan(SourceFWRuleIdxs[3*idx-2])==MINIMUM_INIT && SourceFWRuleIdxs[3*idx-2]<=fas) isBonus=true; - ScoreOrientationRight(table,orir,SourceFWRuleIdxs[3*idx-1],SourceFWRuleIdxs[3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris); - } - for (int i_ant=0; i_ant<_Arity; i_ant++) { - for (int idx=1; idx<=SourceFWAntsIdxs[i_ant][0]; idx++) { - if (DEBUG) - cerr << "considering SourceFWAntsIdxs[" << i_ant << "][" << idx << "]: " << SourceFWAntsIdxs[i_ant][3*idx-2] << endl; - // antfas -> first aligned source word antecedent-wise - // antlas -> last aligned source word antecedent-wise - int antfas = firstSourceAligned(SourceAntsIdxs[i_ant][1]); - int antlas = lastSourceAligned(SourceAntsIdxs[i_ant][SourceAntsIdxs[i_ant][0]]); - assert(antfas <= antlas); - bool aligned = (minTSpan(SourceFWAntsIdxs[i_ant][3*idx-2])!=MINIMUM_INIT); - bool Lcompute = true;bool Rcompute = true; - if ((aligned && antfas<SourceFWAntsIdxs[i_ant][3*idx-2]) || - (!aligned && antfas < SourceFWAntsIdxs[i_ant][3*idx-2] && SourceFWAntsIdxs[i_ant][3*idx-2] < antlas)) - Lcompute=false; - if ((aligned && SourceFWAntsIdxs[i_ant][3*idx-2]<antlas) || - (!aligned && antfas < SourceFWAntsIdxs[i_ant][3*idx-2] && SourceFWAntsIdxs[i_ant][3*idx-2] < antlas)) - Rcompute=false; - if (!Lcompute && !Rcompute) continue; - OrientationSource(SourceFWAntsIdxs[i_ant][3*idx-2],&oril,&orir,Lcompute, Rcompute); - bool isBonus = false; - if (Lcompute) { - if (SourceFWAntsIdxs[i_ant][3*idx-2]<=fas) isBonus = true; - //if (!isBonus) // this is unnecessary - // if (!aligned && las<=SourceFWAntsIdxs[i_ant][3*idx-2]) isBonus=true; - ScoreOrientationLeft(table,oril,SourceFWAntsIdxs[i_ant][3*idx-1],SourceFWAntsIdxs[i_ant][3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris); - } - isBonus = false; - if (Rcompute) { - if (las<=SourceFWAntsIdxs[i_ant][3*idx-2]) isBonus = true; - //if (!isBonus) // this is unnecessary - // if (!aligned && SourceFWAntsIdxs[i_ant][3*idx-2]<=fas) isBonus=true; - ScoreOrientationRight(table,orir,SourceFWAntsIdxs[i_ant][3*idx-1],SourceFWAntsIdxs[i_ant][3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris); - } - } - } -} - -void Alignment::computeOrientationSourceGen(const CountTable& table, double *cost, double *bonus, - double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, const map<WordID,WordID>& tags) { - if (DEBUG) cerr << "computeOrientationSourceGen" << endl; - int oril, orir; - for (int idx=1; idx<=SourceFWRuleIdxs[0]; idx++) { - if (DEBUG) cerr << "considering SourceFWRuleIdxs[" << idx << "]: " << SourceFWRuleIdxs[3*idx-2] << endl; - OrientationSource(SourceFWRuleIdxs[3*idx-2],&oril,&orir); - bool isBonus = false; // fas -> first aligned source word, las -> last aligned source word - if (SourceFWRuleIdxs[3*idx-2]<=fas) isBonus=true; - if (!isBonus) // this is unnecessary because fas <= las assertion - if (minTSpan(SourceFWRuleIdxs[3*idx-2])==MINIMUM_INIT && las<=SourceFWRuleIdxs[3*idx-2]) isBonus=true; - ScoreOrientationLeft(table,oril,generalize(SourceFWRuleIdxs[3*idx-1],tags),SourceFWRuleIdxs[3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris); - isBonus = false; - if (las<=SourceFWRuleIdxs[3*idx-2]) isBonus=true; - if (!isBonus) // this is unnecessary becuase fas <= las assertion - if (minTSpan(SourceFWRuleIdxs[3*idx-2])==MINIMUM_INIT && SourceFWRuleIdxs[3*idx-2]<=fas) isBonus=true; - ScoreOrientationRight(table,orir,generalize(SourceFWRuleIdxs[3*idx-1],tags),SourceFWRuleIdxs[3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris); - } - for (int i_ant=0; i_ant<_Arity; i_ant++) { - for (int idx=1; idx<=SourceFWAntsIdxs[i_ant][0]; idx++) { - if (DEBUG) - cerr << "considering SourceFWAntsIdxs[" << i_ant << "][" << idx << "]: " << SourceFWAntsIdxs[i_ant][3*idx-2] << endl; - // antfas -> first aligned source word antecedent-wise - // antlas -> last aligned source word antecedent-wise - int antfas = firstSourceAligned(SourceAntsIdxs[i_ant][1]); - int antlas = lastSourceAligned(SourceAntsIdxs[i_ant][SourceAntsIdxs[i_ant][0]]); - assert(antfas <= antlas); - bool aligned = (minTSpan(SourceFWAntsIdxs[i_ant][3*idx-2])!=MINIMUM_INIT); - bool Lcompute = true;bool Rcompute = true; - if ((aligned && antfas<SourceFWAntsIdxs[i_ant][3*idx-2]) || - (!aligned && antfas < SourceFWAntsIdxs[i_ant][3*idx-2] && SourceFWAntsIdxs[i_ant][3*idx-2] < antlas)) - Lcompute=false; - if ((aligned && SourceFWAntsIdxs[i_ant][3*idx-2]<antlas) || - (!aligned && antfas < SourceFWAntsIdxs[i_ant][3*idx-2] && SourceFWAntsIdxs[i_ant][3*idx-2] < antlas)) - Rcompute=false; - if (!Lcompute && !Rcompute) continue; - OrientationSource(SourceFWAntsIdxs[i_ant][3*idx-2],&oril,&orir,Lcompute, Rcompute); - bool isBonus = false; - if (Lcompute) { - if (SourceFWAntsIdxs[i_ant][3*idx-2]<=fas) isBonus = true; - //if (!isBonus) // this is unnecessary - // if (!aligned && las<=SourceFWAntsIdxs[i_ant][3*idx-2]) isBonus=true; - ScoreOrientationLeft(table,oril,generalize(SourceFWAntsIdxs[i_ant][3*idx-1],tags),SourceFWAntsIdxs[i_ant][3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris); - } - isBonus = false; - if (Rcompute) { - if (las<=SourceFWAntsIdxs[i_ant][3*idx-2]) isBonus = true; - //if (!isBonus) // this is unnecessary - // if (!aligned && SourceFWAntsIdxs[i_ant][3*idx-2]<=fas) isBonus=true; - ScoreOrientationRight(table,orir,generalize(SourceFWAntsIdxs[i_ant][3*idx-1],tags),SourceFWAntsIdxs[i_ant][3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris); - } - } - } -} -void Alignment::computeOrientationTarget(const CountTable& table, double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus) { - if (DEBUG) cerr << "computeOrientationTarget" << endl; - int oril, orir; - for (int idx=1; idx<=TargetFWRuleIdxs[0]; idx++) { - if (DEBUG) cerr << "considering TargetFWRuleIdxs[" << idx << "]: " << TargetFWRuleIdxs[3*idx-2] << endl; - OrientationTarget(TargetFWRuleIdxs[3*idx-2],&oril,&orir); - // the second and the third parameters of ScoreOrientationLeft must be e and f (not f and then e) - bool isBonus = false; - if (TargetFWRuleIdxs[3*idx-2]<=fat) isBonus = true; - if (!isBonus) - if (minSSpan(TargetFWRuleIdxs[3*idx-2])==MINIMUM_INIT && lat<=TargetFWRuleIdxs[3*idx-2]) isBonus = true; - ScoreOrientationLeft(table,oril,TargetFWRuleIdxs[3*idx-1],TargetFWRuleIdxs[3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_orit,beta_orit); - isBonus = false; - if (lat<=TargetFWRuleIdxs[3*idx-2]) isBonus = true; - if (!isBonus) - if (minSSpan(TargetFWRuleIdxs[3*idx-2])==MINIMUM_INIT && TargetFWRuleIdxs[3*idx-2]<=fat) isBonus=true; - ScoreOrientationRight(table,orir,TargetFWRuleIdxs[3*idx-1],TargetFWRuleIdxs[3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_orit,beta_orit); - } - - for (int i_ant=0; i_ant<_Arity; i_ant++) { - for (int idx=1; idx<=TargetFWAntsIdxs[i_ant][0]; idx++) { - if (DEBUG) cerr << "considering TargetFWAntsIdxs[" << i_ant << "][" << idx << "]: " << TargetFWAntsIdxs[i_ant][3*idx-2] << endl; - int antfat = firstTargetAligned(TargetAntsIdxs[i_ant][1]); - int antlat = lastTargetAligned(TargetAntsIdxs[i_ant][TargetAntsIdxs[i_ant][0]]); - int aligned = (minSSpan( TargetFWAntsIdxs[i_ant][3*idx-2])!=MINIMUM_INIT); - bool Lcompute = true; bool Rcompute = true; - if ((aligned && antfat<TargetFWAntsIdxs[i_ant][3*idx-2]) || - (!aligned && antfat < TargetFWAntsIdxs[i_ant][3*idx-2] && TargetFWAntsIdxs[i_ant][3*idx-2] < antlat)) - Lcompute=false; - if ((aligned && TargetFWAntsIdxs[i_ant][3*idx-2]<antlat) || - (!aligned && antfat < TargetFWAntsIdxs[i_ant][3*idx-2] && TargetFWAntsIdxs[i_ant][3*idx-2] < antlat)) - Rcompute=false; - if (!Lcompute && !Rcompute) continue; - bool isBonus = false; - OrientationTarget(TargetFWAntsIdxs[i_ant][3*idx-2],&oril,&orir, Lcompute, Rcompute); - if (Lcompute) { - if (TargetFWAntsIdxs[i_ant][3*idx-2]<=fat) isBonus=true; - //if (!isBonus) - // if (!aligned && lat<=TargetFWAntsIdxs[i_ant][3*idx-2]) isBonus=true; - ScoreOrientationLeft(table,oril,TargetFWAntsIdxs[i_ant][3*idx-1],TargetFWAntsIdxs[i_ant][3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_orit,beta_orit); - } - isBonus = false; - if (Rcompute) { - if (lat<=TargetFWAntsIdxs[i_ant][3*idx-2]) isBonus=true; - if (!isBonus) - //if (!aligned && TargetFWAntsIdxs[i_ant][3*idx-2]<=fat) isBonus=true; - ScoreOrientationRight(table,orir,TargetFWAntsIdxs[i_ant][3*idx-1],TargetFWAntsIdxs[i_ant][3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_orit,beta_orit); - } - } - } -} - -void Alignment::computeOrientationTargetBackward(const CountTable& table, double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus) { - if (DEBUG) cerr << "computeOrientationTargetBackward" << endl; - int oril, orir; - for (int idx=1; idx<=TargetFWRuleIdxs[0]; idx++) { - if (DEBUG) cerr << "considering TargetFWRuleIdxs[" << idx << "]: " << TargetFWRuleIdxs[3*idx-2] << endl; - int* fwblock = blockSource(TargetFWRuleIdxs[3*idx-2],TargetFWRuleIdxs[3*idx-2]); - bool aligned = (fwblock[0] == MINIMUM_INIT); - if (aligned) { - OrientationSource(fwblock[0],fwblock[1],&oril,&orir); - } else { - OrientationTarget(TargetFWRuleIdxs[3*idx-2],&oril,&orir); - } - delete fwblock; - // the second and the third parameters of ScoreOrientationLeft must be e and f (not f and then e) - bool isBonus = false; - if (TargetFWRuleIdxs[3*idx-2]<=fat) isBonus = true; - //if (!isBonus) // unnecessary - //if (minSSpan(TargetFWRuleIdxs[3*idx-2])==MINIMUM_INIT && lat<=TargetFWRuleIdxs[3*idx-2]) isBonus = true; - ScoreOrientationLeftBackward(table,oril,TargetFWRuleIdxs[3*idx-1],TargetFWRuleIdxs[3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_orit,beta_orit); - isBonus = false; - if (lat<=TargetFWRuleIdxs[3*idx-2]) isBonus = true; - //if (!isBonus) // unnecessary - //if (minSSpan(TargetFWRuleIdxs[3*idx-2])==MINIMUM_INIT && TargetFWRuleIdxs[3*idx-2]<=fat) isBonus=true; - ScoreOrientationRightBackward(table,orir,TargetFWRuleIdxs[3*idx-1],TargetFWRuleIdxs[3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_orit,beta_orit); - } - - for (int i_ant=0; i_ant<_Arity; i_ant++) { - int antfat = firstTargetAligned(TargetAntsIdxs[i_ant][1]); - int antlat = lastTargetAligned(TargetAntsIdxs[i_ant][TargetAntsIdxs[i_ant][0]]); - int antfas = firstSourceAligned(SourceAntsIdxs[i_ant][1]); - int antlas = lastSourceAligned(SourceAntsIdxs[i_ant][SourceAntsIdxs[i_ant][0]]); - for (int idx=1; idx<=TargetFWAntsIdxs[i_ant][0]; idx++) { - if (DEBUG) cerr << "considering TargetFWAntsIdxs[" << i_ant << "][" << idx << "]: " << TargetFWAntsIdxs[i_ant][3*idx-2] << endl; - int* fwblock = blockTarget(TargetFWAntsIdxs[i_ant][3*idx-2],TargetFWAntsIdxs[i_ant][3*idx-2]); - bool aligned = (fwblock[0]!=MINIMUM_INIT); - //bool aligned = (minSSpan( TargetFWAntsIdxs[i_ant][3*idx-2])!=MINIMUM_INIT); - bool Lcompute = true; bool Rcompute = true; - if ((aligned && antfas<fwblock[0]) || - (!aligned && antfat < fwblock[2])) - Lcompute=false; - if ((aligned && fwblock[0]<antlas) || - (!aligned && fwblock[3] < antlat)) - Rcompute=false; - if (!Lcompute && !Rcompute) continue; - bool isBonus = false; - if (aligned) { - OrientationSource(fwblock[0],fwblock[1],&oril,&orir,Lcompute,Rcompute); - } else { - OrientationTarget(TargetFWAntsIdxs[i_ant][3*idx-2],&oril,&orir, Lcompute, Rcompute); - } - if (Lcompute) { - if ((aligned && fwblock[1]<=fas) || - (!aligned && fwblock[3]<=fat)) - isBonus=true; - //if (!isBonus) - // if (!aligned && lat<=TargetFWAntsIdxs[i_ant][3*idx-2]) isBonus=true; - ScoreOrientationLeftBackward(table,oril,TargetFWAntsIdxs[i_ant][3*idx-1],TargetFWAntsIdxs[i_ant][3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_orit,beta_orit); - } - isBonus = false; - if (Rcompute) { - if ((aligned && las<=fwblock[0]) || - (!aligned && lat<=fwblock[2])) - isBonus=true; - if (!isBonus) - //if (!aligned && TargetFWAntsIdxs[i_ant][3*idx-2]<=fat) isBonus=true; - ScoreOrientationRightBackward(table,orir,TargetFWAntsIdxs[i_ant][3*idx-1],TargetFWAntsIdxs[i_ant][3*idx], - isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_orit,beta_orit); - } - delete fwblock; - } - } -} - -bool Alignment::MemberOf(int* FWIdxs, int pos1, int pos2) { - for (int idx=2; idx<=FWIdxs[0]; idx++) { - if (FWIdxs[3*(idx-1)-2]==pos1 && FWIdxs[3*idx-2]==pos2) return true; - } - return false; -} - -void Alignment::computeDominanceSource(const CountTable& table, WordID lfw, WordID rfw, - double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus) { - // no bonus yet - if (DEBUG) cerr << "computeDominanceSource" << endl; - if (DEBUG) cerr << " initial cost=" << *cost << ", initial bonus=" << *bonus << endl; - for (int idx=2; idx<=SourceFWIdxs[0]; idx++) { - if (DEBUG) { - cerr << "PrevSourceFWIdxs :" << SourceFWIdxs[3*(idx-1)-2] << "," << SourceFWIdxs[3*(idx-1)-1] - << "," << SourceFWIdxs[3*(idx-1)] << endl; - cerr << "CurrSourceFWIdxs :" << SourceFWIdxs[3*(idx)-2] << "," << SourceFWIdxs[3*(idx)-1] - << "," << SourceFWIdxs[3*(idx)] << endl; - } - bool compute = true; - for (int i_ant=0; i_ant<_Arity && compute; i_ant++) { - if (MemberOf(SourceFWAntsIdxs[i_ant],SourceFWIdxs[3*(idx-1)-2],SourceFWIdxs[3*(idx)-2])) { - //cerr << "Skipping, they have been calculated in the " << (i_ant+1) << "-th branch" << endl; - compute=false; - } - } - if (compute) { - int dom = DominanceSource(SourceFWIdxs[3*(idx-1)-2],SourceFWIdxs[3*idx-2]); - if (DEBUG) cerr << "dom = " << dom << endl; - ScoreDominance(table,dom,SourceFWIdxs[3*(idx-1)-1],SourceFWIdxs[3*idx-1],SourceFWIdxs[3*(idx-1)],SourceFWIdxs[3*idx], - cost,bo1,bo2,false,alpha_doms,beta_doms); - if (DEBUG) cerr << "cost now is " << *cost << endl; - } - } - if (SourceFWIdxs[0]>0) { - if (lfw>=0) { - int dom = DominanceSource(0,SourceFWIdxs[1]); - if (DEBUG) cerr << " --> lfw = " << lfw << "-" << TD::Convert(lfw) << endl; - if (DEBUG) cerr << " --> rfw = " << rfw << "-" << TD::Convert(rfw) << endl; - ScoreDominance(table,dom,lfw,SourceFWIdxs[2],lfw,SourceFWIdxs[3],bonus,bo1_bonus,bo2_bonus,true,alpha_doms,beta_doms); - } - if (rfw>=0) { - int dom = DominanceSource(SourceFWIdxs[3*SourceFWIdxs[0]-2],_J-1); - ScoreDominance(table,dom,SourceFWIdxs[3*SourceFWIdxs[0]-1],rfw,SourceFWIdxs[3*SourceFWIdxs[0]], - rfw,bonus,bo1_bonus,bo2_bonus,true,alpha_doms,beta_doms); - } - } -} - -void Alignment::computeDominanceSourcePos(const CountTable& table, WordID lfw, WordID rfw, - double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, int maxfwidx, int maxdepth1, int maxdepth2) { - if (DEBUG) cerr << "computeDominanceSourcePos" << endl; - if (DEBUG) cerr << " initial cost=" << *cost << ", initial bonus=" << *bonus << endl; - ostringstream oss; - for (int idx=2; idx<=SourceFWIdxs[0]; idx++) { - if (DEBUG) { - cerr << "PrevSourceFWIdxs :" << SourceFWIdxs[3*(idx-1)-2] << "," << SourceFWIdxs[3*(idx-1)-1] - << "," << SourceFWIdxs[3*(idx-1)] << endl; - cerr << "CurrSourceFWIdxs :" << SourceFWIdxs[3*(idx)-2] << "," << SourceFWIdxs[3*(idx)-1] - << "," << SourceFWIdxs[3*(idx)] << endl; - } - //if (!((SourceFWAbsIdxs[3*(idx-1)-2]<=maxdepth1 && SourceFWAbsIdxs[3*idx-2]<=maxdepth1) || - // (maxfwidx-SourceFWAbsIdxs[3*(idx-1)-2]+1<=maxdepth2 && maxfwidx-SourceFWAbsIdxs[3*idx-2]+1<=maxdepth2))) continue; - bool compute = true; - for (int i_ant=0; i_ant<_Arity && compute; i_ant++) { - if (MemberOf(SourceFWAntsIdxs[i_ant],SourceFWIdxs[3*(idx-1)-2],SourceFWIdxs[3*(idx)-2])) { - //cerr << "Skipping, they have been calculated in the " << (i_ant+1) << "-th branch" << endl; - compute=false; - } - } - if (compute) { - int dom = DominanceSource(SourceFWIdxs[3*(idx-1)-2],SourceFWIdxs[3*idx-2]); - if (DEBUG) cerr << "dom = " << dom << endl; - if (maxdepth1+maxdepth2>0) { - string source1 = TD::Convert(SourceFWIdxs[3*(idx-1)-1]); - string source2 = TD::Convert(SourceFWIdxs[3*(idx)-1]); - if (maxdepth1>0) { - oss << source1 << "/"; - if (SourceFWAbsIdxs[3*(idx-1)-2]<=maxdepth1) - oss << SourceFWAbsIdxs[3*(idx-1)-2]; - else - oss << "X"; - WordID source1id = TD::Convert(oss.str()); - oss.str(""); - oss << source2 << "/"; - if (SourceFWAbsIdxs[3*idx-2]<=maxdepth1) - oss << SourceFWAbsIdxs[3*idx-2]; - else - oss << "X"; - WordID source2id = TD::Convert(oss.str()); - oss.str(""); - ScoreDominance(table,dom,source1id,source2id,SourceFWIdxs[3*(idx-1)],SourceFWIdxs[3*idx], - cost,bo1,bo2,false,alpha_doms,beta_doms); - } - if (maxdepth2>0) { - oss << source1 << "/"; - if (maxfwidx-SourceFWAbsIdxs[3*(idx-1)-2]+1<=maxdepth2) - oss << ((maxfwidx-SourceFWAbsIdxs[3*(idx-1)-2]+1)*-1); - else - oss << "X"; - WordID source1id = TD::Convert(oss.str()); - oss.str(""); - oss << source2 << "/"; - if (maxfwidx-SourceFWAbsIdxs[3*idx-2]+1<=maxdepth2) - oss << ((maxfwidx-SourceFWAbsIdxs[3*(idx-1)-2]+1)*-1); - else - oss << "X"; - WordID source2id = TD::Convert(oss.str()); - oss.str(""); - ScoreDominance(table,dom,source1id,source2id,SourceFWIdxs[3*(idx-1)],SourceFWIdxs[3*idx], - cost,bo1,bo2,false,alpha_doms,beta_doms); - } - } - } - } - if (SourceFWIdxs[0]>0) { - if (lfw>=0) { - int dom = DominanceSource(0,SourceFWIdxs[1]); - string source1 = TD::Convert(lfw); - string source2 = TD::Convert(SourceFWIdxs[2]); - if (maxdepth1>0) { - oss << source1 << "/"; - if (SourceFWAbsIdxs[1]-1<=maxdepth1) - oss << (SourceFWAbsIdxs[1]-1); - else - oss << "X"; - WordID source1id = TD::Convert(oss.str()); - oss.str(""); - oss << source2 << "/"; - if (SourceFWAbsIdxs[1]<=maxdepth1) - oss << SourceFWAbsIdxs[1]; - else - oss << "X"; - WordID source2id = TD::Convert(oss.str()); - oss.str(""); - ScoreDominance(table,dom,source1id,source2id,lfw,SourceFWIdxs[3],bonus,bo1_bonus,bo2_bonus,true,alpha_doms,beta_doms); - } - if (maxdepth2>0) { - oss << source1 << "/"; - if (maxfwidx-(SourceFWAbsIdxs[1]-1)+1<=maxdepth2) - oss << ((maxfwidx-(SourceFWAbsIdxs[1]-1)+1)*-1); - else - oss << "X"; - WordID source1id = TD::Convert(oss.str()); - oss.str(""); - oss << source2 << "/"; - if (maxfwidx-SourceFWAbsIdxs[1]+1<=maxdepth2) - oss << ((maxfwidx-SourceFWAbsIdxs[1]+1)*-1); - else - oss << "X"; - WordID source2id = TD::Convert(oss.str()); - oss.str(""); - ScoreDominance(table,dom,source1id,source2id,lfw,SourceFWIdxs[3],bonus,bo1_bonus,bo2_bonus,true,alpha_doms,beta_doms); - } - } - if (rfw>=0) { - int dom = DominanceSource(SourceFWIdxs[3*SourceFWIdxs[0]-2],_J-1); - string source1 = TD::Convert(SourceFWIdxs[3*SourceFWIdxs[0]-1]); - string source2 = TD::Convert(rfw); - if (maxdepth1>0) { - oss << source1 << "/"; - if (SourceFWAbsIdxs[3*SourceFWAbsIdxs[0]-2]<=maxdepth1) - oss << SourceFWAbsIdxs[3*SourceFWIdxs[0]-2]; - else - oss << "X"; - WordID source1id = TD::Convert(oss.str()); - oss.str(""); - oss << source2 << "/"; - if (SourceFWAbsIdxs[3*SourceFWAbsIdxs[0]-2]+1<=maxdepth1) - oss << (SourceFWAbsIdxs[3*SourceFWAbsIdxs[0]-2]+1); - else - oss << "X"; - WordID source2id = TD::Convert(oss.str()); - ScoreDominance(table,dom,source1id,source2id,SourceFWIdxs[3*SourceFWIdxs[0]], - rfw,bonus,bo1_bonus,bo2_bonus,true,alpha_doms,beta_doms); - } - if (maxdepth2>0) { - oss << source1 << "/"; - if (maxfwidx-SourceFWAbsIdxs[3*SourceFWAbsIdxs[0]-2]+1<=maxdepth2) - oss << ((maxfwidx-SourceFWAbsIdxs[3*SourceFWAbsIdxs[0]-2]+1)*-1); - else - oss << "X"; - WordID source1id = TD::Convert(oss.str()); - oss.str(""); - oss << source2 << "/"; - if (maxfwidx-(SourceFWAbsIdxs[3*SourceFWAbsIdxs[0]-2]+1)+1<=maxdepth2) - oss << ((maxfwidx-(SourceFWAbsIdxs[3*SourceFWAbsIdxs[0]-2]+1)+1)*-1); - else - oss << "X"; - WordID source2id = TD::Convert(oss.str()); - oss.str(""); - ScoreDominance(table,dom,source1id,source2id,SourceFWIdxs[3*SourceFWIdxs[0]], - rfw,bonus,bo1_bonus,bo2_bonus,true,alpha_doms,beta_doms); - } - } - } -} - - -void Alignment::computeDominanceTarget(const CountTable& table, WordID lfw, WordID rfw, - double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus) { - if (DEBUG) cerr << "computeDominanceTarget" << endl; - for (int idx=2; idx<=TargetFWIdxs[0]; idx++) { - if (DEBUG) cerr << "PrevTargetFWIdxs :" << TargetFWIdxs[3*(idx-1)-2] << "," << TargetFWIdxs[3*(idx-1)-1] << "," <<TargetFWIdxs[3*(idx-1)] << endl; - if (DEBUG) cerr << "CurrTargetFWIdxs :" << TargetFWIdxs[3*(idx)-2] << "," << TargetFWIdxs[3*(idx)-1] << "," <<TargetFWIdxs[3*(idx)] << endl; - bool compute = true; - for (int i_ant=0; i_ant <_Arity && compute; i_ant++) { - if (MemberOf(TargetFWAntsIdxs[i_ant],TargetFWIdxs[3*(idx-1)-2],TargetFWIdxs[3*idx-2])) { - if (DEBUG) cerr << "Skipping, they have been calculated in the " << (i_ant+1) << "-th branch" << endl; - compute = false; - } - } - if (compute) { - int dom = DominanceTarget(TargetFWIdxs[3*(idx-1)-2],TargetFWIdxs[3*idx-2]); - //cerr << (3*(idx-1)) << "," << (3*idx) << "," << (3*(idx-1)-1) << "," << (3*idx-1) << endl; - if (DEBUG) cerr << "dom target = " << dom << endl; - ScoreDominance(table,dom,TargetFWIdxs[3*(idx-1)],TargetFWIdxs[3*idx],TargetFWIdxs[3*(idx-1)-1],TargetFWIdxs[3*idx-1], - cost,bo1,bo2,false,alpha_domt,beta_domt); - } - } - if (TargetFWIdxs[0]>0) { - if (DEBUG) cerr << "backoff dominance " << endl; - if (lfw>=0) { - int dom = DominanceTarget(0,TargetFWIdxs[1]); - if (DEBUG) cerr << "dom target (with left) = " << dom << endl; - ScoreDominance(table,dom,lfw,lfw,TargetFWIdxs[2],TargetFWIdxs[3],bonus,bo1_bonus,bo2_bonus,true,alpha_domt,beta_domt); - } - if (rfw>=0) { - int dom = DominanceTarget(TargetFWIdxs[3*TargetFWIdxs[0]-2],_I-1); - if (DEBUG) cerr << "dom target (with right) = " << dom << endl; - ScoreDominance(table,dom,TargetFWIdxs[3*TargetFWIdxs[0]-1],TargetFWIdxs[3*TargetFWIdxs[0]], - rfw,rfw,bonus,bo1_bonus,bo2_bonus,true,alpha_domt,beta_domt); - } - } - - //cerr << "END of computeDominanceTarget" << endl; -} - -double Alignment::ScoreDominance(const CountTable& table, int dom, WordID source1, WordID source2, WordID target1, WordID target2) { - if (DEBUG) { - cerr << "ScoreDominance(source1=" << TD::Convert(source1) << ",source2=" << TD::Convert(source2) - << ",target1=" << TD::Convert(target1) << ",target2=" << TD::Convert(target2) << ", dom=" << dom << endl; - } - string _source1 = TD::Convert(source1); - string _source2 = TD::Convert(source2); - string _source1idx; string _source2idx; - if (table.mode==1) { - _source1idx = _source1; _source2idx = _source2; - _source1 = _source1idx.substr(0,_source1idx.find_last_of("/")); - _source2 = _source2idx.substr(0,_source2idx.find_last_of("/")); - } - string _target1 = TD::Convert(target1); - string _target2 = TD::Convert(target2); - - double count = table.ultimate[dom]; - double total = table.ultimate[4]; - double prob = count/total; - if (DEBUG) cerr << "level0 " << count << "/" << total << "=" << prob << endl; - double alpha = 0.1; - - string key = _source1 + " " + _source2; - WordID key_id = TD::Convert(key); - map<WordID,int*>::const_iterator it = table.model.find(key_id); - bool stop = (it==table.model.end()); - if (!stop) { - stop = true; - if (it->second[4]>=0) { - count = it->second[dom] + alpha*prob; - total = it->second[4] + alpha; - prob = count/total; - if (DEBUG) cerr << "level1 " << count << "/" << total << "=" << prob << endl; - stop = false; - } - } - if (stop) return prob; - - key = _source1 + " " + _source2 + " " + _target1 + " " + _target2; - key_id = TD::Convert(key); - it = table.model.find(key_id); - stop = (it==table.model.end()); - if (!stop) { - stop = true; - if (it->second[4]>=0) { - count = it->second[dom] + alpha*prob; - total = it->second[4] + alpha; - prob = count/total; - if (DEBUG) cerr << "level2 " << count << "/" << total << "=" << prob << endl; - stop = false; - } - } - - if (table.mode!=1 || stop) return prob; - key = _source1 + " " + _source2 + " " + _target1 + " " + _target2; - key_id = TD::Convert(key); - it = table.model.find(key_id); - if (it!=table.model.end()) { - if (it->second[4]>=0) { - count = it->second[dom] + alpha*prob; - total = it->second[4] + alpha; - if (DEBUG) cerr << "level3 " << count << "/" << total << "=" << prob << endl; - prob = count/total; - } - } - - return prob; -} - -void Alignment::ScoreDominance(const CountTable& table, int dom, WordID source1, WordID source2, WordID target1, WordID target2, double *cost, double *bo1, double *bo2, bool isBonus, double alpha2, double beta2) { - if (DEBUG) - cerr << "ScoreDominance(source1=" << TD::Convert(source1) << ",source2=" << TD::Convert(source2) - << ",target1=" << TD::Convert(target1) << ",target2=" << TD::Convert(target2) << ",isBonus=" << isBonus << ", alpha2 = " << alpha2 << ", beta2 = " << beta2 << endl; - if (DEBUG) cerr << " BEFORE=" << *cost << endl; - *cost += ScoreDominance(table,dom,source1,source2,target1,target2); - if (DEBUG) cerr << " AFTER=" << *cost << endl; -} - -WordID Alignment::F2EProjectionFromExternal(int idx, const vector<AlignmentPoint>& als, const string& delimiter) { - if (DEBUG) { - cerr << "F2EProjectionFromExternal=" << idx << endl; - for (int i=0; i< als.size(); i++) cerr << "als[" << i << "]=" << als[i] << " "; - cerr << endl; - } - vector<int> alignedTo; - for (int i=0; i<als.size(); i++) { - if (DEBUG) cerr << als[i] << " "; - if (als[i].s_==idx) - alignedTo.push_back(als[i].t_); - } - if (DEBUG) { - cerr << endl; - cerr << "alignedTo = "; - for (int i=0; i<alignedTo.size(); i++) cerr << alignedTo[i] << " "; - cerr << endl; - } - if (alignedTo.size()==0) { - if (DEBUG) cerr << "returns [NULL] : " << TD::Convert("NULL") << endl; - return TD::Convert("NULL"); - } else if (alignedTo.size()==1) { - if (DEBUG) cerr << "returns [" << TD::Convert(_e[alignedTo[0]]) << "] : " << _e[alignedTo[0]] << endl; - return _e[alignedTo[0]]; // if not aligned to many, why bother continuing - } else { - ostringstream projection; - for (int i=0; i<alignedTo.size(); i++) { - if (i>0) projection << delimiter; - projection << TD::Convert(_e[alignedTo[i]]); - } - if (DEBUG) { - cerr << "projection = " << projection.str() << endl; - cerr << "returns = " << TD::Convert(projection.str()) << endl; - } - return TD::Convert(projection.str()); - } -} - -WordID Alignment::E2FProjectionFromExternal(int idx, const vector<AlignmentPoint>& als, const string& delimiter) { - vector<int> alignedTo; - for (int i=0; i<als.size(); i++) - if (als[i].t_==idx) alignedTo.push_back(als[i].s_); - if (alignedTo.size()==0) { - return TD::Convert("NULL"); - } else if (alignedTo.size()==1) { - return _f[alignedTo[0]]; // if not aligned to many, why bother continuing - } else { - ostringstream projection; - for (int i=0; i<alignedTo.size(); i++) { - if (i>0) projection << delimiter; - projection << TD::Convert(_f[alignedTo[i]]); - } - return TD::Convert(projection.str()); - } -} - - -WordID Alignment::F2EProjection(int idx, const string& delimiter) { - if (DEBUG) cerr << "F2EProjection(" << idx << ")" << endl; - int e = targetOf(idx); - if (e<0) { - if (DEBUG) cerr << "projection = NULL" << endl; - return TD::Convert("NULL"); - } else { - if (targetOf(idx,e+1)<0) { - if (DEBUG) cerr << "e-1=" << (e-1) << ", size=" << _e.size() << endl; - return getE(e-1); // if not aligned to many, why bother continuing - } - ostringstream projection; - bool firstTime = true; - do { - if (!firstTime) projection << delimiter; - projection << TD::Convert(_e[e-1]); // transform space - firstTime = false; - e = targetOf(idx,e+1); - //if (DEBUG) cerr << "projection = " << projection.str() << endl; - } while(e>=0); - return TD::Convert(projection.str()); - } -} - -WordID Alignment::E2FProjection(int idx, const string& delimiter) { - //cerr << "E2FProjection(" << idx << ")" << endl; - //cerr << "i" << endl; - int f = sourceOf(idx); - //cerr << "j, f=" << f << endl; - if (f<0) { - //cerr << "projection = NULL" << endl; - return TD::Convert("NULL"); - } else { - if (sourceOf(idx,f+1)<0) return getF(f-1); - bool firstTime = true; - ostringstream projection(ostringstream::out); - do { - if (!firstTime) projection << delimiter; - projection << TD::Convert(_f[f-1]); //transform space - firstTime = false; - f = sourceOf(idx,f+1); - //cerr << "projection = " << projection.str() << endl; - } while(f>=0); - return TD::Convert(projection.str()); - } -} -void Alignment::computeBorderDominanceSource(const CountTable& table, double *cost, double *bonus, double *state_mono, - double *state_nonmono, TRule &rule, const std::vector<const void*>& ant_contexts, const map<WordID,int>& sfw) { - // HACK: GOAL is assumed to always be "S" - if (DEBUG) cerr << "computeBorderDominanceSource" << endl; - std::vector<WordID> f = rule.f(); - std::vector<WordID> e = rule.e(); - int nt_index[f.size()]; - int nt_count=0; - for (int i=0; i<f.size(); i++) nt_index[i] = (f[i]<0)? ++nt_count : 0; - if (DEBUG) { - cerr << "f = "; - for (int i=0; i<f.size(); i++) cerr << i << "." << "[" << f[i] << "] "; - cerr << endl; - cerr << "e = "; - for (int i=0; i<e.size(); i++) cerr << i << "." << "[" << e[i] << "] "; - cerr << endl; - } - bool flag[f.size()]; - for (int idx=0; idx<f.size(); idx++) flag[idx]=false; - //collect alignments - vector<int> als; - for (std::vector<AlignmentPoint>::const_iterator i = rule.als().begin(); i != rule.als().end(); ++i) { - int s = i->s_; int t = i->t_; - als.push_back(link(t,s)); - } - if (DEBUG) cerr << "rule.Arity=" << rule.Arity() << endl; - if (rule.Arity()>0) { - int ntc=0; - for (int s=0; s<f.size(); s++) { - if (f[s]<=0) { - if (DEBUG) cerr << "f[s]=" << f[s] << "+" << s << " - "; - for (int t=0; t<e.size(); t++) { - if (e[t]==ntc) { - if (DEBUG) cerr << "e[t]=" << e[t] << "+" << t <<endl; - als.push_back(link(t,s)); - ntc--; break; - } - } - } - } - } - if (DEBUG) { - cerr << "unsorted alignments (nonterminals and terminals)" << endl; - for (int i=0; i<als.size(); i++) - cerr << source(als[i]) << "-" << target(als[i]) << " "; - cerr << endl; - } - // sort alignments according to target - std::sort(als.begin(),als.end()); - if (DEBUG) { - cerr << "sorted alignments (nonterminals and terminals)" << endl; - for (int i=0; i<als.size(); i++) - cerr << source(als[i]) << "-" << target(als[i]) << " "; - cerr << endl; - } - // 0 -> neither, 1 -> leftFirst, 2 -> rightFirst, 3 -> dontCare - // ScoreDominance(const CountTable& table, int dom, WordID source1, WordID source2, WordID target1, WordID target2) - int prevs = 0; - for (int i=0; i<als.size(); i++) { - int currs = target(als[i]); //int currt = source(als[i]); - if (DEBUG) cerr << "prevs=" << prevs << ", currs=" << currs << endl << endl; - if (currs<prevs) { - if (DEBUG) cerr << "currs<prevs" << endl; - for (int s = currs; s <= prevs; s++) { - if (sfw.find(f[s])!=sfw.end()) { - WordID target = F2EProjectionFromExternal(s,rule.a_,"_SEP_"); - if (DEBUG) cerr<<" f[s]="<<TD::Convert(f[s])<<" is a function word, target="<<TD::Convert(target)<<endl; - //*cost += ScoreDominance(table,1,kSOS,f[s],kSOS,target) + ScoreDominance(table,2,f[s],kEOS,target,kEOS); - *cost += ScoreDominance(table,1,kSOS,f[s],kUNK,kUNK) + ScoreDominance(table,2,f[s],kEOS,kUNK,kUNK); - if (DEBUG) cerr << " resulting cost="<< *cost << endl; - } else if (f[s]<=0) { - if (DEBUG) cerr << " f[s]= is a nonterminal" << endl; - const int* ants = reinterpret_cast<const int *>(ant_contexts[nt_index[s]-1]); - *cost += Dwarf::IntegerToDouble(ants[51]); // 50->mono, 51->non-mono - if (DEBUG) cerr << " adding "<< Dwarf::IntegerToDouble(ants[51]) << " into cost, resulting = " << *cost << endl; - } - flag[s] = true; - } - } - prevs = currs; - } - if (DEBUG) cerr << "bonus and state matter" << endl; - for (int s=0; s<rule.f().size(); s++) { - if (!flag[s]) { - if (sfw.find(f[s])!=sfw.end()) { - WordID target = F2EProjectionFromExternal(s,rule.a_,"_SEP_"); - if (DEBUG) cerr<<" f[s]="<<TD::Convert(f[s])<<" is a function word, target="<<TD::Convert(target)<<endl; - //double indbonus = ScoreDominance(table,3,kSOS,f[s],kSOS,target) + ScoreDominance(table,3,f[s],kEOS,target,kEOS); - double indbonus = ScoreDominance(table,3,kSOS,f[s],kUNK,kUNK) + ScoreDominance(table,3,f[s],kEOS,kUNK,kUNK); - *bonus += indbonus; - *state_mono += indbonus; - //*state_nonmono += ScoreDominance(table,1,kSOS,f[s],kSOS,target) + ScoreDominance(table,2,f[s],kEOS,target,kEOS); - *state_nonmono += ScoreDominance(table,1,kSOS,f[s],kUNK,kUNK) + ScoreDominance(table,2,f[s],kEOS,kUNK,kUNK); - if (DEBUG) cerr<<" new bonus="<<*bonus<<", new state="<<*state_mono<<","<<*state_nonmono<<endl; - } else if (f[s]<=0) { - if (DEBUG) cerr << " f[s]="<< f[s] <<" is a nonterminal" << endl; - const int* ants = reinterpret_cast<const int *>(ant_contexts[nt_index[s]-1]); - double indbonus = Dwarf::IntegerToDouble(ants[50]); - *bonus += indbonus; - *state_mono += indbonus; - *state_nonmono += Dwarf::IntegerToDouble(ants[51]); - if (DEBUG) cerr << " propagating state=" << *state_mono <<","<< *state_nonmono<< endl; - } - } - } - if (DEBUG) cerr << "LHS:" << rule.GetLHS() << ":" << TD::Convert(rule.GetLHS()*-1) <<endl; - if (rule.GetLHS()*-1==TD::Convert("S")) { - *state_mono = 0; - *state_nonmono = 0; - for (int i=0; i<rule.Arity(); i++) { - const int* ants = reinterpret_cast<const int *>(ant_contexts[i]); - *cost += Dwarf::IntegerToDouble(ants[50]); - } - *bonus = 0; - } - if (DEBUG) cerr << "-->>>> cost="<<*cost<<", bonus="<<*bonus<<", state_mono="<<*state_mono<<", state_nonmono="<<*state_nonmono<<endl; -} - -bool Alignment::prepare(TRule& rule, const std::vector<const void*>& ant_contexts, const map<WordID,int>& sfw, const map<WordID,int>& tfw,const Lattice& sourcelattice, int spanstart, int spanend) { - if (DEBUG) cerr << "===Rule===" << rule.AsString() << endl; - _f = rule.f(); - _e = rule.e(); - _Arity = rule.Arity(); - if (DEBUG) { - cerr << "F: "; - for (int idx=0; idx<_f.size(); idx++) cerr << _f[idx] << " "; - cerr << endl; - cerr << "F': "; - for (int idx=0; idx<_f.size(); idx++) - if (_f[idx]>=0) { - cerr << TD::Convert(_f[idx]) << " "; - } else { - cerr << TD::Convert(_f[idx]*-1); - } - cerr << endl; - cerr << "E: "; - for (int idx=0; idx<_e.size(); idx++) - cerr << _e[idx] << " "; - cerr << endl; - cerr << "E': "; - for (int idx=0; idx<_e.size(); idx++) - if (_e[idx]>0) { - cerr << TD::Convert(_e[idx]) << " "; - } else { - cerr << "[NT]" << " "; - } - cerr << endl; - } - - SourceFWRuleIdxs[0]=0; - SourceFWRuleAbsIdxs[0]=0; - for (int idx=1; idx<=_f.size(); idx++) { // in transformed space - if (sfw.find(_f[idx-1])!=sfw.end()) { - SourceFWRuleIdxs[0]++; - SourceFWRuleAbsIdxs[++SourceFWRuleAbsIdxs[0]]=GetFWGlobalIdx(idx,sourcelattice,_f,spanstart,spanend,ant_contexts,sfw); - SourceFWRuleIdxs[3*SourceFWRuleIdxs[0]-2]=idx; - SourceFWRuleIdxs[3*SourceFWRuleIdxs[0]-1]=_f[idx-1]; - SourceFWRuleIdxs[3*SourceFWRuleIdxs[0]] =F2EProjectionFromExternal(idx-1,rule.a_,"_SEP_"); - } - } - TargetFWRuleIdxs[0]=0; - for (int idx=1; idx<=_e.size(); idx++) { // in transformed space - if (tfw.find(_e[idx-1])!=tfw.end()) { - TargetFWRuleIdxs[0]++; - TargetFWRuleIdxs[3*TargetFWRuleIdxs[0]-2]=idx; - TargetFWRuleIdxs[3*TargetFWRuleIdxs[0]-1]=E2FProjectionFromExternal(idx-1,rule.a_,"_SEP_"); - TargetFWRuleIdxs[3*TargetFWRuleIdxs[0]] =_e[idx-1]; - } - } - - if (DEBUG) { - cerr << "SourceFWRuleIdxs[" << SourceFWRuleIdxs[0] << "]:"; - for (int idx=1; idx<=SourceFWRuleIdxs[0]; idx++) { - cerr << " idx:" << SourceFWRuleIdxs[3*idx-2]; - cerr << " absidx:" << SourceFWRuleAbsIdxs[idx]; - cerr << " F:" << SourceFWRuleIdxs[3*idx-1]; - cerr << " E:" << SourceFWRuleIdxs[3*idx]; - cerr << "; "; - } - cerr << endl; - cerr << "TargetFWRuleIdxs[" << TargetFWRuleIdxs[0] << "]:"; - for (int idx=1; idx<=TargetFWRuleIdxs[0]; idx++) { - cerr << " idx:" << TargetFWRuleIdxs[3*idx-2]; - cerr << " F:" << TargetFWRuleIdxs[3*idx-1]; - cerr << " E:" << TargetFWRuleIdxs[3*idx]; - } - cerr << endl; - } - if (SourceFWRuleIdxs[0]+TargetFWRuleIdxs[0]==0) { - bool nofw = true; - for (int i_ant=0; i_ant<_Arity && nofw; i_ant++) { - const int* ants = reinterpret_cast<const int *>(ant_contexts[i_ant]); - if (ants[0]>=0||ants[3]>=0||ants[6]>=0||ants[9]>=0) nofw=false; - } - if (nofw) return true; - } - //cerr << "clearing als first" << endl; - clearAls(_J,_I); - - if (DEBUG) cerr << "A["<< rule.a_.size() << "]: " ; - RuleAl[0]=0; - // add phrase start boundary - RuleAl[0]++; RuleAl[RuleAl[0]*2-1]=0; RuleAl[RuleAl[0]*2]=0; - if (DEBUG) cerr << RuleAl[RuleAl[0]*2-1] << "-" << RuleAl[RuleAl[0]*2] << " "; - for (int idx=0; idx<rule.a_.size(); idx++) { - RuleAl[0]++; - RuleAl[RuleAl[0]*2-1]=rule.a_[idx].s_+1; - RuleAl[RuleAl[0]*2] =rule.a_[idx].t_+1; - if (DEBUG) cerr << RuleAl[RuleAl[0]*2-1] << "-" << RuleAl[RuleAl[0]*2] << " "; - } - // add phrase end boundary - RuleAl[0]++; RuleAl[RuleAl[0]*2-1]=_f.size()+1; RuleAl[RuleAl[0]*2]=_e.size()+1; - if (DEBUG) cerr << RuleAl[RuleAl[0]*2-1] << "-" << RuleAl[RuleAl[0]*2] << " "; - if (DEBUG) cerr << endl; - - SourceRuleIdxs[0] = _f.size()+2; // +2 (phrase boundaries) - TargetRuleIdxs[0] = _e.size()+2; - int ntidx=-1; - for (int idx=0; idx<_f.size()+2; idx++) { // idx in transformed space - SourceRuleIdxs[idx+1]=idx; - if (0<idx && idx<=_f.size()) if (_f[idx-1]<0) SourceRuleIdxs[idx+1]=ntidx--; - } - for (int idx=0; idx<_e.size()+2; idx++) { - TargetRuleIdxs[idx+1]=idx; - if (0<idx && idx<=_e.size()) { - //cerr << "_e[" <<(idx-1)<< "]=" << _e[idx-1] << endl; - if (_e[idx-1]<=0) TargetRuleIdxs[idx+1]=_e[idx-1]-1; - } - } - if (DEBUG) { - cerr << "SourceRuleIdxs:"; - for (int idx=0; idx<SourceRuleIdxs[0]+1; idx++) - cerr << " " << SourceRuleIdxs[idx]; - cerr << endl; - cerr << "TargetRuleIdxs:"; - for (int idx=0; idx<TargetRuleIdxs[0]+1; idx++) - cerr << " " << TargetRuleIdxs[idx]; - cerr << endl; - } - - // sloppy, the integrity of anstates is assumed - // total = 50 bytes - // first 3 ints for leftmost source function words (1 for index, 4 for source WordID and 4 for target WordI - // second 3 for rightmost source function words - // third 3 for leftmost target function words - // fourth 3 for rightmost target function words - // the next 1 int for the number of alignments - // the remaining 37 ints for alignments (source then target) - for (int i_ant=0; i_ant<_Arity; i_ant++) { - const int* ants = reinterpret_cast<const int *>(ant_contexts[i_ant]); - int span = ants[Dwarf::STATE_SIZE-1]; - if (DEBUG) { - cerr << "antcontexts[" << i_ant << "] "; - for (int idx=0; idx<Dwarf::STATE_SIZE; idx++) cerr << idx << "." << ants[idx] << " "; - cerr << endl; - cerr << "i,j = " << source(ants[Dwarf::STATE_SIZE-1]) << "," << target(ants[Dwarf::STATE_SIZE-1]) << endl; - } - SourceFWAntsIdxs[i_ant][0]=0; - SourceFWAntsAbsIdxs[i_ant][0]=0; - if (ants[0]>=0) { - // Given a span, give the index of the first function word - int firstfwidx = GetFirstFWIdx(source(span),target(span),sourcelattice,sfw); - if (DEBUG) cerr << " firstfwidx = " << firstfwidx << endl; - int fwcount = 0; - if (ants[1]>=0) { // one function word - SourceFWAntsIdxs[i_ant][0]++; SourceFWAntsIdxs[i_ant][1]=ants[0]; - SourceFWAntsIdxs[i_ant][2]=ants[1]; SourceFWAntsIdxs[i_ant][3]=ants[2]; - fwcount++; - } else { // if ants[1] < 0 then compound fws - //cerr << "ants[1]<0" << endl; - istringstream ossf(TD::Convert(ants[1]*-1)); string ffw; - istringstream osse(TD::Convert(ants[2])); string efw; //projection would be mostly NULL - int delta=ants[0]; - while (osse >> efw && ossf >> ffw) { - SourceFWAntsIdxs[i_ant][0]++; - SourceFWAntsIdxs[i_ant][SourceFWAntsIdxs[i_ant][0]*3-2]=ants[0]-(delta--); - SourceFWAntsIdxs[i_ant][SourceFWAntsIdxs[i_ant][0]*3-1]=TD::Convert(ffw); - SourceFWAntsIdxs[i_ant][SourceFWAntsIdxs[i_ant][0]*3] =TD::Convert(efw); - fwcount++; - } - } - if (DEBUG) cerr << " fwcount=" << fwcount << endl; - SourceFWAntsAbsIdxs[i_ant][0]=fwcount; - for (int i=1; i<=fwcount; i++) SourceFWAntsAbsIdxs[i_ant][i]=firstfwidx++; - } - if (ants[3]>=0) { - int lastfwidx = GetLastFWIdx(source(span),target(span),sourcelattice,sfw); - if (DEBUG) cerr << " lastfwidx = " << lastfwidx << endl; - int fwcount=0; - if (ants[4]>=0) { - fwcount++; - SourceFWAntsIdxs[i_ant][0]++; - SourceFWAntsIdxs[i_ant][SourceFWAntsIdxs[i_ant][0]*3-2]=ants[3]; - SourceFWAntsIdxs[i_ant][SourceFWAntsIdxs[i_ant][0]*3-1]=ants[4]; - SourceFWAntsIdxs[i_ant][SourceFWAntsIdxs[i_ant][0]*3] =ants[5]; - } else { // if ants[4] < 0 then compound fws - //cerr << "ants[4]<0" << endl; - istringstream ossf(TD::Convert(ants[4]*-1)); string ffw; - istringstream osse(TD::Convert(ants[5])); string efw; - int delta=0; - while (osse >> efw && ossf >> ffw) { - fwcount++; - SourceFWAntsIdxs[i_ant][0]++; - SourceFWAntsIdxs[i_ant][SourceFWAntsIdxs[i_ant][0]*3-2]=ants[3]+(delta++); - SourceFWAntsIdxs[i_ant][SourceFWAntsIdxs[i_ant][0]*3-1]=TD::Convert(ffw); - SourceFWAntsIdxs[i_ant][SourceFWAntsIdxs[i_ant][0]*3] =TD::Convert(efw); - } - } - if (DEBUG) cerr << " fwcount=" << fwcount << endl; - for (int i=1; i<=fwcount; i++) SourceFWAntsAbsIdxs[i_ant][SourceFWAntsAbsIdxs[i_ant][0]+i]=lastfwidx-fwcount+i; - SourceFWAntsAbsIdxs[i_ant][0]+=fwcount; - } - TargetFWAntsIdxs[i_ant][0]=0; - if (ants[6]>=0) { - if (ants[8]>=0) { // check the e part - TargetFWAntsIdxs[i_ant][0]++; - TargetFWAntsIdxs[i_ant][1]=ants[6]; - TargetFWAntsIdxs[i_ant][2]=ants[7]; - TargetFWAntsIdxs[i_ant][3]=ants[8]; - } else { // if ants[8] < 0 then compound fws - //cerr << "ants[8]<0" << endl; - //cerr << "ants[7]=" << TD::Convert(ants[7]) << endl; - //cerr << "ants[8]=" << TD::Convert(ants[8]*-1) << endl; - istringstream ossf(TD::Convert(ants[7])); string ffw; - istringstream osse(TD::Convert(ants[8]*-1)); string efw; - int delta=ants[6]; - while (osse >> efw && ossf >> ffw) { - //cerr << "efw="<< efw << ",ffw=" << ffw << endl; - TargetFWAntsIdxs[i_ant][0]++; - TargetFWAntsIdxs[i_ant][TargetFWAntsIdxs[i_ant][0]*3-2]=ants[6]-(delta--); - TargetFWAntsIdxs[i_ant][TargetFWAntsIdxs[i_ant][0]*3-1]=TD::Convert(ffw); - TargetFWAntsIdxs[i_ant][TargetFWAntsIdxs[i_ant][0]*3] =TD::Convert(efw); - } - } - } - if (ants[9]>=0) { - if (ants[11]>=0) { - TargetFWAntsIdxs[i_ant][0]++; - TargetFWAntsIdxs[i_ant][TargetFWAntsIdxs[i_ant][0]*3-2]=ants[9]; - TargetFWAntsIdxs[i_ant][TargetFWAntsIdxs[i_ant][0]*3-1]=ants[10]; - TargetFWAntsIdxs[i_ant][TargetFWAntsIdxs[i_ant][0]*3] =ants[11]; - } else { - //cerr << "ants[11]<0" << endl; - //cerr << "ants[10]=" << TD::Convert(ants[10]) << endl; - //cerr << "ants[11]=" << TD::Convert(ants[11]*-1) << endl; - istringstream ossf(TD::Convert(ants[10])); string ffw; - istringstream osse(TD::Convert(ants[11]*-1)); string efw; - int delta = 0; - while (osse >> efw && ossf >> ffw) { - //cerr << "efw="<< efw << ",ffw=" << ffw << endl; - TargetFWAntsIdxs[i_ant][0]++; - TargetFWAntsIdxs[i_ant][TargetFWAntsIdxs[i_ant][0]*3-2]=ants[9]+(delta++); - TargetFWAntsIdxs[i_ant][TargetFWAntsIdxs[i_ant][0]*3-1]=TD::Convert(ffw); - TargetFWAntsIdxs[i_ant][TargetFWAntsIdxs[i_ant][0]*3] =TD::Convert(efw); - } - } - } - AntsAl[i_ant][0]=ants[12];//number of alignments - for (int idx=1; idx<=AntsAl[i_ant][0]; idx++) { - AntsAl[i_ant][idx*2-1] = source(ants[12+idx]); - AntsAl[i_ant][idx*2] = target(ants[12+idx]); - } - } - - for (int i_ant=0; i_ant<_Arity; i_ant++) { - int length = AntsAl[i_ant][0]; - int maxs = -1000; - int maxt = -1000; - for (int idx=0; idx<length; idx++) { - if (maxs<AntsAl[i_ant][2*idx+1]) maxs = AntsAl[i_ant][2*idx+1]; - if (maxt<AntsAl[i_ant][2*idx+2]) maxt = AntsAl[i_ant][2*idx+2]; - } - if (DEBUG) cerr << "SourceFWAntsIdxs[" <<i_ant<<"][0]=" << SourceFWAntsIdxs[i_ant][0] << endl; - for (int idx=1; idx<=SourceFWAntsIdxs[i_ant][0]; idx++) { - if (DEBUG) { - cerr << "SourceFWAntsIdxs["<<i_ant<<"]["<<(3*idx-2)<<"]="<<SourceFWAntsIdxs[i_ant][3*idx-2]; - cerr << ","<<SourceFWAntsIdxs[i_ant][3*idx-1]<<","<<SourceFWAntsIdxs[i_ant][3*idx]<<endl; - cerr << "SourceFWAntsAbsIdxs["<<i_ant<<"]["<<idx<<"]="<<SourceFWAntsAbsIdxs[i_ant][idx] << endl; - } - if (maxs<SourceFWAntsIdxs[i_ant][3*idx-2]) maxs=SourceFWAntsIdxs[i_ant][3*idx-2]; - } - if (DEBUG) cerr << "TargetFWAntsIdxs[" <<i_ant<<"][0]=" << TargetFWAntsIdxs[i_ant][0] << endl; - for (int idx=1; idx<=TargetFWAntsIdxs[i_ant][0]; idx++) { - if (DEBUG) { - cerr << "TargetFWAntsIdxs["<<i_ant<<"]["<<(3*idx-2)<<"]="<<TargetFWAntsIdxs[i_ant][3*idx-2]; - cerr << ","<<TargetFWAntsIdxs[i_ant][3*idx-1]<<","<<TargetFWAntsIdxs[i_ant][3*idx]<<endl; - } - if (maxt<TargetFWAntsIdxs[i_ant][3*idx-2]) maxt=TargetFWAntsIdxs[i_ant][3*idx-2]; - } - SourceAntsIdxs[i_ant][0] = maxs+1; - if (DEBUG) cerr << "SourceAntsIdxs[" << i_ant << "][0]=" <<SourceAntsIdxs[i_ant][0] << endl; - for (int idx=0; idx<SourceAntsIdxs[i_ant][0]; idx++) SourceAntsIdxs[i_ant][idx+1]=idx; - TargetAntsIdxs[i_ant][0] = maxt+1; - if (DEBUG) cerr << "TargetAntsIdxs[" << i_ant << "][0]=" <<TargetAntsIdxs[i_ant][0] << endl; - for (int idx=0; idx<TargetAntsIdxs[i_ant][0]; idx++) TargetAntsIdxs[i_ant][idx+1]=idx; - } - int TotalSource = SourceRuleIdxs[0] - _Arity; - for (int idx=0; idx<_Arity; idx++) TotalSource += SourceAntsIdxs[idx][0]; - int TotalTarget = TargetRuleIdxs[0] - _Arity; - for (int idx=0; idx<_Arity; idx++) TotalTarget += TargetAntsIdxs[idx][0]; - if (DEBUG) cerr << "TotalSource = "<< TotalSource << ", TotalTarget = "<< TotalTarget << endl; - int curr = 0; - for (int idx=1; idx<=SourceRuleIdxs[0]; idx++) { - if (SourceRuleIdxs[idx]>=0) { - SourceRuleIdxs[idx]=curr++; - } else { - int i_ant = SourceRuleIdxs[idx]*-1-1; - if (DEBUG) cerr << "SourceAntsIdxs[" << i_ant << "]" << endl; - for (int idx2=1; idx2<=SourceAntsIdxs[i_ant][0]; idx2++) { - SourceAntsIdxs[i_ant][idx2]=curr++; - if (DEBUG) cerr << SourceAntsIdxs[i_ant][idx2] << " "; - } - if (DEBUG) cerr << endl; - } - } - if (DEBUG) { - cerr << "SourceRuleIdxs" << endl; - for (int idx=1; idx<=SourceRuleIdxs[0]; idx++) cerr << SourceRuleIdxs[idx] << " "; - cerr << endl; - } - curr = 0; - for (int idx=1; idx<=TargetRuleIdxs[0]; idx++) { - if (TargetRuleIdxs[idx]>=0) { - TargetRuleIdxs[idx]=curr++; - } else { - int i_ant = TargetRuleIdxs[idx]*-1-1; - if (DEBUG) cerr << "TargetRuleIdxs[" << i_ant << "]" << endl; - for (int idx2=1; idx2<=TargetAntsIdxs[i_ant][0]; idx2++) { - TargetAntsIdxs[i_ant][idx2]=curr++; - if (DEBUG) cerr << TargetAntsIdxs[i_ant][idx2] << " "; - } - if (DEBUG) cerr << endl; - } - } - if (DEBUG) { - cerr << "TargetRuleIdxs" << endl; - for (int idx=1; idx<=TargetRuleIdxs[0]; idx++) cerr << TargetRuleIdxs[idx] << " "; - cerr << endl; - } - for (int idx=1; idx<=RuleAl[0]; idx++) { - if (DEBUG) { - cerr << RuleAl[idx*2-1] << " - " << RuleAl[idx*2] << " to "; - cerr << SourceRuleIdxs[RuleAl[idx*2-1]+1] << " - " << TargetRuleIdxs[RuleAl[idx*2]+1] << endl; - } - set(SourceRuleIdxs[RuleAl[idx*2-1]+1],TargetRuleIdxs[RuleAl[idx*2]+1]); - } - for (int i_ant=0; i_ant<_Arity; i_ant++) { - for (int idx=1; idx<=AntsAl[i_ant][0]; idx++) { - if (DEBUG) { - cerr << AntsAl[i_ant][2*idx-1] << " - " << AntsAl[i_ant][2*idx] << " to "; - cerr << SourceAntsIdxs[i_ant][AntsAl[i_ant][2*idx-1]+1] << " - "; - cerr << TargetAntsIdxs[i_ant][AntsAl[i_ant][2*idx]+1] << endl; - } - set(SourceAntsIdxs[i_ant][AntsAl[i_ant][2*idx-1]+1],TargetAntsIdxs[i_ant][AntsAl[i_ant][2*idx]+1]); - } - } - SourceFWIdxs[0]=0; - SourceFWAbsIdxs[0]=0; - if (DEBUG) cerr << "SourceFWRuleIdxs:" << endl; - for (int idx=1; idx<=SourceFWRuleIdxs[0]; idx++) { - if (DEBUG) cerr << SourceFWRuleIdxs[3*idx-2] << " to " << SourceRuleIdxs[SourceFWRuleIdxs[3*idx-2]+1] << endl; - SourceFWRuleIdxs[3*idx-2] = SourceRuleIdxs[SourceFWRuleIdxs[3*idx-2]+1]; - SourceFWAbsIdxs[0]++; - SourceFWAbsIdxs[3*SourceFWAbsIdxs[0]-2]=SourceFWRuleAbsIdxs[idx]; - SourceFWIdxs[0]++; - SourceFWIdxs[3*SourceFWIdxs[0]-2]=SourceFWRuleIdxs[3*idx-2]; - SourceFWIdxs[3*SourceFWIdxs[0]-1]=SourceFWRuleIdxs[3*idx-1]; - SourceFWIdxs[3*SourceFWIdxs[0]] =SourceFWRuleIdxs[3*idx]; - } - for (int i_ant=0; i_ant<_Arity; i_ant++) { - if (DEBUG) cerr << "SourceFWAntsIdxs[" << i_ant << "]" << endl; - for (int idx=1; idx<=SourceFWAntsIdxs[i_ant][0]; idx++) { - if (DEBUG) - cerr << SourceFWAntsIdxs[i_ant][3*idx-2] << " to " << SourceAntsIdxs[i_ant][SourceFWAntsIdxs[i_ant][3*idx-2]+1] << endl; - SourceFWAntsIdxs[i_ant][3*idx-2] = SourceAntsIdxs[i_ant][SourceFWAntsIdxs[i_ant][3*idx-2]+1]; - SourceFWAbsIdxs[0]++; - SourceFWAbsIdxs[3*SourceFWAbsIdxs[0]-2]=SourceFWAntsAbsIdxs[i_ant][idx]; - SourceFWIdxs[0]++; - SourceFWIdxs[3*SourceFWIdxs[0]-2]=SourceFWAntsIdxs[i_ant][3*idx-2]; - SourceFWIdxs[3*SourceFWIdxs[0]-1]=SourceFWAntsIdxs[i_ant][3*idx-1]; - SourceFWIdxs[3*SourceFWIdxs[0]] =SourceFWAntsIdxs[i_ant][3*idx]; - } - } - sort(SourceFWIdxs); - sort(SourceFWAbsIdxs); - if (DEBUG) { - cerr << "SourceFWIdxs : "; - for (int idx=1; idx<=SourceFWIdxs[0]; idx++) { - cerr << "idx:" << SourceFWIdxs[3*idx-2] << ","; - cerr << "F:" << SourceFWIdxs[3*idx-1] << ","; - cerr << "E:" << SourceFWIdxs[3*idx] << " "; - } - cerr << endl; - } - TargetFWIdxs[0]=0; - if (DEBUG) cerr << "TargetFWRuleIdxs:" << endl; - for (int idx=1; idx<=TargetFWRuleIdxs[0]; idx++) { - if (DEBUG) cerr << TargetFWRuleIdxs[3*idx-2] << " to " << TargetRuleIdxs[TargetFWRuleIdxs[3*idx-2]+1] << endl; - TargetFWRuleIdxs[3*idx-2] = TargetRuleIdxs[TargetFWRuleIdxs[3*idx-2]+1]; - TargetFWIdxs[0]++; - TargetFWIdxs[3*TargetFWIdxs[0]-2]=TargetFWRuleIdxs[3*idx-2]; - TargetFWIdxs[3*TargetFWIdxs[0]-1]=TargetFWRuleIdxs[3*idx-1]; - TargetFWIdxs[3*TargetFWIdxs[0]] =TargetFWRuleIdxs[3*idx]; - } - for (int i_ant=0; i_ant<_Arity; i_ant++) { - if (DEBUG) cerr << "TargetFWAntsIdxs[" << i_ant << "]" << endl; - for (int idx=1; idx<=TargetFWAntsIdxs[i_ant][0]; idx++) { - if (DEBUG) cerr << TargetFWAntsIdxs[i_ant][3*idx-2] << " to " << TargetAntsIdxs[i_ant][TargetFWAntsIdxs[i_ant][3*idx-2]+1] << endl; - TargetFWAntsIdxs[i_ant][3*idx-2] = TargetAntsIdxs[i_ant][TargetFWAntsIdxs[i_ant][3*idx-2]+1]; - TargetFWIdxs[0]++; - TargetFWIdxs[3*TargetFWIdxs[0]-2]=TargetFWAntsIdxs[i_ant][3*idx-2]; - TargetFWIdxs[3*TargetFWIdxs[0]-1]=TargetFWAntsIdxs[i_ant][3*idx-1]; - TargetFWIdxs[3*TargetFWIdxs[0]] =TargetFWAntsIdxs[i_ant][3*idx]; - } - } - sort(TargetFWIdxs); - if (DEBUG) { - cerr << "TargetFWIdxs : "; - for (int idx=1; idx<=TargetFWIdxs[0]; idx++) { - cerr << "idx:" << TargetFWIdxs[3*idx-2]<< ","; - cerr << "E:" << TargetFWIdxs[3*idx-1]<< ","; - cerr << "F:" << TargetFWIdxs[3*idx]<< " "; - } - cerr << endl; - cerr << AsString() << endl; - } - fas = firstSourceAligned(1); las = lastSourceAligned(_J-2); - fat = firstTargetAligned(1); lat = lastTargetAligned(_I-2); - if (DEBUG) cerr << "fas=" << fas << ", las=" << las << ", fat=" << fat << ", lat=" << lat << endl; - assert(fas<=las); - assert(fat<=lat); - SetCurrAlVector(); - if (DEBUG) cerr << "end prepare" << endl; - return false; -} - -string Alignment::AsStringSimple() { - ostringstream stream; - for (int j=0; j<getJ(); j++) { - int t = targetOf(j,minTSpan(j)); - while (t>=0) { - stream << " " << j << "-" << t; - t = targetOf(j,t+1); - } - } - return stream.str(); -}; - - -string Alignment::AsString() { - ostringstream stream; - stream << "J:" << getJ() << " I:" << getI(); - for (int j=0; j<getJ(); j++) { - int t = targetOf(j,minTSpan(j)); - while (t>=0) { - stream << " " << j << "-" << t; - t = targetOf(j,t+1); - } - } - stream << " TargetSpan:"; - for (int j=0; j<getJ(); j++) - if (minTSpan(j)!=MINIMUM_INIT) - stream << " " << j << "[" << minTSpan(j) << "," << maxTSpan(j) << "]"; - else - stream << " " << j << "[-,-]"; - stream << " SourceSpan:"; - for (int i=0; i<getI(); i++) - if (minSSpan(i)!=MINIMUM_INIT) - stream << " " << i << "[" << minSSpan(i) << "," << maxSSpan(i) << "]"; - else - stream << " " << i << "[-,-]"; - return stream.str(); -}; - -void Alignment::SetCurrAlVector() { - curr_al.clear(); - for (int j=0; j<_J; j++) { - int i = targetOf(j); - while (i>=0) { - curr_al.push_back(link(j,i)); - i = targetOf(j,i+1); - } - } -} - -void CountTable::print() const { - cerr << "+++ Model +++" << endl; - for (map<WordID,int*>::const_iterator iter=model.begin(); iter!=model.end(); iter++) { - cerr << TD::Convert(iter->first) << " "; - for (int i=0; i<numColumn; i++) cerr << iter->second[i] << " "; - cerr << endl; - } - cerr << "+++ Ultimate +++" << endl; - for (int i=0; i<numColumn; i++) cerr << ultimate[i] << " "; - cerr << endl; -} - -void Alignment::ToArrayInt(vector<int>* ret) { - ret->clear(); - for (int i=0; i<_J; i++) { - int t = targetOf(i); - while (t>=0) { - ret->push_back(link(i,t)); - t = targetOf(i,t+1); - } - } -} - -int Alignment::GetFWGlobalIdx(int idx, const Lattice& sourcelattice, vector<WordID>& sources, int spanstart, int spanend, const std::vector<const void*>& ant_contexts, const map<WordID,int>& sfw) { - // get the index of the function word in the lattice - if (DEBUG) cerr << " GetFWGlobalIdx(" << idx << "," << spanstart << "," << spanend << ")" << endl; - int curr = spanstart; int i_ant = 0; - for (int i=1; i<sources.size() && i<idx; i++) { // sources contain <s> and </s> - if (sources[i]<0) { - const int* ants = reinterpret_cast<const int *>(ant_contexts[i_ant++]); - int antstate = ants[Dwarf::STATE_SIZE-1]; - if (DEBUG) cerr << " found NT[" << target(antstate) << "," << source(antstate) << "]" << endl; - curr += target(antstate)-source(antstate); - } else { - curr++; - } - } - if (DEBUG) cerr << " curr = " << curr << endl; - //compute the fw index - int ret = 1; - for (int i=0; i<curr; i++) { - if (sfw.find(sourcelattice[i][0].label)!=sfw.end()) ret++; - } - if (DEBUG) cerr << " ret = " << ret << endl; - return ret; -} - -int Alignment::GetFirstFWIdx(int spanstart,int spanend, const Lattice& sourcelattice, const map<WordID,int>& sfw) { - if (DEBUG) cerr << " GetFirstFWIdx(" << spanstart << "," << spanend << ")" << endl; - int curr=0; - for (int i=0; i<spanend; i++) { - if (sfw.find(sourcelattice[i][0].label)!=sfw.end()) { - curr++; - if (i>=spanstart) return curr; - } - } -// assert(0); - return curr; -} - -int Alignment::GetLastFWIdx(int spanstart,int spanend, const Lattice& sourcelattice, const map<WordID,int>& sfw) { - if (DEBUG) cerr << " GetLastFWIdx(" << spanstart << "," << spanend << ")" << endl; - int curr=0; - for (int i=0; i<spanend; i++) { - if (sfw.find(sourcelattice[i][0].label)!=sfw.end()) { - curr++; - } - } - return curr; -} - -WordID Alignment::generalize(WordID original, const map<WordID,WordID>& tags, bool pos) { - if (!pos) { - map<WordID,WordID>::const_iterator it = tags.find(original); - if (it!=tags.end()) { - return it->second; - } - } else { - string key,idx; - Dwarf::stripIndex(TD::Convert(original),&key,&idx); - map<WordID,WordID>::const_iterator it = tags.find(TD::Convert(key)); - if (it!=tags.end()) { - ostringstream oss; - oss << TD::Convert(it->second) << "/" << idx; - return TD::Convert(oss.str()); - } - } - return original; -} - -int* Alignment::SOS() { - int* neighbor = new int[4]; - neighbor[0]=0; neighbor[1]=0; - neighbor[2]=0; neighbor[3]=0; - return neighbor; -} - -int* Alignment::EOS() { - int* neighbor = new int[4]; - neighbor[0]=getJ()-1; neighbor[1]=neighbor[0]; - neighbor[2]=getI()-1; neighbor[3]=neighbor[2]; - return neighbor; -} - -int* Alignment::neighborLeft(int startidx, int endidx, bool* getit) { - if (DEBUG) cerr << " neighborLeft("<<startidx<<","<<endidx<<")"<<endl; - int lborder = startidx; - int* ret; - while(lborder<=endidx) { - ret = blockSource(lborder,endidx); - if (ret[0]==lborder && ret[1]==endidx && ret[2]!=MINIMUM_INIT) { - *getit = true; - return ret; - } else { - delete[] ret; - lborder++; - } - } - ret = new int[4]; - ret[0]=-1; ret[1]=-1; ret[2]=-1; ret[3]=-1; - *getit = false; - return ret; -} - -int* Alignment:: neighborRight(int startidx, int endidx, bool* getit) { - if (DEBUG) cerr << " neighborRight("<<startidx<<","<<endidx<<")"<<endl; - int rborder = endidx; - int* ret; - while(startidx<=rborder) { - ret = blockSource(startidx,rborder); - if (ret[0]==startidx && ret[1]==rborder && ret[2]!=MINIMUM_INIT) { - *getit = true; - return ret; - } else { - delete[] ret; - rborder--; - } - } - ret = new int[4]; - ret[0]=-1; ret[1]=-1; ret[2]=-1; ret[3]=-1; - *getit = false; - return ret; -} diff --git a/decoder/dwarf.h b/decoder/dwarf.h deleted file mode 100644 index 49d2a3b7..00000000 --- a/decoder/dwarf.h +++ /dev/null @@ -1,286 +0,0 @@ -#ifndef DWARF_H -#define DWARF_H - -#include <cstdlib> -#include <vector> -#include <map> -#include <string> -#include <ostream> -#include "wordid.h" -#include "lattice.h" -#include "trule.h" -#include "tdict.h" -#include <boost/functional/hash.hpp> -#include <tr1/unordered_map> -#include <boost/tuple/tuple.hpp> - -using namespace std; -using namespace std::tr1; -using namespace boost::tuples; -using namespace boost; - -const static bool DEBUG = false; - -class CountTable { -public: - int* ultimate; - map<WordID,int*> model; - int mode; - int numColumn; - void print() const; - void setup(int _numcolumn, int _mode) { - mode = _mode; numColumn = _numcolumn; - } -}; - -class Alignment { -/* Alignment represents an alignment object in a 2D format to support function word-based models calculation - - A note about model's parameter estimation: - ========================================== - The model is estimated as a two-level Dirichlet process. - For orientation model, the first tier estimation is: - P(o|f,e) where *o* is the orientation value to estimate, *f* is the source function word aligned to *e* - its second tier is: P(o|f), while its third tier is P(o) - For dominance model, the first tier estimation is: - P(d|f1,f2,e1,e2) where *d* is a dominance value to estimate, *f1,f2* are the neighboring function words on the source - aligned to *e1,e2* on the target side - its second tier is: P(d|f1,f2) while its third tier is P(d) - - Taking orientation model as a case in point, a two level estimation proceeds as follow: - P(o|f,e) = c(o,f,e) + alpha { c(o,f) + beta [ c (o) / c(.) ] } - ------------------------------ - c(f) + beta - ------------------------------------------------- - c(f,e) + alpha - where c() is a count function, alpha and beta are the concentration parameter - of the first and second Dirichlet process respectively - To encourage or penalize the use of second and third tier statistics, bo1 and bo2 binary features are introduced -*/ -public: - const static int MAX_WORDS = 200; - const static int MINIMUM_INIT = 1000; - const static int MAXIMUM_INIT = -1000; - const static int MAX_ARITY = 2; - WordID kSOS; - WordID kEOS; - WordID kUNK; - double alpha_oris; // 1st concentration parameter for orientation model - double beta_oris; // 2nd concentration parameter for orientation model - double alpha_orit; // 1st concentration parameter for orientation model - double beta_orit; // 2nd concentration parameter for orientation model - double alpha_doms; // idem as above but for dominance model - double beta_doms; - double alpha_domt; // idem as above but for dominance model - double beta_domt; - - // ACCESS to alignment - void set(int j,int i); // j is the source index, while i is the target index - void reset(int j,int i); // idem as above - inline bool at(int j, int i) { return _matrix[j][i]; }; - inline int getJ() {return _J;}; // max source of the current alignment - inline int getI() {return _I;}; // max target of the current alignment - inline void setI(int I) { _I = I; }; - inline void setJ(int J) { _J = J; }; - inline void setF(vector<WordID> f) { _f=f;}; - inline void setE(vector<WordID> e) { _e=e;}; - inline WordID getF(int id) { if (id<0) return TD::Convert("<s>"); if (id>=_f.size()) return TD::Convert("</s>"); return _f[id];}; - inline WordID getE(int id) { if (id<0) return TD::Convert("<s>"); if (id>=_e.size()) return TD::Convert("</s>"); return _e[id];}; - void clearAls(int prevJ=200, int prevI=200); - int sourceOf(int i, int start = -1); - int targetOf(int j, int start = -1); - inline int minSSpan(int i) { return _sSpan[i][0];} - inline int maxSSpan(int i) { return _sSpan[i][1];} - inline int minTSpan(int j) { return _tSpan[j][0];} - inline int maxTSpan(int j) { return _tSpan[j][1];} - static inline int link(int s, int t) { return (s << 16) | t; } - static inline int source(int st) {return st >> 16; } - static inline int target(int st) {return st & 0xffff; } - inline void setAlphaOris(double val) { alpha_oris=val; } - inline void setAlphaOrit(double val) { alpha_orit=val; } - inline void setAlphaDoms(double val) { alpha_doms=val; } - inline void setAlphaDomt(double val) { alpha_domt=val; } - inline void setBetaOris(double val) { beta_oris=val; } - inline void setBetaOrit(double val) { beta_orit=val; } - inline void setBetaDoms(double val) { beta_doms=val; } - inline void setBetaDomt(double val) { beta_domt=val; } - inline void setFreqCutoff(int val) { cout << _freq_cutoff << " to " << val << endl; _freq_cutoff=val; } - string AsString(); - string AsStringSimple(); - int* SOS(); - int* EOS(); - - // Model related function - Alignment(); - // Given the current *rule* and its antecedents, construct an alignment space and mark the function word alignments - // according *sfw* and *tfw* - bool prepare(TRule& rule, const std::vector<const void*>& ant_contexts, - const map<WordID,int>& sfw, const map<WordID,int>& tfw, const Lattice& sourcelattice, int spanstart, int spanend); - - // Compute orientation model score which parameters are stored in *table* and pass the values accordingly - // will call Orientation(Source|Target) and ScoreOrientation(Source|Target) - void computeOrientationSource(const CountTable& table, double *cost, double *bonus, double *bo1, - double *bo1_bonus, double *bo2, double *bo2_bonus); - void computeOrientationSourcePos(const CountTable& table, double *cost, double *bonus, - double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, int maxfwidx, int maxdepth1, int maxdepth2); - void computeOrientationSourceGen(const CountTable& table, double *cost, double *bonus, double *bo1, - double *bo1_bonus, double *bo2, double *bo2_bonus, const map<WordID,WordID>& tags); - void computeOrientationSourceBackward(const CountTable& table, double *cost, double *bonus, double *bo1, - double *bo1_bonus, double *bo2, double *bo2_bonus); - void computeOrientationSourceBackwardPos(const CountTable& table, double *cost, double *bonus, double *bo1, - double *bo1_bonus, double *bo2, double *bo2_bonus, int maxfwidx, int maxdepth1, int maxdepth2); - void computeOrientationTarget(const CountTable& table, double *cost, double *bonus, double *bo1, - double *bo1_bonus, double *bo2, double *bo2_bonus); - void computeOrientationTargetBackward(const CountTable& table, double *cost, double *bonus, double *bo1, - double *bo1_bonus, double *bo2, double *bo2_bonus); - // Get the orientation value of a function word at a particular index *fw* - // assign the value to either *oril* or *orir* accoring to *Lcompute* and *Rcompute* - void OrientationSource(int fw, int*oril, int* orir, bool Lcompute=true, bool Rcompute=true); - void OrientationSource(int fw0, int fw1, int*oril, int* orir, bool Lcompute=true, bool Rcompute=true); - int OrientationSource(int* left, int* right); - void OrientationTarget(int fw, int*oril, int* orir, bool Lcompute=true, bool Rcompute=true); - void OrientationTarget(int fw0, int fw1, int*oril, int* orir, bool Lcompute=true, bool Rcompute=true); - - vector<int> OrientationSourceLeft4Sampler(int fw0, int fw1); - vector<int> OrientationSourceLeft4Sampler(int fw); - vector<int> OrientationSourceRight4Sampler(int fw0, int fw1); - vector<int> OrientationSourceRight4Sampler(int fw); - vector<int> OrientationTargetLeft4Sampler(int fw0, int fw1); - vector<int> OrientationTargetLeft4Sampler(int fw); - vector<int> OrientationTargetRight4Sampler(int fw0, int fw1); - vector<int> OrientationTargetRight4Sampler(int fw); - - // Given an orientation value *ori*, estimate the score accoding to *cond1*, *cond2* - // and assign the value accordingly according to *isBonus* and whether the first or the second tier estimation - // is used or not - void ScoreOrientationRight(const CountTable& table, int ori, WordID cond1, WordID cond2, - bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, - double *bo2, double *bo2_bonus, double alpha1, double beta1); - void ScoreOrientationLeft(const CountTable& table, int ori, WordID cond1, WordID cond, - bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, - double *bo2, double *bo2_bonus, double alpha1, double beta1); - double ScoreOrientationRight(const CountTable& table, int ori, WordID cond1, WordID cond2); - double ScoreOrientationLeft(const CountTable& table, int ori, WordID cond1, WordID cond); - void ScoreOrientationRightBackward(const CountTable& table, int ori, WordID cond1, WordID cond2, - bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, - double *bo2, double *bo2_bonus, double alpha1, double beta1); - void ScoreOrientationLeftBackward(const CountTable& table, int ori, WordID cond1, WordID cond, - bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, - double *bo2, double *bo2_bonus, double alpha1, double beta1); - double ScoreOrientationRightBackward(const CountTable& table, int ori, WordID cond1, WordID cond2); - double ScoreOrientationLeftBackward(const CountTable& table, int ori, WordID cond1, WordID cond); - void ScoreOrientation(const CountTable& table, int offset, int ori, WordID cond1, WordID cond2, - bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, - double *bo2, double *bo2_bonus, double alpha1, double beta1); - double ScoreOrientation(const CountTable& table, int offset, int ori, WordID cond1, WordID cond2); - - // idem as above except these are for dominance model - void computeDominanceSource(const CountTable& table, WordID lfw, WordID rfw, double *cost, double *bonus, - double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus); - void computeDominanceSourcePos(const CountTable& table, WordID lfw, WordID rfw, double *cost, double *bonus, - double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, int maxfwidx, int maxdepth1, int maxdepth2); - void computeDominanceTarget(const CountTable& table, WordID lfw, WordID rfw, double *cost, double *bonus, - double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus); - void computeBorderDominanceSource(const CountTable& table, double *cost, double *bonus, - double *state_mono, double *state_nonmono, - TRule &rule, const std::vector<const void*>& ant_contexts, const map<WordID,int>& sfw); - int DominanceSource(int fw1, int fw2); - int DominanceTarget(int fw1, int fw2); - vector<int> DominanceSource4Sampler(int fw1, int fw2); - vector<int> DominanceTarget4Sampler(int fw1, int fw2); - void ScoreDominance(const CountTable& table, int dom, WordID s1, WordID s2, WordID t1, WordID t2, - double *cost, double *bo1, double *bo2, bool isBonus, double alpha2, double beta2); - double ScoreDominance(const CountTable& table, int dom, WordID s1, WordID s2, WordID t1, WordID t2); - - // Remove all function word alignments except those at the borders - // May result in more than two function word alignments at each side, because this function - // will continue keeping function word alignments until the first aligned word at each side - void BorderingSFWsOnly(); - void BorderingTFWsOnly(); - void simplify(int *ret); // preparing the next state - void simplify_nofw(int *ret); // preparing the next state when no function word appears - // set the first part of the next state, which concerns with function word - // fas, las, fat, lat is the (f)irst or (l)ast function word alignments either on the (s)ource or (t)arget - // these parameters to anticipate cases where there are more than two function word alignments - void FillFWIdxsState(int *state, int fas, int las, int fat, int lat); - - // Helper function to obtain the aligned words on the other side - // WARNING!!! Only to be used if the als are in sync with either source or target sentences - WordID F2EProjectionFromExternal(int idx, const vector<AlignmentPoint>& als, const string& delimiter=" "); - WordID E2FProjectionFromExternal(int idx, const vector<AlignmentPoint>& als, const string& delimiter=" "); - // WARNING!!! Only to be used in dwarf_main.cc - // These two function words assume that the alignment contains phrase boundary - // but the source and target sentences do not - WordID F2EProjection(int idx, const string& delimiter=" "); - WordID E2FProjection(int idx, const string& delimiter=" "); - void SetCurrAlVector(); - int* blockSource(int fw1, int fw2); - int* blockTarget(int fw1, int fw2); - void ToArrayInt(vector<int>* arr); - int* neighborLeft(int startidx, int endidx, bool* found); - int* neighborRight(int startidx, int endidx, bool* found); -private: - // Hash to avoid redundancy - unordered_map<vector<int>, int, boost::hash<vector<int> > > oris_hash; - unordered_map<vector<int>, int, boost::hash<vector<int> > > orit_hash; - unordered_map<vector<int>, int, boost::hash<vector<int> > > doms_hash; - unordered_map<vector<int>, int, boost::hash<vector<int> > > domt_hash; - unordered_map<vector<int>, vector<int>, boost::hash<vector<int> > > simplify_hash; - unordered_map<vector<int>, vector<int>, boost::hash<vector<int> > > prepare_hash; - - int _J; // effective source length; - int _I; // effective target length; - bool _matrix[MAX_WORDS][MAX_WORDS]; // true if aligned - short _sSpan[MAX_WORDS][2]; //the source span of a target index; 0->min, 1->max - short _tSpan[MAX_WORDS][2]; //the target span of a source index; 0->min, 2->max - int _freq_cutoff; - int SourceFWRuleIdxs[40]; //the indexes of function words in the rule; - // The following applies to all *FW*Idxs - // *FW*Idxs[0] = size - // *FW*Idxs[idx*3-2] = index in the alignment, where idx starts from 1 to size - // *FW*Idxs[idx*3-1] = source WordID - // *FW*Idxs[idx*3] = target WordID - int SourceFWRuleAbsIdxs[40]; - int TargetFWRuleIdxs[40]; //the indexes of function words in the rule; zeroth element is the count - int ** SourceFWAntsIdxs; //the indexes of function words in antecedents - int ** SourceFWAntsAbsIdxs; - int ** TargetFWAntsIdxs; //the indexes of function words in antecedents - int SourceRuleIdxs[40]; //the indexes of SOURCE tokens (zeroth element is the number of source tokens) - //>0 means terminal, -i means the i-th Xs - int TargetRuleIdxs[40]; //the indexes of TARGET tokens (zeroth element is the number of target tokens) - int ** SourceAntsIdxs; //the array of indexes of a particular antecedent's SOURCE tokens - int ** TargetAntsIdxs; //the array of indexes of a particular antecedent's TARGET tokens - int SourceFWIdxs[40]; - int SourceFWAbsIdxs[40]; - int TargetFWIdxs[40]; - // *sort* and *quickSort* are used to sort *FW*Idxs - void sort(int* num); - void quickSort(int arr[], int top, int bottom); - - // *block(Source|Target)* finds the minimum block that containts two indexes (fw1 and fw2) - inline int least(int i1, int i2) { return (i1<i2)?i1:i2; } - inline int most(int i1, int i2) { return (i1>i2)?i1:i2; } - void simplifyBackward(vector<int *>*blocks, int* block, const vector<int>& danglings); - // used in simplify to check whether an atomic block according to source function words is also atomic according - // to target function words as well, otherwise break it - // the resulting blocks are added into *blocks* - int _Arity; - std::vector<WordID> _f; // the source sentence of the **current** rule (may not consistent with the current alignment) - std::vector<WordID> _e; // the target sentence of the **current** rule - int RuleAl[40]; - int **AntsAl; - int firstSourceAligned(int start); - int firstTargetAligned(int start); - int lastSourceAligned(int end); - int lastTargetAligned(int end); - int fas, las, fat, lat; // first aligned source, last aligned source, first aligned target, last aligned target - bool MemberOf(int* FWIdxs, int pos1, int pos2); // whether FWIdxs contains pos1 and pos2 consecutively - // Convert the alignment to vector form, will be used for hashing purposes - vector<int> curr_al; - int GetFWGlobalIdx(int idx, const Lattice& sourcelattice, vector<WordID>& sources, int spanstart, int spanend, const std::vector<const void*>& ant_contexts, const map<WordID,int>& sfw); - int GetFirstFWIdx(int spanstart,int spanend, const Lattice& sourcelattice, const map<WordID,int>& sfw); - int GetLastFWIdx(int spanstart,int spanend, const Lattice& sourcelattice, const map<WordID,int>& sfw); - WordID generalize(WordID original, const map<WordID,WordID>& tags, bool pos=false); -}; - -#endif diff --git a/decoder/earley_composer.cc b/decoder/earley_composer.cc index efce70a6..d47a6969 100644 --- a/decoder/earley_composer.cc +++ b/decoder/earley_composer.cc @@ -4,8 +4,14 @@ #include <fstream> #include <map> #include <queue> -#include <tr1/unordered_map> -#include <tr1/unordered_set> +#ifndef HAVE_OLD_CPP +# include <unordered_map> +# include <unordered_set> +#else +# include <tr1/unordered_map> +# include <tr1/unordered_set> +namespace std { using std::tr1::unordered_map; using std::tr1::unordered_multiset; using std::tr1::unordered_set; } +#endif #include <boost/shared_ptr.hpp> #include <boost/program_options.hpp> @@ -19,7 +25,6 @@ #include "hg_remove_eps.h" using namespace std; -using namespace std::tr1; // Define the following macro if you want to see lots of debugging output // when you run the chart parser diff --git a/decoder/factored_lexicon_helper.cc b/decoder/factored_lexicon_helper.cc index 7203b325..e7899215 100644 --- a/decoder/factored_lexicon_helper.cc +++ b/decoder/factored_lexicon_helper.cc @@ -2,6 +2,7 @@ #include "filelib.h" #include "stringlib.h" +#include "sentence_metadata.h" using namespace std; diff --git a/decoder/factored_lexicon_helper.h b/decoder/factored_lexicon_helper.h index 81c75275..7fedc517 100644 --- a/decoder/factored_lexicon_helper.h +++ b/decoder/factored_lexicon_helper.h @@ -6,7 +6,8 @@ #include <string> #include <map> #include "tdict.h" -#include "sentence_metadata.h" + +struct SentenceMetadata; // when computing features, it can be advantageous to: // 1) back off to less specific forms (e.g., less highly inflected forms, POS tags, etc) diff --git a/decoder/ff_context.cc b/decoder/ff_context.cc index f2b0e67c..e56f6f1f 100644 --- a/decoder/ff_context.cc +++ b/decoder/ff_context.cc @@ -46,7 +46,7 @@ void RuleContextFeatures::ReplaceMacroWithString( macro << relative_location << "]"; int macro_index = feature_instance.find(macro.str()); if (macro_index == string::npos) { - cerr << "Can't find macro " << macro << " in feature template " + cerr << "Can't find macro " << macro.str() << " in feature template " << feature_instance; abort(); } diff --git a/decoder/ff_dwarf.cc b/decoder/ff_dwarf.cc deleted file mode 100644 index fe7a472e..00000000 --- a/decoder/ff_dwarf.cc +++ /dev/null @@ -1,894 +0,0 @@ -#include <vector> -#include <sstream> -#include <fstream> -#include <string> -#include <iostream> -#include <map> -#include "hg.h" -#include "ff_dwarf.h" -#include "dwarf.h" -#include "wordid.h" -#include "tdict.h" -#include "filelib.h" -#include "sentence_metadata.h" -#include "stringlib.h" - -using namespace std; - -Dwarf::Dwarf(const std::string& param) { -/* Param is a space separated string which contains any or all of the following: - oris|orit|doms|domt=filename - e.g. oris=/fs/clip-galep3eval/hendra/z2e/oris128.gz -*/ - sSOS="<s>"; - sEOS="</s>"; - kSOS=TD::Convert(sSOS); - kEOS=TD::Convert(sEOS); - kGOAL=TD::Convert("S")*-1; - _sent_id = (int *)malloc(sizeof(int)); - *_sent_id = -1; - if (DEBUG) cerr << "here = " << *_sent_id << endl; - _fwcount = (int *)malloc(sizeof(int)); - *_fwcount = -1; - cerr << "initializing dwarf" << endl; - flag_oris=false; flag_orit=false; flag_doms=false; flag_domt=false; flag_tfw_count=false; - flag_bdoms=false; flag_porislr=false, flag_porisrl=false, flag_goris=false; flag_pgorislr=false, flag_pgorisrl=false; - flag_pdomslr=false; flag_pdomsrl=false; flag_pgdomslr=false; flag_pgdomsrl=false; flag_gdoms=false; - flag_oris_backward=false; flag_orit_backward=false; - explicit_soseos=false; - SetStateSize(STATE_SIZE*sizeof(int)); - als = new Alignment(); - als->clearAls(Alignment::MAX_WORDS,Alignment::MAX_WORDS); - istringstream iss(param); string w; - while(iss >> w) { - int equal = w.find_first_of("="); - if (equal!=string::npos) { - string model = w.substr(0,equal); - vector<string> params; - Tokenize(w.substr(equal+1),',',¶ms); - string fn = params[0]; - if (model == "minfreq") { - cerr << "model minfreq " << fn << endl; - als->setFreqCutoff(atoi(fn.c_str())); - } else if (model == "oris") { - flag_oris = readOrientation(&toris,fn,&sfw); - if (flag_oris) { - oris_ = FD::Convert("OrientationSource"); - //oris_bo1_ = FD::Convert("OrientationSource_BO1"); - //oris_bo2_ = FD::Convert("OrientationSource_BO2"); - } - if (params.size()>1) als->setAlphaOris(atof(params[1].c_str())); - if (params.size()>2) als->setBetaOris(atof(params[2].c_str())); - } else if (model == "porislr") { - flag_porislr = readOrientation(&tporislr,fn,&sfw,true); - poris_nlr = 0; - if (flag_porislr) { - porislr_ = FD::Convert("OrientationSourcePositionfulLeftRight"); - } - if (params.size()>1) poris_nlr = atoi(params[1].c_str()); - if (DEBUG) cerr << " maximum poris depth=" << poris_nlr << endl; - } else if (model == "porisrl") { - flag_porisrl = readOrientation(&tporisrl,fn,&sfw,true); - poris_nrl = 0; - if (flag_porisrl) { - porisrl_ = FD::Convert("OrientationSourcePositionfulRightLeft"); - } - if (params.size()>1) poris_nrl = atoi(params[1].c_str()); - if (DEBUG) cerr << " maximum poris depth=" << poris_nrl << endl; - } else if (model=="goris") { - flag_goris = readOrientation(&tgoris,fn,&sfw); - if (flag_goris) { - goris_ = FD::Convert("OrientationSourceGeneralized"); - } - if (params.size()>1) { - readTags(params[1],&tags); - generalizeOrientation(&tgoris,tags); - } - } else if (model=="pgorislr") { - flag_pgorislr = readOrientation(&tpgorislr,fn,&sfw,true); - pgoris_nlr = 0; - if (flag_pgorislr) { - pgorislr_ = FD::Convert("OrientationSourceGeneralizedPositionfulLeftRight"); - } - if (DEBUG) { - cerr << "BEFORE GENERALIZATION" << endl; - tpgorislr.print(); - } - if (params.size()>1) pgoris_nlr = atoi(params[1].c_str()); - if (params.size()>2) { - readTags(params[2],&tags); - generalizeOrientation(&tpgorislr,tags,true); - } - if (DEBUG) { - cerr << "AFTER GENERALIZATION" << endl; - tpgorislr.print(); - } - } else if (model=="pgorisrl") { - flag_pgorisrl = readOrientation(&tpgorisrl,fn,&sfw,true); - pgoris_nrl = 0; - if (flag_pgorisrl) { - pgorisrl_ = FD::Convert("OrientationSourceGeneralizedPositionfulLeftRight"); - } - if (params.size()>1) pgoris_nrl = atoi(params[1].c_str()); - if (params.size()>2) { - readTags(params[2],&tags); - generalizeOrientation(&tpgorisrl,tags,true); - } - } else if (model == "oris_backward") { - flag_oris_backward = true; - if (!flag_oris) readOrientation(&toris,fn,&sfw); - oris_backward_ = FD::Convert("OrientationSourceBackward"); - if (params.size()>1) als->setAlphaOris(atof(params[1].c_str())); - if (params.size()>2) als->setBetaOris(atof(params[2].c_str())); - } else if (model == "orit") { - flag_orit = readOrientation(&torit,fn,&tfw); - if (flag_orit) { - orit_ = FD::Convert("OrientationTarget"); - //orit_bo1_ = FD::Convert("OrientationTarget_BO1"); - //orit_bo2_ = FD::Convert("OrientationTarget_BO2"); - } - if (params.size()>1) als->setAlphaOrit(atof(params[1].c_str())); - if (params.size()>2) als->setBetaOrit(atof(params[2].c_str())); - } else if (model == "orit_backward") { - flag_orit_backward = true; - if (!flag_orit) readOrientation(&torit,fn,&tfw); - orit_backward_ = FD::Convert("OrientationTargetBackward"); - if (params.size()>1) als->setAlphaOrit(atof(params[1].c_str())); - if (params.size()>2) als->setBetaOrit(atof(params[2].c_str())); - } else if (model == "doms") { - flag_doms = readDominance(&tdoms,fn,&sfw); - if (flag_doms) { - doms_ = FD::Convert("DominanceSource"); - //doms_bo1_ = FD::Convert("DominanceSource_BO1"); - //doms_bo2_ = FD::Convert("DominanceSource_BO2"); - } - if (params.size()>1) als->setAlphaDoms(atof(params[1].c_str())); - if (params.size()>2) als->setBetaDoms(atof(params[2].c_str())); - } else if (model == "pdomsrl") { - flag_pdomsrl = readDominance(&tpdomsrl,fn,&sfw,true); - if (flag_pdomsrl) { - pdomsrl_ = FD::Convert("DominanceSourcePositionfulRightLeft"); - } - if (params.size()>1) pdoms_nrl = atoi(params[1].c_str()); - } else if (model == "pdomslr") { - flag_pdomslr = readDominance(&tpdomslr,fn,&sfw,true); - tpdomslr.print(); - if (flag_pdomslr) { - pdomslr_ = FD::Convert("DominanceSourcePositionfulLeftRight"); - } - if (params.size()>1) pdoms_nlr = atoi(params[1].c_str()); - } else if (model == "pgdomsrl") { - flag_pgdomsrl = readDominance(&tpgdomsrl,fn,&sfw,true); - if (flag_pgdomsrl) { - pgdomsrl_ = FD::Convert("DominanceSourceGeneralizedPositionfulRightLeft"); - } - if (params.size()>1) pgdoms_nrl = atoi(params[1].c_str()); - if (params.size()>2) { - readTags(params[2],&tags); - generalizeDominance(&tpgdomsrl,tags,true); - } - } else if (model == "pgdomslr") { - flag_pgdomslr = readDominance(&tpgdomslr,fn,&sfw,true); - if (flag_pgdomslr) { - pgdomslr_ = FD::Convert("DominanceSourceGeneralizedPositionfulLeftRight"); - } - if (params.size()>1) pgdoms_nlr = atoi(params[1].c_str()); - if (params.size()>2) { - readTags(params[2],&tags); - if (DEBUG) { - for (map<WordID,WordID>::const_iterator it=tags.begin(); it!=tags.end(); it++) { - cerr << "tags = " << TD::Convert(it->first) << ", " << TD::Convert(it->second) << endl; - } - } - generalizeDominance(&tpgdomslr,tags,true); - } - if (DEBUG) tpgdomslr.print(); - } else if (model == "bdoms") { - flag_bdoms = readDominance(&tbdoms,fn,&sfw); - if (flag_bdoms) { - bdoms_ = FD::Convert("BorderDominanceSource"); - } - } else if (model == "domt") { - flag_domt = readDominance(&tdomt,fn,&tfw); - if (flag_domt) { - domt_ = FD::Convert("DominanceTarget"); - //domt_bo1_ = FD::Convert("DominanceTarget_BO1"); - //domt_bo2_ = FD::Convert("DominanceTarget_BO2"); - } - if (params.size()>1) als->setAlphaDomt(atof(params[1].c_str())); - if (params.size()>2) als->setBetaDomt(atof(params[2].c_str())); - } else if (model== "tfw_count") { - flag_tfw_count = readList(fn,&tfw); - tfw_count_ = FD::Convert("TargetFunctionWordsCount"); - } else { - cerr << "DWARF doesn't understand this model: " << model << endl; - } - } else { - if (w=="tfw_count") { - flag_tfw_count = true; - tfw_count_ = FD::Convert("TargetFunctionWordsCount"); - } else if (w=="oris_backward") { - flag_oris_backward = true; - oris_backward_ = FD::Convert("OrientationSourceBackward"); - } else if (w=="orit_backward") { - flag_orit_backward = true; - orit_backward_ = FD::Convert("OrientationTargetBackward"); - } else if (w=="explicit_soseos") { - explicit_soseos=true; - } else { - cerr << "DWARF doesn't need this param: " << param << endl; - } - } - } - for (map<WordID,int>::const_iterator it=sfw.begin(); it!=sfw.end() && DEBUG; it++) { - cerr << " FW:" << TD::Convert(it->first) << endl; - } -} - -void Dwarf::TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector<const void*>& ant_contexts, - SparseVector<double>* features, - SparseVector<double>* estimated_features, - void* context) const { - if (DEBUG) cerr << "TraversalFeaturesImpl" << endl; - double cost, bonus, bo1, bo2, bo1_bonus, bo2_bonus; - double bdoms_state_mono= 0; double bdoms_state_nonmono = 0; - TRule r = *edge.rule_; - if (DEBUG) cerr << " sent_id=" << *_sent_id << ", " << smeta.GetSentenceID() << endl; - if (DEBUG) cerr << "rule = " << r.AsString() << endl; - if (DEBUG) cerr << "rule[i,j] = " << edge.i_ << "," << edge.j_ << endl; - if (*_sent_id != smeta.GetSentenceID()) { //new sentence - *_sent_id = smeta.GetSentenceID(); - const Lattice l = smeta.GetSourceLattice(); - *_fwcount=0; - for (int i=0; i<smeta.GetSourceLength(); i++) { - if (sfw.find(l[i][0].label)!=sfw.end()) { - *_fwcount+=1; - } - } - if (DEBUG) cerr << "new sentence[" << *_sent_id << "]="<<*_fwcount<<endl; - } - bool nofw = als->prepare(*edge.rule_, ant_contexts, sfw, tfw,smeta.GetSourceLattice(),edge.i_,edge.j_); - bool isFinal = (edge.i_==0 && edge.j_==smeta.GetSourceLength() && r.GetLHS()==kGOAL); - // prepare *nofw* outputs whether the resulting alignment, contains function words or not - // if not, the models do not have to be calcualted and *simplify* is very simple - if (DEBUG) cerr << "nofw = " << nofw << endl; - if (flag_tfw_count) { - double count = 0; - for (int i=0; i<r.e_.size(); i++) { - if (tfw.find(r.e_[i])!=tfw.end()) count++; - } - features->set_value(tfw_count_,count); - } - if (flag_oris) { - cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; - if (!nofw) als->computeOrientationSource(toris,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus); - if (isFinal&&!explicit_soseos) { - cost += bonus; - bonus = 0; - } - features->set_value(oris_,cost); - //features->set_value(oris_bo1_,bo1); - //features->set_value(oris_bo2_,bo2); - estimated_features->set_value(oris_,bonus); - //estimated_features->set_value(oris_bo1_,bo1_bonus); - //estimated_features->set_value(oris_bo2_,bo2_bonus); - } - if (flag_porislr) { - cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; - if (!nofw) - als->computeOrientationSourcePos(tporislr,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus,*_fwcount,poris_nlr,0); - if (isFinal&&!explicit_soseos) { - cost += bonus; - bonus = 0; - } - features->set_value(porislr_,cost); - estimated_features->set_value(porislr_,bonus); - } - if (flag_porisrl) { - cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; - if (!nofw) - als->computeOrientationSourcePos(tporisrl,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus,*_fwcount,0,poris_nrl); - if (isFinal&&!explicit_soseos) { - cost += bonus; - bonus = 0; - } - features->set_value(porisrl_,cost); - estimated_features->set_value(porisrl_,bonus); - } - if (flag_pgorislr) { - cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; - if (!nofw) - als->computeOrientationSourcePos(tpgorislr,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus,*_fwcount,pgoris_nlr,0); - if (isFinal&&!explicit_soseos) { - cost += bonus; - bonus = 0; - } - features->set_value(pgorislr_,cost); - estimated_features->set_value(pgorislr_,bonus); - } - if (flag_pgorisrl) { - cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; - if (!nofw) - als->computeOrientationSourcePos(tpgorisrl,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus,*_fwcount,0,pgoris_nrl); - if (isFinal&&!explicit_soseos) { - cost += bonus; - bonus = 0; - } - features->set_value(pgorisrl_,cost); - estimated_features->set_value(pgorisrl_,bonus); - } - if (flag_goris) { - cost=0; bonus=0; - if (!nofw) als->computeOrientationSource(tgoris,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus); - if (isFinal&&!explicit_soseos) { - cost += bonus; - bonus = 0; - } - features->set_value(goris_,cost); - estimated_features->set_value(goris_,bonus); - } - if (flag_oris_backward) { - cost=0; bonus=0; - if (!nofw) - als->computeOrientationSourceBackward(toris,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus); - if (isFinal&&!explicit_soseos) { - cost += bonus; - bonus = 0; - } - features->set_value(oris_backward_,cost); - estimated_features->set_value(oris_backward_,bonus); - } - WordID _lfw = kSOS; - WordID _rfw = kEOS; - if (flag_doms || flag_pdomslr || flag_pdomsrl || flag_pgdomslr || flag_pgdomsrl) { - if (DEBUG) cerr << " seeking lfw and rfw" << endl; - int start = edge.i_; - int end = edge.j_; - if (DEBUG) cerr << " start=" << start << ", end=" << end << endl; - const Lattice l = smeta.GetSourceLattice(); - for (int idx=start-1; idx>=0; idx--) { - if (DEBUG) cerr << " checking idx=" << idx << ", label=" << l[idx][0].label << "-" << TD::Convert(l[idx][0].label) << endl; - if (sfw.find(l[idx][0].label) !=sfw.end()) { - if (DEBUG) cerr << "+"; - _lfw=l[idx][0].label; break; - } - } - for (int idx=end; idx<l.size(); idx++) { // end or end+1 - if (DEBUG) cerr << " checking idx=" << idx << ", label=" << l[idx][0].label << "-" << TD::Convert(l[idx][0].label) << endl; - if (sfw.find(l[idx][0].label)!=sfw.end()) { - if (DEBUG) cerr << "."; - _rfw=l[idx][0].label; break; - } - } - if (isFinal&&!explicit_soseos) { - _lfw=kSOS; _rfw=kEOS; - } - } - if (flag_doms) { - cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; - if (!nofw) als->computeDominanceSource(tdoms,_lfw,_rfw,&cost,&bonus, - &bo1,&bo1_bonus,&bo2,&bo2_bonus); - if (DEBUG) cerr << " COST=" << cost << ", BONUS=" << bonus << endl; - if (isFinal&&!explicit_soseos) { - cost += bonus; - if (DEBUG) cerr << " final and !explicit_soseos, thus cost = " << cost << endl; - bonus = 0; - } - features->set_value(doms_,cost); - estimated_features->set_value(doms_,bonus); - } - if (flag_pdomslr) { - if (DEBUG) cerr << " flag_pdomslr true, nofw=" << nofw << endl; - if (DEBUG) cerr << " lfw=" << _lfw << ", rfw=" << _rfw << endl; - if (DEBUG) cerr << " kSOS=" << kSOS << ", kEOS=" << kEOS << endl; - cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; - if (!nofw) als->computeDominanceSourcePos(tpdomslr,_lfw,_rfw,&cost,&bonus, - &bo1,&bo1_bonus,&bo2,&bo2_bonus,*_fwcount,pdoms_nlr,0); - if (isFinal&&!explicit_soseos) { - cost += bonus; - bonus = 0; - } - features->set_value(pdomslr_,cost); - estimated_features->set_value(pdomslr_,bonus); - } - if (flag_pdomsrl) { - cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; - if (!nofw) als->computeDominanceSourcePos(tpdomsrl,_lfw,_rfw,&cost,&bonus, - &bo1,&bo1_bonus,&bo2,&bo2_bonus,*_fwcount,0,pdoms_nrl); - if (isFinal&&!explicit_soseos) { - cost += bonus; - bonus = 0; - } - features->set_value(pdomsrl_,cost); - estimated_features->set_value(pdomsrl_,bonus); - } - if (flag_pgdomslr) { - cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; - if (!nofw) als->computeDominanceSourcePos(tpgdomslr,_lfw,_rfw,&cost,&bonus, - &bo1,&bo1_bonus,&bo2,&bo2_bonus,*_fwcount,pgdoms_nlr,0); - if (isFinal&&!explicit_soseos) { - cost += bonus; - bonus = 0; - } - features->set_value(pgdomslr_,cost); - estimated_features->set_value(pgdomslr_,bonus); - } - if (flag_pgdomsrl) { cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; - if (!nofw) als->computeDominanceSourcePos(tpgdomsrl,_lfw,_rfw,&cost,&bonus, - &bo1,&bo1_bonus,&bo2,&bo2_bonus,*_fwcount,0,pgdoms_nrl); - if (isFinal&&!explicit_soseos) { - cost += bonus; - bonus = 0; - } - features->set_value(pgdomsrl_,cost); - estimated_features->set_value(pgdomsrl_,bonus); - } - - - if (flag_bdoms) { - cost=0; bonus=0; bdoms_state_mono=0; bdoms_state_nonmono=0; - if (!nofw) - als->computeBorderDominanceSource(tbdoms,&cost,&bonus, - &bdoms_state_mono, &bdoms_state_nonmono,*edge.rule_, ant_contexts, sfw); - features->set_value(bdoms_,cost); - estimated_features->set_value(bdoms_,bonus); - } - if (flag_orit) { - cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; - if (!nofw) als->computeOrientationTarget(torit,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus); - if (DEBUG) cerr << "cost=" << cost << ", bonus=" << bonus << ", bo1=" << bo1 << ", bo1_bonus=" << bo1_bonus << ", bo2=" << bo2 << ", bo2_bonus=" << bo2_bonus << endl; - features->set_value(orit_,cost); - //features->set_value(orit_bo1_,bo1); - //features->set_value(orit_bo2_,bo2); - estimated_features->set_value(orit_,bonus); - //estimated_features->set_value(orit_bo1_,bo1_bonus); - //estimated_features->set_value(orit_bo2_,bo2_bonus); - } - if (flag_orit_backward) { - cost=0; bonus=0; - if (!nofw) als->computeOrientationTargetBackward(torit,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus); - features->set_value(orit_backward_,cost); - estimated_features->set_value(orit_backward_,bonus); - } - if (flag_domt) { - cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0; - WordID _lfw=-1; int start = edge.i_; - WordID _rfw=-1; int end = edge.j_; - if (smeta.HasReference()) { - const Lattice l = smeta.GetReference(); - for (int idx=start-1; idx>=0; idx--) { - if (l.size()>0) - if (tfw.find(l[idx][0].label) !=tfw.end()) { - _lfw=l[idx][0].label; break; - } - } - for (int idx=end; idx<l.size(); idx++) { // end or end+1 - if (l[idx].size()>0) - if (tfw.find(l[idx][0].label)!=tfw.end()) { - _rfw=l[idx][0].label; break; - } - } - } - //neighboringFWs(smeta.GetReference(),edge.i_,edge.j_,tfw,&_lfw,&_rfw); - if (!nofw) als->computeDominanceTarget(tdomt,_lfw,_rfw,&cost,&bonus, - &bo1,&bo1_bonus,&bo2,&bo2_bonus); - features->set_value(domt_,cost); - //features->set_value(domt_bo1_,bo1); - //features->set_value(domt_bo2_,bo2); - estimated_features->set_value(domt_,bonus); - //estimated_features->set_value(domt_bo1_,bo1_bonus); - //estimated_features->set_value(domt_bo2_,bo2_bonus); - } - int* vcontext = reinterpret_cast<int *>(context); - if (!nofw) { - als->BorderingSFWsOnly(); - als->BorderingTFWsOnly(); - als->simplify(vcontext); - } else { - als->simplify_nofw(vcontext); - } - vcontext[50] = DoubleToInteger(bdoms_state_mono); - vcontext[51] = DoubleToInteger(bdoms_state_nonmono); - vcontext[STATE_SIZE-1] = Alignment::link(edge.i_,edge.j_); - if (DEBUG) { - cerr << "state@traverse = "; - for (int idx=0; idx<STATE_SIZE; idx++) cerr << idx << "." << vcontext[idx] << " "; - cerr << endl; - cerr << "bdoms_state_mono=" << bdoms_state_mono << ", state[50]=" << IntegerToDouble(vcontext[50]) << endl; - cerr << "bdoms_state_nonmono=" << bdoms_state_nonmono << ", state[51]=" << IntegerToDouble(vcontext[51]) << endl; - } -} - -int Dwarf::DoubleToInteger(double val) { - float x = (float)val; - float* px = &x; - int* pix = reinterpret_cast<int *>(px); - return *pix; -} - -double Dwarf::IntegerToDouble(int val) { - int *py = &val; - float* pd = reinterpret_cast<float *>(py); - return (double)*pd; -} - -void Dwarf::neighboringFWs(const Lattice& l, const int& i, const int& j, const map<WordID,int>& fw_hash, int* lfw, int* rfw) { - *lfw=0; *rfw=0; - int idx=i-l[i][0].dist2next; - while (idx>=0) { - if (l[idx].size()>0) { - if (fw_hash.find(l[idx][0].label)!=fw_hash.end()) { - lfw++; - } - } - idx-=l[idx][0].dist2next; - } - idx=j+l[j][0].dist2next; - while (idx<l.size()) { - if (l[idx].size()>0) { - if (fw_hash.find(l[idx][0].label)!=fw_hash.end()) { - rfw++; - } - } - idx+=l[idx][0].dist2next; - } -} - -bool Dwarf::readOrientation(CountTable* table, const std::string& filename, std::map<WordID,int> *fw, bool pos) { - // the input format is - // source target 0 1 2 3 4 0 1 2 3 4 - // 0 -> MA, 1 -> RA, 2 -> MG, 3 -> RG, 4 -> NO_NEIGHBOR - // first 01234 corresponds to the left neighbor, the second 01234 corresponds to the right neighbor - // append 2 more at the end as precomputed total - - // TONS of hack here. CountTable should be wrapped as a class - // TODO: check whether the file exists or not, return false if not - if (DEBUG) cerr << " readOrientation(" << filename << ", pos=" << pos << ")" << endl; - ReadFile rf(filename); - istream& in = *rf.stream(); - table->setup(24,pos); - table->ultimate = new int[24]; - for (int i=0; i<24; i++) table->ultimate[i]=0; - ostringstream oss; - while (in) { - string line; - getline(in,line); - if (line=="") break; - istringstream tokenizer(line); - string sourceidx, source, target, word; - tokenizer >> source >> target; - if (pos) { - sourceidx = source; - source = sourceidx.substr(0,sourceidx.find_last_of("/")); - } - if (fw->find(TD::Convert(source))==fw->end()) fw->insert(pair<WordID,int>(TD::Convert(source),1)); - - - int* element = new int[24]; - element[5] = 0; - for (int i=0; i<5; i++) { - element[i] = 0; - if (tokenizer >> word) element[i] = atoi(word.c_str()); - element[5] += element[i]; - } - element[11] = 0; - for (int i=6; i<11; i++) { - element[i] = 0; - if (tokenizer >> word) element[i] = atoi(word.c_str()); - element[11] += element[i]; - } - element[17] = 0; - for (int i=12; i<17; i++) { - element[i] = 0; - if (tokenizer >> word) element[i] = atoi(word.c_str()); - element[17] += element[i]; - } - element[23] = 0; - for (int i=18; i<23; i++) { - element[i] = 0; - if (tokenizer >> word) element[i] = atoi(word.c_str()); - element[23] += element[i]; - } - for (int i=0; i<24; i++) table->ultimate[i] += element[i]; - oss << source << " " << target; - WordID key_id = TD::Convert(oss.str()); - oss.str(""); - if (table->model.find(key_id)!=table->model.end()) { - for (int i=0; i<24; i++) table->model[key_id][i]+=element[i]; - } else { - int* el2 = new int[24]; - for (int i=0; i<24; i++) el2[i] = element[i]; - table->model.insert(pair<WordID,int*>(key_id,el2)); - } - - oss << source; - key_id = TD::Convert(oss.str()); - oss.str(""); - if (table->model.find(key_id)!=table->model.end()) { - for (int i=0; i<24; i++) table->model[key_id][i]+=element[i]; - } else { - int* el2 = new int[24]; - for (int i=0; i<24; i++) el2[i] = element[i]; - table->model.insert(pair<WordID,int*>(key_id,el2)); - } - - if (pos) { - oss << sourceidx << " " << target; - key_id = TD::Convert(oss.str()); - oss.str(""); - if (table->model.find(key_id)!=table->model.end()) { - for (int i=0; i<24; i++) table->model[key_id][i]+=element[i]; - } else { - int* el2 = new int[24]; - for (int i=0; i<24; i++) el2[i] = element[i]; - table->model.insert(pair<WordID,int*>(key_id,el2)); - } - } - delete[] element; - } - return true; -} - -bool Dwarf::readList(const std::string& filename, std::map<WordID,int>* fw) { - ReadFile rf(filename); - istream& in = *rf.stream(); - while (in) { - string word; - getline(in,word); - if (fw->find(TD::Convert(word))==fw->end()) fw->insert(pair<WordID,int>(TD::Convert(word),1)); - } - return true; -} - -bool Dwarf::readDominance(CountTable* table, const std::string& filename, std::map<WordID,int>* fw, bool pos) { - // the input format is - // source1 source2 target1 target2 0 1 2 3 - // 0 -> dontcase 1->leftfirst 2->rightfirst 3->neither - if (DEBUG) cerr << "readDominance(" << filename << ",pos="<< pos << ")" << endl; - ReadFile rf(filename); - istream& in = *rf.stream(); - table->ultimate = new int[5]; - table->setup(5,pos); - for (int i=0; i<5; i++) table->ultimate[i]=0; - while (in) { - string line, word; - getline(in,line); - if (line=="") break; - string source1idx, source2idx, target1, target2, source1, source2; - ostringstream oss; - WordID key_id; - istringstream tokenizer(line); - tokenizer >> source1 >> source2 >> target1 >> target2; - if (pos) { - source1idx = source1; - source2idx = source2; - source1 = source1idx.substr(0,source1idx.find_last_of("/")); - source2 = source2idx.substr(0,source2idx.find_last_of("/")); - } - if (fw->find(TD::Convert(source1))==fw->end()) fw->insert(pair<WordID,int>(TD::Convert(source1),1)); - if (fw->find(TD::Convert(source2))==fw->end()) fw->insert(pair<WordID,int>(TD::Convert(source2),1)); - - int* element = new int[5]; - element[4]=0; - for (int i=0; i<4; i++) { - element[i] = 0; - if (tokenizer >> word) element[i] = atoi(word.c_str()); - element[4]+=element[i]; - } - for (int i=0; i<5; i++) table->ultimate[i] += element[i]; - - oss << source1 << " " << source2 << " " << target1 << " " << target2; - key_id = TD::Convert(oss.str()); - oss.str(""); - if (table->model.find(key_id)!=table->model.end()) { - for (int i=0; i<5; i++) table->model[key_id][i]+=element[i]; - } else { - int* el2 = new int[5]; - for (int i=0; i<5; i++) el2[i]=element[i]; - table->model.insert(pair<WordID,int*>(key_id,el2)); - } - - oss << source1 << " " << source2; - key_id = TD::Convert(oss.str()); - oss.str(""); - if (table->model.find(key_id)!=table->model.end()) { - for (int i=0; i<5; i++) table->model[key_id][i]+=element[i]; - } else { - int* el2 = new int[5]; - for (int i=0; i<5; i++) el2[i]=element[i]; - table->model.insert(pair<WordID,int*>(key_id,el2)); - } - - if (pos) { - oss << source1idx << " " << source2idx << " " << target1 << " " << target2; - key_id = TD::Convert(oss.str()); - oss.str(""); - if (table->model.find(key_id)!=table->model.end()) { - for (int i=0; i<5; i++) table->model[key_id][i]+=element[i]; - } else { - int* el2 = new int[5]; - for (int i=0; i<5; i++) el2[i]=element[i]; - table->model.insert(pair<WordID,int*>(key_id,el2)); - } - } - delete element; - } - - return true; -} - -bool Dwarf::readTags(const std::string& filename, std::map<WordID,WordID>* tags) { - ReadFile rf(filename); - istream& in = *rf.stream(); - while(in) { - string line, word, tag; - getline(in,line); - if (line=="") break; - istringstream tokenizer(line); - tokenizer >> tag >> word; - tags->insert(pair<WordID,WordID>(TD::Convert(word),TD::Convert(tag))); - } - return true; -} - -bool Dwarf::generalizeOrientation(CountTable* table, const std::map<WordID,WordID>& tags, bool pos) { - map<string,int*> generalized; - for (map<WordID,int*>::iterator it=table->model.begin(); it!=table->model.end(); it++) { - string source, target; - istringstream tokenizer(TD::Convert(it->first)); - tokenizer >> source >> target; - string idx = ""; - if (pos) { - int found = source.find_last_of("/"); - if (found!=string::npos && found>0) { - idx = source.substr(found+1); - source = source.substr(0,found); - } - } - map<WordID,WordID>::const_iterator tags_iter = tags.find(TD::Convert(source)); - if (tags_iter!=tags.end()) { - ostringstream genkey; - genkey << TD::Convert(tags_iter->second); - if (idx!="") genkey << "/" << idx; - if (target!="") genkey << " " << target; - int* model; - if (generalized.find(genkey.str())!=generalized.end()) { - model = generalized[genkey.str()]; - for (int i=0; i<24; i++) model[i] += it->second[i]; - } else { - int* el = new int[24]; - for (int i=0; i<24; i++) el[i] = it->second[i]; - generalized.insert(pair<string,int*>(genkey.str(),el)); - } - } - } - for (map<WordID,int*>::iterator it=table->model.begin(); it!=table->model.end(); it++) { - string source, target; - istringstream tokenizer(TD::Convert(it->first)); - tokenizer >> source >> target; - string idx = ""; - if (pos) { - int found = source.find_last_of("/"); - if (found!=string::npos && found>0) { - idx = source.substr(found+1); - source = source.substr(0,found); - } - } - map<WordID,WordID>::const_iterator tags_iter = tags.find(TD::Convert(source)); - if (tags_iter!=tags.end()) { - ostringstream genkey; - genkey << TD::Convert(tags_iter->second); - if (idx!="") genkey << "/" << idx; - if (target!="") genkey << " " << target; - if (generalized.find(genkey.str())!=generalized.end()) { - delete it->second; - it->second = generalized[genkey.str()]; - } - } - } - return false; // no idea if this is right -} - - - -bool Dwarf::generalizeDominance(CountTable* table, const std::map<WordID,WordID>& tags, bool pos) { - map<string,int*> generalized; - ostringstream oss; - for (map<WordID,int*>::iterator it=table->model.begin(); it!=table->model.end(); it++) { - string source1, source2, target1, target2; - string idx1 = ""; string idx2 = ""; - istringstream tokenizer(TD::Convert(it->first)); - tokenizer >> source1 >> source2 >> target1 >> target2; - if (DEBUG) cerr << "source1=|" << source1 << "|, source2=|" << source2 << "|, target1=|" << target1 << "|, target2=|" << target2 << "|" << endl; - if (pos) { - int found1 = source1.find_last_of("/"); - int found2 = source2.find_last_of("/"); - if (found1!=string::npos && found2!=string::npos && found1>0 && found2>0) { - idx1 = source1.substr(found1+1); - source1 = source1.substr(0,found1); - idx2 = source2.substr(found2+1); - source2 = source2.substr(0,found2); - } - } - if (DEBUG) - cerr << "[U]source1='" << source1 << "', idx1='"<< idx1 << "', source2='" << source2 << "', idx2='"<< idx2 << "', target1='" << target1 << "', target2='" << target2 << "'" << endl; - map<WordID,WordID>::const_iterator tags_iter1 = tags.find(TD::Convert(source1)); - map<WordID,WordID>::const_iterator tags_iter2 = tags.find(TD::Convert(source2)); - if (tags_iter1!=tags.end()) - source1 = TD::Convert(tags_iter1->second); - oss << source1; - if (idx1!="") oss << "/" << idx1; - if (tags_iter2!=tags.end()) - source2 = TD::Convert(tags_iter2->second); - oss << " " << source2; - if (idx2!="") oss << "/" << idx2; - if (target1!="" && target2!="") oss << " " << target1 << " " << target2; - - if (DEBUG) cerr << "generalized key = '" << oss.str() << "'" << endl; - if (generalized.find(oss.str())!=generalized.end()) { - int* model = generalized[oss.str()]; - for (int i=0; i<5; i++) model[i] += it->second[i]; - } else { - int* model = new int[5]; - for (int i=0; i<5; i++) model[i] = it->second[i]; - generalized.insert(pair<string,int*>(oss.str(),model)); - } - oss.str(""); - } - - if (DEBUG) { - for (map<string,int*>::const_iterator it=generalized.begin(); it!=generalized.end(); it++) { - cerr << "GENERALIZED = " << it->first << ", "; - for (int i=0; i<5; i++) cerr << it->second[i] << " "; - cerr << endl; - } - } - - for (map<WordID,int*>::iterator it=table->model.begin(); it!=table->model.end(); it++) { - string source1, source2, target1, target2; - string idx1 = ""; string idx2 = ""; - istringstream tokenizer(TD::Convert(it->first)); - tokenizer >> source1 >> source2 >> target1 >> target2; - if (pos) { - int found1 = source1.find_last_of("/"); - int found2 = source2.find_last_of("/"); - if (found1!=string::npos && found2!=string::npos && found1>0 && found2>0) { - idx1 = source1.substr(found1+1); - source1 = source1.substr(0,found1); - idx2 = source2.substr(found2+1); - source2 = source2.substr(0,found2); - } - } - map<WordID,WordID>::const_iterator tags_iter1 = tags.find(TD::Convert(source1)); - map<WordID,WordID>::const_iterator tags_iter2 = tags.find(TD::Convert(source2)); - if (tags_iter1!=tags.end()) - source1 = TD::Convert(tags_iter1->second); - oss << source1; - if (idx1!="") oss << "/" << idx1; - if (tags_iter2!=tags.end()) - source2 = TD::Convert(tags_iter2->second); - oss << " " << source2; - if (idx2!="") oss << "/" << idx2; - if (target1!="" && target2!="") oss << " " << target1 << " " << target2; - - if (generalized.find(oss.str())!=generalized.end()) { - if (DEBUG) cerr << " generalizing "<< TD::Convert(it->first) << " into " << oss.str() << endl; - if (DEBUG) { - cerr << " model from "; - for (int i=0; i<5; i++) cerr << it->second[i] << " "; - cerr << endl; - } - delete it->second; - it->second = generalized[oss.str()]; - if (DEBUG) { - cerr << " into "; - for (int i=0; i<5; i++) cerr << it->second[i] << " "; - cerr << endl; - } - } - oss.str(""); - } - -} diff --git a/decoder/ff_dwarf.h b/decoder/ff_dwarf.h deleted file mode 100644 index 3d6a7da6..00000000 --- a/decoder/ff_dwarf.h +++ /dev/null @@ -1,100 +0,0 @@ -#include <vector> -#include <map> -#include <string> -#include "ff.h" -#include "dwarf.h" -#include "lattice.h" - -using namespace std; - -class Dwarf : public FeatureFunction { - public: - Dwarf(const std::string& param); - /* State-related param - STATE_SIZE: the number of ints - MAXIMUM_ALIGNMENTS: the maximum number of alignments in the states, - each alignment point is encoded in one int - (the first two bytes for source, and the remaining one for target) - */ - static const int STATE_SIZE=53; - static const int IMPOSSIBLY_LARGE_POS = 9999999; - static const int MAXIMUM_ALIGNMENTS=37; - /* Read from file the Orientation(Source|Target model parameter. */ - static bool readOrientation(CountTable* table, const std::string& filename, std::map<WordID,int> *fw, bool pos=false); - /* Read from file the Dominance(Source|Target) model parameter. */ - static bool readDominance(CountTable* table, const std::string& filename, std::map<WordID,int> *fw, bool pos=false); - static bool readList(const std::string& filename, std::map<WordID,int>* fw); - static double IntegerToDouble(int val); - static int DoubleToInteger(double val); - bool readTags(const std::string& filename, std::map<WordID,WordID>* tags); - bool generalizeOrientation(CountTable* table, const std::map<WordID,WordID>& tags, bool pos=false); - bool generalizeDominance(CountTable* table, const std::map<WordID,WordID>& tags, bool pos=false); - static void stripIndex(const string& source, string* pkey, string* pidx) { - if (DEBUG) cerr << " stripIndex(" << source << ")" << endl; - int found = source.find_last_of("/"); - string idx = source.substr(found+1); - string key = source.substr(0,found); - if (DEBUG) cerr << " found=" << found << "," << key << "," << idx << endl; - pkey = &key; - pidx = &idx; - } - - - protected: - /* The high-level workflow is as follow: - 1. call *als->prepare*, which constructs the full alignment of the edge while taking into account the antecedents - also in this call, function words are identified. Most of the work in this call is to make sure the indexes - of the alignments (including the function words) are consistent with the newly created alignment - 2. call *als->computeOrientationSource*, *als->computeOrientationTarget*, - *als->computeDominanceSource*, or *als->computeDominanceTarget* - and pass the resulting score to either *features* or to *estimated_features* - 3. call *als->BorderingSFWsOnly()* and *als->BorderingTFWsOnly()*, which removes records of all function word - alignments except those at the borders. Note that fw alignments kept may be more than two on each side - for examples if there are a number of unaligned fw alignments before the leftmost alignment or the rightmost one - 4. call *als->simplify()*, which assigns the state of this edge (*context*). It simplifies the alignment space to - its most compact representation, enough to compute the unscored models. This is done by observing the surviving - function word alignments set by 3. - */ - void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const HG::Edge& edge, - const std::vector<const void*>& ant_contexts, - SparseVector<double>* features, - SparseVector<double>* estimated_features, - void* context) const; - private: - Alignment* als; - /* Feature IDs set by calling FD::Convert(model's string) */ - int oris_, oris_bo1_, oris_bo2_, orit_, orit_bo1_, orit_bo2_; - int oris_backward_, orit_backward_, porislr_, porisrl_, goris_, pgorislr_, pgorisrl_; - int pdomslr_, pdomsrl_, pgdomslr_, pgdomsrl_; - int doms_, doms_bo1_, doms_bo2_, domt_, domt_bo1_, domt_bo2_; - int tfw_count_; - int bdoms_; - int poris_count; - int pgoris_count; - int poris_nlr, poris_nrl; // maximum depth (1->from the beginning of the sentence, 2-> from the end of the sentence) - int pgoris_nlr, pgoris_nrl; - int pdoms_nlr, pdoms_nrl; - int pgdoms_nlr, pgdoms_nrl; - int* _sent_id; - int* _fwcount; - WordID kSOS; - WordID kEOS; - string sSOS; - string sEOS; - WordID kGOAL; - /* model's flag, if set true will invoke the model scoring */ - bool flag_oris, flag_orit, flag_doms, flag_domt, flag_tfw_count, flag_oris_backward, flag_orit_backward, flag_bdoms; - bool flag_porislr, flag_porisrl, flag_goris, flag_pgorislr, flag_pgorisrl; - bool explicit_soseos; - bool flag_pdomslr, flag_pdomsrl, flag_pgdomslr, flag_pgdomsrl, flag_gdoms; - /* a collection of Source function words (sfw) and Target function words (tfw) */ - std::map<WordID,int> sfw; - std::map<WordID,int> tfw; - std::map<WordID,WordID> tags; - /* a collection of model's parameter */ - CountTable toris, torit, tdoms, tbdoms, tdomt, tporislr, tporisrl, tgoris, tpgorislr, tpgorisrl; - CountTable tpdomslr, tpdomsrl, tpgdomslr, tpgdomsrl; - void neighboringFWs(const Lattice& l, const int& i, const int& j, const map<WordID,int>& fw_hash, int* lfw, int* rfw); -}; - diff --git a/decoder/ff_external.cc b/decoder/ff_external.cc index dea0e20f..6ee4b2cf 100644 --- a/decoder/ff_external.cc +++ b/decoder/ff_external.cc @@ -19,7 +19,7 @@ ExternalFeature::ExternalFeature(const string& param) { cerr << "External requires a path to a dynamic library!\n"; abort(); } - lib_handle = dlopen(file.c_str(), RTLD_LAZY); + lib_handle = dlopen(file.c_str(), RTLD_LAZY | RTLD_GLOBAL); if (!lib_handle) { cerr << "dlopen reports: " << dlerror() << endl; cerr << "Did you provide a full path to the dynamic library?\n"; diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc index 6ec7b4f3..bc51076f 100644 --- a/decoder/ff_lm.cc +++ b/decoder/ff_lm.cc @@ -61,11 +61,6 @@ char const* usage_verbose="-n determines the name of the feature (and its weight #include "hg.h" #include "stringlib.h" -#ifdef HAVE_RANDLM -// http://randlm.sourceforge.net/ -#include "RandLM.h" -#endif - using namespace std; string LanguageModel::usage(bool param,bool verbose) { @@ -542,99 +537,3 @@ void LanguageModel::FinalTraversalFeatures(const void* ant_state, features->set_value(fid_, imp().FinalTraversalCost(ant_state)); } -#ifdef HAVE_RANDLM -struct RandLMImpl : public LanguageModelImpl { - RandLMImpl(int order, randlm::RandLM* rlm) : - LanguageModelImpl(order), - rlm_(rlm), - oov_(rlm->getWordID(rlm->getOOV())), - rb_(1000, oov_) { - map<int, randlm::WordID> map_cdec2randlm; - int max_wordid = 0; - for(map<randlm::Word, randlm::WordID>::const_iterator it = rlm->vocabStart(); - it != rlm->vocabEnd(); ++it) { - const int cur = TD::Convert(it->first); - map_cdec2randlm[TD::Convert(it->first)] = it->second; - if (cur > max_wordid) max_wordid = cur; - } - cdec2randlm_.resize(max_wordid + 1, oov_); - for (map<int, randlm::WordID>::iterator it = map_cdec2randlm.begin(); - it != map_cdec2randlm.end(); ++it) - cdec2randlm_[it->first] = it->second; - map_cdec2randlm.clear(); - } - - inline randlm::WordID Convert2RandLM(int w) { - return (w < cdec2randlm_.size() ? cdec2randlm_[w] : oov_); - } - - virtual double WordProb(int word, int* context) { - int i = order_; - int c = 1; - rb_[i] = Convert2RandLM(word); - while (i > 1 && *context > 0) { - --i; - rb_[i] = Convert2RandLM(*context); - ++context; - ++c; - } - const void* finalState = 0; - int found; - //cerr << "I = " << i << endl; - return rlm_->getProb(&rb_[i], c, &found, &finalState); - } - private: - boost::shared_ptr<randlm::RandLM> rlm_; - randlm::WordID oov_; - vector<randlm::WordID> cdec2randlm_; - vector<randlm::WordID> rb_; -}; - -LanguageModelRandLM::LanguageModelRandLM(const string& param) : - fid_(FD::Convert("RandLM")) { - vector<string> argv; - int argc = SplitOnWhitespace(param, &argv); - int order = 3; - // TODO add support for -n FeatureName - string filename; - if (argc < 1) { cerr << "RandLM requires a filename, minimally!\n"; abort(); } - else if (argc == 1) { filename = argv[0]; } - else if (argc == 2 || argc > 3) { cerr << "Don't understand 'RandLM " << param << "'\n"; } - else if (argc == 3) { - if (argv[0] == "-o") { - order = atoi(argv[1].c_str()); - filename = argv[2]; - } else if (argv[1] == "-o") { - order = atoi(argv[2].c_str()); - filename = argv[0]; - } - } -// set_order(order); - int cache_MB = 200; // increase cache size - randlm::RandLM* rlm = randlm::RandLM::initRandLM(filename, order, cache_MB); - assert(rlm != NULL); - pimpl_ = new RandLMImpl(order, rlm); -} - -LanguageModelRandLM::~LanguageModelRandLM() { - delete pimpl_; -} - -void LanguageModelRandLM::TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const vector<const void*>& ant_states, - SparseVector<double>* features, - SparseVector<double>* estimated_features, - void* state) const { - (void) smeta; - features->set_value(fid_, imp().LookupWords(*edge.rule_, ant_states, state)); - estimated_features->set_value(fid_, imp().EstimateProb(state)); -} - -void LanguageModelRandLM::FinalTraversalFeatures(const void* ant_state, - SparseVector<double>* features) const { - features->set_value(fid_, imp().FinalTraversalCost(ant_state)); -} - -#endif - diff --git a/decoder/ff_lm.h b/decoder/ff_lm.h index 94e18f00..85e79704 100644 --- a/decoder/ff_lm.h +++ b/decoder/ff_lm.h @@ -69,26 +69,4 @@ class LanguageModel : public FeatureFunction { /* mutable */ LanguageModelInterface* pimpl_; }; -#ifdef HAVE_RANDLM -class LanguageModelRandLM : public FeatureFunction { - public: - // param = "filename.lm [-o n]" - LanguageModelRandLM(const std::string& param); - ~LanguageModelRandLM(); - virtual void FinalTraversalFeatures(const void* context, - SparseVector<double>* features) const; - std::string DebugStateToString(const void* state) const; - protected: - virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const HG::Edge& edge, - const std::vector<const void*>& ant_contexts, - SparseVector<double>* features, - SparseVector<double>* estimated_features, - void* out_context) const; - private: - const int fid_; - mutable LanguageModelImpl* pimpl_; -}; -#endif - #endif diff --git a/decoder/ff_parse_match.cc b/decoder/ff_parse_match.cc index ed556b91..58026975 100644 --- a/decoder/ff_parse_match.cc +++ b/decoder/ff_parse_match.cc @@ -42,10 +42,8 @@ struct ParseMatchFeaturesImpl { void InitializeGrids(const string& tree, unsigned src_len) { assert(tree.size() > 0); - //fids_cat.clear(); fids_ef.clear(); src_tree.clear(); - //fids_cat.resize(src_len, src_len + 1); fids_ef.resize(src_len, src_len + 1); src_tree.resize(src_len, src_len + 1, TD::Convert("XX")); ParseTreeString(tree, src_len); @@ -112,7 +110,7 @@ struct ParseMatchFeaturesImpl { int fid_ef = FD::Convert("PM"); int min_dist; // minimal distance to next syntactic constituent of this rule's LHS int summed_min_dists; // minimal distances of LHS and NTs summed up - if (TD::Convert(lhs).compare("XX") != 0) + if (TD::Convert(lhs).compare("XX") != 0) min_dist= 0; // compute the distance to the next syntactical constituent else { @@ -131,7 +129,7 @@ struct ParseMatchFeaturesImpl { ok = 1; break; } - // check if removing k words from the rule span will + // check if removing k words from the rule span will // lead to a syntactical constituent else { //cerr << "Hilfe...!" << endl; @@ -144,7 +142,7 @@ struct ParseMatchFeaturesImpl { ok = 1; break; } - } + } } if (ok) break; } @@ -183,9 +181,9 @@ struct ParseMatchFeaturesImpl { return min_dist; } - Array2D<WordID> src_tree; // src_tree(i,j) NT = type + Array2D<WordID> src_tree; // src_tree(i,j) NT = type unsigned int src_sent_len; - mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized + mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized int scoring_method; }; @@ -214,5 +212,9 @@ void ParseMatchFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, } void ParseMatchFeatures::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); + ReadFile f = ReadFile(smeta.GetSGMLValue("src_tree")); + string tree; + f.ReadAll(tree); + impl->InitializeGrids(tree, smeta.GetSourceLength()); } + diff --git a/decoder/ff_parse_match.h b/decoder/ff_parse_match.h index fa73481a..7820b418 100644 --- a/decoder/ff_parse_match.h +++ b/decoder/ff_parse_match.h @@ -23,3 +23,4 @@ class ParseMatchFeatures : public FeatureFunction { }; #endif + diff --git a/decoder/ff_soft_syntax.cc b/decoder/ff_soft_syntax.cc index 9981fa45..23fe87bd 100644 --- a/decoder/ff_soft_syntax.cc +++ b/decoder/ff_soft_syntax.cc @@ -13,16 +13,15 @@ using namespace std; -// Implements the soft syntactic features described in +// Implements the soft syntactic features described in // Marton and Resnik (2008): "Soft Syntacitc Constraints for Hierarchical Phrase-Based Translation". // Source trees must be represented in Penn Treebank format, // e.g. (S (NP John) (VP (V left))). -struct SoftSyntacticFeaturesImpl { - SoftSyntacticFeaturesImpl(const string& param) { +struct SoftSyntaxFeaturesImpl { + SoftSyntaxFeaturesImpl(const string& param) { vector<string> labels = SplitOnWhitespace(param); - for (unsigned int i = 0; i < labels.size(); i++) - //cerr << "Labels: " << labels.at(i) << endl; + //for (unsigned int i = 0; i < labels.size(); i++) { cerr << "Labels: " << labels.at(i) << endl; } for (unsigned int i = 0; i < labels.size(); i++) { string label = labels.at(i); pair<string, string> feat_label; @@ -34,10 +33,8 @@ struct SoftSyntacticFeaturesImpl { void InitializeGrids(const string& tree, unsigned src_len) { assert(tree.size() > 0); - //fids_cat.clear(); fids_ef.clear(); src_tree.clear(); - //fids_cat.resize(src_len, src_len + 1); fids_ef.resize(src_len, src_len + 1); src_tree.resize(src_len, src_len + 1, TD::Convert("XX")); ParseTreeString(tree, src_len); @@ -99,7 +96,7 @@ struct SoftSyntacticFeaturesImpl { const WordID lhs = src_tree(i,j); string lhs_str = TD::Convert(lhs); //cerr << "LHS: " << lhs_str << " from " << i << " to " << j << endl; - //cerr << "RULE :"<< rule << endl; + //cerr << "RULE :"<< rule << endl; int& fid_ef = fids_ef(i,j)[&rule]; for (unsigned int i = 0; i < feat_labels.size(); i++) { ostringstream os; @@ -110,10 +107,10 @@ struct SoftSyntacticFeaturesImpl { switch(feat_type) { case '2': if (lhs_str.compare(label) == 0) { - os << "SYN:" << label << "_conform"; + os << "SOFT:" << label << "_conform"; } else { - os << "SYN:" << label << "_cross"; + os << "SOFT:" << label << "_cross"; } fid_ef = FD::Convert(os.str()); if (fid_ef > 0) { @@ -122,11 +119,11 @@ struct SoftSyntacticFeaturesImpl { } break; case '_': - os << "SYN:" << label; + os << "SOFT:" << label; fid_ef = FD::Convert(os.str()); if (lhs_str.compare(label) == 0) { if (fid_ef > 0) { - //cerr << "Feature: " << os.str() << endl; + //cerr << "Feature: " << os.str() << endl; feats->set_value(fid_ef, 1.0); } } @@ -139,7 +136,7 @@ struct SoftSyntacticFeaturesImpl { break; case '+': if (lhs_str.compare(label) == 0) { - os << "SYN:" << label << "_conform"; + os << "SOFT:" << label << "_conform"; fid_ef = FD::Convert(os.str()); if (fid_ef > 0) { //cerr << "Feature: " << os.str() << endl; @@ -147,10 +144,10 @@ struct SoftSyntacticFeaturesImpl { } } break; - case '-': - //cerr << "-" << endl; + case '-': + //cerr << "-" << endl; if (lhs_str.compare(label) != 0) { - os << "SYN:" << label << "_cross"; + os << "SOFT:" << label << "_cross"; fid_ef = FD::Convert(os.str()); if (fid_ef > 0) { //cerr << "Feature :" << os.str() << endl; @@ -167,22 +164,22 @@ struct SoftSyntacticFeaturesImpl { return lhs; } - Array2D<WordID> src_tree; // src_tree(i,j) NT = type - mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized + Array2D<WordID> src_tree; // src_tree(i,j) NT = type + mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized vector<pair<string, string> > feat_labels; }; -SoftSyntacticFeatures::SoftSyntacticFeatures(const string& param) : +SoftSyntaxFeatures::SoftSyntaxFeatures(const string& param) : FeatureFunction(sizeof(WordID)) { - impl = new SoftSyntacticFeaturesImpl(param); + impl = new SoftSyntaxFeaturesImpl(param); } -SoftSyntacticFeatures::~SoftSyntacticFeatures() { +SoftSyntaxFeatures::~SoftSyntaxFeatures() { delete impl; impl = NULL; } -void SoftSyntacticFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, +void SoftSyntaxFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, const Hypergraph::Edge& edge, const vector<const void*>& ant_contexts, SparseVector<double>* features, @@ -196,6 +193,10 @@ void SoftSyntacticFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, impl->FireFeatures(*edge.rule_, edge.i_, edge.j_, ants, features); } -void SoftSyntacticFeatures::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); +void SoftSyntaxFeatures::PrepareForInput(const SentenceMetadata& smeta) { + ReadFile f = ReadFile(smeta.GetSGMLValue("src_tree")); + string tree; + f.ReadAll(tree); + impl->InitializeGrids(tree, smeta.GetSourceLength()); } + diff --git a/decoder/ff_soft_syntax.h b/decoder/ff_soft_syntax.h index 79352f49..e71825d5 100644 --- a/decoder/ff_soft_syntax.h +++ b/decoder/ff_soft_syntax.h @@ -1,15 +1,15 @@ -#ifndef _FF_SOFTSYNTAX_H_ -#define _FF_SOFTSYNTAX_H_ +#ifndef _FF_SOFT_SYNTAX_H_ +#define _FF_SOFT_SYNTAX_H_ #include "ff.h" #include "hg.h" -struct SoftSyntacticFeaturesImpl; +struct SoftSyntaxFeaturesImpl; -class SoftSyntacticFeatures : public FeatureFunction { +class SoftSyntaxFeatures : public FeatureFunction { public: - SoftSyntacticFeatures(const std::string& param); - ~SoftSyntacticFeatures(); + SoftSyntaxFeatures(const std::string& param); + ~SoftSyntaxFeatures(); protected: virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, const Hypergraph::Edge& edge, @@ -19,9 +19,9 @@ class SoftSyntacticFeatures : public FeatureFunction { void* context) const; virtual void PrepareForInput(const SentenceMetadata& smeta); private: - SoftSyntacticFeaturesImpl* impl; + SoftSyntaxFeaturesImpl* impl; }; - #endif + diff --git a/decoder/ff_soft_syntax2.cc b/decoder/ff_soft_syntax_mindist.cc index 121bc39b..a23f70f8 100644 --- a/decoder/ff_soft_syntax2.cc +++ b/decoder/ff_soft_syntax_mindist.cc @@ -1,4 +1,4 @@ -#include "ff_soft_syntax2.h" +#include "ff_soft_syntax_mindist.h" #include <cstdio> #include <sstream> @@ -13,16 +13,18 @@ using namespace std; -// Implements the soft syntactic features described in +// Implements the soft syntactic features described in // Marton and Resnik (2008): "Soft Syntacitc Constraints for Hierarchical Phrase-Based Translation". // Source trees must be represented in Penn Treebank format, // e.g. (S (NP John) (VP (V left))). +// +// This variant accepts fuzzy matches, choosing the constituent with +// minimum distance. -struct SoftSyntacticFeatures2Impl { - SoftSyntacticFeatures2Impl(const string& param) { +struct SoftSyntaxFeaturesMindistImpl { + SoftSyntaxFeaturesMindistImpl(const string& param) { vector<string> labels = SplitOnWhitespace(param); - //for (unsigned int i = 0; i < labels.size(); i++) - //cerr << "Labels: " << labels.at(i) << endl; + //for (unsigned int i = 0; i < labels.size(); i++) { cerr << "Labels: " << labels.at(i) << endl; } for (unsigned int i = 0; i < labels.size(); i++) { string label = labels.at(i); pair<string, string> feat_label; @@ -30,14 +32,12 @@ struct SoftSyntacticFeatures2Impl { feat_label.second = label.at(label.size() - 1); feat_labels.push_back(feat_label); } - } + } void InitializeGrids(const string& tree, unsigned src_len) { assert(tree.size() > 0); - //fids_cat.clear(); fids_ef.clear(); src_tree.clear(); - //fids_cat.resize(src_len, src_len + 1); fids_ef.resize(src_len, src_len + 1); src_tree.resize(src_len, src_len + 1, TD::Convert("XX")); ParseTreeString(tree, src_len); @@ -99,14 +99,14 @@ struct SoftSyntacticFeatures2Impl { const WordID lhs = src_tree(i,j); string lhs_str = TD::Convert(lhs); //cerr << "LHS: " << lhs_str << " from " << i << " to " << j << endl; - //cerr << "RULE :"<< rule << endl; + //cerr << "RULE :"<< rule << endl; int& fid_ef = fids_ef(i,j)[&rule]; string lhs_to_str = TD::Convert(lhs); int min_dist; string min_dist_label; if (lhs_to_str.compare("XX") != 0) { min_dist = 0; - min_dist_label = lhs_to_str; + min_dist_label = lhs_to_str; } else { int ok = 0; @@ -128,7 +128,7 @@ struct SoftSyntacticFeatures2Impl { min_dist_label = (TD::Convert(src_tree(l_rem, r_rem))); break; } - } + } } if (ok) break; } @@ -146,10 +146,10 @@ struct SoftSyntacticFeatures2Impl { case '2': if (min_dist_label.compare(label) == 0) { if (min_dist == 0) { - os << "SYN:" << label << "_conform"; + os << "SOFTM:" << label << "_conform"; } else { - os << "SYN:" << label << "_cross"; + os << "SOFTM:" << label << "_cross"; } fid_ef = FD::Convert(os.str()); //cerr << "Feature :" << os.str() << endl; @@ -157,7 +157,7 @@ struct SoftSyntacticFeatures2Impl { } break; case '_': - os << "SYN:" << label; + os << "SOFTM:" << label; fid_ef = FD::Convert(os.str()); if (min_dist_label.compare(label) == 0) { //cerr << "Feature: " << os.str() << endl; @@ -172,7 +172,7 @@ struct SoftSyntacticFeatures2Impl { break; case '+': if (min_dist_label.compare(label) == 0) { - os << "SYN:" << label << "_conform"; + os << "SOFTM:" << label << "_conform"; fid_ef = FD::Convert(os.str()); if (min_dist == 0) { //cerr << "Feature: " << os.str() << endl; @@ -180,10 +180,10 @@ struct SoftSyntacticFeatures2Impl { } } break; - case '-': - //cerr << "-" << endl; + case '-': + //cerr << "-" << endl; if (min_dist_label.compare(label) != 0) { - os << "SYN:" << label << "_cross"; + os << "SOFTM:" << label << "_cross"; fid_ef = FD::Convert(os.str()); if (min_dist > 0) { //cerr << "Feature :" << os.str() << endl; @@ -200,22 +200,22 @@ struct SoftSyntacticFeatures2Impl { return lhs; } - Array2D<WordID> src_tree; // src_tree(i,j) NT = type - mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized + Array2D<WordID> src_tree; // src_tree(i,j) NT = type + mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized vector<pair<string, string> > feat_labels; }; -SoftSyntacticFeatures2::SoftSyntacticFeatures2(const string& param) : +SoftSyntaxFeaturesMindist::SoftSyntaxFeaturesMindist(const string& param) : FeatureFunction(sizeof(WordID)) { - impl = new SoftSyntacticFeatures2Impl(param); + impl = new SoftSyntaxFeaturesMindistImpl(param); } -SoftSyntacticFeatures2::~SoftSyntacticFeatures2() { +SoftSyntaxFeaturesMindist::~SoftSyntaxFeaturesMindist() { delete impl; impl = NULL; } -void SoftSyntacticFeatures2::TraversalFeaturesImpl(const SentenceMetadata& smeta, +void SoftSyntaxFeaturesMindist::TraversalFeaturesImpl(const SentenceMetadata& smeta, const Hypergraph::Edge& edge, const vector<const void*>& ant_contexts, SparseVector<double>* features, @@ -229,6 +229,10 @@ void SoftSyntacticFeatures2::TraversalFeaturesImpl(const SentenceMetadata& smeta impl->FireFeatures(*edge.rule_, edge.i_, edge.j_, ants, features); } -void SoftSyntacticFeatures2::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); +void SoftSyntaxFeaturesMindist::PrepareForInput(const SentenceMetadata& smeta) { + ReadFile f = ReadFile(smeta.GetSGMLValue("src_tree")); + string tree; + f.ReadAll(tree); + impl->InitializeGrids(tree, smeta.GetSourceLength()); } + diff --git a/decoder/ff_soft_syntax2.h b/decoder/ff_soft_syntax_mindist.h index 4de91d86..bf938b38 100644 --- a/decoder/ff_soft_syntax2.h +++ b/decoder/ff_soft_syntax_mindist.h @@ -1,15 +1,15 @@ -#ifndef _FF_SOFTSYNTAX2_H_ -#define _FF_SOFTSYNTAX2_H_ +#ifndef _FF_SOFT_SYNTAX_MINDIST_H_ +#define _FF_SOFT_SYNTAX_MINDIST_H_ #include "ff.h" #include "hg.h" -struct SoftSyntacticFeatures2Impl; +struct SoftSyntaxFeaturesMindistImpl; -class SoftSyntacticFeatures2 : public FeatureFunction { +class SoftSyntaxFeaturesMindist : public FeatureFunction { public: - SoftSyntacticFeatures2(const std::string& param); - ~SoftSyntacticFeatures2(); + SoftSyntaxFeaturesMindist(const std::string& param); + ~SoftSyntaxFeaturesMindist(); protected: virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, const Hypergraph::Edge& edge, @@ -19,9 +19,9 @@ class SoftSyntacticFeatures2 : public FeatureFunction { void* context) const; virtual void PrepareForInput(const SentenceMetadata& smeta); private: - SoftSyntacticFeatures2Impl* impl; + SoftSyntaxFeaturesMindistImpl* impl; }; - #endif + diff --git a/decoder/ff_source_syntax.cc b/decoder/ff_source_syntax.cc index a1997695..6b183863 100644 --- a/decoder/ff_source_syntax.cc +++ b/decoder/ff_source_syntax.cc @@ -2,8 +2,13 @@ #include <sstream> #include <stack> +#ifndef HAVE_OLD_CPP +# include <unordered_set> +#else +# include <tr1/unordered_set> +namespace std { using std::tr1::unordered_set; } +#endif -#include "hg.h" #include "sentence_metadata.h" #include "array2d.h" #include "filelib.h" @@ -24,6 +29,17 @@ inline int SpanSizeTransform(unsigned span_size) { struct SourceSyntaxFeaturesImpl { SourceSyntaxFeaturesImpl() {} + SourceSyntaxFeaturesImpl(const string& param) { + if (!(param.compare("") == 0)) { + string triggered_features_fn = param; + ReadFile triggered_features(triggered_features_fn); + string in; + while(getline(*triggered_features, in)) { + feature_filter.insert(FD::Convert(in)); + } + } + } + void InitializeGrids(const string& tree, unsigned src_len) { assert(tree.size() > 0); //fids_cat.clear(); @@ -93,7 +109,7 @@ struct SourceSyntaxFeaturesImpl { if (fid_ef <= 0) { ostringstream os; //ostringstream os2; - os << "SYN:" << TD::Convert(lhs); + os << "SSYN:" << TD::Convert(lhs); //os2 << "SYN:" << TD::Convert(lhs) << '_' << SpanSizeTransform(j - i); //fid_cat = FD::Convert(os2.str()); os << ':'; @@ -118,21 +134,28 @@ struct SourceSyntaxFeaturesImpl { } fid_ef = FD::Convert(os.str()); } - //if (fid_cat > 0) - // feats->set_value(fid_cat, 1.0); - if (fid_ef > 0) - feats->set_value(fid_ef, 1.0); + if (fid_ef > 0) { + if (feature_filter.size()>0) { + if (feature_filter.find(fid_ef) != feature_filter.end()) { + feats->set_value(fid_ef, 1.0); + } + } else { + feats->set_value(fid_ef, 1.0); + } + } + cerr << FD::Convert(fid_ef) << endl; return lhs; } - Array2D<WordID> src_tree; // src_tree(i,j) NT = type - // mutable Array2D<int> fids_cat; // this tends to overfit baddly - mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized + Array2D<WordID> src_tree; // src_tree(i,j) NT = type + // mutable Array2D<int> fids_cat; // this tends to overfit baddly + mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized + unordered_set<int> feature_filter; }; SourceSyntaxFeatures::SourceSyntaxFeatures(const string& param) : FeatureFunction(sizeof(WordID)) { - impl = new SourceSyntaxFeaturesImpl; + impl = new SourceSyntaxFeaturesImpl(param); } SourceSyntaxFeatures::~SourceSyntaxFeatures() { @@ -155,7 +178,10 @@ void SourceSyntaxFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, } void SourceSyntaxFeatures::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); + ReadFile f = ReadFile(smeta.GetSGMLValue("src_tree")); + string tree; + f.ReadAll(tree); + impl->InitializeGrids(tree, smeta.GetSourceLength()); } struct SourceSpanSizeFeaturesImpl { @@ -230,4 +256,3 @@ void SourceSpanSizeFeatures::PrepareForInput(const SentenceMetadata& smeta) { impl->InitializeGrids(smeta.GetSourceLength()); } - diff --git a/decoder/ff_source_syntax.h b/decoder/ff_source_syntax.h index a8c7150a..bdd638c1 100644 --- a/decoder/ff_source_syntax.h +++ b/decoder/ff_source_syntax.h @@ -1,7 +1,8 @@ -#ifndef _FF_SOURCE_TOOLS_H_ -#define _FF_SOURCE_TOOLS_H_ +#ifndef _FF_SOURCE_SYNTAX_H_ +#define _FF_SOURCE_SYNTAX_H_ #include "ff.h" +#include "hg.h" struct SourceSyntaxFeaturesImpl; @@ -11,7 +12,7 @@ class SourceSyntaxFeatures : public FeatureFunction { ~SourceSyntaxFeatures(); protected: virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const HG::Edge& edge, + const Hypergraph::Edge& edge, const std::vector<const void*>& ant_contexts, SparseVector<double>* features, SparseVector<double>* estimated_features, @@ -28,7 +29,7 @@ class SourceSpanSizeFeatures : public FeatureFunction { ~SourceSpanSizeFeatures(); protected: virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const HG::Edge& edge, + const Hypergraph::Edge& edge, const std::vector<const void*>& ant_contexts, SparseVector<double>* features, SparseVector<double>* estimated_features, @@ -39,3 +40,4 @@ class SourceSpanSizeFeatures : public FeatureFunction { }; #endif + diff --git a/decoder/ff_source_syntax2.cc b/decoder/ff_source_syntax2.cc index 08ece917..a97e31d8 100644 --- a/decoder/ff_source_syntax2.cc +++ b/decoder/ff_source_syntax2.cc @@ -3,7 +3,6 @@ #include <sstream> #include <stack> #include <string> -#include <tr1/unordered_set> #include "sentence_metadata.h" #include "array2d.h" @@ -17,7 +16,7 @@ using namespace std; struct SourceSyntaxFeatures2Impl { SourceSyntaxFeatures2Impl(const string& param) { - if (!(param.compare("") == 0)) { + if (param.compare("") != 0) { string triggered_features_fn = param; ReadFile triggered_features(triggered_features_fn); string in; @@ -29,10 +28,8 @@ struct SourceSyntaxFeatures2Impl { void InitializeGrids(const string& tree, unsigned src_len) { assert(tree.size() > 0); - //fids_cat.clear(); fids_ef.clear(); src_tree.clear(); - //fids_cat.resize(src_len, src_len + 1); fids_ef.resize(src_len, src_len + 1); src_tree.resize(src_len, src_len + 1, TD::Convert("XX")); ParseTreeString(tree, src_len); @@ -40,7 +37,7 @@ struct SourceSyntaxFeatures2Impl { void ParseTreeString(const string& tree, unsigned src_len) { //cerr << "TREE: " << tree << endl; - stack<pair<int, WordID> > stk; // first = i, second = category + stack<pair<int, WordID> > stk; // first = i, second = category pair<int, WordID> cur_cat; cur_cat.first = -1; unsigned i = 0; unsigned p = 0; @@ -92,7 +89,7 @@ struct SourceSyntaxFeatures2Impl { const WordID lhs = src_tree(i,j); int& fid_ef = fids_ef(i,j)[&rule]; ostringstream os; - os << "SYN:" << TD::Convert(lhs); + os << "SSYN2:" << TD::Convert(lhs); os << ':'; unsigned ntc = 0; for (unsigned k = 0; k < rule.f_.size(); ++k) { @@ -100,7 +97,7 @@ struct SourceSyntaxFeatures2Impl { if (k > 0 && fj <= 0) os << '_'; if (fj <= 0) { os << '[' << TD::Convert(ants[ntc++]) << ']'; - } /*else { + }/*else { os << TD::Convert(fj); }*/ } @@ -116,18 +113,23 @@ struct SourceSyntaxFeatures2Impl { fid_ef = FD::Convert(os.str()); //cerr << "FEATURE: " << os.str() << endl; //cerr << "FID_EF: " << fid_ef << endl; - if (feature_filter.find(fid_ef) != feature_filter.end()) { - cerr << "SYN-Feature was trigger more than once on training set." << endl; + if (feature_filter.size() > 0) { + if (feature_filter.find(fid_ef) != feature_filter.end()) { + //cerr << "SYN-Feature was trigger more than once on training set." << endl; + feats->set_value(fid_ef, 1.0); + } + //else cerr << "SYN-Feature was triggered less than once on training set." << endli; + } + else { feats->set_value(fid_ef, 1.0); } - else cerr << "SYN-Feature was triggered less than once on training set." << endl; + cerr << FD::Convert(fid_ef) << endl; return lhs; } - Array2D<WordID> src_tree; // src_tree(i,j) NT = type - mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized - tr1::unordered_set<int> feature_filter; - + Array2D<WordID> src_tree; // src_tree(i,j) NT = type + mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized + unordered_set<int> feature_filter; }; SourceSyntaxFeatures2::SourceSyntaxFeatures2(const string& param) : @@ -155,5 +157,9 @@ void SourceSyntaxFeatures2::TraversalFeaturesImpl(const SentenceMetadata& smeta, } void SourceSyntaxFeatures2::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); + ReadFile f = ReadFile(smeta.GetSGMLValue("src_tree")); + string tree; + f.ReadAll(tree); + impl->InitializeGrids(tree, smeta.GetSourceLength()); } + diff --git a/decoder/ff_source_syntax2.h b/decoder/ff_source_syntax2.h index b6b7dc3d..f606c2bf 100644 --- a/decoder/ff_source_syntax2.h +++ b/decoder/ff_source_syntax2.h @@ -1,5 +1,5 @@ -#ifndef _FF_SOURCE_TOOLS2_H_ -#define _FF_SOURCE_TOOLS2_H_ +#ifndef _FF_SOURCE_SYNTAX2_H_ +#define _FF_SOURCE_SYNTAX2_H_ #include "ff.h" #include "hg.h" @@ -23,3 +23,4 @@ class SourceSyntaxFeatures2 : public FeatureFunction { }; #endif + diff --git a/decoder/ff_source_syntax2_p.cc b/decoder/ff_source_syntax2_p.cc deleted file mode 100644 index dfa791ea..00000000 --- a/decoder/ff_source_syntax2_p.cc +++ /dev/null @@ -1,166 +0,0 @@ -#include "ff_source_syntax2_p.h" - -#include <sstream> -#include <stack> -#include <string> -#include <tr1/unordered_set> - -#include "sentence_metadata.h" -#include "array2d.h" -#include "filelib.h" - -using namespace std; - -// implements the source side syntax features described in Blunsom et al. (EMNLP 2008) -// source trees must be represented in Penn Treebank format, e.g. -// (S (NP John) (VP (V left))) - -struct PSourceSyntaxFeatures2Impl { - PSourceSyntaxFeatures2Impl(const string& param) { - if (param.compare("") != 0) { - string triggered_features_fn = param; - ReadFile triggered_features(triggered_features_fn); - string in; - while(getline(*triggered_features, in)) { - feature_filter.insert(FD::Convert(in)); - } - } - /*cerr << "find(\"One\") == " << boolalpha << (table.find("One") != table.end()) << endl; - cerr << "find(\"Three\") == " << boolalpha << (table.find("Three") != table.end()) << endl;*/ - } - - void InitializeGrids(const string& tree, unsigned src_len) { - assert(tree.size() > 0); - //fids_cat.clear(); - fids_ef.clear(); - src_tree.clear(); - //fids_cat.resize(src_len, src_len + 1); - fids_ef.resize(src_len, src_len + 1); - src_tree.resize(src_len, src_len + 1, TD::Convert("XX")); - ParseTreeString(tree, src_len); - } - - void ParseTreeString(const string& tree, unsigned src_len) { - //cerr << "TREE: " << tree << endl; - stack<pair<int, WordID> > stk; // first = i, second = category - pair<int, WordID> cur_cat; cur_cat.first = -1; - unsigned i = 0; - unsigned p = 0; - while(p < tree.size()) { - const char cur = tree[p]; - if (cur == '(') { - stk.push(cur_cat); - ++p; - unsigned k = p + 1; - while (k < tree.size() && tree[k] != ' ') { ++k; } - cur_cat.first = i; - cur_cat.second = TD::Convert(tree.substr(p, k - p)); - // cerr << "NT: '" << tree.substr(p, k-p) << "' (i=" << i << ")\n"; - p = k + 1; - } else if (cur == ')') { - unsigned k = p; - while (k < tree.size() && tree[k] == ')') { ++k; } - const unsigned num_closes = k - p; - for (unsigned ci = 0; ci < num_closes; ++ci) { - src_tree(cur_cat.first, i) = cur_cat.second; - cur_cat = stk.top(); - stk.pop(); - } - p = k; - while (p < tree.size() && (tree[p] == ' ' || tree[p] == '\t')) { ++p; } - } else if (cur == ' ' || cur == '\t') { - cerr << "Unexpected whitespace in: " << tree << endl; - abort(); - } else { // terminal symbol - unsigned k = p + 1; - do { - while (k < tree.size() && tree[k] != ')' && tree[k] != ' ') { ++k; } - // cerr << "TERM: '" << tree.substr(p, k-p) << "' (i=" << i << ")\n"; - ++i; - assert(i <= src_len); - while (k < tree.size() && tree[k] == ' ') { ++k; } - p = k; - } while (p < tree.size() && tree[p] != ')'); - } - //cerr << "i=" << i << " src_len=" << src_len << endl; - } - //cerr << "i=" << i << " src_len=" << src_len << endl; - assert(i == src_len); // make sure tree specified in src_tree is - // the same length as the source sentence - } - - WordID FireFeatures(const TRule& rule, const int i, const int j, const WordID* ants, SparseVector<double>* feats) { - //cerr << "fire features: " << rule.AsString() << " for " << i << "," << j << endl; - const WordID lhs = src_tree(i,j); - int& fid_ef = fids_ef(i,j)[&rule]; - ostringstream os; - os << "SYN:" << TD::Convert(lhs); - os << ':'; - unsigned ntc = 0; - for (unsigned k = 0; k < rule.f_.size(); ++k) { - int fj = rule.f_[k]; - if (k > 0 && fj <= 0) os << '_'; - if (fj <= 0) { - os << '[' << TD::Convert(ants[ntc++]) << ']'; - } /*else { - os << TD::Convert(fj); - }*/ - } - os << ':'; - for (unsigned k = 0; k < rule.e_.size(); ++k) { - const int ei = rule.e_[k]; - if (k > 0) os << '_'; - if (ei <= 0) - os << '[' << (1-ei) << ']'; - else - os << TD::Convert(ei); - } - fid_ef = FD::Convert(os.str()); - //cerr << "FEATURE: " << os.str() << endl; - //cerr << "FID_EF: " << fid_ef << endl; - if (feature_filter.size() > 0) { - if (feature_filter.find(fid_ef) != feature_filter.end()) { - //cerr << "SYN-Feature was trigger more than once on training set." << endl; - feats->set_value(fid_ef, 1.0); - } - //else cerr << "SYN-Feature was triggered less than once on training set." << endli; - } - else { - feats->set_value(fid_ef, 1.0); - } - return lhs; - } - - Array2D<WordID> src_tree; // src_tree(i,j) NT = type - mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized - tr1::unordered_set<int> feature_filter; - -}; - -PSourceSyntaxFeatures2::PSourceSyntaxFeatures2(const string& param) : - FeatureFunction(sizeof(WordID)) { - impl = new PSourceSyntaxFeatures2Impl(param); -} - -PSourceSyntaxFeatures2::~PSourceSyntaxFeatures2() { - delete impl; - impl = NULL; -} - -void PSourceSyntaxFeatures2::TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const vector<const void*>& ant_contexts, - SparseVector<double>* features, - SparseVector<double>* estimated_features, - void* context) const { - WordID ants[8]; - for (unsigned i = 0; i < ant_contexts.size(); ++i) - ants[i] = *static_cast<const WordID*>(ant_contexts[i]); - - *static_cast<WordID*>(context) = - impl->FireFeatures(*edge.rule_, edge.i_, edge.j_, ants, features); -} - -void PSourceSyntaxFeatures2::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); -} diff --git a/decoder/ff_source_syntax2_p.h b/decoder/ff_source_syntax2_p.h deleted file mode 100644 index d56ecab0..00000000 --- a/decoder/ff_source_syntax2_p.h +++ /dev/null @@ -1,25 +0,0 @@ -#ifndef _FF_SOURCE_TOOLS2_H_ -#define _FF_SOURCE_TOOLS2_H_ - -#include "ff.h" -#include "hg.h" - -struct PSourceSyntaxFeatures2Impl; - -class PSourceSyntaxFeatures2 : public FeatureFunction { - public: - PSourceSyntaxFeatures2(const std::string& param); - ~PSourceSyntaxFeatures2(); - protected: - virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector<const void*>& ant_contexts, - SparseVector<double>* features, - SparseVector<double>* estimated_features, - void* context) const; - virtual void PrepareForInput(const SentenceMetadata& smeta); - private: - PSourceSyntaxFeatures2Impl* impl; -}; - -#endif diff --git a/decoder/ff_source_syntax_p.cc b/decoder/ff_source_syntax_p.cc deleted file mode 100644 index cd081544..00000000 --- a/decoder/ff_source_syntax_p.cc +++ /dev/null @@ -1,245 +0,0 @@ -#include "ff_source_syntax_p.h" - -#include <sstream> -#include <stack> -#include <tr1/unordered_set> - -#include "sentence_metadata.h" -#include "array2d.h" -#include "filelib.h" - -using namespace std; - -// implements the source side syntax features described in Blunsom et al. (EMNLP 2008) -// source trees must be represented in Penn Treebank format, e.g. -// (S (NP John) (VP (V left))) - -// log transform to make long spans cluster together -// but preserve differences -inline int SpanSizeTransform(unsigned span_size) { - if (!span_size) return 0; - return static_cast<int>(log(span_size+1) / log(1.39)) - 1; -} - -struct PSourceSyntaxFeaturesImpl { - PSourceSyntaxFeaturesImpl() {} - - PSourceSyntaxFeaturesImpl(const string& param) { - if (!(param.compare("") == 0)) { - string triggered_features_fn = param; - ReadFile triggered_features(triggered_features_fn); - string in; - while(getline(*triggered_features, in)) { - feature_filter.insert(FD::Convert(in)); - } - } - } - - void InitializeGrids(const string& tree, unsigned src_len) { - assert(tree.size() > 0); - //fids_cat.clear(); - fids_ef.clear(); - src_tree.clear(); - //fids_cat.resize(src_len, src_len + 1); - fids_ef.resize(src_len, src_len + 1); - src_tree.resize(src_len, src_len + 1, TD::Convert("XX")); - ParseTreeString(tree, src_len); - } - - void ParseTreeString(const string& tree, unsigned src_len) { - stack<pair<int, WordID> > stk; // first = i, second = category - pair<int, WordID> cur_cat; cur_cat.first = -1; - unsigned i = 0; - unsigned p = 0; - while(p < tree.size()) { - const char cur = tree[p]; - if (cur == '(') { - stk.push(cur_cat); - ++p; - unsigned k = p + 1; - while (k < tree.size() && tree[k] != ' ') { ++k; } - cur_cat.first = i; - cur_cat.second = TD::Convert(tree.substr(p, k - p)); - // cerr << "NT: '" << tree.substr(p, k-p) << "' (i=" << i << ")\n"; - p = k + 1; - } else if (cur == ')') { - unsigned k = p; - while (k < tree.size() && tree[k] == ')') { ++k; } - const unsigned num_closes = k - p; - for (unsigned ci = 0; ci < num_closes; ++ci) { - // cur_cat.second spans from cur_cat.first to i - // cerr << TD::Convert(cur_cat.second) << " from " << cur_cat.first << " to " << i << endl; - // NOTE: unary rule chains end up being labeled with the top-most category - src_tree(cur_cat.first, i) = cur_cat.second; - cur_cat = stk.top(); - stk.pop(); - } - p = k; - while (p < tree.size() && (tree[p] == ' ' || tree[p] == '\t')) { ++p; } - } else if (cur == ' ' || cur == '\t') { - cerr << "Unexpected whitespace in: " << tree << endl; - abort(); - } else { // terminal symbol - unsigned k = p + 1; - do { - while (k < tree.size() && tree[k] != ')' && tree[k] != ' ') { ++k; } - // cerr << "TERM: '" << tree.substr(p, k-p) << "' (i=" << i << ")\n"; - ++i; - assert(i <= src_len); - while (k < tree.size() && tree[k] == ' ') { ++k; } - p = k; - } while (p < tree.size() && tree[p] != ')'); - } - } - // cerr << "i=" << i << " src_len=" << src_len << endl; - assert(i == src_len); // make sure tree specified in src_tree is - // the same length as the source sentence - } - - WordID FireFeatures(const TRule& rule, const int i, const int j, const WordID* ants, SparseVector<double>* feats) { - //cerr << "fire features: " << rule.AsString() << " for " << i << "," << j << endl; - const WordID lhs = src_tree(i,j); - //int& fid_cat = fids_cat(i,j); - int& fid_ef = fids_ef(i,j)[&rule]; - if (fid_ef <= 0) { - ostringstream os; - //ostringstream os2; - os << "SYN:" << TD::Convert(lhs); - //os2 << "SYN:" << TD::Convert(lhs) << '_' << SpanSizeTransform(j - i); - //fid_cat = FD::Convert(os2.str()); - os << ':'; - unsigned ntc = 0; - for (unsigned k = 0; k < rule.f_.size(); ++k) { - if (k > 0) os << '_'; - int fj = rule.f_[k]; - if (fj <= 0) { - os << '[' << TD::Convert(ants[ntc++]) << ']'; - } else { - os << TD::Convert(fj); - } - } - os << ':'; - for (unsigned k = 0; k < rule.e_.size(); ++k) { - const int ei = rule.e_[k]; - if (k > 0) os << '_'; - if (ei <= 0) - os << '[' << (1-ei) << ']'; - else - os << TD::Convert(ei); - } - fid_ef = FD::Convert(os.str()); - } - //if (fid_cat > 0) - // feats->set_value(fid_cat, 1.0); - if (fid_ef > 0 && (feature_filter.find(fid_ef) != feature_filter.end())) - feats->set_value(fid_ef, 1.0); - return lhs; - } - - Array2D<WordID> src_tree; // src_tree(i,j) NT = type - // mutable Array2D<int> fids_cat; // this tends to overfit baddly - mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized - tr1::unordered_set<int> feature_filter; -}; - -PSourceSyntaxFeatures::PSourceSyntaxFeatures(const string& param) : - FeatureFunction(sizeof(WordID)) { - impl = new PSourceSyntaxFeaturesImpl(param); -} - -PSourceSyntaxFeatures::~PSourceSyntaxFeatures() { - delete impl; - impl = NULL; -} - -void PSourceSyntaxFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const vector<const void*>& ant_contexts, - SparseVector<double>* features, - SparseVector<double>* estimated_features, - void* context) const { - WordID ants[8]; - for (unsigned i = 0; i < ant_contexts.size(); ++i) - ants[i] = *static_cast<const WordID*>(ant_contexts[i]); - - *static_cast<WordID*>(context) = - impl->FireFeatures(*edge.rule_, edge.i_, edge.j_, ants, features); -} - -void PSourceSyntaxFeatures::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength()); -} - -struct PSourceSpanSizeFeaturesImpl { - PSourceSpanSizeFeaturesImpl() {} - - void InitializeGrids(unsigned src_len) { - fids.clear(); - fids.resize(src_len, src_len + 1); - } - - int FireFeatures(const TRule& rule, const int i, const int j, const WordID* ants, SparseVector<double>* feats) { - if (rule.Arity() > 0) { - int& fid = fids(i,j)[&rule]; - if (fid <= 0) { - ostringstream os; - os << "SSS:"; - unsigned ntc = 0; - for (unsigned k = 0; k < rule.f_.size(); ++k) { - if (k > 0) os << '_'; - int fj = rule.f_[k]; - if (fj <= 0) { - os << '[' << TD::Convert(-fj) << ants[ntc++] << ']'; - } else { - os << TD::Convert(fj); - } - } - os << ':'; - for (unsigned k = 0; k < rule.e_.size(); ++k) { - const int ei = rule.e_[k]; - if (k > 0) os << '_'; - if (ei <= 0) - os << '[' << (1-ei) << ']'; - else - os << TD::Convert(ei); - } - fid = FD::Convert(os.str()); - } - if (fid > 0) - feats->set_value(fid, 1.0); - } - return SpanSizeTransform(j - i); - } - - mutable Array2D<map<const TRule*, int> > fids; -}; - -PSourceSpanSizeFeatures::PSourceSpanSizeFeatures(const string& param) : - FeatureFunction(sizeof(char)) { - impl = new PSourceSpanSizeFeaturesImpl; -} - -PSourceSpanSizeFeatures::~PSourceSpanSizeFeatures() { - delete impl; - impl = NULL; -} - -void PSourceSpanSizeFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const vector<const void*>& ant_contexts, - SparseVector<double>* features, - SparseVector<double>* estimated_features, - void* context) const { - int ants[8]; - for (unsigned i = 0; i < ant_contexts.size(); ++i) - ants[i] = *static_cast<const char*>(ant_contexts[i]); - - *static_cast<char*>(context) = - impl->FireFeatures(*edge.rule_, edge.i_, edge.j_, ants, features); -} - -void PSourceSpanSizeFeatures::PrepareForInput(const SentenceMetadata& smeta) { - impl->InitializeGrids(smeta.GetSourceLength()); -} - - diff --git a/decoder/ff_source_syntax_p.h b/decoder/ff_source_syntax_p.h deleted file mode 100644 index 2dd9094a..00000000 --- a/decoder/ff_source_syntax_p.h +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef _FF_SOURCE_TOOLS_H_ -#define _FF_SOURCE_TOOLS_H_ - -#include "ff.h" -#include "hg.h" - -struct PSourceSyntaxFeaturesImpl; - -class PSourceSyntaxFeatures : public FeatureFunction { - public: - PSourceSyntaxFeatures(const std::string& param); - ~PSourceSyntaxFeatures(); - protected: - virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector<const void*>& ant_contexts, - SparseVector<double>* features, - SparseVector<double>* estimated_features, - void* context) const; - virtual void PrepareForInput(const SentenceMetadata& smeta); - private: - PSourceSyntaxFeaturesImpl* impl; -}; - -struct PSourceSpanSizeFeaturesImpl; -class PSourceSpanSizeFeatures : public FeatureFunction { - public: - PSourceSpanSizeFeatures(const std::string& param); - ~PSourceSpanSizeFeatures(); - protected: - virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const std::vector<const void*>& ant_contexts, - SparseVector<double>* features, - SparseVector<double>* estimated_features, - void* context) const; - virtual void PrepareForInput(const SentenceMetadata& smeta); - private: - PSourceSpanSizeFeaturesImpl* impl; -}; - -#endif diff --git a/decoder/ff_wordalign.cc b/decoder/ff_wordalign.cc index 1491819d..dcb80110 100644 --- a/decoder/ff_wordalign.cc +++ b/decoder/ff_wordalign.cc @@ -7,7 +7,12 @@ #include <string> #include <cmath> #include <bitset> -#include <tr1/unordered_map> +#ifndef HAVE_OLD_CPP +# include <unordered_map> +#else +# include <tr1/unordered_map> +namespace std { using std::tr1::unordered_map; } +#endif #include <boost/tuple/tuple.hpp> #include "boost/tuple/tuple_comparison.hpp" @@ -249,7 +254,7 @@ void NewJump::FireFeature(const SentenceMetadata& smeta, if (fp1_) get<6>(key) = GetSourceWord(id, cur_src_index + 1); if (fprev_) get<7>(key) = GetSourceWord(id, prev_src_index); - static std::tr1::unordered_map<NewJumpFeatureKey, int, KeyHash> fids; + static std::unordered_map<NewJumpFeatureKey, int, KeyHash> fids; int& fid = fids[key]; if (!fid) { ostringstream os; diff --git a/decoder/ff_wordalign.h b/decoder/ff_wordalign.h index ba3d0b9b..0161f603 100644 --- a/decoder/ff_wordalign.h +++ b/decoder/ff_wordalign.h @@ -5,8 +5,16 @@ #include "array2d.h" #include "factored_lexicon_helper.h" +#include <boost/functional/hash.hpp> +#include <cassert> #include <boost/scoped_ptr.hpp> #include <boost/multi_array.hpp> +#ifndef HAVE_OLD_CPP +# include <unordered_map> +#else +# include <tr1/unordered_map> +namespace std { using std::tr1::unordered_map; } +#endif class RelativeSentencePosition : public FeatureFunction { public: @@ -124,9 +132,6 @@ class LexicalTranslationTrigger : public FeatureFunction { std::vector<std::vector<WordID> > triggers_; }; -#include <tr1/unordered_map> -#include <boost/functional/hash.hpp> -#include <cassert> class BlunsomSynchronousParseHack : public FeatureFunction { public: BlunsomSynchronousParseHack(const std::string& param); @@ -196,7 +201,7 @@ class BlunsomSynchronousParseHack : public FeatureFunction { const int fid_; mutable int cur_sent_; - typedef std::tr1::unordered_map<std::vector<WordID>, int, boost::hash<std::vector<WordID> > > Vec2Int; + typedef std::unordered_map<std::vector<WordID>, int, boost::hash<std::vector<WordID> > > Vec2Int; mutable Vec2Int cur_map_; const std::vector<WordID> mutable * cur_ref_; mutable std::vector<std::vector<WordID> > refs_; diff --git a/decoder/ff_wordset.cc b/decoder/ff_wordset.cc index 70cea7de..9be6f2e0 100644 --- a/decoder/ff_wordset.cc +++ b/decoder/ff_wordset.cc @@ -2,21 +2,67 @@ #include "hg.h" #include "fdict.h" +#include "filelib.h" +#include <boost/algorithm/string.hpp> #include <sstream> #include <iostream> using namespace std; +void WordSet::parseArgs(const string& args, string* featName, string* vocabFile, bool* oovMode) { + vector<string> toks(10); + boost::split(toks, args, boost::is_any_of(" ")); + + *oovMode = false; + + // skip initial feature name + for(vector<string>::const_iterator it = toks.begin(); it != toks.end(); ++it) { + if(*it == "-v") { + *vocabFile = *++it; // copy + + } else if(*it == "-N") { + *featName = *++it; + } else if(*it == "--oov") { + *oovMode = true; + } else { + cerr << "Unrecognized argument: " << *it << endl; + exit(1); + } + } + + if(*featName == "") { + cerr << "featName (-N) not specified for WordSet" << endl; + exit(1); + } + if(*vocabFile == "") { + cerr << "vocabFile (-v) not specified for WordSet" << endl; + exit(1); + } +} + +void WordSet::loadVocab(const string& vocabFile, unordered_set<WordID>* vocab) { + ReadFile rf(vocabFile); + if (!rf) { + cerr << "Unable to open file: " << vocabFile; + abort(); + } + string line; + while (getline(*rf.stream(), line)) { + boost::trim(line); + if(line.empty()) continue; + WordID vocabId = TD::Convert(line); + vocab->insert(vocabId); + } +} + void WordSet::TraversalFeaturesImpl(const SentenceMetadata& /*smeta*/ , const Hypergraph::Edge& edge, const vector<const void*>& /* ant_contexts */, SparseVector<double>* features, SparseVector<double>* /* estimated_features */, void* /* context */) const { - double addScore = 0.0; - for(std::vector<WordID>::const_iterator it = edge.rule_->e_.begin(); it != edge.rule_->e_.end(); ++it) { - + for(vector<WordID>::const_iterator it = edge.rule_->e_.begin(); it != edge.rule_->e_.end(); ++it) { bool inVocab = (vocab_.find(*it) != vocab_.end()); if(oovMode_ && !inVocab) { addScore += 1.0; diff --git a/decoder/ff_wordset.h b/decoder/ff_wordset.h index 639e1514..e78cd2fb 100644 --- a/decoder/ff_wordset.h +++ b/decoder/ff_wordset.h @@ -4,14 +4,18 @@ #include "ff.h" #include "tdict.h" -#include <tr1/unordered_set> -#include <boost/algorithm/string.hpp> - #include <vector> #include <string> #include <iostream> #include <fstream> +#ifndef HAVE_OLD_CPP +# include <unordered_set> +#else +# include <tr1/unordered_set> +namespace std { using std::tr1::unordered_set; } +#endif + class WordSet : public FeatureFunction { public: // we depend on the order of the initializer list @@ -42,69 +46,12 @@ class WordSet : public FeatureFunction { void* context) const; private: - static void loadVocab(const std::string& vocabFile, std::tr1::unordered_set<WordID>* vocab) { - - std::ifstream file; - std::string line; - - file.open(vocabFile.c_str(), std::fstream::in); - if (file.is_open()) { - unsigned lineNum = 0; - while (!file.eof()) { - ++lineNum; - getline(file, line); - boost::trim(line); - if(line.empty()) { - continue; - } - - WordID vocabId = TD::Convert(line); - vocab->insert(vocabId); - } - file.close(); - } else { - std::cerr << "Unable to open file: " << vocabFile; - exit(1); - } - } - - static void parseArgs(const std::string& args, std::string* featName, std::string* vocabFile, bool* oovMode) { - - std::vector<std::string> toks(10); - boost::split(toks, args, boost::is_any_of(" ")); - - *oovMode = false; - - // skip initial feature name - for(std::vector<std::string>::const_iterator it = toks.begin(); it != toks.end(); ++it) { - if(*it == "-v") { - *vocabFile = *++it; // copy - - } else if(*it == "-N") { - *featName = *++it; - - } else if(*it == "--oov") { - *oovMode = true; - - } else { - std::cerr << "Unrecognized argument: " << *it << std::endl; - exit(1); - } - } - - if(*featName == "") { - std::cerr << "featName (-N) not specified for WordSet" << std::endl; - exit(1); - } - if(*vocabFile == "") { - std::cerr << "vocabFile (-v) not specified for WordSet" << std::endl; - exit(1); - } - } + static void parseArgs(const std::string& args, std::string* featName, std::string* vocabFile, bool* oovMode); + static void loadVocab(const std::string& vocabFile, std::unordered_set<WordID>* vocab); int fid_; bool oovMode_; - std::tr1::unordered_set<WordID> vocab_; + std::unordered_set<WordID> vocab_; }; #endif diff --git a/decoder/grammar.cc b/decoder/grammar.cc index ee43f537..160d00e6 100644 --- a/decoder/grammar.cc +++ b/decoder/grammar.cc @@ -3,15 +3,20 @@ #include <algorithm> #include <utility> #include <map> -#include <tr1/unordered_map> -#include <tr1/unordered_set> +#ifndef HAVE_OLD_CPP +# include <unordered_map> +# include <unordered_set> +#else +# include <tr1/unordered_map> +# include <tr1/unordered_set> +namespace std { using std::tr1::unordered_map; using std::tr1::unordered_set; } +#endif #include "rule_lexer.h" #include "filelib.h" #include "tdict.h" using namespace std; -using namespace std::tr1; const vector<TRulePtr> Grammar::NO_RULES; diff --git a/decoder/hg_intersect.cc b/decoder/hg_intersect.cc index ad5b701a..31a9a1ce 100644 --- a/decoder/hg_intersect.cc +++ b/decoder/hg_intersect.cc @@ -1,7 +1,12 @@ #include "hg_intersect.h" #include <vector> -#include <tr1/unordered_map> +#ifndef HAVE_OLD_CPP +# include <unordered_map> +#else +# include <tr1/unordered_map> +namespace std { using std::tr1::unordered_map; } +#endif #include "fast_lexical_cast.hpp" #include <boost/functional/hash.hpp> @@ -13,7 +18,6 @@ #include "bottom_up_parser.h" using boost::lexical_cast; -using namespace std::tr1; using namespace std; struct RuleFilter { diff --git a/decoder/kbest.h b/decoder/kbest.h index 44c23151..c7194c7e 100644 --- a/decoder/kbest.h +++ b/decoder/kbest.h @@ -3,7 +3,12 @@ #include <vector> #include <utility> -#include <tr1/unordered_set> +#ifndef HAVE_OLD_CPP +# include <unordered_set> +#else +# include <tr1/unordered_set> +namespace std { using std::tr1::unordered_set; } +#endif #include <boost/shared_ptr.hpp> #include <boost/type_traits.hpp> @@ -22,7 +27,7 @@ namespace KBest { // optional, filter unique yield strings struct FilterUnique { - std::tr1::unordered_set<std::vector<WordID>, boost::hash<std::vector<WordID> > > unique; + std::unordered_set<std::vector<WordID>, boost::hash<std::vector<WordID> > > unique; bool operator()(const std::vector<WordID>& yield) { return !unique.insert(yield).second; @@ -111,7 +116,7 @@ namespace KBest { }; typedef std::vector<Derivation*> CandidateHeap; typedef std::vector<Derivation*> DerivationList; - typedef std::tr1::unordered_set< + typedef std::unordered_set< const Derivation*, DerivationUniquenessHash, DerivationUniquenessEquals> UniqueDerivationSet; struct NodeDerivationState { diff --git a/decoder/maxtrans_blunsom.cc b/decoder/maxtrans_blunsom.cc index 774e4170..a9f65fab 100644 --- a/decoder/maxtrans_blunsom.cc +++ b/decoder/maxtrans_blunsom.cc @@ -2,8 +2,14 @@ #include <vector> #include <algorithm> -#include <tr1/unordered_map> -#include <tr1/unordered_set> +#ifndef HAVE_OLD_CPP +# include <unordered_map> +# include <unordered_set> +#else +# include <tr1/unordered_map> +# include <tr1/unordered_set> +namespace std { using std::tr1::unordered_map; using std::tr1::unordered_set; } +#endif #include <boost/tuple/tuple.hpp> #include <boost/functional/hash.hpp> @@ -14,7 +20,6 @@ using boost::tuple; using namespace std; -using namespace std::tr1; namespace Hack { diff --git a/decoder/phrasebased_translator.cc b/decoder/phrasebased_translator.cc index d65e44d1..8048248e 100644 --- a/decoder/phrasebased_translator.cc +++ b/decoder/phrasebased_translator.cc @@ -2,8 +2,14 @@ #include <queue> #include <iostream> -#include <tr1/unordered_map> -#include <tr1/unordered_set> +#ifndef HAVE_OLD_CPP +# include <unordered_map> +# include <unordered_set> +#else +# include <tr1/unordered_map> +# include <tr1/unordered_set> +namespace std { using std::tr1::unordered_map; using std::tr1::unordered_set; } +#endif #include <boost/tuple/tuple.hpp> #include <boost/functional/hash.hpp> @@ -17,7 +23,6 @@ #include "array2d.h" using namespace std; -using namespace std::tr1; using namespace boost::tuples; struct Coverage : public vector<bool> { @@ -49,10 +54,13 @@ struct Coverage : public vector<bool> { }; struct CoverageHash { size_t operator()(const Coverage& cov) const { - return hasher_(static_cast<const vector<bool>&>(cov)); + int seed = 131; + size_t res = 0; + for (vector<bool>::const_iterator it = cov.begin(); it != cov.end(); ++it) { + res = (res * seed) + (*it + 1); + } + return res; } - private: - boost::hash<vector<bool> > hasher_; }; ostream& operator<<(ostream& os, const Coverage& cov) { os << '['; diff --git a/decoder/scfg_translator.cc b/decoder/scfg_translator.cc index 6f0b003b..a506c591 100644 --- a/decoder/scfg_translator.cc +++ b/decoder/scfg_translator.cc @@ -1,13 +1,9 @@ -//TODO: bottom-up pruning, with actual final models' (appropriately weighted) heuristics and local scores. - -//TODO: grammar heuristic (min cost of reachable rule set) for binarizations (active edges) if we wish to prune those also - -#include "hash.h" -#include "translator.h" #include <algorithm> #include <vector> #include <boost/foreach.hpp> #include <boost/functional/hash.hpp> +#include "hash.h" +#include "translator.h" #include "hg.h" #include "grammar.h" #include "bottom_up_parser.h" @@ -16,13 +12,11 @@ #include "tdict.h" #include "viterbi.h" #include "verbose.h" -#include <tr1/unordered_map> #define foreach BOOST_FOREACH #define reverse_foreach BOOST_REVERSE_FOREACH using namespace std; -using namespace std::tr1; static bool printGrammarsUsed = false; struct GlueGrammar : public TextGrammar { diff --git a/decoder/sentence_metadata.h b/decoder/sentence_metadata.h index eab9f15d..52586331 100644 --- a/decoder/sentence_metadata.h +++ b/decoder/sentence_metadata.h @@ -5,7 +5,9 @@ #include <map> #include <cassert> #include "lattice.h" -#include "scorer.h" + +struct DocScorer; // deprecated, will be removed +struct Score; // deprecated, will be removed struct SentenceMetadata { friend class DecoderImpl; diff --git a/decoder/tromble_loss.cc b/decoder/tromble_loss.cc deleted file mode 100644 index 24cfef5f..00000000 --- a/decoder/tromble_loss.cc +++ /dev/null @@ -1,309 +0,0 @@ -#include "tromble_loss.h" -#include "fast_lexical_cast.hpp" - -#include <boost/algorithm/string/predicate.hpp> -#include <boost/circular_buffer.hpp> -#include <boost/functional/hash.hpp> -#include <boost/range/iterator_range.hpp> -#include <boost/tokenizer.hpp> -#include <boost/unordered_map.hpp> - -#include <cmath> -#include <fstream> -#include <vector> - -#include "sentence_metadata.h" -#include "trule.h" -#include "tdict.h" - -using namespace std; - -namespace { - -typedef unsigned char GramCount; - -struct RefCounts { - GramCount max; - std::vector<GramCount> refs; - size_t length; -}; - -typedef boost::unordered_map<std::vector<WordID>, size_t, boost::hash<std::vector<WordID> > > NGramMap; - -// Take all the n-grams in the references and stuff them into ngrams. -void MakeNGramMapFromReferences(const vector<vector<WordID> > &references, - int n, - vector<RefCounts> *counts, - NGramMap *ngrams) { - ngrams->clear(); - std::pair<vector<WordID>, size_t> insert_me; - vector<WordID> &ngram = insert_me.first; - ngram.reserve(n); - size_t &id = insert_me.second; - id = 0; - for (int refi = 0; refi < references.size(); ++refi) { - const vector<WordID>& ref = references[refi]; - const int s = ref.size(); - for (int j=0; j<s; ++j) { - const int remaining = s-j; - const int k = (n < remaining ? n : remaining); - ngram.clear(); - for (unsigned int i = 0; i < k; ++i) { - ngram.push_back(ref[j + i]); - std::pair<NGramMap::iterator, bool> ret(ngrams->insert(insert_me)); - if (ret.second) { - counts->resize(id + 1); - RefCounts &ref_counts = counts->back(); - ref_counts.max = 1; - ref_counts.refs.resize(references.size()); - ref_counts.refs[refi] = 1; - ref_counts.length = ngram.size(); - ++id; - } else { - RefCounts &ref_counts = (*counts)[ret.first->second]; - ref_counts.max = std::max(ref_counts.max, ++ref_counts.refs[refi]); - } - } - } - } -} - -struct MutableState { - MutableState(void *from, size_t n) : length(reinterpret_cast<size_t*>(from)), left(reinterpret_cast<WordID *>(length + 1)), right(left + n - 1), counts(reinterpret_cast<GramCount *>(right + n - 1)) {} - size_t *length; - WordID *left, *right; - GramCount *counts; - static size_t Size(size_t n, size_t bound_ngram_id) { return sizeof(size_t) + (n - 1) * 2 * sizeof(WordID) + bound_ngram_id * sizeof(GramCount); } -}; - -struct ConstState { - ConstState(const void *from, size_t n) : length(reinterpret_cast<const size_t*>(from)), left(reinterpret_cast<const WordID *>(length + 1)), right(left + n - 1), counts(reinterpret_cast<const GramCount *>(right + n - 1)) {} - const size_t *length; - const WordID *left, *right; - const GramCount *counts; - static size_t Size(size_t n, size_t bound_ngram_id) { return sizeof(size_t) + (n - 1) * 2 * sizeof(WordID) + bound_ngram_id * sizeof(GramCount); } -}; - -template <class T> struct CompatibleHashRange : public std::unary_function<const boost::iterator_range<T> &, size_t> { - size_t operator()(const boost::iterator_range<T> &range) const { - return boost::hash_range(range.begin(), range.end()); - } -}; - -template <class T> struct CompatibleEqualsRange : public std::binary_function<const boost::iterator_range<T> &, const std::vector<WordID> &, size_t> { - size_t operator()(const boost::iterator_range<T> &range, const std::vector<WordID> &vec) const { - return boost::algorithm::equals(range, vec); - } - size_t operator()(const std::vector<WordID> &vec, const boost::iterator_range<T> &range) const { - return boost::algorithm::equals(range, vec); - } -}; - -void AddWord(const boost::circular_buffer<WordID> &segment, size_t min_length, const NGramMap &ref_grams, GramCount *counters) { - typedef boost::circular_buffer<WordID>::const_iterator BufferIt; - typedef boost::iterator_range<BufferIt> SegmentRange; - if (segment.size() < min_length) return; -#if 0 - CompatibleHashRange<BufferIt> hasher; - CompatibleEqualsRange<BufferIt> equals; - for (BufferIt seg_start(segment.end() - min_length); ; --seg_start) { - NGramMap::const_iterator found = ref_grams.find(SegmentRange(seg_start, segment.end())); - if (found == ref_grams.end()) break; - ++counters[found->second]; - if (seg_start == segment.begin()) break; - } -#endif -} - -} // namespace - -class TrombleLossComputerImpl { - public: - explicit TrombleLossComputerImpl(const std::string ¶ms) : star_(TD::Convert("<{STAR}>")) { - typedef boost::tokenizer<boost::char_separator<char> > Tokenizer; - // Argument parsing - std::string ref_file_name; - Tokenizer tok(params, boost::char_separator<char>(" ")); - Tokenizer::iterator i = tok.begin(); - if (i == tok.end()) { - std::cerr << "TrombleLossComputer needs a reference file name." << std::endl; - exit(1); - } - ref_file_name = *i++; - if (i == tok.end()) { - std::cerr << "TrombleLossComputer needs to know how many references." << std::endl; - exit(1); - } - num_refs_ = boost::lexical_cast<unsigned int>(*i++); - for (; i != tok.end(); ++i) { - thetas_.push_back(boost::lexical_cast<double>(*i)); - } - if (thetas_.empty()) { - std::cerr << "TrombleLossComputer is pointless with no weight on n-grams." << std::endl; - exit(1); - } - - // Read references file. - std::ifstream ref_file(ref_file_name.c_str()); - if (!ref_file) { - std::cerr << "Could not open TrombleLossComputer file " << ref_file_name << std::endl; - exit(1); - } - std::string ref; - vector<vector<WordID> > references(num_refs_); - bound_ngram_id_ = 0; - for (unsigned int sentence = 0; ref_file; ++sentence) { - for (unsigned int refidx = 0; refidx < num_refs_; ++refidx) { - if (!getline(ref_file, ref)) { - if (refidx == 0) break; - std::cerr << "Short read of " << refidx << " references for sentence " << sentence << std::endl; - exit(1); - } - TD::ConvertSentence(ref, &references[refidx]); - } - ref_ids_.resize(sentence + 1); - ref_counts_.resize(sentence + 1); - MakeNGramMapFromReferences(references, thetas_.size(), &ref_counts_.back(), &ref_ids_.back()); - bound_ngram_id_ = std::max(bound_ngram_id_, ref_ids_.back().size()); - } - } - - size_t StateSize() const { - // n-1 boundary words plus counts for n-grams currently rendered as bytes even though most would fit in bits. - // Also, this is cached by higher up classes so no need to cache here. - return MutableState::Size(thetas_.size(), bound_ngram_id_); - } - - double Traversal( - const SentenceMetadata &smeta, - const TRule &rule, - const vector<const void*> &ant_contexts, - void *out_context) const { - // TODO: get refs from sentence metadata. - // This will require resizable features. - if (smeta.GetSentenceID() >= ref_ids_.size()) { - std::cerr << "Sentence ID " << smeta.GetSentenceID() << " doesn't have references; there are only " << ref_ids_.size() << " references." << std::endl; - exit(1); - } - const NGramMap &ngrams = ref_ids_[smeta.GetSentenceID()]; - MutableState out_state(out_context, thetas_.size()); - memset(out_state.counts, 0, bound_ngram_id_ * sizeof(GramCount)); - boost::circular_buffer<WordID> history(thetas_.size()); - std::vector<const void*>::const_iterator ant_context = ant_contexts.begin(); - *out_state.length = 0; - size_t pushed = 0; - const size_t keep = thetas_.size() - 1; - for (vector<WordID>::const_iterator rhs = rule.e().begin(); rhs != rule.e().end(); ++rhs) { - if (*rhs < 1) { - assert(ant_context != ant_contexts.end()); - // Constituent - ConstState rhs_state(*ant_context, thetas_.size()); - *out_state.length += *rhs_state.length; - { - GramCount *accum = out_state.counts; - for (const GramCount *c = rhs_state.counts; c != rhs_state.counts + ngrams.size(); ++c, ++accum) { - *accum += *c; - } - } - const WordID *w = rhs_state.left; - bool long_constit = true; - for (size_t i = 1; i <= keep; ++i, ++w) { - if (*w == star_) { - long_constit = false; - break; - } - history.push_back(*w); - if (++pushed == keep) { - std::copy(history.begin(), history.end(), out_state.left); - } - // Now i is the length of the history coming from this constituent. So it needs at least i+1 words to have a cross-child add. - AddWord(history, i + 1, ngrams, out_state.counts); - } - // If the consituent is shorter than thetas_.size(), then the - // constituent's left is the entire constituent, so history is already - // correct. Otherwise, the entire right hand side is the entire - // history. - if (long_constit) { - history.assign(thetas_.size(), rhs_state.right, rhs_state.right + keep); - } - ++ant_context; - } else { - // Word - ++*out_state.length; - history.push_back(*rhs); - if (++pushed == keep) { - std::copy(history.begin(), history.end(), out_state.left); - } - AddWord(history, 1, ngrams, out_state.counts); - } - } - // Fill in left and right constituents. - if (pushed < keep) { - std::copy(history.begin(), history.end(), out_state.left); - for (WordID *i = out_state.left + pushed; i != out_state.left + keep; ++i) { - *i = star_; - } - std::copy(out_state.left, out_state.left + keep, out_state.right); - } else if(pushed == keep) { - std::copy(history.begin(), history.end(), out_state.right); - } else if ((pushed > keep) && !history.empty()) { - std::copy(history.begin() + 1, history.end(), out_state.right); - } - std::vector<RefCounts>::const_iterator ref_info = ref_counts_[smeta.GetSentenceID()].begin(); - // Clip the counts and count matches. - // Indexed by reference then by length. - std::vector<std::vector<unsigned int> > matches(num_refs_, std::vector<unsigned int>(thetas_.size())); - for (GramCount *c = out_state.counts; c != out_state.counts + ngrams.size(); ++c, ++ref_info) { - *c = std::min(*c, ref_info->max); - if (*c) { - for (unsigned int refidx = 0; refidx < num_refs_; ++refidx) { - assert(ref_info->length >= 1); - assert(ref_info->length - 1 < thetas_.size()); - matches[refidx][ref_info->length - 1] += std::min(*c, ref_info->refs[refidx]); - } - } - } - double best_score = 0.0; - for (unsigned int refidx = 0; refidx < num_refs_; ++refidx) { - double score = 0.0; - for (unsigned int j = 0; j < std::min(*out_state.length, thetas_.size()); ++j) { - score += thetas_[j] * static_cast<double>(matches[refidx][j]) / static_cast<double>(*out_state.length - j); - } - best_score = std::max(best_score, score); - } - return best_score; - } - - private: - unsigned int num_refs_; - // Indexed by sentence id. - std::vector<NGramMap> ref_ids_; - // Then by id from ref_ids_. - std::vector<std::vector<RefCounts> > ref_counts_; - - // thetas_[0] is the weight for 1-grams - std::vector<double> thetas_; - - // All ngram ids in ref_ids_ are < this value. - size_t bound_ngram_id_; - - const WordID star_; -}; - -TrombleLossComputer::TrombleLossComputer(const std::string ¶ms) : - boost::base_from_member<PImpl>(new TrombleLossComputerImpl(params)), - FeatureFunction(boost::base_from_member<PImpl>::member->StateSize()), - fid_(FD::Convert("TrombleLossComputer")) {} - -TrombleLossComputer::~TrombleLossComputer() {} - -void TrombleLossComputer::TraversalFeaturesImpl(const SentenceMetadata& smeta, - const Hypergraph::Edge& edge, - const vector<const void*>& ant_contexts, - SparseVector<double>* features, - SparseVector<double>* estimated_features, - void* out_context) const { - (void) estimated_features; - const double loss = boost::base_from_member<PImpl>::member->Traversal(smeta, *edge.rule_, ant_contexts, out_context); - features->set_value(fid_, loss); -} diff --git a/decoder/tromble_loss.h b/decoder/tromble_loss.h deleted file mode 100644 index fde33100..00000000 --- a/decoder/tromble_loss.h +++ /dev/null @@ -1,40 +0,0 @@ -#ifndef _TROMBLE_LOSS_H_ -#define _TROMBLE_LOSS_H_ - -#include <vector> -#include <boost/scoped_ptr.hpp> -#include <boost/utility/base_from_member.hpp> - -#include "ff.h" -#include "wordid.h" - -// this may not be the most elegant way to implement this computation, but since we -// may need cube pruning and state splitting, we reuse the feature detector framework. -// the loss is then stored in a feature #0 (which is guaranteed to have weight 0 and -// never be a "real" feature). -class TrombleLossComputerImpl; -class TrombleLossComputer : private boost::base_from_member<boost::scoped_ptr<TrombleLossComputerImpl> >, public FeatureFunction { - private: - typedef boost::scoped_ptr<TrombleLossComputerImpl> PImpl; - typedef FeatureFunction Base; - - public: - // String parameters are ref.txt num_ref weight1 weight2 ... weightn - // where ref.txt contains references on per line, with num_ref references per sentence - // The weights are the weight on each length n-gram. - explicit TrombleLossComputer(const std::string ¶ms); - - ~TrombleLossComputer(); - - protected: - virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta, - const HG::Edge& edge, - const std::vector<const void*>& ant_contexts, - SparseVector<double>* features, - SparseVector<double>* estimated_features, - void* out_context) const; - private: - const int fid_; -}; - -#endif diff --git a/extractor/Makefile.am b/extractor/Makefile.am index e94a9b91..65a3d436 100644 --- a/extractor/Makefile.am +++ b/extractor/Makefile.am @@ -1,7 +1,8 @@ -if HAVE_CXX11 bin_PROGRAMS = compile run_extractor +if HAVE_CXX11 + EXTRA_PROGRAMS = alignment_test \ data_array_test \ fast_intersector_test \ @@ -113,7 +114,29 @@ libcompile_a_SOURCES = \ precomputation.cc \ suffix_array.cc \ time_util.cc \ - translation_table.cc + translation_table.cc \ + alignment.h \ + data_array.h \ + fast_intersector.h \ + grammar.h \ + grammar_extractor.h \ + matchings_finder.h \ + matchings_trie.h \ + phrase.h \ + phrase_builder.h \ + phrase_location.h \ + precomputation.h \ + rule.h \ + rule_extractor.h \ + rule_extractor_helper.h \ + rule_factory.h \ + sampler.h \ + scorer.h \ + suffix_array.h \ + target_phrase_extractor.h \ + time_util.h \ + translation_table.h \ + vocabulary.h libextractor_a_SOURCES = \ alignment.cc \ diff --git a/extractor/grammar_extractor.cc b/extractor/grammar_extractor.cc index 8050ce7b..487abcaf 100644 --- a/extractor/grammar_extractor.cc +++ b/extractor/grammar_extractor.cc @@ -3,11 +3,13 @@ #include <iterator> #include <sstream> #include <vector> +#include <unordered_set> #include "grammar.h" #include "rule.h" #include "rule_factory.h" #include "vocabulary.h" +#include "data_array.h" using namespace std; @@ -32,10 +34,10 @@ GrammarExtractor::GrammarExtractor( vocabulary(vocabulary), rule_factory(rule_factory) {} -Grammar GrammarExtractor::GetGrammar(const string& sentence) { +Grammar GrammarExtractor::GetGrammar(const string& sentence, const unordered_set<int>& blacklisted_sentence_ids, const shared_ptr<DataArray> source_data_array) { vector<string> words = TokenizeSentence(sentence); vector<int> word_ids = AnnotateWords(words); - return rule_factory->GetGrammar(word_ids); + return rule_factory->GetGrammar(word_ids, blacklisted_sentence_ids, source_data_array); } vector<string> GrammarExtractor::TokenizeSentence(const string& sentence) { diff --git a/extractor/grammar_extractor.h b/extractor/grammar_extractor.h index b36ceeb9..ae407b47 100644 --- a/extractor/grammar_extractor.h +++ b/extractor/grammar_extractor.h @@ -4,6 +4,7 @@ #include <memory> #include <string> #include <vector> +#include <unordered_set> using namespace std; @@ -44,7 +45,7 @@ class GrammarExtractor { // Converts the sentence to a vector of word ids and uses the RuleFactory to // extract the SCFG rules which may be used to decode the sentence. - Grammar GetGrammar(const string& sentence); + Grammar GetGrammar(const string& sentence, const unordered_set<int>& blacklisted_sentence_ids, const shared_ptr<DataArray> source_data_array); private: // Splits the sentence in a vector of words. diff --git a/extractor/grammar_extractor_test.cc b/extractor/grammar_extractor_test.cc index 823bb8b4..f32a9599 100644 --- a/extractor/grammar_extractor_test.cc +++ b/extractor/grammar_extractor_test.cc @@ -39,12 +39,15 @@ TEST(GrammarExtractorTest, TestAnnotatingWords) { vector<Rule> rules; vector<string> feature_names; Grammar grammar(rules, feature_names); - EXPECT_CALL(*factory, GetGrammar(word_ids)) + unordered_set<int> blacklisted_sentence_ids; + shared_ptr<DataArray> source_data_array; + EXPECT_CALL(*factory, GetGrammar(word_ids, blacklisted_sentence_ids, source_data_array)) .WillOnce(Return(grammar)); GrammarExtractor extractor(vocabulary, factory); string sentence = "Anna has many many apples ."; - extractor.GetGrammar(sentence); + + extractor.GetGrammar(sentence, blacklisted_sentence_ids, source_data_array); } } // namespace diff --git a/extractor/mocks/mock_rule_factory.h b/extractor/mocks/mock_rule_factory.h index 7389b396..86a084b5 100644 --- a/extractor/mocks/mock_rule_factory.h +++ b/extractor/mocks/mock_rule_factory.h @@ -7,7 +7,7 @@ namespace extractor { class MockHieroCachingRuleFactory : public HieroCachingRuleFactory { public: - MOCK_METHOD1(GetGrammar, Grammar(const vector<int>& word_ids)); + MOCK_METHOD3(GetGrammar, Grammar(const vector<int>& word_ids, const unordered_set<int> blacklisted_sentence_ids, const shared_ptr<DataArray> source_data_array)); }; } // namespace extractor diff --git a/extractor/rule_factory.cc b/extractor/rule_factory.cc index 8c30fb9e..6ae2d792 100644 --- a/extractor/rule_factory.cc +++ b/extractor/rule_factory.cc @@ -17,6 +17,7 @@ #include "suffix_array.h" #include "time_util.h" #include "vocabulary.h" +#include "data_array.h" using namespace std; using namespace chrono; @@ -100,7 +101,7 @@ HieroCachingRuleFactory::HieroCachingRuleFactory() {} HieroCachingRuleFactory::~HieroCachingRuleFactory() {} -Grammar HieroCachingRuleFactory::GetGrammar(const vector<int>& word_ids) { +Grammar HieroCachingRuleFactory::GetGrammar(const vector<int>& word_ids, const unordered_set<int>& blacklisted_sentence_ids, const shared_ptr<DataArray> source_data_array) { Clock::time_point start_time = Clock::now(); double total_extract_time = 0; double total_intersect_time = 0; @@ -192,7 +193,7 @@ Grammar HieroCachingRuleFactory::GetGrammar(const vector<int>& word_ids) { Clock::time_point extract_start = Clock::now(); if (!state.starts_with_x) { // Extract rules for the sampled set of occurrences. - PhraseLocation sample = sampler->Sample(next_node->matchings); + PhraseLocation sample = sampler->Sample(next_node->matchings, blacklisted_sentence_ids, source_data_array); vector<Rule> new_rules = rule_extractor->ExtractRules(next_phrase, sample); rules.insert(rules.end(), new_rules.begin(), new_rules.end()); diff --git a/extractor/rule_factory.h b/extractor/rule_factory.h index 52e8712a..df63a9d8 100644 --- a/extractor/rule_factory.h +++ b/extractor/rule_factory.h @@ -3,6 +3,7 @@ #include <memory> #include <vector> +#include <unordered_set> #include "matchings_trie.h" @@ -71,7 +72,7 @@ class HieroCachingRuleFactory { // Constructs SCFG rules for a given sentence. // (See class description for more details.) - virtual Grammar GetGrammar(const vector<int>& word_ids); + virtual Grammar GetGrammar(const vector<int>& word_ids, const unordered_set<int>& blacklisted_sentence_ids, const shared_ptr<DataArray> source_data_array); protected: HieroCachingRuleFactory(); diff --git a/extractor/rule_factory_test.cc b/extractor/rule_factory_test.cc index 08af3dcd..f26cc567 100644 --- a/extractor/rule_factory_test.cc +++ b/extractor/rule_factory_test.cc @@ -76,7 +76,9 @@ TEST_F(RuleFactoryTest, TestGetGrammarDifferentWords) { .WillRepeatedly(Return(PhraseLocation(0, 1))); vector<int> word_ids = {2, 3, 4}; - Grammar grammar = factory->GetGrammar(word_ids); + unordered_set<int> blacklisted_sentence_ids; + shared_ptr<DataArray> source_data_array; + Grammar grammar = factory->GetGrammar(word_ids, blacklisted_sentence_ids, source_data_array); EXPECT_EQ(feature_names, grammar.GetFeatureNames()); EXPECT_EQ(7, grammar.GetRules().size()); } @@ -94,7 +96,9 @@ TEST_F(RuleFactoryTest, TestGetGrammarRepeatingWords) { .WillRepeatedly(Return(PhraseLocation(0, 1))); vector<int> word_ids = {2, 3, 4, 2, 3}; - Grammar grammar = factory->GetGrammar(word_ids); + unordered_set<int> blacklisted_sentence_ids; + shared_ptr<DataArray> source_data_array; + Grammar grammar = factory->GetGrammar(word_ids, blacklisted_sentence_ids, source_data_array); EXPECT_EQ(feature_names, grammar.GetFeatureNames()); EXPECT_EQ(28, grammar.GetRules().size()); } diff --git a/extractor/run_extractor.cc b/extractor/run_extractor.cc index 8a9ca89d..6eb55073 100644 --- a/extractor/run_extractor.cc +++ b/extractor/run_extractor.cc @@ -75,7 +75,9 @@ int main(int argc, char** argv) { ("max_samples", po::value<int>()->default_value(300), "Maximum number of samples") ("tight_phrases", po::value<bool>()->default_value(true), - "False if phrases may be loose (better, but slower)"); + "False if phrases may be loose (better, but slower)") + ("leave_one_out", po::value<bool>()->zero_tokens(), + "do leave-one-out estimation of grammars (e.g. for extracting grammars for the training set"); po::variables_map vm; po::store(po::parse_command_line(argc, argv, desc), vm); @@ -96,6 +98,11 @@ int main(int argc, char** argv) { return 1; } + bool leave_one_out = false; + if (vm.count("leave_one_out")) { + leave_one_out = true; + } + int num_threads = vm["threads"].as<int>(); cerr << "Grammar extraction will use " << num_threads << " threads." << endl; @@ -223,7 +230,9 @@ int main(int argc, char** argv) { } suffixes[i] = suffix; - Grammar grammar = extractor.GetGrammar(sentences[i]); + unordered_set<int> blacklisted_sentence_ids; + if (leave_one_out) blacklisted_sentence_ids.insert(i); + Grammar grammar = extractor.GetGrammar(sentences[i], blacklisted_sentence_ids, source_data_array); ofstream output(GetGrammarFilePath(grammar_path, i).c_str()); output << grammar; } diff --git a/extractor/sample_source.txt b/extractor/sample_source.txt new file mode 100644 index 00000000..971baf6d --- /dev/null +++ b/extractor/sample_source.txt @@ -0,0 +1,2 @@ +ana are mere . +ana bea mult lapte . diff --git a/extractor/sampler.cc b/extractor/sampler.cc index d81956b5..963afa7a 100644 --- a/extractor/sampler.cc +++ b/extractor/sampler.cc @@ -12,7 +12,7 @@ Sampler::Sampler() {} Sampler::~Sampler() {} -PhraseLocation Sampler::Sample(const PhraseLocation& location) const { +PhraseLocation Sampler::Sample(const PhraseLocation& location, const unordered_set<int>& blacklisted_sentence_ids, const shared_ptr<DataArray> source_data_array) const { vector<int> sample; int num_subpatterns; if (location.matchings == NULL) { @@ -20,8 +20,37 @@ PhraseLocation Sampler::Sample(const PhraseLocation& location) const { num_subpatterns = 1; int low = location.sa_low, high = location.sa_high; double step = max(1.0, (double) (high - low) / max_samples); - for (double i = low; i < high && sample.size() < max_samples; i += step) { - sample.push_back(suffix_array->GetSuffix(Round(i))); + double i = low, last = i; + bool found; + while (sample.size() < max_samples && i < high) { + int x = suffix_array->GetSuffix(Round(i)); + int id = source_data_array->GetSentenceId(x); + if (find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) != blacklisted_sentence_ids.end()) { + found = false; + double backoff_step = 1; + while (true) { + if ((double)backoff_step >= step) break; + double j = i - backoff_step; + x = suffix_array->GetSuffix(Round(j)); + id = source_data_array->GetSentenceId(x); + if (x >= 0 && j > last && find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) { + found = true; last = i; break; + } + double k = i + backoff_step; + x = suffix_array->GetSuffix(Round(k)); + id = source_data_array->GetSentenceId(x); + if (k < min(i+step, (double)high) && find(blacklisted_sentence_ids.begin(), blacklisted_sentence_ids.end(), id) == blacklisted_sentence_ids.end()) { + found = true; last = k; break; + } + if (j <= last && k >= high) break; + backoff_step++; + } + } else { + found = true; + last = i; + } + if (found) sample.push_back(x); + i += step; } } else { // Sample vector of occurrences. diff --git a/extractor/sampler.h b/extractor/sampler.h index be4aa1bb..de450c48 100644 --- a/extractor/sampler.h +++ b/extractor/sampler.h @@ -2,6 +2,9 @@ #define _SAMPLER_H_ #include <memory> +#include <unordered_set> + +#include "data_array.h" using namespace std; @@ -20,7 +23,7 @@ class Sampler { virtual ~Sampler(); // Samples uniformly at most max_samples phrase occurrences. - virtual PhraseLocation Sample(const PhraseLocation& location) const; + virtual PhraseLocation Sample(const PhraseLocation& location, const unordered_set<int>& blacklisted_sentence_ids, const shared_ptr<DataArray> source_data_array) const; protected: Sampler(); diff --git a/extractor/sampler_test.cc b/extractor/sampler_test.cc index e9abebfa..965567ba 100644 --- a/extractor/sampler_test.cc +++ b/extractor/sampler_test.cc @@ -3,6 +3,7 @@ #include <memory> #include "mocks/mock_suffix_array.h" +#include "mocks/mock_data_array.h" #include "phrase_location.h" #include "sampler.h" @@ -15,6 +16,8 @@ namespace { class SamplerTest : public Test { protected: virtual void SetUp() { + source_data_array = make_shared<MockDataArray>(); + EXPECT_CALL(*source_data_array, GetSentenceId(_)).WillRepeatedly(Return(9999)); suffix_array = make_shared<MockSuffixArray>(); for (int i = 0; i < 10; ++i) { EXPECT_CALL(*suffix_array, GetSuffix(i)).WillRepeatedly(Return(i)); @@ -23,51 +26,54 @@ class SamplerTest : public Test { shared_ptr<MockSuffixArray> suffix_array; shared_ptr<Sampler> sampler; + shared_ptr<MockDataArray> source_data_array; }; TEST_F(SamplerTest, TestSuffixArrayRange) { PhraseLocation location(0, 10); + unordered_set<int> blacklist; sampler = make_shared<Sampler>(suffix_array, 1); vector<int> expected_locations = {0}; - EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location)); + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); sampler = make_shared<Sampler>(suffix_array, 2); expected_locations = {0, 5}; - EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location)); + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); sampler = make_shared<Sampler>(suffix_array, 3); expected_locations = {0, 3, 7}; - EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location)); + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); sampler = make_shared<Sampler>(suffix_array, 4); expected_locations = {0, 3, 5, 8}; - EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location)); + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); sampler = make_shared<Sampler>(suffix_array, 100); expected_locations = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location)); + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); } TEST_F(SamplerTest, TestSubstringsSample) { vector<int> locations = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + unordered_set<int> blacklist; PhraseLocation location(locations, 2); sampler = make_shared<Sampler>(suffix_array, 1); vector<int> expected_locations = {0, 1}; - EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location)); + EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location, blacklist, source_data_array)); sampler = make_shared<Sampler>(suffix_array, 2); expected_locations = {0, 1, 6, 7}; - EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location)); + EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location, blacklist, source_data_array)); sampler = make_shared<Sampler>(suffix_array, 3); expected_locations = {0, 1, 4, 5, 6, 7}; - EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location)); + EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location, blacklist, source_data_array)); sampler = make_shared<Sampler>(suffix_array, 7); expected_locations = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; - EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location)); + EXPECT_EQ(PhraseLocation(expected_locations, 2), sampler->Sample(location, blacklist, source_data_array)); } } // namespace diff --git a/extractor/sampler_test_blacklist.cc b/extractor/sampler_test_blacklist.cc new file mode 100644 index 00000000..3305b990 --- /dev/null +++ b/extractor/sampler_test_blacklist.cc @@ -0,0 +1,102 @@ +#include <gtest/gtest.h> + +#include <memory> + +#include "mocks/mock_suffix_array.h" +#include "mocks/mock_data_array.h" +#include "phrase_location.h" +#include "sampler.h" + +using namespace std; +using namespace ::testing; + +namespace extractor { +namespace { + +class SamplerTestBlacklist : public Test { + protected: + virtual void SetUp() { + source_data_array = make_shared<MockDataArray>(); + for (int i = 0; i < 10; ++i) { + EXPECT_CALL(*source_data_array, GetSentenceId(i)).WillRepeatedly(Return(i)); + } + for (int i = -10; i < 0; ++i) { + EXPECT_CALL(*source_data_array, GetSentenceId(i)).WillRepeatedly(Return(0)); + } + suffix_array = make_shared<MockSuffixArray>(); + for (int i = -10; i < 10; ++i) { + EXPECT_CALL(*suffix_array, GetSuffix(i)).WillRepeatedly(Return(i)); + } + } + + shared_ptr<MockSuffixArray> suffix_array; + shared_ptr<Sampler> sampler; + shared_ptr<MockDataArray> source_data_array; +}; + +TEST_F(SamplerTestBlacklist, TestSuffixArrayRange) { + PhraseLocation location(0, 10); + unordered_set<int> blacklist; + vector<int> expected_locations; + + blacklist.insert(0); + sampler = make_shared<Sampler>(suffix_array, 1); + expected_locations = {1}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); + blacklist.clear(); + + for (int i = 0; i < 9; i++) { + blacklist.insert(i); + } + sampler = make_shared<Sampler>(suffix_array, 1); + expected_locations = {9}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); + blacklist.clear(); + + blacklist.insert(0); + blacklist.insert(5); + sampler = make_shared<Sampler>(suffix_array, 2); + expected_locations = {1, 4}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); + blacklist.clear(); + + blacklist.insert(0); + blacklist.insert(1); + blacklist.insert(2); + blacklist.insert(3); + sampler = make_shared<Sampler>(suffix_array, 2); + expected_locations = {4, 5}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); + blacklist.clear(); + + blacklist.insert(0); + blacklist.insert(3); + blacklist.insert(7); + sampler = make_shared<Sampler>(suffix_array, 3); + expected_locations = {1, 2, 6}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); + blacklist.clear(); + + blacklist.insert(0); + blacklist.insert(3); + blacklist.insert(5); + blacklist.insert(8); + sampler = make_shared<Sampler>(suffix_array, 4); + expected_locations = {1, 2, 4, 7}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); + blacklist.clear(); + + blacklist.insert(0); + sampler = make_shared<Sampler>(suffix_array, 100); + expected_locations = {1, 2, 3, 4, 5, 6, 7, 8, 9}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); + blacklist.clear(); + + blacklist.insert(9); + sampler = make_shared<Sampler>(suffix_array, 100); + expected_locations = {0, 1, 2, 3, 4, 5, 6, 7, 8}; + EXPECT_EQ(PhraseLocation(expected_locations, 1), sampler->Sample(location, blacklist, source_data_array)); +} + +} // namespace +} // namespace extractor diff --git a/klm/lm/builder/Makefile.am b/klm/lm/builder/Makefile.am index 317e03ce..38259c51 100644 --- a/klm/lm/builder/Makefile.am +++ b/klm/lm/builder/Makefile.am @@ -1,6 +1,6 @@ -bin_PROGRAMS = builder +bin_PROGRAMS = lmplz -builder_SOURCES = \ +lmplz_SOURCES = \ lmplz_main.cc \ adjust_counts.cc \ adjust_counts.hh \ @@ -22,7 +22,7 @@ builder_SOURCES = \ print.hh \ sort.hh -builder_LDADD = ../libklm.a ../../util/double-conversion/libklm_util_double.a ../../util/stream/libklm_util_stream.a ../../util/libklm_util.a $(BOOST_THREAD_LIBS) +lmplz_LDADD = ../libklm.a ../../util/double-conversion/libklm_util_double.a ../../util/stream/libklm_util_stream.a ../../util/libklm_util.a $(BOOST_THREAD_LIBS) AM_CPPFLAGS = -W -Wall -I$(top_srcdir)/klm diff --git a/mteval/mbr_kbest.cc b/mteval/mbr_kbest.cc index 2519bc01..76d2c7fc 100644 --- a/mteval/mbr_kbest.cc +++ b/mteval/mbr_kbest.cc @@ -1,9 +1,14 @@ #include <iostream> #include <vector> -#include <tr1/unordered_map> #include <boost/program_options.hpp> #include <boost/functional/hash.hpp> +#ifndef HAVE_OLD_CPP +# include <unordered_map> +#else +# include <tr1/unordered_map> +namespace std { using std::tr1::unordered_map; } +#endif #include "prob.h" #include "tdict.h" @@ -12,7 +17,6 @@ #include "stringlib.h" using namespace std; -using namespace std::tr1; namespace po = boost::program_options; diff --git a/mteval/ns.cc b/mteval/ns.cc index b64d4798..c1ea238b 100644 --- a/mteval/ns.cc +++ b/mteval/ns.cc @@ -106,7 +106,7 @@ struct BleuSegmentEvaluator : public SegmentEvaluator { } struct NGramCompare { - int operator() (const vector<WordID>& a, const vector<WordID>& b) { + int operator() (const vector<WordID>& a, const vector<WordID>& b) const { const size_t as = a.size(); const size_t bs = b.size(); const size_t s = (as < bs ? as : bs); diff --git a/mteval/ns_ter.cc b/mteval/ns_ter.cc index 0e1008db..680fb7b4 100644 --- a/mteval/ns_ter.cc +++ b/mteval/ns_ter.cc @@ -4,7 +4,12 @@ #include <cassert> #include <iostream> #include <limits> -#include <tr1/unordered_map> +#ifndef HAVE_OLD_CPP +# include <unordered_map> +#else +# include <tr1/unordered_map> +namespace std { using std::tr1::unordered_map; } +#endif #include <set> #include <boost/functional/hash.hpp> #include "tdict.h" @@ -20,7 +25,6 @@ static const unsigned kREF_WORDCOUNT = 4; static const unsigned kDUMMY_LAST_ENTRY = 5; using namespace std; -using namespace std::tr1; bool TERMetric::IsErrorMetric() const { return true; diff --git a/mteval/scorer.cc b/mteval/scorer.cc index de84e076..fedbca91 100644 --- a/mteval/scorer.cc +++ b/mteval/scorer.cc @@ -198,7 +198,7 @@ class BLEUScorerBase : public SentenceScorer { virtual float ComputeRefLength(const vector<WordID>& hyp) const = 0; private: struct NGramCompare { - int operator() (const vector<WordID>& a, const vector<WordID>& b) { + int operator() (const vector<WordID>& a, const vector<WordID>& b) const { size_t as = a.size(); size_t bs = b.size(); const size_t s = (as < bs ? as : bs); @@ -594,26 +594,29 @@ void DocScorer::Init( const ScoreType type, const vector<string>& ref_files, const string& src_file, bool verbose) { + cerr << "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n" + "!!! This code is using the deprecated DocScorer interface, please fix !!!\n" + "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!\n"; scorers_.clear(); - // TODO stop using valarray, start using ReadFile + static const WordID kDIV = TD::Convert("|||"); cerr << "Loading references (" << ref_files.size() << " files)\n"; ReadFile srcrf; if (type == AER && src_file.size() > 0) { cerr << " (source=" << src_file << ")\n"; srcrf.Init(src_file); } + std::vector<WordID> tmp; std::vector<ReadFile> ifs(ref_files.begin(),ref_files.end()); for (int i=0; i < ref_files.size(); ++i) ifs[i].Init(ref_files[i]); char buf[64000]; bool expect_eof = false; int line=0; while (ifs[0].get()) { - vector<vector<WordID> > refs(ref_files.size()); + vector<vector<WordID> > refs; for (int i=0; i < ref_files.size(); ++i) { istream &in=ifs[i].get(); if (in.eof()) break; in.getline(buf, 64000); - refs[i].clear(); if (strlen(buf) == 0) { if (in.eof()) { if (!expect_eof) { @@ -622,9 +625,17 @@ void DocScorer::Init( } break; } - } else { - TD::ConvertSentence(buf, &refs[i]); - assert(!refs[i].empty()); + } else { // process reference + tmp.clear(); + TD::ConvertSentence(buf, &tmp); + unsigned last = 0; + for (unsigned j = 0; j < tmp.size(); ++j) { + if (tmp[j] == kDIV) { + refs.push_back(vector<WordID>(tmp.begin() + last, tmp.begin() + j)); + last = j + 1; + } + } + refs.push_back(vector<WordID>(tmp.begin() + last, tmp.end())); } assert(!expect_eof); } diff --git a/mteval/ter.cc b/mteval/ter.cc index cacc5b00..19b90bbc 100644 --- a/mteval/ter.cc +++ b/mteval/ter.cc @@ -5,7 +5,12 @@ #include <iostream> #include <limits> #include <sstream> -#include <tr1/unordered_map> +#ifndef HAVE_OLD_CPP +# include <unordered_map> +#else +# include <tr1/unordered_map> +namespace std { using std::tr1::unordered_map; } +#endif #include <set> #include <valarray> #include <boost/functional/hash.hpp> @@ -16,7 +21,6 @@ const bool ter_use_average_ref_len = true; const int ter_short_circuit_long_sentences = -1; using namespace std; -using namespace std::tr1; struct COSTS { static const float substitution; diff --git a/python/cdec/sa/_sa.c b/python/cdec/sa/_sa.cpp index a9f7855e..a9f7855e 100644 --- a/python/cdec/sa/_sa.c +++ b/python/cdec/sa/_sa.cpp diff --git a/python/cdec/sa/compile.py b/python/cdec/sa/compile.py index d4cd8387..caa93f8b 100644 --- a/python/cdec/sa/compile.py +++ b/python/cdec/sa/compile.py @@ -75,6 +75,13 @@ def main(): a_bin = os.path.join(args.output, 'a.bin') lex_bin = os.path.join(args.output, 'lex.bin') + config = cdec.configobj.ConfigObj(args.config, unrepr=True) + config['f_sa_file'] = os.path.abspath(f_sa_bin) + config['e_file'] = os.path.abspath(e_bin) + config['a_file'] = os.path.abspath(a_bin) + config['lex_file'] = os.path.abspath(lex_bin) + config['precompute_file'] = os.path.abspath(precomp_bin) + start_time = monitor_cpu() logger.info('Compiling source suffix array') if args.bitext: @@ -116,12 +123,6 @@ def main(): logger.info('Compiling bilexical dictionary took %f seconds', stop_time - start_time) # Write configuration - config = cdec.configobj.ConfigObj(args.config, unrepr=True) - config['f_sa_file'] = os.path.abspath(f_sa_bin) - config['e_file'] = os.path.abspath(e_bin) - config['a_file'] = os.path.abspath(a_bin) - config['lex_file'] = os.path.abspath(lex_bin) - config['precompute_file'] = os.path.abspath(precomp_bin) for name, value in zip(param_names, params): config[name] = value config.write() diff --git a/python/cdec/sa/strmap.cc b/python/cdec/sa/strmap.cc index 5040477e..b6debfb0 100644 --- a/python/cdec/sa/strmap.cc +++ b/python/cdec/sa/strmap.cc @@ -2,11 +2,16 @@ #include <vector> #include <string> -#include <tr1/unordered_map> -#include <stdint.h> +#include <cstdint> + +#ifndef HAVE_OLD_CPP +# include <unordered_map> +#else +# include <tr1/unordered_map> +namespace std { using std::tr1::unordered_map; } +#endif using namespace std; -using namespace std::tr1; #undef HAVE_64_BITS diff --git a/python/setup.py.in b/python/setup.py.in index ce1eb2ed..a2aa28f6 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -7,7 +7,7 @@ LIB = ['../decoder', '../utils', '../mteval', '../training/utils', '../klm/lm', # Set automatically by configure LIBS = re.findall('-l([^\s]+)', '@LIBS@') -CPPFLAGS = re.findall('-[^\s]+', '@CPPFLAGS@') +CPPFLAGS = re.findall('-[^\s]+', '@CPPFLAGS@ @CXXFLAGS@') LDFLAGS = re.findall('-[^\s]+', '@LDFLAGS@') # Make sure linker flags go only to the linker LDFLAGS = [opt.replace('-R', '-Wl,-rpath,') for opt in LDFLAGS] @@ -21,7 +21,8 @@ ext_modules = [ extra_compile_args=CPPFLAGS, extra_link_args=LDFLAGS), Extension(name='cdec.sa._sa', - sources=['cdec/sa/_sa.c', 'cdec/sa/strmap.cc']) + sources=['cdec/sa/_sa.cpp', 'cdec/sa/strmap.cc'], + extra_compile_args=CPPFLAGS) ] setup( diff --git a/training/crf/mpi_online_optimize.cc b/training/crf/mpi_online_optimize.cc index 9e1ae34c..6b5b7d64 100644 --- a/training/crf/mpi_online_optimize.cc +++ b/training/crf/mpi_online_optimize.cc @@ -4,11 +4,11 @@ #include <vector> #include <cassert> #include <cmath> -#include <tr1/memory> #include <ctime> #include <boost/program_options.hpp> #include <boost/program_options/variables_map.hpp> +#include <boost/shared_ptr.hpp> #include "stringlib.h" #include "verbose.h" @@ -219,7 +219,7 @@ int main(int argc, char** argv) { #endif if (size > 1) SetSilent(true); // turn off verbose decoder output register_feature_functions(); - std::tr1::shared_ptr<MT19937> rng; + boost::shared_ptr<MT19937> rng; po::variables_map conf; if (!InitCommandLine(argc, argv, &conf)) @@ -264,8 +264,8 @@ int main(int argc, char** argv) { ReadTrainingCorpus(conf["training_data"].as<string>(), rank, size, &corpus, &ids); assert(corpus.size() > 0); - std::tr1::shared_ptr<OnlineOptimizer> o; - std::tr1::shared_ptr<LearningRateSchedule> lr; + boost::shared_ptr<OnlineOptimizer> o; + boost::shared_ptr<LearningRateSchedule> lr; const unsigned size_per_proc = conf["minibatch_size_per_proc"].as<unsigned>(); if (size_per_proc > corpus.size()) { diff --git a/training/dtrain/Makefile.am b/training/dtrain/Makefile.am index 844c790d..ecb6c128 100644 --- a/training/dtrain/Makefile.am +++ b/training/dtrain/Makefile.am @@ -1,7 +1,7 @@ bin_PROGRAMS = dtrain dtrain_SOURCES = dtrain.cc score.cc dtrain.h kbestget.h ksampler.h pairsampling.h score.h -dtrain_LDADD = ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a +dtrain_LDADD = ../../decoder/libcdec.a ../../klm/search/libksearch.a ../../mteval/libmteval.a ../../utils/libutils.a ../../klm/lm/libklm.a ../../klm/util/libklm_util.a ../../klm/util/double-conversion/libklm_util_double.a -lboost_regex AM_CPPFLAGS = -W -Wall -Wno-sign-compare -I$(top_srcdir)/utils -I$(top_srcdir)/decoder -I$(top_srcdir)/mteval diff --git a/training/dtrain/README.md b/training/dtrain/README.md index 2bae6b48..aa1ab3e7 100644 --- a/training/dtrain/README.md +++ b/training/dtrain/README.md @@ -1,10 +1,15 @@ This is a simple (and parallelizable) tuning method for cdec -which is able to train the weights of very many (sparse) features. -It was used here: - "Joint Feature Selection in Distributed Stochastic - Learning for Large-Scale Discriminative Training in - SMT" -(Simianer, Riezler, Dyer; ACL 2012) +which is able to train the weights of very many (sparse) features +on the training set. + +It was used in these papers: +> "Joint Feature Selection in Distributed Stochastic +> Learning for Large-Scale Discriminative Training in +> SMT" (Simianer, Riezler, Dyer; ACL 2012) +> +> "Multi-Task Learning for Improved Discriminative +> Training in SMT" (Simianer, Riezler; WMT 2013) +> Building @@ -17,20 +22,9 @@ To build only parts needed for dtrain do cd training/dtrain/; make ``` -Ideas ------ - * get approx_bleu to work? - * implement minibatches (Minibatch and Parallelization for Online Large Margin Structured Learning) - * learning rate 1/T? - * use an oracle? mira-like (model vs. BLEU), feature repr. of reference!? - * implement lc_bleu properly - * merge kbest lists of previous epochs (as MERT does) - * ``walk entire regularization path'' - * rerank after each update? - Running ------- -See directories under test/ . +See directories under examples/ . Legal ----- diff --git a/training/dtrain/dtrain.cc b/training/dtrain/dtrain.cc index 0ee2f124..0a27a068 100644 --- a/training/dtrain/dtrain.cc +++ b/training/dtrain/dtrain.cc @@ -12,8 +12,9 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg) { po::options_description ini("Configuration File Options"); ini.add_options() - ("input", po::value<string>()->default_value("-"), "input file (src)") + ("input", po::value<string>(), "input file (src)") ("refs,r", po::value<string>(), "references") + ("bitext,b", po::value<string>(), "bitext: 'src ||| tgt'") ("output", po::value<string>()->default_value("-"), "output weights file, '-' for STDOUT") ("input_weights", po::value<string>(), "input weights file (e.g. from previous iteration)") ("decoder_config", po::value<string>(), "configuration file for cdec") @@ -40,6 +41,10 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg) ("scale_bleu_diff", po::value<bool>()->zero_tokens(), "learning rate <- bleu diff of a misranked pair") ("loss_margin", po::value<weight_t>()->default_value(0.), "update if no error in pref pair but model scores this near") ("max_pairs", po::value<unsigned>()->default_value(std::numeric_limits<unsigned>::max()), "max. # of pairs per Sent.") + ("pclr", po::value<string>()->default_value("no"), "use a (simple|adagrad) per-coordinate learning rate") + ("batch", po::value<bool>()->zero_tokens(), "do batch optimization") + ("repeat", po::value<unsigned>()->default_value(1), "repeat optimization over kbest list this number of times") + //("test-k-best", po::value<bool>()->zero_tokens(), "check if optimization works (use repeat >= 2)") ("noup", po::value<bool>()->zero_tokens(), "do not update weights"); po::options_description cl("Command Line Options"); cl.add_options() @@ -72,13 +77,17 @@ dtrain_init(int argc, char** argv, po::variables_map* cfg) cerr << "Wrong 'pair_sampling' param: '" << (*cfg)["pair_sampling"].as<string>() << "'." << endl; return false; } - if(cfg->count("hi_lo") && (*cfg)["pair_sampling"].as<string>() != "XYX") { + if (cfg->count("hi_lo") && (*cfg)["pair_sampling"].as<string>() != "XYX") { cerr << "Warning: hi_lo only works with pair_sampling XYX." << endl; } - if((*cfg)["hi_lo"].as<float>() > 0.5 || (*cfg)["hi_lo"].as<float>() < 0.01) { + if ((*cfg)["hi_lo"].as<float>() > 0.5 || (*cfg)["hi_lo"].as<float>() < 0.01) { cerr << "hi_lo must lie in [0.01, 0.5]" << endl; return false; } + if ((cfg->count("input")>0 || cfg->count("refs")>0) && cfg->count("bitext")>0) { + cerr << "Provide 'input' and 'refs' or 'bitext', not both." << endl; + return false; + } if ((*cfg)["pair_threshold"].as<score_t>() < 0) { cerr << "The threshold must be >= 0!" << endl; return false; @@ -120,10 +129,16 @@ main(int argc, char** argv) const float hi_lo = cfg["hi_lo"].as<float>(); const score_t approx_bleu_d = cfg["approx_bleu_d"].as<score_t>(); const unsigned max_pairs = cfg["max_pairs"].as<unsigned>(); + int repeat = cfg["repeat"].as<unsigned>(); + //bool test_k_best = false; + //if (cfg.count("test-k-best")) test_k_best = true; weight_t loss_margin = cfg["loss_margin"].as<weight_t>(); + bool batch = false; + if (cfg.count("batch")) batch = true; if (loss_margin > 9998.) loss_margin = std::numeric_limits<float>::max(); bool scale_bleu_diff = false; if (cfg.count("scale_bleu_diff")) scale_bleu_diff = true; + const string pclr = cfg["pclr"].as<string>(); bool average = false; if (select_weights == "avg") average = true; @@ -131,7 +146,6 @@ main(int argc, char** argv) if (cfg.count("print_weights")) boost::split(print_weights, cfg["print_weights"].as<string>(), boost::is_any_of(" ")); - // setup decoder register_feature_functions(); SetSilent(true); @@ -178,17 +192,16 @@ main(int argc, char** argv) observer->SetScorer(scorer); // init weights - vector<weight_t>& dense_weights = decoder.CurrentWeightVector(); + vector<weight_t>& decoder_weights = decoder.CurrentWeightVector(); SparseVector<weight_t> lambdas, cumulative_penalties, w_average; - if (cfg.count("input_weights")) Weights::InitFromFile(cfg["input_weights"].as<string>(), &dense_weights); - Weights::InitSparseVector(dense_weights, &lambdas); + if (cfg.count("input_weights")) Weights::InitFromFile(cfg["input_weights"].as<string>(), &decoder_weights); + Weights::InitSparseVector(decoder_weights, &lambdas); // meta params for perceptron, SVM weight_t eta = cfg["learning_rate"].as<weight_t>(); weight_t gamma = cfg["gamma"].as<weight_t>(); // faster perceptron: consider only misranked pairs, see - // DO NOT ENABLE WITH SVM (gamma > 0) OR loss_margin! bool faster_perceptron = false; if (gamma==0 && loss_margin==0) faster_perceptron = true; @@ -208,13 +221,24 @@ main(int argc, char** argv) // output string output_fn = cfg["output"].as<string>(); // input - string input_fn = cfg["input"].as<string>(); + bool read_bitext = false; + string input_fn; + if (cfg.count("bitext")) { + read_bitext = true; + input_fn = cfg["bitext"].as<string>(); + } else { + input_fn = cfg["input"].as<string>(); + } ReadFile input(input_fn); // buffer input for t > 0 vector<string> src_str_buf; // source strings (decoder takes only strings) vector<vector<WordID> > ref_ids_buf; // references as WordID vecs - string refs_fn = cfg["refs"].as<string>(); - ReadFile refs(refs_fn); + ReadFile refs; + string refs_fn; + if (!read_bitext) { + refs_fn = cfg["refs"].as<string>(); + refs.Init(refs_fn); + } unsigned in_sz = std::numeric_limits<unsigned>::max(); // input index, input size vector<pair<score_t, score_t> > all_scores; @@ -229,6 +253,7 @@ main(int argc, char** argv) cerr << setw(25) << "k " << k << endl; cerr << setw(25) << "N " << N << endl; cerr << setw(25) << "T " << T << endl; + cerr << setw(25) << "batch " << batch << endl; cerr << setw(26) << "scorer '" << scorer_str << "'" << endl; if (scorer_str == "approx_bleu") cerr << setw(25) << "approx. B discount " << approx_bleu_d << endl; @@ -249,10 +274,14 @@ main(int argc, char** argv) cerr << setw(25) << "l1 reg " << l1_reg << " '" << cfg["l1_reg"].as<string>() << "'" << endl; if (rescale) cerr << setw(25) << "rescale " << rescale << endl; + cerr << setw(25) << "pclr " << pclr << endl; cerr << setw(25) << "max pairs " << max_pairs << endl; + cerr << setw(25) << "repeat " << repeat << endl; + //cerr << setw(25) << "test k-best " << test_k_best << endl; cerr << setw(25) << "cdec cfg " << "'" << cfg["decoder_config"].as<string>() << "'" << endl; cerr << setw(25) << "input " << "'" << input_fn << "'" << endl; - cerr << setw(25) << "refs " << "'" << refs_fn << "'" << endl; + if (!read_bitext) + cerr << setw(25) << "refs " << "'" << refs_fn << "'" << endl; cerr << setw(25) << "output " << "'" << output_fn << "'" << endl; if (cfg.count("input_weights")) cerr << setw(25) << "weights in " << "'" << cfg["input_weights"].as<string>() << "'" << endl; @@ -261,6 +290,11 @@ main(int argc, char** argv) if (!verbose) cerr << "(a dot represents " << DTRAIN_DOTS << " inputs)" << endl; } + // pclr + SparseVector<weight_t> learning_rates; + // batch + SparseVector<weight_t> batch_updates; + score_t batch_loss; for (unsigned t = 0; t < T; t++) // T epochs { @@ -269,16 +303,24 @@ main(int argc, char** argv) time(&start); score_t score_sum = 0.; score_t model_sum(0); - unsigned ii = 0, rank_errors = 0, margin_violations = 0, npairs = 0, f_count = 0, list_sz = 0; + unsigned ii = 0, rank_errors = 0, margin_violations = 0, npairs = 0, f_count = 0, list_sz = 0, kbest_loss_improve = 0; + batch_loss = 0.; if (!quiet) cerr << "Iteration #" << t+1 << " of " << T << "." << endl; while(true) { string in; + string ref; bool next = false, stop = false; // next iteration or premature stop if (t == 0) { if(!getline(*input, in)) next = true; + if(read_bitext) { + vector<string> strs; + boost::algorithm::split_regex(strs, in, boost::regex(" \\|\\|\\| ")); + in = strs[0]; + ref = strs[1]; + } } else { if (ii == in_sz) next = true; // stop if we reach the end of our input } @@ -310,15 +352,16 @@ main(int argc, char** argv) if (next || stop) break; // weights - lambdas.init_vector(&dense_weights); + lambdas.init_vector(&decoder_weights); // getting input vector<WordID> ref_ids; // reference as vector<WordID> if (t == 0) { - string r_; - getline(*refs, r_); + if (!read_bitext) { + getline(*refs, ref); + } vector<string> ref_tok; - boost::split(ref_tok, r_, boost::is_any_of(" ")); + boost::split(ref_tok, ref, boost::is_any_of(" ")); register_and_convert(ref_tok, ref_ids); ref_ids_buf.push_back(ref_ids); src_str_buf.push_back(in); @@ -348,8 +391,10 @@ main(int argc, char** argv) } } - score_sum += (*samples)[0].score; // stats for 1best - model_sum += (*samples)[0].model; + if (repeat == 1) { + score_sum += (*samples)[0].score; // stats for 1best + model_sum += (*samples)[0].model; + } f_count += observer->get_f_count(); list_sz += observer->get_sz(); @@ -364,30 +409,74 @@ main(int argc, char** argv) partXYX(samples, pairs, pair_threshold, max_pairs, faster_perceptron, hi_lo); if (pair_sampling == "PRO") PROsampling(samples, pairs, pair_threshold, max_pairs); - npairs += pairs.size(); + int cur_npairs = pairs.size(); + npairs += cur_npairs; + + score_t kbest_loss_first, kbest_loss_last = 0.0; - SparseVector<weight_t> lambdas_copy; + for (vector<pair<ScoredHyp,ScoredHyp> >::iterator it = pairs.begin(); + it != pairs.end(); it++) { + score_t model_diff = it->first.model - it->second.model; + kbest_loss_first += max(0.0, -1.0 * model_diff); + } + + for (int ki=0; ki < repeat; ki++) { + + score_t kbest_loss = 0.0; // test-k-best + SparseVector<weight_t> lambdas_copy; // for l1 regularization + SparseVector<weight_t> sum_up; // for pclr if (l1naive||l1clip||l1cumul) lambdas_copy = lambdas; for (vector<pair<ScoredHyp,ScoredHyp> >::iterator it = pairs.begin(); it != pairs.end(); it++) { - bool rank_error; + score_t model_diff = it->first.model - it->second.model; + if (repeat > 1) { + model_diff = lambdas.dot(it->first.f) - lambdas.dot(it->second.f); + kbest_loss += max(0.0, -1.0 * model_diff); + } + bool rank_error = false; score_t margin; if (faster_perceptron) { // we only have considering misranked pairs rank_error = true; // pair sampling already did this for us margin = std::numeric_limits<float>::max(); } else { - rank_error = it->first.model <= it->second.model; - margin = fabs(it->first.model - it->second.model); + rank_error = model_diff<=0.0; + margin = fabs(model_diff); if (!rank_error && margin < loss_margin) margin_violations++; } - if (rank_error) rank_errors++; + if (rank_error && ki==1) rank_errors++; if (scale_bleu_diff) eta = it->first.score - it->second.score; if (rank_error || margin < loss_margin) { SparseVector<weight_t> diff_vec = it->first.f - it->second.f; - lambdas.plus_eq_v_times_s(diff_vec, eta); - if (gamma) - lambdas.plus_eq_v_times_s(lambdas, -2*gamma*eta*(1./npairs)); + if (batch) { + batch_loss += max(0., -1.0*model_diff); + batch_updates += diff_vec; + continue; + } + if (pclr != "no") { + sum_up += diff_vec; + } else { + lambdas.plus_eq_v_times_s(diff_vec, eta); + if (gamma) lambdas.plus_eq_v_times_s(lambdas, -2*gamma*eta*(1./cur_npairs)); + } + } + } + + // per-coordinate learning rate + if (pclr != "no") { + SparseVector<weight_t>::iterator it = sum_up.begin(); + for (; it != sum_up.end(); ++it) { + if (pclr == "simple") { + lambdas[it->first] += it->second / max(1.0, learning_rates[it->first]); + learning_rates[it->first]++; + } else if (pclr == "adagrad") { + if (learning_rates[it->first] == 0) { + lambdas[it->first] += it->second * eta; + } else { + lambdas[it->first] += it->second * eta * learning_rates[it->first]; + } + learning_rates[it->first] += pow(it->second, 2.0); + } } } @@ -395,14 +484,16 @@ main(int argc, char** argv) // please note that this regularizations happen // after a _sentence_ -- not after each example/pair! if (l1naive) { - FastSparseVector<weight_t>::iterator it = lambdas.begin(); + SparseVector<weight_t>::iterator it = lambdas.begin(); for (; it != lambdas.end(); ++it) { if (!lambdas_copy.get(it->first) || lambdas_copy.get(it->first)!=it->second) { + it->second *= max(0.0000001, eta/(eta+learning_rates[it->first])); // FIXME + learning_rates[it->first]++; it->second -= sign(it->second) * l1_reg; } } } else if (l1clip) { - FastSparseVector<weight_t>::iterator it = lambdas.begin(); + SparseVector<weight_t>::iterator it = lambdas.begin(); for (; it != lambdas.end(); ++it) { if (!lambdas_copy.get(it->first) || lambdas_copy.get(it->first)!=it->second) { if (it->second != 0) { @@ -417,7 +508,7 @@ main(int argc, char** argv) } } else if (l1cumul) { weight_t acc_penalty = (ii+1) * l1_reg; // ii is the index of the current input - FastSparseVector<weight_t>::iterator it = lambdas.begin(); + SparseVector<weight_t>::iterator it = lambdas.begin(); for (; it != lambdas.end(); ++it) { if (!lambdas_copy.get(it->first) || lambdas_copy.get(it->first)!=it->second) { if (it->second != 0) { @@ -435,7 +526,28 @@ main(int argc, char** argv) } } - } + if (ki==repeat-1) { // done + kbest_loss_last = kbest_loss; + if (repeat > 1) { + score_t best_score = -1.; + score_t best_model = -std::numeric_limits<score_t>::max(); + unsigned best_idx; + for (unsigned i=0; i < samples->size(); i++) { + score_t s = lambdas.dot((*samples)[i].f); + if (s > best_model) { + best_idx = i; + best_model = s; + } + } + score_sum += (*samples)[best_idx].score; + model_sum += best_model; + } + } + } // repeat + + if ((kbest_loss_first - kbest_loss_last) >= 0) kbest_loss_improve++; + + } // noup if (rescale) lambdas /= lambdas.l2norm(); @@ -443,14 +555,19 @@ main(int argc, char** argv) } // input loop - if (average) w_average += lambdas; + if (t == 0) in_sz = ii; // remember size of input (# lines) - if (scorer_str == "approx_bleu" || scorer_str == "lc_bleu") scorer->Reset(); - if (t == 0) { - in_sz = ii; // remember size of input (# lines) + if (batch) { + lambdas.plus_eq_v_times_s(batch_updates, eta); + if (gamma) lambdas.plus_eq_v_times_s(lambdas, -2*gamma*eta*(1./npairs)); + batch_updates.clear(); } + if (average) w_average += lambdas; + + if (scorer_str == "approx_bleu" || scorer_str == "lc_bleu") scorer->Reset(); + // print some stats score_t score_avg = score_sum/(score_t)in_sz; score_t model_avg = model_sum/(score_t)in_sz; @@ -477,13 +594,15 @@ main(int argc, char** argv) cerr << _np << " 1best avg model score: " << model_avg; cerr << _p << " (" << model_diff << ")" << endl; cerr << " avg # pairs: "; - cerr << _np << npairs/(float)in_sz; + cerr << _np << npairs/(float)in_sz << endl; + cerr << " avg # rank err: "; + cerr << rank_errors/(float)in_sz; if (faster_perceptron) cerr << " (meaningless)"; cerr << endl; - cerr << " avg # rank err: "; - cerr << rank_errors/(float)in_sz << endl; cerr << " avg # margin viol: "; cerr << margin_violations/(float)in_sz << endl; + if (batch) cerr << " batch loss: " << batch_loss << endl; + cerr << " k-best loss imp: " << ((float)kbest_loss_improve/in_sz)*100 << "%" << endl; cerr << " non0 feature count: " << nonz << endl; cerr << " avg list sz: " << list_sz/(float)in_sz << endl; cerr << " avg f count: " << f_count/(float)list_sz << endl; @@ -510,9 +629,9 @@ main(int argc, char** argv) // write weights to file if (select_weights == "best" || keep) { - lambdas.init_vector(&dense_weights); + lambdas.init_vector(&decoder_weights); string w_fn = "weights." + boost::lexical_cast<string>(t) + ".gz"; - Weights::WriteToFile(w_fn, dense_weights, true); + Weights::WriteToFile(w_fn, decoder_weights, true); } } // outer loop diff --git a/training/dtrain/dtrain.h b/training/dtrain/dtrain.h index 3981fb39..ccb5ad4d 100644 --- a/training/dtrain/dtrain.h +++ b/training/dtrain/dtrain.h @@ -9,6 +9,8 @@ #include <string.h> #include <boost/algorithm/string.hpp> +#include <boost/regex.hpp> +#include <boost/algorithm/string/regex.hpp> #include <boost/program_options.hpp> #include "decoder.h" diff --git a/training/dtrain/examples/standard/dtrain.ini b/training/dtrain/examples/standard/dtrain.ini index 23e94285..fc83f08e 100644 --- a/training/dtrain/examples/standard/dtrain.ini +++ b/training/dtrain/examples/standard/dtrain.ini @@ -1,5 +1,6 @@ -input=./nc-wmt11.de.gz -refs=./nc-wmt11.en.gz +#input=./nc-wmt11.de.gz +#refs=./nc-wmt11.en.gz +bitext=./nc-wmt11.gz output=- # a weights file (add .gz for gzip compression) or STDOUT '-' select_weights=VOID # output average (over epochs) weight vector decoder_config=./cdec.ini # config for cdec @@ -10,11 +11,11 @@ print_weights=Glue WordPenalty LanguageModel LanguageModel_OOV PhraseModel_0 Phr stop_after=10 # stop epoch after 10 inputs # interesting stuff -epochs=2 # run over input 2 times +epochs=3 # run over input 3 times k=100 # use 100best lists N=4 # optimize (approx) BLEU4 scorer=fixed_stupid_bleu # use 'stupid' BLEU+1 -learning_rate=1.0 # learning rate, don't care if gamma=0 (perceptron) +learning_rate=0.1 # learning rate, don't care if gamma=0 (perceptron) and loss_margin=0 (not margin perceptron) gamma=0 # use SVM reg sample_from=kbest # use kbest lists (as opposed to forest) filter=uniq # only unique entries in kbest (surface form) @@ -22,3 +23,5 @@ pair_sampling=XYX # hi_lo=0.1 # 10 vs 80 vs 10 and 80 vs 10 here pair_threshold=0 # minimum distance in BLEU (here: > 0) loss_margin=0 # update if correctly ranked, but within this margin +repeat=1 # repeat training on a kbest list 1 times +#batch=true # batch tuning, update after accumulating over all sentences and all kbest lists diff --git a/training/dtrain/examples/standard/expected-output b/training/dtrain/examples/standard/expected-output index 21f91244..75f47337 100644 --- a/training/dtrain/examples/standard/expected-output +++ b/training/dtrain/examples/standard/expected-output @@ -4,17 +4,18 @@ Reading ./nc-wmt11.en.srilm.gz ----5---10---15---20---25---30---35---40---45---50---55---60---65---70---75---80---85---90---95--100 **************************************************************************************************** Example feature: Shape_S00000_T00000 -Seeding random number sequence to 970626287 +Seeding random number sequence to 3751911392 dtrain Parameters: k 100 N 4 - T 2 + T 3 + batch 0 scorer 'fixed_stupid_bleu' sample from 'kbest' filter 'uniq' - learning rate 1 + learning rate 0.1 gamma 0 loss margin 0 faster perceptron 1 @@ -23,69 +24,99 @@ Parameters: pair threshold 0 select weights 'VOID' l1 reg 0 'none' + pclr no max pairs 4294967295 + repeat 1 cdec cfg './cdec.ini' - input './nc-wmt11.de.gz' - refs './nc-wmt11.en.gz' + input './nc-wmt11.gz' output '-' stop_after 10 (a dot represents 10 inputs) -Iteration #1 of 2. +Iteration #1 of 3. . 10 Stopping after 10 input sentences. WEIGHTS - Glue = -614 - WordPenalty = +1256.8 - LanguageModel = +5610.5 - LanguageModel_OOV = -1449 - PhraseModel_0 = -2107 - PhraseModel_1 = -4666.1 - PhraseModel_2 = -2713.5 - PhraseModel_3 = +4204.3 - PhraseModel_4 = -1435.8 - PhraseModel_5 = +916 - PhraseModel_6 = +190 - PassThrough = -2527 + Glue = -110 + WordPenalty = -8.2082 + LanguageModel = -319.91 + LanguageModel_OOV = -19.2 + PhraseModel_0 = +312.82 + PhraseModel_1 = -161.02 + PhraseModel_2 = -433.65 + PhraseModel_3 = +291.03 + PhraseModel_4 = +252.32 + PhraseModel_5 = +50.6 + PhraseModel_6 = +146.7 + PassThrough = -38.7 --- - 1best avg score: 0.17874 (+0.17874) - 1best avg model score: 88399 (+88399) - avg # pairs: 798.2 (meaningless) - avg # rank err: 798.2 + 1best avg score: 0.16966 (+0.16966) + 1best avg model score: 29874 (+29874) + avg # pairs: 906.3 + avg # rank err: 0 (meaningless) avg # margin viol: 0 - non0 feature count: 887 + k-best loss imp: 100% + non0 feature count: 832 avg list sz: 91.3 - avg f count: 126.85 -(time 0.33 min, 2 s/S) + avg f count: 139.77 +(time 0.35 min, 2.1 s/S) -Iteration #2 of 2. +Iteration #2 of 3. . 10 WEIGHTS - Glue = -1025 - WordPenalty = +1751.5 - LanguageModel = +10059 - LanguageModel_OOV = -4490 - PhraseModel_0 = -2640.7 - PhraseModel_1 = -3757.4 - PhraseModel_2 = -1133.1 - PhraseModel_3 = +1837.3 - PhraseModel_4 = -3534.3 - PhraseModel_5 = +2308 - PhraseModel_6 = +1677 - PassThrough = -6222 + Glue = -122.1 + WordPenalty = +83.689 + LanguageModel = +233.23 + LanguageModel_OOV = -145.1 + PhraseModel_0 = +150.72 + PhraseModel_1 = -272.84 + PhraseModel_2 = -418.36 + PhraseModel_3 = +181.63 + PhraseModel_4 = -289.47 + PhraseModel_5 = +140.3 + PhraseModel_6 = +3.5 + PassThrough = -109.7 --- - 1best avg score: 0.30764 (+0.12891) - 1best avg model score: -2.5042e+05 (-3.3882e+05) - avg # pairs: 725.9 (meaningless) - avg # rank err: 725.9 + 1best avg score: 0.17399 (+0.004325) + 1best avg model score: 4936.9 (-24937) + avg # pairs: 662.4 + avg # rank err: 0 (meaningless) avg # margin viol: 0 - non0 feature count: 1499 + k-best loss imp: 100% + non0 feature count: 1240 avg list sz: 91.3 - avg f count: 114.34 -(time 0.32 min, 1.9 s/S) + avg f count: 125.11 +(time 0.27 min, 1.6 s/S) + +Iteration #3 of 3. + . 10 +WEIGHTS + Glue = -157.4 + WordPenalty = -1.7372 + LanguageModel = +686.18 + LanguageModel_OOV = -399.7 + PhraseModel_0 = -39.876 + PhraseModel_1 = -341.96 + PhraseModel_2 = -318.67 + PhraseModel_3 = +105.08 + PhraseModel_4 = -290.27 + PhraseModel_5 = -48.6 + PhraseModel_6 = -43.6 + PassThrough = -298.5 + --- + 1best avg score: 0.30742 (+0.13343) + 1best avg model score: -15393 (-20329) + avg # pairs: 623.8 + avg # rank err: 0 (meaningless) + avg # margin viol: 0 + k-best loss imp: 100% + non0 feature count: 1776 + avg list sz: 91.3 + avg f count: 118.58 +(time 0.28 min, 1.7 s/S) Writing weights file to '-' ... done --- -Best iteration: 2 [SCORE 'fixed_stupid_bleu'=0.30764]. -This took 0.65 min. +Best iteration: 3 [SCORE 'fixed_stupid_bleu'=0.30742]. +This took 0.9 min. diff --git a/training/dtrain/examples/standard/nc-wmt11.gz b/training/dtrain/examples/standard/nc-wmt11.gz Binary files differnew file mode 100644 index 00000000..c39c5aef --- /dev/null +++ b/training/dtrain/examples/standard/nc-wmt11.gz diff --git a/training/dtrain/parallelize.rb b/training/dtrain/parallelize.rb index 285f3c9b..60ca9422 100755 --- a/training/dtrain/parallelize.rb +++ b/training/dtrain/parallelize.rb @@ -21,6 +21,8 @@ opts = Trollop::options do opt :qsub, "use qsub", :type => :bool, :default => false opt :dtrain_binary, "path to dtrain binary", :type => :string opt :extra_qsub, "extra qsub args", :type => :string, :default => "" + opt :per_shard_decoder_configs, "give special decoder config per shard", :type => :string, :short => '-o' + opt :first_input_weights, "input weights for first iter", :type => :string, :default => '', :short => '-w' end usage if not opts[:config]&&opts[:shards]&&opts[:input]&&opts[:references] @@ -41,9 +43,11 @@ epochs = opts[:epochs] rand = opts[:randomize] reshard = opts[:reshard] predefined_shards = false +per_shard_decoder_configs = false if opts[:shards] == 0 predefined_shards = true num_shards = 0 + per_shard_decoder_configs = true if opts[:per_shard_decoder_configs] else num_shards = opts[:shards] end @@ -51,6 +55,7 @@ input = opts[:input] refs = opts[:references] use_qsub = opts[:qsub] shards_at_once = opts[:processes_at_once] +first_input_weights = opts[:first_input_weights] `mkdir work` @@ -101,6 +106,9 @@ refs_files = [] if predefined_shards input_files = File.new(input).readlines.map {|i| i.strip } refs_files = File.new(refs).readlines.map {|i| i.strip } + if per_shard_decoder_configs + decoder_configs = File.new(opts[:per_shard_decoder_configs]).readlines.map {|i| i.strip} + end num_shards = input_files.size else input_files, refs_files = make_shards input, refs, num_shards, 0, rand @@ -126,10 +134,18 @@ end else local_end = "2>work/out.#{shard}.#{epoch}" end + if per_shard_decoder_configs + cdec_cfg = "--decoder_config #{decoder_configs[shard]}" + else + cdec_cfg = "" + end + if first_input_weights!='' && epoch == 0 + input_weights = "--input_weights #{first_input_weights}" + end pids << Kernel.fork { - `#{qsub_str_start}#{dtrain_bin} -c #{ini}\ + `#{qsub_str_start}#{dtrain_bin} -c #{ini} #{cdec_cfg} #{input_weights}\ --input #{input_files[shard]}\ - --refs #{refs_files[shard]} #{input_weights}\ + --refs #{refs_files[shard]}\ --output work/weights.#{shard}.#{epoch}#{qsub_str_end} #{local_end}` } weights_files << "work/weights.#{shard}.#{epoch}" diff --git a/training/latent_svm/latent_svm.cc b/training/latent_svm/latent_svm.cc index ab9c1d5d..60e52550 100644 --- a/training/latent_svm/latent_svm.cc +++ b/training/latent_svm/latent_svm.cc @@ -32,7 +32,6 @@ total_loss and prev_loss actually refer not to loss, but the metric (usually BLE #include "sampler.h" using namespace std; -using boost::shared_ptr; namespace po = boost::program_options; bool invert_score; @@ -128,7 +127,7 @@ struct HypothesisInfo { }; struct GoodOracle { - shared_ptr<HypothesisInfo> good; + boost::shared_ptr<HypothesisInfo> good; }; struct TrainingObserver : public DecoderObserver { @@ -143,9 +142,9 @@ struct TrainingObserver : public DecoderObserver { const DocScorer& ds; const vector<weight_t>& feature_weights; vector<GoodOracle>& oracles; - shared_ptr<HypothesisInfo> cur_best; - shared_ptr<HypothesisInfo> cur_costaug_best; - shared_ptr<HypothesisInfo> cur_ref; + boost::shared_ptr<HypothesisInfo> cur_best; + boost::shared_ptr<HypothesisInfo> cur_costaug_best; + boost::shared_ptr<HypothesisInfo> cur_ref; const int kbest_size; const double mt_metric_scale; const double mu; @@ -168,8 +167,8 @@ struct TrainingObserver : public DecoderObserver { UpdateOracles(smeta.GetSentenceID(), *hg); } - shared_ptr<HypothesisInfo> MakeHypothesisInfo(const SparseVector<double>& feats, const double metric) { - shared_ptr<HypothesisInfo> h(new HypothesisInfo); + boost::shared_ptr<HypothesisInfo> MakeHypothesisInfo(const SparseVector<double>& feats, const double metric) { + boost::shared_ptr<HypothesisInfo> h(new HypothesisInfo); h->features = feats; h->mt_metric_score = metric; return h; diff --git a/training/mira/kbest_cut_mira.cc b/training/mira/kbest_cut_mira.cc index 59fa860a..990609d7 100644 --- a/training/mira/kbest_cut_mira.cc +++ b/training/mira/kbest_cut_mira.cc @@ -30,7 +30,6 @@ #include "sparse_vector.h" using namespace std; -using boost::shared_ptr; namespace po = boost::program_options; bool invert_score; @@ -50,13 +49,6 @@ bool sent_approx; bool checkloss; bool stream; -void SanityCheck(const vector<double>& w) { - for (int i = 0; i < w.size(); ++i) { - assert(!isnan(w[i])); - assert(!isinf(w[i])); - } -} - struct FComp { const vector<double>& w_; FComp(const vector<double>& w) : w_(w) {} @@ -149,7 +141,7 @@ struct HypothesisInfo { double alpha; double oracle_loss; SparseVector<double> oracle_feat_diff; - shared_ptr<HypothesisInfo> oracleN; + boost::shared_ptr<HypothesisInfo> oracleN; }; bool ApproxEqual(double a, double b) { @@ -157,7 +149,7 @@ bool ApproxEqual(double a, double b) { return (fabs(a-b)/fabs(b)) < EPSILON; } -typedef shared_ptr<HypothesisInfo> HI; +typedef boost::shared_ptr<HypothesisInfo> HI; bool HypothesisCompareB(const HI& h1, const HI& h2 ) { return h1->mt_metric > h2->mt_metric; @@ -185,11 +177,11 @@ bool HypothesisCompareG(const HI& h1, const HI& h2 ) }; -void CuttingPlane(vector<shared_ptr<HypothesisInfo> >* cur_c, bool* again, vector<shared_ptr<HypothesisInfo> >& all_hyp, vector<weight_t> dense_weights) +void CuttingPlane(vector<boost::shared_ptr<HypothesisInfo> >* cur_c, bool* again, vector<boost::shared_ptr<HypothesisInfo> >& all_hyp, vector<weight_t> dense_weights) { bool DEBUG_CUT = false; - shared_ptr<HypothesisInfo> max_fear, max_fear_in_set; - vector<shared_ptr<HypothesisInfo> >& cur_constraint = *cur_c; + boost::shared_ptr<HypothesisInfo> max_fear, max_fear_in_set; + vector<boost::shared_ptr<HypothesisInfo> >& cur_constraint = *cur_c; if(no_reweight) { @@ -235,9 +227,9 @@ void CuttingPlane(vector<shared_ptr<HypothesisInfo> >* cur_c, bool* again, vecto } -double ComputeDelta(vector<shared_ptr<HypothesisInfo> >* cur_p, double max_step_size,vector<weight_t> dense_weights ) +double ComputeDelta(vector<boost::shared_ptr<HypothesisInfo> >* cur_p, double max_step_size,vector<weight_t> dense_weights ) { - vector<shared_ptr<HypothesisInfo> >& cur_pair = *cur_p; + vector<boost::shared_ptr<HypothesisInfo> >& cur_pair = *cur_p; double loss = cur_pair[0]->oracle_loss - cur_pair[1]->oracle_loss; double margin = -(cur_pair[0]->oracleN->features.dot(dense_weights)- cur_pair[0]->features.dot(dense_weights)) + (cur_pair[1]->oracleN->features.dot(dense_weights) - cur_pair[1]->features.dot(dense_weights)); @@ -261,12 +253,12 @@ double ComputeDelta(vector<shared_ptr<HypothesisInfo> >* cur_p, double max_step_ } -vector<shared_ptr<HypothesisInfo> > SelectPair(vector<shared_ptr<HypothesisInfo> >* cur_c) +vector<boost::shared_ptr<HypothesisInfo> > SelectPair(vector<boost::shared_ptr<HypothesisInfo> >* cur_c) { bool DEBUG_SELECT= false; - vector<shared_ptr<HypothesisInfo> >& cur_constraint = *cur_c; + vector<boost::shared_ptr<HypothesisInfo> >& cur_constraint = *cur_c; - vector<shared_ptr<HypothesisInfo> > pair; + vector<boost::shared_ptr<HypothesisInfo> > pair; if (no_select || optimizer == 2){ //skip heuristic search and return oracle and fear for pa-mira @@ -278,7 +270,7 @@ vector<shared_ptr<HypothesisInfo> > SelectPair(vector<shared_ptr<HypothesisInfo> for(int u=0;u != cur_constraint.size();u++) { - shared_ptr<HypothesisInfo> max_fear; + boost::shared_ptr<HypothesisInfo> max_fear; if(DEBUG_SELECT) cerr<< "cur alpha " << u << " " << cur_constraint[u]->alpha; for(int i=0; i < cur_constraint.size();i++) //select maximal violator @@ -323,8 +315,8 @@ vector<shared_ptr<HypothesisInfo> > SelectPair(vector<shared_ptr<HypothesisInfo> } struct GoodBadOracle { - vector<shared_ptr<HypothesisInfo> > good; - vector<shared_ptr<HypothesisInfo> > bad; + vector<boost::shared_ptr<HypothesisInfo> > good; + vector<boost::shared_ptr<HypothesisInfo> > bad; }; struct BasicObserver: public DecoderObserver { @@ -367,8 +359,8 @@ struct TrainingObserver : public DecoderObserver { const DocScorer& ds; vector<ScoreP>& corpus_bleu_sent_stats; vector<GoodBadOracle>& oracles; - vector<shared_ptr<HypothesisInfo> > cur_best; - shared_ptr<HypothesisInfo> cur_oracle; + vector<boost::shared_ptr<HypothesisInfo> > cur_best; + boost::shared_ptr<HypothesisInfo> cur_oracle; const int kbest_size; Hypergraph forest; int cur_sent; @@ -386,7 +378,7 @@ struct TrainingObserver : public DecoderObserver { return *cur_best[0]; } - const vector<shared_ptr<HypothesisInfo> > GetCurrentBest() const { + const vector<boost::shared_ptr<HypothesisInfo> > GetCurrentBest() const { return cur_best; } @@ -411,8 +403,8 @@ struct TrainingObserver : public DecoderObserver { } - shared_ptr<HypothesisInfo> MakeHypothesisInfo(const SparseVector<double>& feats, const double score, const vector<WordID>& hyp) { - shared_ptr<HypothesisInfo> h(new HypothesisInfo); + boost::shared_ptr<HypothesisInfo> MakeHypothesisInfo(const SparseVector<double>& feats, const double score, const vector<WordID>& hyp) { + boost::shared_ptr<HypothesisInfo> h(new HypothesisInfo); h->features = feats; h->mt_metric = score; h->hyp = hyp; @@ -424,14 +416,14 @@ struct TrainingObserver : public DecoderObserver { if (stream) sent_id = 0; bool PRINT_LIST= false; - vector<shared_ptr<HypothesisInfo> >& cur_good = oracles[sent_id].good; - vector<shared_ptr<HypothesisInfo> >& cur_bad = oracles[sent_id].bad; + vector<boost::shared_ptr<HypothesisInfo> >& cur_good = oracles[sent_id].good; + vector<boost::shared_ptr<HypothesisInfo> >& cur_bad = oracles[sent_id].bad; //TODO: look at keeping previous iterations hypothesis lists around cur_best.clear(); cur_good.clear(); cur_bad.clear(); - vector<shared_ptr<HypothesisInfo> > all_hyp; + vector<boost::shared_ptr<HypothesisInfo> > all_hyp; typedef KBest::KBestDerivations<vector<WordID>, ESentenceTraversal,Filter> K; K kbest(forest,kbest_size); @@ -527,7 +519,7 @@ struct TrainingObserver : public DecoderObserver { if(PRINT_LIST) { cerr << "GOOD" << endl; for(int u=0;u!=cur_good.size();u++) cerr << cur_good[u]->mt_metric << " " << cur_good[u]->hope << endl;} //use hope for fear selection - shared_ptr<HypothesisInfo>& oracleN = cur_good[0]; + boost::shared_ptr<HypothesisInfo>& oracleN = cur_good[0]; if(fear_select == 1){ //compute fear hyps with model - bleu if (PRINT_LIST) cerr << "FEAR " << endl; @@ -663,13 +655,13 @@ int main(int argc, char** argv) { invert_score = false; } - shared_ptr<DocScorer> ds; + boost::shared_ptr<DocScorer> ds; //normal: load references, stream: start stream scorer if (stream) { - ds = shared_ptr<DocScorer>(new DocStreamScorer(type, vector<string>(0), "")); + ds = boost::shared_ptr<DocScorer>(new DocStreamScorer(type, vector<string>(0), "")); cerr << "Scoring doc stream with " << metric_name << endl; } else { - ds = shared_ptr<DocScorer>(new DocScorer(type, conf["reference"].as<vector<string> >(), "")); + ds = boost::shared_ptr<DocScorer>(new DocScorer(type, conf["reference"].as<vector<string> >(), "")); cerr << "Loaded " << ds->size() << " references for scoring with " << metric_name << endl; } vector<ScoreP> corpus_bleu_sent_stats; @@ -774,9 +766,9 @@ int main(int argc, char** argv) { const HypothesisInfo& cur_good = *oracles[cur_sent].good[0]; const HypothesisInfo& cur_bad = *oracles[cur_sent].bad[0]; - vector<shared_ptr<HypothesisInfo> >& cur_good_v = oracles[cur_sent].good; - vector<shared_ptr<HypothesisInfo> >& cur_bad_v = oracles[cur_sent].bad; - vector<shared_ptr<HypothesisInfo> > cur_best_v = observer.GetCurrentBest(); + vector<boost::shared_ptr<HypothesisInfo> >& cur_good_v = oracles[cur_sent].good; + vector<boost::shared_ptr<HypothesisInfo> >& cur_bad_v = oracles[cur_sent].bad; + vector<boost::shared_ptr<HypothesisInfo> > cur_best_v = observer.GetCurrentBest(); tot_loss += cur_hyp.mt_metric; @@ -824,13 +816,13 @@ int main(int argc, char** argv) { } else if(optimizer == 5) //full mira with n-best list of constraints from hope, fear, model best { - vector<shared_ptr<HypothesisInfo> > cur_constraint; + vector<boost::shared_ptr<HypothesisInfo> > cur_constraint; cur_constraint.insert(cur_constraint.begin(), cur_bad_v.begin(), cur_bad_v.end()); cur_constraint.insert(cur_constraint.begin(), cur_best_v.begin(), cur_best_v.end()); cur_constraint.insert(cur_constraint.begin(), cur_good_v.begin(), cur_good_v.end()); bool optimize_again; - vector<shared_ptr<HypothesisInfo> > cur_pair; + vector<boost::shared_ptr<HypothesisInfo> > cur_pair; //SMO for(int u=0;u!=cur_constraint.size();u++) cur_constraint[u]->alpha =0; @@ -879,7 +871,7 @@ int main(int argc, char** argv) { else if(optimizer == 2 || optimizer == 3) //PA and Cutting Plane MIRA update { bool DEBUG_SMO= true; - vector<shared_ptr<HypothesisInfo> > cur_constraint; + vector<boost::shared_ptr<HypothesisInfo> > cur_constraint; cur_constraint.push_back(cur_good_v[0]); //add oracle to constraint set bool optimize_again = true; int cut_plane_calls = 0; @@ -919,7 +911,7 @@ int main(int argc, char** argv) { while (iter < smo_iter) { //select pair to optimize from constraint set - vector<shared_ptr<HypothesisInfo> > cur_pair = SelectPair(&cur_constraint); + vector<boost::shared_ptr<HypothesisInfo> > cur_pair = SelectPair(&cur_constraint); if(cur_pair.empty()){ iter=MAX_SMO; diff --git a/training/mira/kbest_mira.cc b/training/mira/kbest_mira.cc index d59b4224..2868de0c 100644 --- a/training/mira/kbest_mira.cc +++ b/training/mira/kbest_mira.cc @@ -3,10 +3,10 @@ #include <vector> #include <cassert> #include <cmath> -#include <tr1/memory> #include <boost/program_options.hpp> #include <boost/program_options/variables_map.hpp> +#include <boost/shared_ptr.hpp> #include "stringlib.h" #include "hg_sampler.h" @@ -30,7 +30,7 @@ using namespace std; namespace po = boost::program_options; bool invert_score; -std::tr1::shared_ptr<MT19937> rng; +boost::shared_ptr<MT19937> rng; void RandomPermutation(int len, vector<int>* p_ids) { vector<int>& ids = *p_ids; @@ -88,8 +88,8 @@ struct HypothesisInfo { }; struct GoodBadOracle { - std::tr1::shared_ptr<HypothesisInfo> good; - std::tr1::shared_ptr<HypothesisInfo> bad; + boost::shared_ptr<HypothesisInfo> good; + boost::shared_ptr<HypothesisInfo> bad; }; struct TrainingObserver : public DecoderObserver { @@ -97,7 +97,7 @@ struct TrainingObserver : public DecoderObserver { const DocumentScorer& ds; const EvaluationMetric& metric; vector<GoodBadOracle>& oracles; - std::tr1::shared_ptr<HypothesisInfo> cur_best; + boost::shared_ptr<HypothesisInfo> cur_best; const int kbest_size; const bool sample_forest; @@ -109,16 +109,16 @@ struct TrainingObserver : public DecoderObserver { UpdateOracles(smeta.GetSentenceID(), *hg); } - std::tr1::shared_ptr<HypothesisInfo> MakeHypothesisInfo(const SparseVector<double>& feats, const double score) { - std::tr1::shared_ptr<HypothesisInfo> h(new HypothesisInfo); + boost::shared_ptr<HypothesisInfo> MakeHypothesisInfo(const SparseVector<double>& feats, const double score) { + boost::shared_ptr<HypothesisInfo> h(new HypothesisInfo); h->features = feats; h->mt_metric = score; return h; } void UpdateOracles(int sent_id, const Hypergraph& forest) { - std::tr1::shared_ptr<HypothesisInfo>& cur_good = oracles[sent_id].good; - std::tr1::shared_ptr<HypothesisInfo>& cur_bad = oracles[sent_id].bad; + boost::shared_ptr<HypothesisInfo>& cur_good = oracles[sent_id].good; + boost::shared_ptr<HypothesisInfo>& cur_bad = oracles[sent_id].bad; cur_bad.reset(); // TODO get rid of?? if (sample_forest) { diff --git a/training/mira/mira.py b/training/mira/mira.py index 29c51e1d..d5a1d9f8 100755 --- a/training/mira/mira.py +++ b/training/mira/mira.py @@ -4,8 +4,19 @@ import subprocess, shlex, glob import argparse import logging import random, time -import cdec.score import gzip, itertools +try: + import cdec.score +except ImportError: + sys.stderr.write('Could not import pycdec, see cdec/python/README.md for details\n') + sys.exit(1) +have_mpl = True +try: + import matplotlib + matplotlib.use('Agg') + import matplotlib.pyplot as plt +except ImportError: + have_mpl = False #mira run script #requires pycdec to be built, since it is used for scoring hypothesis @@ -16,17 +27,17 @@ import gzip, itertools #scoring function using pycdec scoring def fast_score(hyps, refs, metric): scorer = cdec.score.Scorer(metric) - logging.info('loaded {0} references for scoring with {1}\n'.format( + logging.info('loaded {0} references for scoring with {1}'.format( len(refs), metric)) if metric=='BLEU': logging.warning('BLEU is ambiguous, assuming IBM_BLEU\n') metric = 'IBM_BLEU' elif metric=='COMBI': logging.warning('COMBI metric is no longer supported, switching to ' - 'COMB:TER=-0.5;BLEU=0.5\n') + 'COMB:TER=-0.5;BLEU=0.5') metric = 'COMB:TER=-0.5;BLEU=0.5' stats = sum(scorer(r).evaluate(h) for h,r in itertools.izip(hyps,refs)) - logging.info(stats.detail+'\n') + logging.info('Score={} ({})'.format(stats.score, stats.detail)) return stats.score #create new parallel input file in output directory in sgml format @@ -71,6 +82,8 @@ def main(): #set logging to write all info messages to stderr logging.basicConfig(level=logging.INFO) script_dir = os.path.dirname(os.path.abspath(sys.argv[0])) + if not have_mpl: + logging.warning('Failed to import matplotlib, graphs will not be generated.') parser= argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) @@ -181,10 +194,11 @@ def main(): dev_size = enseg(args.devset, newdev, args.grammar_prefix) args.devset = newdev - write_config(args) + log_config(args) args.weights, hope_best_fear = optimize(args, script_dir, dev_size) - graph_file = graph(args.output_dir, hope_best_fear, args.metric) + graph_file = '' + if have_mpl: graph_file = graph(args.output_dir, hope_best_fear, args.metric) dev_results, dev_bleu = evaluate(args.devset, args.weights, args.config, script_dir, args.output_dir) @@ -205,17 +219,12 @@ def main(): if graph_file: logging.info('A graph of the best/hope/fear scores over the iterations ' - 'has been saved to {}\n'.format(graph_file)) + 'has been saved to {}'.format(graph_file)) print 'final weights:\n{}\n'.format(args.weights) #graph of hope/best/fear metric values across all iterations def graph(output_dir, hope_best_fear, metric): - try: - import matplotlib.pyplot as plt - except ImportError: - logging.error('Error importing matplotlib. Graphing disabled.\n') - return '' max_y = float(max(hope_best_fear['best']))*1.5 plt.plot(hope_best_fear['best'], label='best') plt.plot(hope_best_fear['hope'], label='hope') @@ -308,6 +317,7 @@ def optimize(args, script_dir, dev_size): decoder = script_dir+'/kbest_cut_mira' (source, refs) = split_devset(args.devset, args.output_dir) port = random.randint(15000,50000) + logging.info('using port {}'.format(port)) num_features = 0 last_p_score = 0 best_score_iter = -1 @@ -316,8 +326,8 @@ def optimize(args, script_dir, dev_size): hope_best_fear = {'hope':[],'best':[],'fear':[]} #main optimization loop while i<args.max_iterations: - logging.info('\n\nITERATION {}\n========\n'.format(i)) - logging.info('using port {}\n'.format(port)) + logging.info('======= STARTING ITERATION {} ======='.format(i)) + logging.info('Starting at {}'.format(time.asctime())) #iteration specific files runfile = args.output_dir+'/run.raw.'+str(i) @@ -327,10 +337,8 @@ def optimize(args, script_dir, dev_size): weightdir = args.output_dir+'/weights.pass'+str(i) os.mkdir(logdir) os.mkdir(weightdir) - - logging.info('RUNNING DECODER AT {}'.format(time.asctime())) weightsfile = args.output_dir+'/weights.'+str(i) - logging.info('ITER {}\n'.format(i)) + logging.info(' log directory={}'.format(logdir)) curr_pass = '0{}'.format(i) decoder_cmd = ('{0} -c {1} -w {2} -r{3} -m {4} -s {5} -b {6} -k {7} -o {8}' ' -p {9} -O {10} -D {11} -h {12} -f {13} -C {14}').format( @@ -350,7 +358,7 @@ def optimize(args, script_dir, dev_size): parallelize, logdir, args.jobs) cmd = parallel_cmd + ' ' + decoder_cmd - logging.info('COMMAND: \n{}\n'.format(cmd)) + logging.info('OPTIMIZATION COMMAND: {}'.format(cmd)) dlog = open(decoderlog,'w') runf = open(runfile,'w') @@ -365,27 +373,26 @@ def optimize(args, script_dir, dev_size): p1.stdout.close() if exit_code: - logging.error('Failed with exit code {}\n'.format(exit_code)) + logging.error('Failed with exit code {}'.format(exit_code)) sys.exit(exit_code) try: f = open(runfile) except IOError, msg: - logging.error('Unable to open {}\n'.format(runfile)) + logging.error('Unable to open {}'.format(runfile)) sys.exit() num_topbest = sum(1 for line in f) f.close() if num_topbest == dev_size: break - logging.warning('Incorrect number of top best. ' - 'Waiting for distributed filesystem and retrying.') + logging.warning('Incorrect number of top best. Sleeping for 10 seconds and retrying...') time.sleep(10) retries += 1 if dev_size != num_topbest: logging.error("Dev set contains "+dev_size+" sentences, but we don't " "have topbest for all of these. Decoder failure? " - " Check "+decoderlog+'\n') + " Check "+decoderlog) sys.exit() dlog.close() runf.close() @@ -427,7 +434,7 @@ def optimize(args, script_dir, dev_size): hope_best_fear['hope'].append(dec_score) hope_best_fear['best'].append(dec_score_h) hope_best_fear['fear'].append(dec_score_f) - logging.info('DECODER SCORE: {0} HOPE: {1} FEAR: {2}\n'.format( + logging.info('DECODER SCORE: {0} HOPE: {1} FEAR: {2}'.format( dec_score, dec_score_h, dec_score_f)) if dec_score > best_score: best_score_iter = i @@ -436,12 +443,13 @@ def optimize(args, script_dir, dev_size): new_weights_file = '{}/weights.{}'.format(args.output_dir, i+1) last_weights_file = '{}/weights.{}'.format(args.output_dir, i) i += 1 - weight_files = weightdir+'/weights.mira-pass*.*[0-9].gz' + weight_files = args.output_dir+'/weights.pass*/weights.mira-pass*[0-9].gz' average_weights(new_weights_file, weight_files) - logging.info('\nBEST ITER: {} :: {}\n\n'.format( + logging.info('BEST ITERATION: {} (SCORE={})'.format( best_score_iter, best_score)) weights_final = args.output_dir+'/weights.final' + logging.info('WEIGHTS FILE: {}'.format(weights_final)) shutil.copy(last_weights_file, weights_final) average_final_weights(args.output_dir) @@ -481,15 +489,15 @@ def gzip_file(filename): #average the weights for a given pass def average_weights(new_weights, weight_files): - logging.info('AVERAGE {} {}\n'.format(new_weights, weight_files)) + logging.info('AVERAGE {} {}'.format(new_weights, weight_files)) feature_weights = {} total_mult = 0.0 for path in glob.glob(weight_files): score = gzip.open(path) mult = 0 - logging.info('FILE {}\n'.format(path)) + logging.info(' FILE {}'.format(path)) msg, ran, mult = score.readline().strip().split(' ||| ') - logging.info('Processing {} {}'.format(ran, mult)) + logging.info(' Processing {} {}'.format(ran, mult)) for line in score: f,w = line.split(' ',1) if f in feature_weights: @@ -500,34 +508,30 @@ def average_weights(new_weights, weight_files): score.close() #write new weights to outfile + logging.info('Writing averaged weights to {}'.format(new_weights)) out = open(new_weights, 'w') for f in iter(feature_weights): avg = feature_weights[f]/total_mult - logging.info('{} {} {} ||| Printing {} {}\n'.format(f,feature_weights[f], - total_mult, f, avg)) out.write('{} {}\n'.format(f,avg)) -def write_config(args): - config = ('\n' - 'DECODER: ' - '/usr0/home/eschling/cdec/training/mira/kbest_cut_mira\n' - 'INI FILE: '+args.config+'\n' - 'WORKING DIRECTORY: '+args.output_dir+'\n' - 'DEVSET: '+args.devset+'\n' - 'EVAL METRIC: '+args.metric+'\n' - 'MAX ITERATIONS: '+str(args.max_iterations)+'\n' - 'DECODE NODES: '+str(args.jobs)+'\n' - 'INITIAL WEIGHTS: '+args.weights+'\n') +def log_config(args): + logging.info('WORKING DIRECTORY={}'.format(args.output_dir)) + logging.info('INI FILE={}'.format(args.config)) + logging.info('DEVSET={}'.format(args.devset)) + logging.info('EVAL METRIC={}'.format(args.metric)) + logging.info('MAX ITERATIONS={}'.format(args.max_iterations)) + logging.info('PARALLEL JOBS={}'.format(args.jobs)) + logging.info('INITIAL WEIGHTS={}'.format(args.weights)) if args.grammar_prefix: - config += 'GRAMMAR PREFIX: '+str(args.grammar_prefix)+'\n' + logging.info('GRAMMAR PREFIX={}'.format(args.grammar_prefix)) if args.test: - config += 'TEST SET: '+args.test+'\n' + logging.info('TEST SET={}'.format(args.test)) + else: + logging.info('TEST SET=none specified') if args.test_config: - config += 'TEST CONFIG: '+args.test_config+'\n' + logging.info('TEST CONFIG={}'.format(args.test_config)) if args.email: - config += 'EMAIL: '+args.email+'\n' - - logging.info(config) + logging.info('EMAIL={}'.format(args.email)) if __name__=='__main__': main() diff --git a/training/pro/mr_pro_map.cc b/training/pro/mr_pro_map.cc index eef40b8a..a5e6e48f 100644 --- a/training/pro/mr_pro_map.cc +++ b/training/pro/mr_pro_map.cc @@ -2,7 +2,6 @@ #include <iostream> #include <fstream> #include <vector> -#include <tr1/unordered_map> #include <boost/functional/hash.hpp> #include <boost/shared_ptr.hpp> diff --git a/training/utils/candidate_set.cc b/training/utils/candidate_set.cc index 087efec3..33dae9a3 100644 --- a/training/utils/candidate_set.cc +++ b/training/utils/candidate_set.cc @@ -1,6 +1,11 @@ #include "candidate_set.h" -#include <tr1/unordered_set> +#ifndef HAVE_OLD_CPP +# include <unordered_set> +#else +# include <tr1/unordered_set> +namespace std { using std::tr1::unordered_set; } +#endif #include <boost/functional/hash.hpp> @@ -139,12 +144,12 @@ void CandidateSet::ReadFromFile(const string& file) { void CandidateSet::Dedup() { if(!SILENT) cerr << "Dedup in=" << cs.size(); - tr1::unordered_set<Candidate, CandidateHasher, CandidateCompare> u; + unordered_set<Candidate, CandidateHasher, CandidateCompare> u; while(cs.size() > 0) { u.insert(cs.back()); cs.pop_back(); } - tr1::unordered_set<Candidate, CandidateHasher, CandidateCompare>::iterator it = u.begin(); + unordered_set<Candidate, CandidateHasher, CandidateCompare>::iterator it = u.begin(); while (it != u.end()) { cs.push_back(*it); it = u.erase(it); diff --git a/training/utils/online_optimizer.h b/training/utils/online_optimizer.h index 28d89344..19223e9d 100644 --- a/training/utils/online_optimizer.h +++ b/training/utils/online_optimizer.h @@ -1,10 +1,10 @@ #ifndef _ONL_OPTIMIZE_H_ #define _ONL_OPTIMIZE_H_ -#include <tr1/memory> #include <set> #include <string> #include <cmath> +#include <boost/shared_ptr.hpp> #include "sparse_vector.h" struct LearningRateSchedule { @@ -56,7 +56,7 @@ struct ExponentialDecayLearningRate : public LearningRateSchedule { class OnlineOptimizer { public: virtual ~OnlineOptimizer(); - OnlineOptimizer(const std::tr1::shared_ptr<LearningRateSchedule>& s, + OnlineOptimizer(const boost::shared_ptr<LearningRateSchedule>& s, size_t batch_size, const std::vector<int>& frozen_feats = std::vector<int>()) : N_(batch_size),schedule_(s),k_() { @@ -77,13 +77,13 @@ class OnlineOptimizer { std::set<int> frozen_; // frozen (non-optimizing) features private: - std::tr1::shared_ptr<LearningRateSchedule> schedule_; + boost::shared_ptr<LearningRateSchedule> schedule_; int k_; // iteration count }; class CumulativeL1OnlineOptimizer : public OnlineOptimizer { public: - CumulativeL1OnlineOptimizer(const std::tr1::shared_ptr<LearningRateSchedule>& s, + CumulativeL1OnlineOptimizer(const boost::shared_ptr<LearningRateSchedule>& s, size_t training_instances, double C, const std::vector<int>& frozen) : OnlineOptimizer(s, training_instances, frozen), C_(C), u_() {} diff --git a/training/utils/optimize_test.cc b/training/utils/optimize_test.cc index bff2ca03..72fcef6d 100644 --- a/training/utils/optimize_test.cc +++ b/training/utils/optimize_test.cc @@ -2,6 +2,7 @@ #include <iostream> #include <sstream> #include <boost/program_options/variables_map.hpp> +#include <boost/shared_ptr.hpp> #include "optimize.h" #include "online_optimizer.h" #include "sparse_vector.h" @@ -96,14 +97,11 @@ void TestOptimizerVariants(int num_vars) { cerr << oa.Name() << " SUCCESS\n"; } -using namespace std::tr1; - void TestOnline() { size_t N = 20; double C = 1.0; double eta0 = 0.2; - std::tr1::shared_ptr<LearningRateSchedule> r(new ExponentialDecayLearningRate(N, eta0, 0.85)); - //shared_ptr<LearningRateSchedule> r(new StandardLearningRate(N, eta0)); + boost::shared_ptr<LearningRateSchedule> r(new ExponentialDecayLearningRate(N, eta0, 0.85)); CumulativeL1OnlineOptimizer opt(r, N, C, std::vector<int>()); assert(r->eta(10) < r->eta(1)); } diff --git a/utils/atools.cc b/utils/atools.cc index 1726c4ac..559eadef 100644 --- a/utils/atools.cc +++ b/utils/atools.cc @@ -203,14 +203,16 @@ struct RefineCommand : public Command { bool keep_going = !p.empty(); while (keep_going) { keep_going = false; - for (set<pair<int, int> >::iterator pi = p.begin(); - pi != p.end(); ++pi) { + set<pair<int, int> > added; + for (set<pair<int, int> >::iterator pi = p.begin(); pi != p.end(); ++pi) { if ((this->*pred)(pi->first, pi->second)) { Align(pi->first, pi->second); - p.erase(pi); + added.insert(make_pair(pi->first, pi->second)); keep_going = true; } } + for (set<pair<int, int> >::iterator ai = added.begin(); ai != added.end(); ++ai) + p.erase(*ai); } } Array2D<bool> res_; // refined alignment diff --git a/utils/filelib.h b/utils/filelib.h index b9ea3940..4fa69760 100644 --- a/utils/filelib.h +++ b/utils/filelib.h @@ -75,7 +75,10 @@ class ReadFile : public BaseFile<std::istream> { } } } - + void ReadAll(std::string& s) { + getline(*stream(), s, (char) EOF); + if (s.size() > 0) s.resize(s.size()-1); + } }; class WriteFile : public BaseFile<std::ostream> { diff --git a/utils/hash.h b/utils/hash.h index 189ed1ae..e1426ffb 100644 --- a/utils/hash.h +++ b/utils/hash.h @@ -20,11 +20,17 @@ # define HASH_MAP_RESERVED(h,empty,deleted) do { (h).set_empty_key(empty); (h).set_deleted_key(deleted); } while(0) # define HASH_MAP_EMPTY(h,empty) do { (h).set_empty_key(empty); } while(0) #else +#ifndef HAVE_OLD_CPP +# include <unordered_map> +# include <unordered_set> +#else # include <tr1/unordered_map> # include <tr1/unordered_set> -# define SPARSE_HASH_MAP std::tr1::unordered_map -# define HASH_MAP std::tr1::unordered_map -# define HASH_SET std::tr1::unordered_set +namespace std { using std::tr1::unordered_map; using std::tr1::unordered_set; } +#endif +# define SPARSE_HASH_MAP std::unordered_map +# define HASH_MAP std::unordered_map +# define HASH_SET std::unordered_set # define HASH_MAP_DELETED(h,deleted) # define HASH_MAP_RESERVED(h,empty,deleted) # define HASH_MAP_EMPTY(h,empty) diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl index cbccb94a..08d95162 100755 --- a/word-aligner/aligner.pl +++ b/word-aligner/aligner.pl @@ -86,10 +86,17 @@ PTRAIN_PARAMS = --gaussian_prior --sigma_squared 1.0 --max_iteration 15 #MPIRUN = mpirun -np $(MPIJOBS) MPIRUN= +USE_AFFIXES = 0 + WALLTIME=90 export +generate-wordpair-features: + \@failcom='exit 1'; \\ + (cd grammars && make USE_AFFIXES=\$(USE_AFFIXES) ) || eval \$\$failcom; + cd .. + all: \@failcom='exit 1'; \\ list='\$(TARGETS)'; for subdir in \$\$list; do \\ diff --git a/word-aligner/fast_align.cc b/word-aligner/fast_align.cc index fddcba9c..f54233eb 100644 --- a/word-aligner/fast_align.cc +++ b/word-aligner/fast_align.cc @@ -1,7 +1,12 @@ #include <iostream> #include <cmath> #include <utility> -#include <tr1/unordered_map> +#ifndef HAVE_OLD_CPP +# include <unordered_map> +#else +# include <tr1/unordered_map> +namespace std { using std::tr1::unordered_map; } +#endif #include <boost/functional/hash.hpp> #include <boost/program_options.hpp> @@ -17,7 +22,6 @@ namespace po = boost::program_options; using namespace std; -using namespace std::tr1; bool InitCommandLine(int argc, char** argv, po::variables_map* conf) { po::options_description opts("Configuration options"); diff --git a/word-aligner/makefiles/makefile.grammars b/word-aligner/makefiles/makefile.grammars index 8d3ea8cb..1db516f1 100644 --- a/word-aligner/makefiles/makefile.grammars +++ b/word-aligner/makefiles/makefile.grammars @@ -19,6 +19,8 @@ MAKE_LEX_GRAMMAR = $(SUPPORT_DIR)/make_lex_grammar.pl MODEL1 = $(SCRIPT_DIR)/fast_align MERGE_CORPUS = $(SUPPORT_DIR)/merge_corpus.pl +USE_AFFIXES = 0 + e.voc: corpus.e $(EXTRACT_VOCAB) < corpus.e > $@ @@ -66,20 +68,20 @@ corpus.e-f: corpus.f corpus.e $(MERGE_CORPUS) corpus.e corpus.f > $@ corpus.f-e.model1: corpus.f-e - $(MODEL1) -p -v -i corpus.f-e > $@ + $(MODEL1) -p corpus.f-e.model1 -v -i corpus.f-e > $@ corpus.e-f.model1: corpus.e-f - $(MODEL1) -p -v -V -i corpus.e-f > $@ + $(MODEL1) -p corpus.e-f.model1 -v -V -i corpus.e-f > $@ corpus.f-e.full-model1: corpus.f-e - $(MODEL1) -p -t -999999 -v -V -i corpus.f-e > $@ + $(MODEL1) -p corpus.f-e.full-model1 -t -999999 -v -V -i corpus.f-e > $@ corpus.e-f.full-model1: corpus.e-f - $(MODEL1) -p -t -999999 -v -V -i corpus.e-f > $@ + $(MODEL1) -p corpus.e-f.full-model1 -t -999999 -v -V -i corpus.e-f > $@ corpus.f-e.lex-grammar.gz: corpus.f-e corpus.f-e.model1 corpus.e-f.model1 $(MAKE_LEX_GRAMMAR) corpus.f-e corpus.f-e.model1 corpus.e-f.model1 | $(GZIP) -9 > $@ wordpairs.f-e.features.gz: corpus.f-e corpus.f-e.full-model1 corpus.e-f.full-model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f corpus.f-e.model1 - $(GENERATE_WORDPAIR_FEATURES) corpus.f-e corpus.f-e.full-model1 corpus.e-f.full-model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f corpus.f-e.model1 | $(GZIP) -9 > $@ + $(GENERATE_WORDPAIR_FEATURES) corpus.f-e corpus.f-e.full-model1 corpus.e-f.full-model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f corpus.f-e.model1 $(USE_AFFIXES) $(USE_AFFIXES) | $(GZIP) -9 > $@ diff --git a/word-aligner/support/generate_word_pair_features.pl b/word-aligner/support/generate_word_pair_features.pl index 54b89ce1..f3fdf149 100755 --- a/word-aligner/support/generate_word_pair_features.pl +++ b/word-aligner/support/generate_word_pair_features.pl @@ -2,7 +2,7 @@ use utf8; use strict; -my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f, $sparse_m1) = @ARGV; +my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f, $sparse_m1, $use_prefixes, $use_suffixes) = @ARGV; die "Usage: $0 corpus.fr-en corpus.f-e.full-model1 corpus.e-f.full-model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f corpus.f-e.model1\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f && $sparse_m1 && -f $sparse_m1; my %eclass = (); @@ -253,10 +253,71 @@ for my $f (sort keys %fdict) { push @feats, "PuncMiss=1"; } } + if ($use_prefixes) { + my $prefix1 = prefix_to_type($f, $e, 1); + if (length $prefix1 > 0 && !$is_null) { push @feats, $prefix1."=1";} + my $prefix2 = prefix_to_type($f, $e, 2); + if (length $prefix2 > 0 && !$is_null) { push @feats, $prefix2."=1";} + my $prefix3 = prefix_to_type($f, $e, 3); + if (length $prefix3 > 0 && !$is_null) { push @feats, $prefix3."=1";} + my $prefix1_reverse = prefix_to_type($e, $f, 1); + if (length $prefix1_reverse > 0 && !$is_null) { push @feats, $prefix1_reverse."=1";} + my $prefix2_reverse = prefix_to_type($e, $f, 2); + if (length $prefix2_reverse > 0 && !$is_null) { push @feats, $prefix2_reverse."=1";} + my $prefix3_reverse = prefix_to_type($e, $f, 3); + if (length $prefix3_reverse > 0 && !$is_null) { push @feats, $prefix3_reverse."=1";} + } + if ($use_suffixes) { + my $suffix1 = suffix_to_type($f, $e, 1); + if (length $suffix1 > 0 && !$is_null) { push @feats, $suffix1."=1";} + my $suffix2 = suffix_to_type($f, $e, 2); + if (length $suffix2 > 0 && !$is_null) { push @feats, $suffix2."=1";} + my $suffix3 = suffix_to_type($f, $e, 3); + if (length $suffix3 > 0 && !$is_null) { push @feats, $suffix3."=1";} + my $suffix1_reverse = suffix_to_type($e, $f, 1); + if (length $suffix1_reverse > 0 && !$is_null) { push @feats, $suffix1_reverse."=1";} + my $suffix2_reverse = suffix_to_type($e, $f, 2); + if (length $suffix2_reverse > 0 && !$is_null) { push @feats, $suffix2_reverse."=1";} + my $suffix3_reverse = suffix_to_type($e, $f, 3); + if (length $suffix3_reverse > 0 && !$is_null) { push @feats, $suffix3_reverse."=1";} + } print "$f ||| $e ||| @feats\n"; } } +# returns a feature string instantiating the pattern "(source_prefix,target)" +sub prefix_to_type +{ + # $f => src token + # $e => tgt token + my ($f, $e, $len_prefix) = @_; + + if (length $f > $len_prefix && index($e.$f, '=') < 0) + { + return substr($f, 0, $len_prefix)."-".$e; + } + else + { + return ""; + } +} + +# returns a feature string instantiating the pattern "(source_prefix,target)" +sub suffix_to_type +{ + # $f => src token + # $e => tgt token + my ($f, $e, $len_prefix) = @_; + + if ( (length $f) > $len_prefix && index($e.$f, '=') < 0) + { + return substr($f, (length $f)-$len_prefix, $len_prefix)."_".$e; + } + else + { + return ""; + } +} sub levenshtein { diff --git a/word-aligner/ttables.cc b/word-aligner/ttables.cc index c177aa30..a56bbcef 100644 --- a/word-aligner/ttables.cc +++ b/word-aligner/ttables.cc @@ -5,7 +5,6 @@ #include "dict.h" using namespace std; -using namespace std::tr1; void TTable::DeserializeProbsFromText(std::istream* in) { int c = 0; diff --git a/word-aligner/ttables.h b/word-aligner/ttables.h index 507f591a..d82aff72 100644 --- a/word-aligner/ttables.h +++ b/word-aligner/ttables.h @@ -2,7 +2,12 @@ #define _TTABLES_H_ #include <iostream> -#include <tr1/unordered_map> +#ifndef HAVE_OLD_CPP +# include <unordered_map> +#else +# include <tr1/unordered_map> +namespace std { using std::tr1::unordered_map; } +#endif #include "sparse_vector.h" #include "m.h" @@ -12,8 +17,8 @@ class TTable { public: TTable() {} - typedef std::tr1::unordered_map<WordID, double> Word2Double; - typedef std::tr1::unordered_map<WordID, Word2Double> Word2Word2Double; + typedef std::unordered_map<WordID, double> Word2Double; + typedef std::unordered_map<WordID, Word2Double> Word2Word2Double; inline double prob(const int& e, const int& f) const { const Word2Word2Double::const_iterator cit = ttable.find(e); if (cit != ttable.end()) { |