summaryrefslogtreecommitdiff
path: root/decoder
diff options
context:
space:
mode:
authorPaul Baltescu <pauldb89@gmail.com>2013-11-23 17:33:47 +0000
committerPaul Baltescu <pauldb89@gmail.com>2013-11-23 17:33:47 +0000
commitcc6313b23cac25eb05976b6cf64f96faf1ed4163 (patch)
tree3dc28060ad25b43773e875bea7388ab1cefcd927 /decoder
parent7990c750829af93f0a1e0fc14534582f52ee9e8c (diff)
parentf2fb69b10a897e8beb4e6e6d6cbb4327096235ef (diff)
Merge branch 'master' of https://github.com/redpony/cdec
Diffstat (limited to 'decoder')
-rw-r--r--decoder/Makefile.am92
-rw-r--r--decoder/apply_models.cc11
-rw-r--r--decoder/cdec_ff.cc36
-rw-r--r--decoder/decoder.cc10
-rw-r--r--decoder/dwarf.cc3209
-rw-r--r--decoder/dwarf.h286
-rw-r--r--decoder/earley_composer.cc11
-rw-r--r--decoder/factored_lexicon_helper.cc1
-rw-r--r--decoder/factored_lexicon_helper.h3
-rw-r--r--decoder/ff_context.cc2
-rw-r--r--decoder/ff_dwarf.cc894
-rw-r--r--decoder/ff_dwarf.h100
-rw-r--r--decoder/ff_external.cc2
-rw-r--r--decoder/ff_lm.cc101
-rw-r--r--decoder/ff_lm.h22
-rw-r--r--decoder/ff_parse_match.cc18
-rw-r--r--decoder/ff_parse_match.h1
-rw-r--r--decoder/ff_soft_syntax.cc49
-rw-r--r--decoder/ff_soft_syntax.h16
-rw-r--r--decoder/ff_soft_syntax_mindist.cc (renamed from decoder/ff_soft_syntax2.cc)58
-rw-r--r--decoder/ff_soft_syntax_mindist.h (renamed from decoder/ff_soft_syntax2.h)16
-rw-r--r--decoder/ff_source_syntax.cc49
-rw-r--r--decoder/ff_source_syntax.h10
-rw-r--r--decoder/ff_source_syntax2.cc36
-rw-r--r--decoder/ff_source_syntax2.h5
-rw-r--r--decoder/ff_source_syntax2_p.cc166
-rw-r--r--decoder/ff_source_syntax2_p.h25
-rw-r--r--decoder/ff_source_syntax_p.cc245
-rw-r--r--decoder/ff_source_syntax_p.h42
-rw-r--r--decoder/ff_wordalign.cc9
-rw-r--r--decoder/ff_wordalign.h13
-rw-r--r--decoder/ff_wordset.cc52
-rw-r--r--decoder/ff_wordset.h73
-rw-r--r--decoder/grammar.cc11
-rw-r--r--decoder/hg_intersect.cc8
-rw-r--r--decoder/kbest.h11
-rw-r--r--decoder/maxtrans_blunsom.cc11
-rw-r--r--decoder/phrasebased_translator.cc20
-rw-r--r--decoder/scfg_translator.cc10
-rw-r--r--decoder/sentence_metadata.h4
-rw-r--r--decoder/tromble_loss.cc309
-rw-r--r--decoder/tromble_loss.h40
42 files changed, 351 insertions, 5736 deletions
diff --git a/decoder/Makefile.am b/decoder/Makefile.am
index 914faaea..8280b22c 100644
--- a/decoder/Makefile.am
+++ b/decoder/Makefile.am
@@ -41,7 +41,6 @@ libcdec_a_SOURCES = \
cfg_options.h \
csplit.h \
decoder.h \
- dwarf.h \
earley_composer.h \
exp_semiring.h \
factored_lexicon_helper.h \
@@ -51,18 +50,21 @@ libcdec_a_SOURCES = \
ff_charset.h \
ff_context.h \
ff_csplit.h \
- ff_dwarf.h \
ff_external.h \
ff_factory.h \
ff_klm.h \
ff_lm.h \
ff_ngrams.h \
+ ff_parse_match.h \
ff_register.h \
ff_rules.h \
ff_ruleshape.h \
ff_sample_fsa.h \
+ ff_soft_syntax.h \
+ ff_soft_syntax_mindist.h \
ff_source_path.h \
ff_source_syntax.h \
+ ff_source_syntax2.h \
ff_spans.h \
ff_tagger.h \
ff_wordalign.h \
@@ -96,68 +98,64 @@ libcdec_a_SOURCES = \
sentences.h \
tagger.h \
translator.h \
- tromble_loss.h \
trule.h \
viterbi.h \
- forest_writer.cc \
- maxtrans_blunsom.cc \
+ aligner.cc \
+ apply_models.cc \
+ bottom_up_parser.cc \
+ cdec.cc \
cdec_ff.cc \
cfg.cc \
- dwarf.cc \
- ff_dwarf.cc \
- ff_external.cc \
- rule_lexer.cc \
- fst_translator.cc \
csplit.cc \
- translator.cc \
- rescore_translator.cc \
- scfg_translator.cc \
- hg.cc \
- hg_io.cc \
- hg_remove_eps.cc \
decoder.cc \
- hg_intersect.cc \
- hg_union.cc \
- hg_sampler.cc \
- factored_lexicon_helper.cc \
- viterbi.cc \
- lattice.cc \
- aligner.cc \
- apply_models.cc \
earley_composer.cc \
- phrasetable_fst.cc \
- trule.cc \
+ factored_lexicon_helper.cc \
ff.cc \
- ffset.cc \
ff_basic.cc \
- ff_rules.cc \
- ff_wordset.cc \
- ff_context.cc \
+ ff_bleu.cc \
ff_charset.cc \
- ff_lm.cc \
+ ff_context.cc \
+ ff_csplit.cc \
+ ff_external.cc \
+ ff_factory.cc \
ff_klm.cc \
+ ff_lm.cc \
ff_ngrams.cc \
- ff_spans.cc \
+ ff_parse_match.cc \
+ ff_rules.cc \
ff_ruleshape.cc \
- ff_wordalign.cc \
- ff_csplit.cc \
- ff_tagger.cc \
+ ff_soft_syntax.cc \
+ ff_soft_syntax_mindist.cc \
ff_source_path.cc \
- ff_parse_match.cc \
- ff_soft_syntax.cc \
- ff_soft_syntax2.cc \
ff_source_syntax.cc \
- ff_source_syntax_p.cc \
ff_source_syntax2.cc \
- ff_source_syntax2_p.cc \
- ff_bleu.cc \
- ff_factory.cc \
+ ff_spans.cc \
+ ff_tagger.cc \
+ ff_wordalign.cc \
+ ff_wordset.cc \
+ ffset.cc \
+ forest_writer.cc \
+ fst_translator.cc \
+ grammar.cc \
+ hg.cc \
+ hg_intersect.cc \
+ hg_io.cc \
+ hg_remove_eps.cc \
+ hg_sampler.cc \
+ hg_union.cc \
incremental.cc \
+ json_parse.cc \
+ lattice.cc \
lexalign.cc \
lextrans.cc \
- tagger.cc \
- bottom_up_parser.cc \
+ maxtrans_blunsom.cc \
phrasebased_translator.cc \
- JSON_parser.c \
- json_parse.cc \
- grammar.cc
+ phrasetable_fst.cc \
+ rescore_translator.cc \
+ rule_lexer.cc \
+ scfg_translator.cc \
+ tagger.cc \
+ translator.cc \
+ trule.cc \
+ viterbi.cc \
+ JSON_parser.c
diff --git a/decoder/apply_models.cc b/decoder/apply_models.cc
index 330de9e2..4cd8b36f 100644
--- a/decoder/apply_models.cc
+++ b/decoder/apply_models.cc
@@ -8,8 +8,14 @@
#include <vector>
#include <algorithm>
-#include <tr1/unordered_map>
-#include <tr1/unordered_set>
+#ifndef HAVE_OLD_CPP
+# include <unordered_map>
+# include <unordered_set>
+#else
+# include <tr1/unordered_map>
+# include <tr1/unordered_set>
+namespace std { using std::tr1::unordered_map; using std::tr1::unordered_set; }
+#endif
#include <boost/functional/hash.hpp>
@@ -23,7 +29,6 @@
#define FAST_CP_2 3
using namespace std;
-using namespace std::tr1;
struct Candidate;
typedef SmallVectorInt JVector;
diff --git a/decoder/cdec_ff.cc b/decoder/cdec_ff.cc
index e7b31f50..d586c1d1 100644
--- a/decoder/cdec_ff.cc
+++ b/decoder/cdec_ff.cc
@@ -15,26 +15,16 @@
#include "ff_ruleshape.h"
#include "ff_bleu.h"
#include "ff_soft_syntax.h"
-#include "ff_soft_syntax2.h"
+#include "ff_soft_syntax_mindist.h"
#include "ff_source_path.h"
-
-
#include "ff_parse_match.h"
#include "ff_source_syntax.h"
-#include "ff_source_syntax_p.h"
#include "ff_source_syntax2.h"
-#include "ff_source_syntax2_p.h"
-
-
#include "ff_register.h"
#include "ff_charset.h"
#include "ff_wordset.h"
-#include "ff_dwarf.h"
#include "ff_external.h"
-#ifdef HAVE_GLC
-#include <cdec/ff_glc.h>
-#endif
void register_feature_functions() {
static bool registered = false;
@@ -51,30 +41,16 @@ void register_feature_functions() {
RegisterFF<BLEUModel>();
//TODO: use for all features the new Register which requires static FF::usage(false,false) give name
-#ifdef HAVE_RANDLM
- ff_registry.Register("RandLM", new FFFactory<LanguageModelRandLM>);
-#endif
ff_registry.Register("SpanFeatures", new FFFactory<SpanFeatures>());
ff_registry.Register("NgramFeatures", new FFFactory<NgramDetector>());
ff_registry.Register("RuleContextFeatures", new FFFactory<RuleContextFeatures>());
ff_registry.Register("RuleIdentityFeatures", new FFFactory<RuleIdentityFeatures>());
-
-
ff_registry.Register("ParseMatchFeatures", new FFFactory<ParseMatchFeatures>);
-
- ff_registry.Register("SoftSyntacticFeatures", new FFFactory<SoftSyntacticFeatures>);
- ff_registry.Register("SoftSyntacticFeatures2", new FFFactory<SoftSyntacticFeatures2>);
-
+ ff_registry.Register("SoftSyntaxFeatures", new FFFactory<SoftSyntaxFeatures>);
+ ff_registry.Register("SoftSyntaxFeaturesMindist", new FFFactory<SoftSyntaxFeaturesMindist>);
ff_registry.Register("SourceSyntaxFeatures", new FFFactory<SourceSyntaxFeatures>);
- ff_registry.Register("SourceSyntaxFeatures2", new FFFactory<SourceSyntaxFeatures2>);
-
ff_registry.Register("SourceSpanSizeFeatures", new FFFactory<SourceSpanSizeFeatures>);
-
- //ff_registry.Register("PSourceSyntaxFeatures", new FFFactory<PSourceSyntaxFeatures>);
- //ff_registry.Register("PSourceSpanSizeFeatures", new FFFactory<PSourceSpanSizeFeatures>);
- //ff_registry.Register("PSourceSyntaxFeatures2", new FFFactory<PSourceSyntaxFeatures2>);
-
-
+ ff_registry.Register("SourceSyntaxFeatures2", new FFFactory<SourceSyntaxFeatures2>);
ff_registry.Register("CMR2008ReorderingFeatures", new FFFactory<CMR2008ReorderingFeatures>());
ff_registry.Register("RuleSourceBigramFeatures", new FFFactory<RuleSourceBigramFeatures>());
ff_registry.Register("RuleTargetBigramFeatures", new FFFactory<RuleTargetBigramFeatures>());
@@ -98,10 +74,6 @@ void register_feature_functions() {
ff_registry.Register("WordPairFeatures", new FFFactory<WordPairFeatures>);
ff_registry.Register("SourcePathFeatures", new FFFactory<SourcePathFeatures>);
ff_registry.Register("WordSet", new FFFactory<WordSet>);
- ff_registry.Register("Dwarf", new FFFactory<Dwarf>);
ff_registry.Register("External", new FFFactory<ExternalFeature>);
-#ifdef HAVE_GLC
- ff_registry.Register("ContextCRF", new FFFactory<Model1Features>);
-#endif
}
diff --git a/decoder/decoder.cc b/decoder/decoder.cc
index 31e6dc46..da65713a 100644
--- a/decoder/decoder.cc
+++ b/decoder/decoder.cc
@@ -1,6 +1,11 @@
#include "decoder.h"
-#include <tr1/unordered_map>
+#ifndef HAVE_OLD_CPP
+# include <unordered_map>
+#else
+# include <tr1/unordered_map>
+namespace std { using std::tr1::unordered_map; }
+#endif
#include <boost/program_options.hpp>
#include <boost/program_options/variables_map.hpp>
#include <boost/make_shared.hpp>
@@ -61,7 +66,6 @@
static const double kMINUS_EPSILON = -1e-6; // don't be too strict
using namespace std;
-using namespace std::tr1;
namespace po = boost::program_options;
static bool verbose_feature_functions=true;
@@ -90,7 +94,7 @@ struct ELengthWeightFunction {
}
};
inline void ShowBanner() {
- cerr << "cdec v1.0 (c) 2009-2011 by Chris Dyer\n";
+ cerr << "cdec (c) 2009--2013 by Chris Dyer\n";
}
inline string str(char const* name,po::variables_map const& conf) {
diff --git a/decoder/dwarf.cc b/decoder/dwarf.cc
deleted file mode 100644
index fb0404a6..00000000
--- a/decoder/dwarf.cc
+++ /dev/null
@@ -1,3209 +0,0 @@
-#include "dwarf.h"
-#include "tdict.h"
-#include "wordid.h"
-#include "lattice.h"
-#include "ff_dwarf.h"
-#include <assert.h>
-#include <algorithm>
-#include <ostream>
-#include <sstream>
-#include <iostream>
-#include <fstream>
-#include <vector>
-#include <map>
-#include <set>
-#include <boost/functional/hash.hpp>
-#include <tr1/unordered_map>
-#include <boost/tuple/tuple.hpp>
-
-using namespace std;
-using namespace std::tr1;
-using namespace boost::tuples;
-using namespace boost;
-
-Alignment::Alignment() {
- //unordered_map<std::vector<WordID>,int> XX;
- _I=0;
- _J=0;
- kSOS = TD::Convert("<s>");
- kEOS = TD::Convert("</s>");
- kUNK = TD::Convert("**UNKNOWN**");
- SourceFWAntsIdxs = new int*[MAX_ARITY];
- SourceFWAntsAbsIdxs = new int*[MAX_ARITY];
- TargetFWAntsIdxs = new int*[MAX_ARITY];
- SourceAntsIdxs = new int*[MAX_ARITY];
- TargetAntsIdxs = new int*[MAX_ARITY];
- AntsAl = new int*[MAX_ARITY];
- for (int idx=0; idx<MAX_ARITY; idx++) {
- SourceAntsIdxs[idx] = new int[40];
- SourceFWAntsIdxs[idx] = new int[40];
- SourceFWAntsAbsIdxs[idx] = new int[40];
- TargetAntsIdxs[idx] = new int[40];
- TargetFWAntsIdxs[idx] = new int[40];
- AntsAl[idx] = new int[40];
- }
- for (int j=0; j<MAX_WORDS; j++)
- for (int i=0; i<MAX_WORDS; i++) _matrix[j][i]=false;
- for (int j=0; j<MAX_WORDS; j++) {
- _tSpan[j][0]=MINIMUM_INIT;
- _sSpan[j][1]=MAXIMUM_INIT;
- }
- for (int i=0; i<MAX_WORDS; i++) {
- _sSpan[i][0]=MINIMUM_INIT;
- _sSpan[i][1]=MAXIMUM_INIT;
- }
- alpha_oris=0.1;
- alpha_orit=0.1;
- alpha_doms=0.1;
- alpha_domt=0.1;
- beta_oris=0.1;
- beta_orit=0.1;
- beta_doms=0.1;
- beta_domt=0.1;
-}
-
-void Alignment::set(int j,int i) {
-// create a link between j and i, update their corresponding span accordingly
- if (DEBUG) cerr << "set(" << j << "," << i << ")" << endl;
- assert(0<=j && j<MAX_WORDS);
- assert(0<=i && i<MAX_WORDS);
- if (0<=j && j<MAX_WORDS && 0<=i && i<MAX_WORDS) {
- _matrix[j][i] = true;
- _tSpan[j][0]=least(i,_tSpan[j][0]);
- _tSpan[j][1]=most(i,_tSpan[j][1]);
- _sSpan[i][0]=least(j,_sSpan[i][0]);
- _sSpan[i][1]=most(j,_sSpan[i][1]);
- }
- _J=most(j+1,_J);
- _I=most(i+1,_I);
-}
-
-void Alignment::reset(int j,int i) { //probably won't be used, since the alignment is not dynamic
-// remove the link between j and i, update their corresponding span accordingly
- if (DEBUG) cerr << "reset(" << j << "," << i << ")" << endl;
- assert(0<=j && j<MAX_WORDS);
- assert(0<=i && i<MAX_WORDS);
- _matrix[j][i] = false;
- if (j==_sSpan[i][0] || j==_sSpan[i][1]) {
- int min=MINIMUM_INIT;
- int max=MAXIMUM_INIT;
- for (int idx=_sSpan[i][0]; idx<=_sSpan[i][1]; idx++) {
- if (_matrix[idx][i]) {
- min=least(min,idx);
- max=most(max,idx);
- }
- }
- _sSpan[i][0]=min;
- _sSpan[i][1]=max;
- }
- if (i==_tSpan[j][0] || i==_tSpan[j][1]) {
- int min=MINIMUM_INIT;
- int max=MAXIMUM_INIT;
- for (int idx=_tSpan[j][0]; idx<=_tSpan[j][1]; idx++) {
- if (_matrix[j][idx]) {
- min=least(min,idx);
- max=most(max,idx);
- }
- }
- _tSpan[j][0]=min;
- _tSpan[j][1]=max;
- }
-}
-
-int Alignment::targetOf(int j, int start) {
- assert(j>=0);
- if (start==-1) start = _tSpan[j][0];
- if (_tSpan[j][0]==MINIMUM_INIT) return -1;
- for (int idx=start; idx<=_tSpan[j][1]; idx++) {
- if (_matrix[j][idx]) return idx;
- }
- return -1;
-}
-
-int Alignment::sourceOf(int i, int start) {
- assert(i>=0);
- if (start==-1) start = _sSpan[i][0];
- if (_sSpan[i][0]==MINIMUM_INIT) return -1;
- for (int idx=start; idx<=_sSpan[i][1]; idx++) {
- if (_matrix[idx][i]) return idx;
- }
- return -1;
-}
-
-void Alignment::clearAls(int prevJ, int prevI) {
- for (int j=0; j<=prevJ; j++) {
- for (int i=0; i<prevI; i++) {
- _matrix[j][i]=false;
- }
- }
- for (int j=0; j<=prevJ; j++) {
- _tSpan[j][0] = MINIMUM_INIT;
- _tSpan[j][1] = MAXIMUM_INIT;
- }
- for (int i=0; i<=prevI; i++) {
- _sSpan[i][0] = MINIMUM_INIT;
- _sSpan[i][1] = MAXIMUM_INIT;
- }
- _J=0;
- _I=0;
-}
-
-int Alignment::DominanceSource(int fw1, int fw2) {
- // Dominance of fw1 and fw2
- // 0 -> neither, 1 -> leftFirst, 2 -> rightFirst, 3 -> dontCare
- if (DEBUG) cerr << "DominanceSource(" << fw1 << "," << fw2 << ")" << endl;
- //cerr << TD::Convert(_f[fw1]) << "," << TD::Convert(_f[fw2]) << endl;
- //cerr << AsString() << endl;
- int dom = 0;
- curr_al.push_back(fw1); curr_al.push_back(fw2);
- if (doms_hash.find(curr_al)==doms_hash.end()) {
- int* block = blockSource(fw1,fw2);
- //cerr << "block = " << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << endl;
- if (block[0]==fw1) {
- int tfw10 = _tSpan[fw1][0];
- int tfw11 = _tSpan[fw1][1];
- //cerr << "tfw = " << tfw10 << "," << tfw11 << endl;
- if (tfw11<0) {
- dom+=1;
- } else {
- if ((block[2]==tfw10 || block[3]==tfw11)) dom+=1;
- }
- }
- if (block[1]==fw2) {
- int tfw20 = _tSpan[fw2][0];
- int tfw21 = _tSpan[fw2][1];
- //cerr << "tfw = " << tfw20 << "," << tfw21 << endl;
- if (tfw21<0) {
- dom+=2;
- } else {
- if ((block[2]==tfw20 || block[3]==tfw21)) dom+=2;
- }
- }
- delete block;
- doms_hash.insert(pair<vector<int>,int>(curr_al,dom));
- } else {
- dom = doms_hash[curr_al];
- }
- if (DEBUG) cerr << " dom = " << dom << endl;
- curr_al.pop_back(); curr_al.pop_back();
- return dom;
-}
-
-vector<int> Alignment::DominanceSource4Sampler(int fw1, int fw2) {
- if (DEBUG) cerr << "DominanceSource4Sampler(" << fw1 << "," << fw2 << ")" << endl;
- int dom = 0;
- int* block = blockSource(fw1,fw2);
- //cerr << "block = " << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << endl;
- if (block[0]==fw1) {
- int tfw10 = _tSpan[fw1][0];
- int tfw11 = _tSpan[fw1][1];
- //cerr << "tfw = " << tfw10 << "," << tfw11 << endl;
- if (tfw11<0) {
- dom+=1;
- } else {
- if ((block[2]==tfw10 || block[3]==tfw11)) dom+=1;
- }
- }
- if (block[1]==fw2) {
- int tfw20 = _tSpan[fw2][0];
- int tfw21 = _tSpan[fw2][1];
- //cerr << "tfw = " << tfw20 << "," << tfw21 << endl;
- if (tfw21<0) {
- dom+=2;
- } else {
- if ((block[2]==tfw20 || block[3]==tfw21)) dom+=2;
- }
- }
- if (DEBUG) cerr << "doms = " << dom << endl;
- vector<int> ret;
- ret.push_back(dom); ret.push_back(block[0]); ret.push_back(block[1]);
- ret.push_back(block[2]); ret.push_back(block[3]);
- delete block;
- return ret;
-}
-
-int Alignment::DominanceTarget(int fw1, int fw2) {
- int dom = 0;
- curr_al.push_back(fw1); curr_al.push_back(fw2);
- if (domt_hash.find(curr_al)==domt_hash.end()) {
- int* block = blockTarget(fw1,fw2);
- if (block[2]==fw1) {
- int sfw10 = _sSpan[fw1][0];
- int sfw11 = _sSpan[fw1][1];
- if (sfw11<0) {
- dom+=1;
- } else {
- if (block[0]==sfw10 || block[1]==sfw11) dom+=1;
- }
- }
- if (block[3]==fw2) {
- int sfw20 = _sSpan[fw2][0];
- int sfw21 = _sSpan[fw2][0];
- if (sfw21<0) {
- dom+=2;
- } else {
- if (block[0]==sfw20 || block[1]==sfw21) dom+=2;
- }
- }
- delete block;
- domt_hash.insert(pair<vector<int>,int>(curr_al,dom));
- } else {
- dom = domt_hash[curr_al];
- }
- curr_al.pop_back(); curr_al.pop_back();
- return dom;
-}
-
-vector<int> Alignment::DominanceTarget4Sampler(int fw1, int fw2) {
- int dom = 0;
- int* block = blockTarget(fw1,fw2);
- if (block[2]==fw1) {
- int sfw10 = _sSpan[fw1][0];
- int sfw11 = _sSpan[fw1][1];
- if (sfw11<0) {
- dom+=1;
- } else {
- if (block[0]==sfw10 || block[1]==sfw11) dom+=1;
- }
- }
- if (block[3]==fw2) {
- int sfw20 = _sSpan[fw2][0];
- int sfw21 = _sSpan[fw2][0];
- if (sfw21<0) {
- dom+=2;
- } else {
- if (block[0]==sfw20 || block[1]==sfw21) dom+=2;
- }
- }
- vector<int> ret;
- ret.push_back(dom); ret.push_back(block[0]); ret.push_back(block[1]);
- ret.push_back(block[2]); ret.push_back(block[3]);
- delete block;
- return ret;
-}
-
-void Alignment::OrientationSource(int fw, int* oril, int* orir, bool Lcompute, bool Rcompute) {
- OrientationSource(fw,fw,oril,orir,Lcompute,Rcompute);
-}
-
-vector<int> Alignment::OrientationSourceLeft4Sampler(int fw) {
- return OrientationSourceLeft4Sampler(fw,fw);
-}
-
-vector<int> Alignment::OrientationSourceLeft4Sampler(int fw0, int fw1) {
- if (DEBUG) cerr << "OrientationSourceLeft4Sampler(" << fw0 << "," << fw1 << ")" << endl;
- int oril = 0;
- int N0=fw0-1;
- while (N0>=0) {
- if (minTSpan(N0)!=MINIMUM_INIT) break;
- N0--;
- }
- int N1=fw1+1;
- while (N1<_J) {
- if (minTSpan(N1)!=MINIMUM_INIT) break;
- N1++;
- }
- if (minTSpan(fw0)==MINIMUM_INIT && minTSpan(fw1)==MINIMUM_INIT) {
- fw0 = N1; fw1 = N0;
- }
- if (DEBUG) cerr << "fw0=" << fw0 << ", fw1=" << fw1 << ", N0=" << N0 << ", N1=" << N1 << endl;
- if (maxTSpan(N0)<minTSpan(fw0) || maxTSpan(fw0)<minTSpan(N0)) {
- if (DEBUG) cerr << "N0=" << minTSpan(N0) << "-" << maxTSpan(N0);
- if (DEBUG) cerr << "fw=" << minTSpan(fw0) << "-" << maxTSpan(fw0) << endl;
- int *block = blockTarget(minTSpan(N0),maxTSpan(N0));
- if (block[0]<=fw0 && fw0<=block[1]) oril=5;
- delete block;
- if (oril==0) {
- block = blockTarget(minTSpan(fw0),maxTSpan(fw0));
- if (block[0]<=N0 && N0<=block[1]) oril=5;
- delete block;
- }
- if (oril==0) {
- if (maxTSpan(N0)<minTSpan(fw0)) {// if N0 is monotone
- oril=1;
- block = blockTarget(maxTSpan(N0),minTSpan(fw0)-1);
- if (block[0] <= fw0 && fw0 <= block[1]) oril+=2;
- delete block;
- } else { //if (maxTSpan(fw0)<minTSpan(N0)) { // if NO is non-monotone
- oril=2;
- block = blockTarget(maxTSpan(fw0)+1,minTSpan(N0));
- if (block[0] <= fw0 && fw0 <= block[1]) oril+=2;
- delete block;
- }
- }
- } else {
- oril=5;
- }
- if (DEBUG) cerr << "oril = " << oril << endl;
- int* block = blockSource(N0,fw0);
- if (DEBUG) {
- for (int i=0; i<4; i++) cerr << "block[" << i << "]=" << block[i] << endl;
- }
- vector<int> ret;
- ret.push_back(oril); ret.push_back(block[0]); ret.push_back(block[1]);
- ret.push_back(block[2]); ret.push_back(block[3]);
- delete block;
- return ret;
-}
-
-vector<int> Alignment::OrientationSourceRight4Sampler(int fw) {
- return OrientationSourceRight4Sampler(fw,fw);
-}
-
-vector<int> Alignment::OrientationSourceRight4Sampler(int fw0, int fw1) {
- if (DEBUG) cerr << "OrientationSourceLeft4Sampler(" << fw0 << "," << fw1 << ")" << endl;
- int orir = 0;
- int N0=fw0-1;
- while (N0>=0) {
- if (minTSpan(N0)!=MINIMUM_INIT) break;
- N0--;
- }
- int N1=fw1+1;
- while (N1<_J) {
- if (minTSpan(N1)!=MINIMUM_INIT) break;
- N1++;
- }
- if (minTSpan(fw0)==MINIMUM_INIT && minTSpan(fw1)==MINIMUM_INIT) {
- fw0 = N1; fw1 = N0;
- }
- if (DEBUG) cerr << "fw0=" << fw0 << ", fw1=" << fw1 << ", N0=" << N0 << ", N1=" << N1 << endl;
- if (maxTSpan(N1)<minTSpan(fw1) || maxTSpan(fw1)<minTSpan(N1)) {
- int* block = blockTarget(minTSpan(N1),maxTSpan(N1));
- if (block[0]<=fw1 && fw1<=block[2]) orir=5;
- delete block;
- if (orir==0) {
- block = blockTarget(minTSpan(fw1),maxTSpan(fw1));
- if (block[0]<=N1 && N1 <=block[1]) orir=5;
- delete block;
- }
- if (DEBUG) cerr << "N1=" << minTSpan(N1) << "-" << maxTSpan(N1);
- if (DEBUG) cerr << "fw1=" << minTSpan(fw1) << "-" << maxTSpan(fw1) << endl;
- if (orir==0) {
- if (maxTSpan(fw1)<minTSpan(N1)) { // if N1 is monotone
- orir = 1;
- block = blockTarget(maxTSpan(fw1)+1,minTSpan(N1));
- if (block[0] <= fw1 && fw1 <= block[1]) orir+=2;
- delete block;
- } else {// if (maxTSpan(N1)<minTSpan(fw1)) { // if N1 is non-monotone
- orir = 2;
- block = blockTarget(maxTSpan(N1),minTSpan(fw1)-1);
- if (block[0] <= fw1 && fw1 <= block[1]) orir+=2;
- delete block;
- }
- }
- } else {
- orir = 5;
- }
- if (DEBUG) cerr << "orir = " << orir << endl;
- int* block = blockSource(fw1,N1);
- vector<int> ret;
- ret.push_back(orir); ret.push_back(block[0]); ret.push_back(block[1]);
- ret.push_back(block[2]); ret.push_back(block[3]);
- delete block;
- return ret;
-}
-
-void Alignment::OrientationSource(int fw0, int fw1, int* oril, int* orir, bool Lcompute, bool Rcompute) {
- // Orientation
- // A bit tricky since fw can be 1) unaligned 2) aligned to many
- // 1 -> MA, 2 -> RA, 3 -> MG, 4 -> RG, 5 -> Other
- if (DEBUG) cerr << "OrientationSource(" << fw0 << "," << fw1 << ")" << endl;
- if (!Lcompute && !Rcompute) return;
- curr_al.push_back(fw0);
- curr_al.push_back(fw1);
- *oril=0;
- *orir=0;
- int lr=0;
- if (oris_hash.find(curr_al)==oris_hash.end()) {
- // Find first aligned word N0 to the left of fw
- int N0=fw0-1;
- while (N0>=0) {
- if (minTSpan(N0)!=MINIMUM_INIT) break;
- N0--;
- }
- int N1=fw1+1;
- while (N1<_J) {
- if (minTSpan(N1)!=MINIMUM_INIT) break;
- N1++;
- }
- if (minTSpan(fw0)==MINIMUM_INIT && minTSpan(fw1)==MINIMUM_INIT) {
- fw0 = N1; fw1 = N0;
- //cerr << "minTSpan(fw)==MINIMUM_INIT, thus fw0=" << fw0 << ", fw1=" << fw1 << endl;
- }
- if (DEBUG) cerr << "fw0=" << fw0 << ", fw1=" << fw1 << ", N0=" << N0 << ", N1=" << N1 << endl;
- if (maxTSpan(N0)<minTSpan(fw0) || maxTSpan(fw0)<minTSpan(N0)) {
- if (DEBUG) cerr << "N0=" << minTSpan(N0) << "-" << maxTSpan(N0);
- if (DEBUG) cerr << "fw=" << minTSpan(fw0) << "-" << maxTSpan(fw0) << endl;
- int *block = blockTarget(minTSpan(N0),maxTSpan(N0));
- if (block[0]<=fw0 && fw0<=block[1]) *oril=5;
- delete block;
- if (*oril==0) {
- block = blockTarget(minTSpan(fw0),maxTSpan(fw0));
- if (block[0]<=N0 && N0<=block[1]) *oril=5;
- delete block;
- }
- if (*oril==0) {
- if (maxTSpan(N0)<minTSpan(fw0)) {// if N0 is monotone
- *oril=1;
- block = blockTarget(maxTSpan(N0),minTSpan(fw0)-1);
- if (block[0] <= fw0 && fw0 <= block[1]) *oril+=2;
- delete block;
- } else { //if (maxTSpan(fw0)<minTSpan(N0)) { // if NO is non-monotone
- *oril=2;
- block = blockTarget(maxTSpan(fw0)+1,minTSpan(N0));
- if (block[0] <= fw0 && fw0 <= block[1]) *oril+=2;
- delete block;
- }
- }
- } else {
- *oril=5;
- }
- if (DEBUG) cerr << "oril =" << *oril << endl;
- // Right neighbor
- if (maxTSpan(N1)<minTSpan(fw1) || maxTSpan(fw1)<minTSpan(N1)) {
- int* block = blockTarget(minTSpan(N1),maxTSpan(N1));
- if (block[0]<=fw1 && fw1<=block[2]) *orir=5;
- delete block;
- if (*orir==0) {
- block = blockTarget(minTSpan(fw1),maxTSpan(fw1));
- if (block[0]<=N1 && N1 <=block[1]) *orir=5;
- delete block;
- }
- if (DEBUG) cerr << "N1=" << minTSpan(N1) << "-" << maxTSpan(N1);
- if (DEBUG) cerr << "fw1=" << minTSpan(fw1) << "-" << maxTSpan(fw1) << endl;
- if (*orir==0) {
- if (maxTSpan(fw1)<minTSpan(N1)) { // if N1 is monotone
- *orir = 1;
- block = blockTarget(maxTSpan(fw1)+1,minTSpan(N1));
- if (block[0] <= fw1 && fw1 <= block[1]) *orir+=2;
- delete block;
- } else {// if (maxTSpan(N1)<minTSpan(fw1)) { // if N1 is non-monotone
- *orir = 2;
- block = blockTarget(maxTSpan(N1),minTSpan(fw1)-1);
- if (block[0] <= fw1 && fw1 <= block[1]) *orir+=2;
- delete block;
- }
- }
- } else {
- *orir = 5;
- }
- if (DEBUG) cerr << "orir =" << *orir << endl;
- lr = link(*oril,*orir);
- oris_hash.insert(pair<vector<int>,int>(curr_al,lr));
- } else {
- lr = oris_hash[curr_al];
- }
- if (DEBUG) cerr << "Lcompute=" << Lcompute << ", Rcompute=" << Rcompute << endl;
- if (Lcompute) *oril = source(lr);
- if (Rcompute) *orir = target(lr);
- curr_al.pop_back();
- curr_al.pop_back();
-}
-
-int Alignment::OrientationSource(int* left, int* right) {
- if (DEBUG) {
- cerr << " OrientationSource(";
- cerr << "left="<<left[0]<<","<<left[1]<<","<<left[2]<<","<<left[3];
- cerr << " right="<<right[0]<<","<<right[1]<<","<<right[2]<<","<<right[3];
- cerr << ")" << endl;
- }
- //if ((right[1]<=left[0]) return 5;
- if (!(left[1]<right[0])) return 5;
- int ori = 1;
- if (right[3]<left[2]) ori=2;
- int gapstart = left[3]+1; int gapend = right[2]-1;
- if (ori==2) { gapstart = right[3]+1; gapend = left[2]-1; }
- for (int j=gapstart; j<=gapend; j++) {
- if (sourceOf(j)!=-1) {
- ori+=2; break;
- }
- }
- return ori;
-}
-
-void Alignment::OrientationTarget(int fw, int *oril, int *orir, bool Lcompute, bool Rcompute) {
- OrientationTarget(fw,fw,oril,orir,Lcompute,Rcompute);
-}
-
-vector<int> Alignment::OrientationTargetLeft4Sampler(int fw) {
- return OrientationTargetLeft4Sampler(fw,fw);
-}
-
-vector<int> Alignment::OrientationTargetLeft4Sampler(int fw0, int fw1) {
- if (DEBUG) cerr << "OrientationTargetLeft4Sampler " << fw0 << "," << fw1 << endl;
- int oril=0;
- int N0=fw0-1;
- while (N0>=0) {
- if (minSSpan(N0)!=MINIMUM_INIT) break;
- N0--;
- }
- int N1=fw1+1;
- while (N1<_I) {
- if (minSSpan(N1)!=MINIMUM_INIT) break;
- N1++;
- }
- if (minSSpan(fw0)==MINIMUM_INIT && minSSpan(fw1)==MINIMUM_INIT) {
- fw0=N1; fw1=N0;
- }
- if (maxSSpan(N0)<minSSpan(fw0) || maxSSpan(fw0)<minSSpan(N0)) {
- int *block = blockSource(minSSpan(N0),maxSSpan(N0));
- if (DEBUG) cerr << "block1[" << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << endl;
- if (block[2]<=fw0 && fw0<=block[3]) //source span of fw0 subsumes NO's or the other way around
- oril=5;
- delete block;
- if (oril==0) {
- block = blockSource(minSSpan(fw0), maxSSpan(fw0));
- if (DEBUG) cerr << "block2[" << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << endl;
- if (block[2] <= N0 && N0 <= block[3]) oril=5;
- delete block;
- }
- if (oril==0) {
- if (maxSSpan(N0)<minSSpan(fw0)) {// if N0 is monotone
- oril=1;
- block = blockSource(maxSSpan(N0),minSSpan(fw0)-1);
- if (DEBUG) cerr << "block3[" << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << endl;
- if (block[2] <= fw0 && fw0 <= block[3]) oril+=2;
- delete block;
- } else { // (maxSSpan(fw0)<minSSpan(N0)) // if NO is non-monotone
- oril=2;
- block = blockSource(maxSSpan(fw0)+1,minSSpan(N0));
- if (DEBUG) cerr << "block4[" << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << endl;
- if (block[2] <= fw0 && fw0 <= block[3]) oril+=2;
- delete block;
- }
- }
- } else { //source span of fw0 subsumes NO's or the other way around
- oril=5;
- }
- if (DEBUG) cerr << "oril = " << oril << endl;
- int* block = blockSource(N0,fw0);
- vector<int> ret;
- ret.push_back(oril); ret.push_back(block[0]); ret.push_back(block[1]);
- ret.push_back(block[2]); ret.push_back(block[3]);
- delete block;
- return ret;
-}
-
-vector<int> Alignment::OrientationTargetRight4Sampler(int fw) {
- return OrientationTargetRight4Sampler(fw,fw);
-}
-
-vector<int> Alignment::OrientationTargetRight4Sampler(int fw0, int fw1) {
- if (DEBUG) cerr << "OrientationTargetRight4Sampler " << fw0 << "," << fw1 << endl;
- int orir=0;
- int N0=fw0-1;
- while (N0>=0) {
- if (minSSpan(N0)!=MINIMUM_INIT) break;
- N0--;
- }
- int N1=fw1+1;
- while (N1<_I) {
- if (minSSpan(N1)!=MINIMUM_INIT) break;
- N1++;
- }
- if (minSSpan(fw0)==MINIMUM_INIT && minSSpan(fw1)==MINIMUM_INIT) {
- fw0=N1; fw1=N0;
- }
- if (maxSSpan(N1)<minSSpan(fw1) || maxSSpan(fw1)<minSSpan(N1)) {
- int *block = blockSource(minSSpan(N1),maxSSpan(N1));
- if (block[2]<=fw1 && fw1<=block[3]) orir=5;
- delete block;
- if (orir==0) {
- block = blockSource(minSSpan(fw1),maxSSpan(fw1));
- if (block[2] <= N1 && N1 <= block[3]) orir=5;
- delete block;
- }
- if (orir==0) {
- if (maxSSpan(fw1)<minSSpan(N1)) { // if N1 is monotone
- orir=1;
- block = blockSource(maxSSpan(fw1)+1,minSSpan(N1));
- if (block[2] <= fw1 && fw1 <= block[3]) orir+=2;
- delete block;
- } else { //if (maxSSpan(N1)<minSSpan(fw1)) { // if N1 is non-monotone
- orir=2;
- block = blockSource(maxSSpan(N1),minSSpan(fw1)-1);
- if (block[2] <= fw1 && fw1 <= block[3]) orir+=2;
- delete block;
- }
- }
- } else {
- orir=5;
- }
- if (DEBUG) cerr << "orir = " << orir << endl;
- int* block = blockSource(fw1,N1);
- vector<int> ret;
- ret.push_back(orir); ret.push_back(block[0]); ret.push_back(block[1]);
- ret.push_back(block[2]); ret.push_back(block[3]);
- delete block;
- return ret;
-
-}
-
-void Alignment::OrientationTarget(int fw0, int fw1, int*oril, int*orir, bool Lcompute, bool Rcompute) {
- if (DEBUG) cerr << "OrientationTarget " << fw0 << "," << fw1 << endl;
- // Left Neighbor
- if (!Lcompute && !Rcompute) return;
- *oril=0;
- *orir=0;
- curr_al.push_back(fw0);
- curr_al.push_back(fw1);
- int lr = 0;
- if (orit_hash.find(curr_al)==orit_hash.end()) {
- // Find first aligned word N0 to the left of fw
- //int fw0 = fw; int fw1 = fw;
- int N0=fw0-1;
- while (N0>=0) {
- if (minSSpan(N0)!=MINIMUM_INIT) break;
- N0--;
- }
- int N1=fw1+1;
- while (N1<_I) {
- if (minSSpan(N1)!=MINIMUM_INIT) break;
- N1++;
- }
- if (minSSpan(fw0)==MINIMUM_INIT && minSSpan(fw1)==MINIMUM_INIT) {
- fw0=N1; fw1=N0;
- }
- if (DEBUG) {
- cerr << "fw0:" << fw0 << ", fw1:" << fw1 << ", N0:" << N0 << ", N1:" << N1 << endl ;
- cerr << "minSSpan(N0)=" << minSSpan(N0) << " maxSSpan(N0)=" << maxSSpan(N0);
- cerr << " minSSpan(fw0)="<< minSSpan(fw0) << " maxSSpan(fw0)=" << maxSSpan(fw0) << endl;
- cerr << "minSSpan(fw1)=" << minSSpan(fw1) << " maxSSpan(fw1)=" << maxSSpan(fw1);
- cerr << " minSSpan(N1)="<< minSSpan(N1) << " maxSSpan(N1)=" << maxSSpan(N1) << endl;
- }
- if (maxSSpan(N0)<minSSpan(fw0) || maxSSpan(fw0)<minSSpan(N0)) {
- int *block = blockSource(minSSpan(N0),maxSSpan(N0));
- if (DEBUG) cerr << "block1[" << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << endl;
- if (block[2]<=fw0 && fw0<=block[3]) //source span of fw0 subsumes NO's or the other way around
- *oril=5;
- delete block;
- if (*oril==0) {
- block = blockSource(minSSpan(fw0), maxSSpan(fw0));
- if (DEBUG) cerr << "block2[" << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << endl;
- if (block[2] <= N0 && N0 <= block[3]) *oril=5;
- delete block;
- }
- if (*oril==0) {
- if (maxSSpan(N0)<minSSpan(fw0)) {// if N0 is monotone
- *oril=1;
- block = blockSource(maxSSpan(N0),minSSpan(fw0)-1);
- if (DEBUG) cerr << "block3[" << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << endl;
- if (block[2] <= fw0 && fw0 <= block[3]) *oril+=2;
- delete block;
- } else { // (maxSSpan(fw0)<minSSpan(N0)) // if NO is non-monotone
- *oril=2;
- block = blockSource(maxSSpan(fw0)+1,minSSpan(N0));
- if (DEBUG) cerr << "block4[" << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << endl;
- if (block[2] <= fw0 && fw0 <= block[3]) *oril+=2;
- delete block;
- }
- }
- } else { //source span of fw0 subsumes NO's or the other way around
- *oril=5;
- }
- if (DEBUG) cerr << "oril = " << *oril << endl;
- // Right Neighbor
- if (maxSSpan(N1)<minSSpan(fw1) || maxSSpan(fw1)<minSSpan(N1)) {
- int *block = blockSource(minSSpan(N1),maxSSpan(N1));
- if (block[2]<=fw1 && fw1<=block[3]) *orir=5;
- delete block;
- if (*orir==0) {
- block = blockSource(minSSpan(fw1),maxSSpan(fw1));
- if (block[2] <= N1 && N1 <= block[3]) *orir=5;
- delete block;
- }
- if (*orir==0) {
- if (maxSSpan(fw1)<minSSpan(N1)) { // if N1 is monotone
- *orir=1;
- block = blockSource(maxSSpan(fw1)+1,minSSpan(N1));
- if (block[2] <= fw1 && fw1 <= block[3]) *orir+=2;
- delete block;
- } else { //if (maxSSpan(N1)<minSSpan(fw1)) { // if N1 is non-monotone
- *orir=2;
- block = blockSource(maxSSpan(N1),minSSpan(fw1)-1);
- if (block[2] <= fw1 && fw1 <= block[3]) *orir+=2;
- delete block;
- }
- }
- } else {
- *orir=5;
- }
- if (DEBUG) cerr << "orir = " << *orir << endl;
- lr = link(*oril,*orir);
- orit_hash.insert(pair<vector<int>,int>(curr_al,lr));
- } else {
- lr = orit_hash[curr_al];
- }
- if (DEBUG) cerr << "Lcompute=" << Lcompute << ", Rcompute=" << Rcompute << endl;
- if (DEBUG) cerr << "lr=" << lr << ", l=" << source(lr) << ", r=" << target(lr) << endl;
- if (Lcompute>0) *oril=source(lr);
- if (Rcompute>0) *orir=target(lr);
- curr_al.pop_back();
- curr_al.pop_back();
-}
-
-int* Alignment::blockSource(int idx1, int idx2) {
-// outputs a minimal block [s1,s2,t1,t2] that contains idx1 and idx2, where idx1 <= idx2
- if (DEBUG) cerr << "blockSource[" << idx1 << "," << idx2 << "]" << endl;
- int *curr = new int[4];
- curr[0]=idx1; curr[1]=idx2; curr[2]=MINIMUM_INIT; curr[3]=MAXIMUM_INIT;
- for (int j=curr[0]; j<=curr[1]; j++) {
- curr[2] = least(curr[2],_tSpan[j][0]);
- curr[3] = most(curr[3],_tSpan[j][1]);
- }
- int next[4];
- next[0]=curr[0]; next[1]=curr[1];
- for (int i=curr[2]; i<=curr[3]; i++) {
- next[0] = least(next[0],_sSpan[i][0]);
- next[1] = most(next[1],_sSpan[i][1]);
- }
- next[2] = curr[2]; next[3]= curr[3];
- int idx=1;
- do {
- // update the current
- for (int j=next[0]; j<curr[0]; j++) {
- curr[2] = least(curr[2],_tSpan[j][0]);
- curr[3] = most(curr[3],_tSpan[j][1]);
- }
- for (int j=curr[1]+1; j<=next[1]; j++) {
- curr[2] = least(curr[2],_tSpan[j][0]);
- curr[3] = most(curr[3],_tSpan[j][1]);
- }
- curr[0] = next[0]; curr[1] = next[1];
- if (curr[2]==next[2] && curr[3]==next[3]) break;
- // prepare for the next
- for (int i=curr[2]; i<next[2]; i++) {
- next[0]= least(next[0],_sSpan[i][0]);
- next[1]= most(next[1],_sSpan[i][1]);
- }
- for (int i=next[3]+1; i<=curr[3]; i++) {
- next[0] = least(next[0],_sSpan[i][0]);
- next[1] = most(next[1],_sSpan[i][1]);
- }
- next[2] = curr[2]; next[3]= curr[3];
- idx++;
- } while(1);
- return curr;
-}
-
-int* Alignment::blockTarget(int idx1, int idx2) {
-// outputs a minimal [s1,s2,t1,t2] that contains idx1 and idx2, where idx1<=idx2
- int *curr = new int[4];
- curr[0]=MINIMUM_INIT; curr[1]=MAXIMUM_INIT; curr[2]=idx1; curr[3]=idx2;
- for (int i=curr[2]; i<=curr[3]; i++) {
- curr[0] = least(curr[0],_sSpan[i][0]);
- curr[1] = most(curr[1],_sSpan[i][1]);
- }
- int next[4];
- next[2]=curr[2]; next[3]=curr[3];
- for (int j=curr[0]; j<=curr[1]; j++) {
- next[2] = least(next[2],_tSpan[j][0]);
- next[3] = most(next[3],_tSpan[j][1]);
- }
- next[0] = curr[0]; next[1]= curr[1];
- int idx=1;
- do {
- // update the current
- for (int i=next[2]; i<curr[2]; i++) {
- curr[0] = least(curr[0],_sSpan[i][0]);
- curr[1] = most(curr[1],_sSpan[i][1]);
- }
- for (int i=curr[3]+1; i<=next[3]; i++) {
- curr[0] = least(curr[0],_sSpan[i][0]);
- curr[1] = most(curr[1],_sSpan[i][1]);
- }
- curr[2] = next[2]; curr[3] = next[3];
- if (curr[0]==next[0] && curr[1]==next[1]) break;
- // prepare for the next
- for (int j=curr[0]; j<next[0]; j++) {
- next[2]= least(next[2],_tSpan[j][0]);
- next[3]= most(next[3],_tSpan[j][1]);
- }
- for (int j=next[1]+1; j<=curr[1]; j++) {
- next[2] = least(next[2],_tSpan[j][0]);
- next[3] = most(next[3],_tSpan[j][1]);
- }
- next[0] = curr[0]; next[1]= curr[1];
- idx++;
- } while(1);
- return curr;
-}
-
-int Alignment::firstSourceAligned(int start) {
- for (int j=start; j<_J; j++)
- if (_tSpan[j][0]!=MINIMUM_INIT) return j;
- return -1;
-}
-
-int Alignment::lastSourceAligned(int end) {
- for (int j=end; j>=0; j--)
- if (_tSpan[j][0]!=MINIMUM_INIT) return j;
- return -1;
-}
-
-int Alignment::firstTargetAligned(int start) {
- for (int i=start; i<_I; i++)
- if (_sSpan[i][0]!=MINIMUM_INIT) return i;
- return -1;
-}
-
-int Alignment::lastTargetAligned(int end) {
- for (int i=end; i>=0; i--)
- if (_sSpan[i][0]!=MINIMUM_INIT) return i;
- return -1;
-}
-
-void Alignment::BorderingSFWsOnly() {
-// removes the record of all function word alignments, except those at the borders
-// the number of alignments kept may be more than two
-// i.e. where the leftmost / the rightmost alignments are unaligned.
-// In such cases, this function continues keeping function word alignments until the
-// first (or last) alignment words.
- if (SourceFWIdxs[0]>2) {
- int firstCut = 1;
- for (int j=2; j<=SourceFWIdxs[0]; j++) {
- if (SourceFWIdxs[3*j-2]>fas) break;
- firstCut=j;
- }
- int lastCut = SourceFWIdxs[0];
- for (int j=SourceFWIdxs[0]-1; j>=0; j--) {
- if (SourceFWIdxs[3*j-2]<las) break;
- lastCut=j;
- }
- if (firstCut>=lastCut) return;
- int delta = 0;
- for (int j=lastCut; j<=SourceFWIdxs[0]; j++) {
- delta++;
- SourceFWIdxs[3*(firstCut+delta)-2]=SourceFWIdxs[3*j-2];
- SourceFWIdxs[3*(firstCut+delta)-1]=SourceFWIdxs[3*j-1];
- SourceFWIdxs[3*(firstCut+delta)] =SourceFWIdxs[3*j];
- }
- SourceFWIdxs[0]=firstCut+delta;
- }
-}
-
-void Alignment::BorderingTFWsOnly() {
-// similar to BorderingSFWsOnly() except this looks at the source side.
- if (TargetFWIdxs[0]>2) {
- int firstCut = 1;
- for (int j=2; j<=TargetFWIdxs[0]; j++) {
- if (TargetFWIdxs[3*j-2]>fat) break;
- firstCut=j;
- }
- int lastCut = TargetFWIdxs[0];
- for (int j=TargetFWIdxs[0]-1; j>=0; j--) {
- if (TargetFWIdxs[3*j-2]<lat) break;
- lastCut=j;
- }
- if (firstCut>=lastCut) return;
- int delta = 0;
- for (int j=lastCut; j<=TargetFWIdxs[0]; j++) {
- delta++;
- TargetFWIdxs[3*(firstCut+delta)-2]=TargetFWIdxs[3*j-2];
- TargetFWIdxs[3*(firstCut+delta)-1]=TargetFWIdxs[3*j-1];
- TargetFWIdxs[3*(firstCut+delta)] =TargetFWIdxs[3*j];
- }
- TargetFWIdxs[0]=firstCut+delta;
- }
-}
-
-void Alignment::FillFWIdxsState(int* state, int fas, int las, int fat, int lat) {
- if (DEBUG) cerr << "FillFWIdxsState ("<< fas <<","<< las<<"," << fat <<"," << lat << ")" << endl;
- if (fas==las) las+=1;
- if (fat==lat) lat+=1;
- for (int idx=0; idx<12; idx++) state[idx]=-1;
- if (SourceFWIdxs[0]<=2) {
- if (SourceFWIdxs[0]>=1) {state[0]=SourceFWIdxs[1]; state[1]=SourceFWIdxs[2]; state[2]=SourceFWIdxs[3];}
- if (SourceFWIdxs[0]==2) {state[3]=SourceFWIdxs[4]; state[4]=SourceFWIdxs[5]; state[5]=SourceFWIdxs[6];}
- } else {
- if (SourceFWIdxs[1]>fas) {
- state[0]=SourceFWIdxs[1]; state[1]=SourceFWIdxs[2]; state[2]=SourceFWIdxs[3];
- } else {
- ostringstream issf; ostringstream isse;
- for (int idx=1; idx<=SourceFWIdxs[0]; idx++) {
- if (SourceFWIdxs[3*idx-2]>las) break;
- if (idx>1) { issf << " "; isse << " ";};
- issf << TD::Convert(SourceFWIdxs[3*idx-1]);
- isse << TD::Convert(SourceFWIdxs[3*idx]);
- state[0]=SourceFWIdxs[3*idx-2];
- if (state[0]>=fas) break;
- }
- if (state[0]>=0) {
- state[1]=TD::Convert(issf.str())*-1; state[2]=TD::Convert(isse.str()); //multiplying source with -1 as marker
- }
- }
- if (SourceFWIdxs[SourceFWIdxs[0]*3-2]==las) {
- state[3]=SourceFWIdxs[SourceFWIdxs[0]*3-2];
- state[4]=SourceFWIdxs[SourceFWIdxs[0]*3-1];
- state[5]=SourceFWIdxs[SourceFWIdxs[0]*3];
- } else {
- int lastCut = SourceFWIdxs[0];
- for (int j=lastCut-1; j>=state[0]+1; j--) {
- if (SourceFWIdxs[3*j-2]==state[0]) break;
- if (SourceFWIdxs[3*j-2]<las) break;
- lastCut=j;
- }
- state[3]=SourceFWIdxs[3*lastCut-2];
- ostringstream issf; ostringstream isse;
- for (int idx=lastCut; idx<=SourceFWIdxs[0]; idx++) {
- if (idx>lastCut) { issf << " "; isse << " ";};
- issf << TD::Convert(SourceFWIdxs[3*idx-1]);
- isse << TD::Convert(SourceFWIdxs[3*idx]);
- }
- if (state[3]>=0) {
- //multiplying source with -1 as compound marker
- state[4]=TD::Convert(issf.str())*-1; state[5]=TD::Convert(isse.str());
- }
- }
- }
- if (TargetFWIdxs[0]<=2) {
- if (TargetFWIdxs[0]>=1) {state[6]=TargetFWIdxs[1]; state[7]=TargetFWIdxs[2]; state[8]=TargetFWIdxs[3];}
- if (TargetFWIdxs[0]==2) {state[9]=TargetFWIdxs[4]; state[10]=TargetFWIdxs[5]; state[11]=TargetFWIdxs[6];}
- } else {
- if (TargetFWIdxs[1]>fat) { //shouldn't come here if SetTargetBorderingFW is invoked
- state[6]=TargetFWIdxs[1]; state[7]=TargetFWIdxs[2]; state[8]=TargetFWIdxs[3];
- } else {
- ostringstream issf; ostringstream isse;
- for (int idx=1; idx<=TargetFWIdxs[0]; idx++) {
- if (TargetFWIdxs[3*idx-2]>fat) break;
- if (idx>1) { issf << " "; isse << " ";};
- issf << TD::Convert(TargetFWIdxs[3*idx-1]);
- isse << TD::Convert(TargetFWIdxs[3*idx]);
- state[6]=TargetFWIdxs[3*idx-2];
- }
- state[7]=TD::Convert(issf.str()); state[8]=TD::Convert(isse.str())*-1;
- //multiplying target with -1 as compound marker
- }
- if (TargetFWIdxs[TargetFWIdxs[0]*3-2]==lat) {
- state[9]=TargetFWIdxs[TargetFWIdxs[0]*3-2];
- state[10]=TargetFWIdxs[TargetFWIdxs[0]*3-1];
- state[11]=TargetFWIdxs[TargetFWIdxs[0]*3];
- } else {
- int lastCut = TargetFWIdxs[0];
- for (int j=lastCut-1; j>=1; j--) {
- if (TargetFWIdxs[3*j-2]<=state[9]) break;
- if (TargetFWIdxs[3*j-2]<lat) break;
- lastCut=j;
- }
- state[9]=TargetFWIdxs[3*lastCut-2];
- ostringstream issf; ostringstream isse;
- for (int idx=lastCut; idx<=TargetFWIdxs[0]; idx++) {
- if (idx>lastCut) issf << " "; isse << " ";;
- issf << TD::Convert(TargetFWIdxs[3*idx-1]);
- isse << TD::Convert(TargetFWIdxs[3*idx]);
- }
- state[10]=TD::Convert(issf.str()); state[11]=TD::Convert(isse.str())*-1;
- }
- }
-}
-
-void Alignment::simplifyBackward(vector<int *>*blocks, int* block, const vector<int>& danglings) {
-// given a *block*, see whether its target span contains any index inside *danglings*.
-// if yes, break it; otherwise, keep it. put the result(s) to *blocks*
- if (DEBUG) cerr << "simplifyBackward[" << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << "]" << endl;
- if (DEBUG) for (int i=0; i<danglings.size(); i++) cerr << "danglings[" << i << "] = " << danglings[i] << endl;
- if (danglings.size()==0) {
- blocks->push_back(block);
- if (DEBUG) cerr << "pushing(0) " << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << endl;
- return;
- }
- int currIdx = block[2];
- int i_dangling = 0;
- while (block[2]>danglings[i_dangling]) {
- if (i_dangling+1 >= danglings.size()) break;
- i_dangling++;
- }
- while (danglings[i_dangling]==currIdx) {
- i_dangling++;
- currIdx++;
- }
- /*if (i_dangling>=danglings.size() && currIdx) {
- blocks->push_back(block);
- if (DEBUG) cerr << "pushing(1) " << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << endl;
- return;
- }
- if (block[3]<danglings[i_dangling]) {
- blocks->push_back(block);
- if (DEBUG) cerr << "pushing(2) " << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << endl;
- return;
- }*/
- if (DEBUG) cerr << "i_dangling = " << i_dangling << endl;
- int anchorIdx = danglings[i_dangling];
- if (i_dangling+1>=danglings.size() || anchorIdx>block[3]+1) anchorIdx=block[3]+1;
- if (DEBUG) cerr << "anchorIdx = " << anchorIdx << ", currIdx = " << currIdx << endl;
- do {
- while(currIdx<anchorIdx) {
- if (DEBUG) cerr << "currIdx = " << currIdx << ", anchorIdx = " << anchorIdx << endl;
- bool isMoved = false;
- for (int idx=anchorIdx-1; idx>=currIdx; idx--) {
- int *nublock = blockTarget(currIdx,idx);
- if (nublock[2]==currIdx && nublock[3]==idx) {
- if (nublock[0]!=MINIMUM_INIT) {
- blocks->push_back(nublock);
- if (DEBUG) cerr << "pushing(3) " << nublock[0] << "," << nublock[1] << "," << nublock[2] << "," << nublock[3] << endl;
- } else {
- delete nublock;
- }
- isMoved = true;
- currIdx=idx+1; break;
- } else {
- delete nublock;
- }
- }
- if (DEBUG) cerr << "isMoved=" << isMoved << ", currIdx=" << currIdx << endl;
- if (!isMoved) {
- int source = sourceOf(currIdx);
- while (source>=0) {
- if (source >= block[0]) {
- int* nublock = new int[4];
- nublock[0]=source; nublock[1]=source; nublock[2]=currIdx; nublock[3]=currIdx;
- blocks->push_back(nublock);
- if (DEBUG) cerr << "pushing(4) " << nublock[0] << "," << nublock[1] << "," << nublock[2] << "," << nublock[3] << endl;
- }
- source = sourceOf(currIdx,source+1);
- }
- currIdx++;
- }
- }
- currIdx=anchorIdx+1;
- anchorIdx=block[3]+1;
- if (i_dangling+1<danglings.size()) anchorIdx=danglings[++i_dangling];
- } while(currIdx<=block[3]);
-}
-
-void Alignment::simplify(int* ret) {
- // the idea is to create blocks of maximal consistent alignment in between a pair of function words
- // exceptional cases include: one to non-contiguous many (or vice versa) -> treat this as one alignment each
- // record all function word alignments first, important because it may be unaligned
- // return true if it's truly simple (no function word alignment involves); false, otherwise
- if (DEBUG) cerr << "begin simplify" << endl;
- reset(0,0); reset(_J-1,_I-1); // remove the phrase boundary alignments, NEED TO CHECK AGAIN !!!
- if (SourceFWIdxs[0]+TargetFWIdxs[0]==0) { // return singleton
- if (DEBUG) cerr << "no function words" << endl;
- for (int idx=0; idx<12; idx++) ret[idx]=-1;
- ret[12]=1; ret[13]=0; ret[14]=0; // 0-0
- FillFWIdxsState(ret,0,0,0,0);
- return;
- }
- curr_al.insert(curr_al.begin(),curr_al.size());
- curr_al.push_back(SourceFWIdxs[0]);
- for (int i=1; i<=SourceFWIdxs[0]; i++) curr_al.push_back(SourceFWIdxs[3*i-2]);
- curr_al.push_back(TargetFWIdxs[0]);
- for (int i=1; i<=TargetFWIdxs[0]; i++) curr_al.push_back(TargetFWIdxs[3*i-2]);
- vector<int> el;
- if (simplify_hash.find(curr_al)==simplify_hash.end()) {
- if (DEBUG) {
- cerr << "SourceFWIdxs:" << SourceFWIdxs[0] << endl;
- for (int i=1; i<=SourceFWIdxs[0]; i++)
- cerr << SourceFWIdxs[3*i-2] << "," << SourceFWIdxs[3*i-1] << "," << SourceFWIdxs[3*i] << endl;
- cerr << "TargetFWIdxs:" << TargetFWIdxs[0] << endl;
- for (int i=1; i<=TargetFWIdxs[0]; i++) {
- cerr << TargetFWIdxs[3*i-2] << "," << TargetFWIdxs[3*i-1] << "," << TargetFWIdxs[3*i] << endl;
- }
- }
-
- vector< int* > blocks; // each element contains s1,s2,t1,t2
- int currIdx = 1; // start from 1 to avoid considering phrase start
- std::set<int> FWIdxs;
- std::vector<int> DanglingTargetFWIdxs;
- for (int i=1; i<= SourceFWIdxs[0]; i++) FWIdxs.insert(SourceFWIdxs[3*i-2]);
- for (int i=1; i<= TargetFWIdxs[0]; i++) {
- int source = sourceOf(TargetFWIdxs[3*i-2]);
- if (source>=0) {
- do {
- FWIdxs.insert(source);
- source = sourceOf(TargetFWIdxs[3*i-2],source+1);
- } while(source >=0);
- } else {
- int *block = new int[4];
- block[0]=-1; block[1]=-1; block[2]=TargetFWIdxs[3*i-2]; block[3]=TargetFWIdxs[3*i-2];
- blocks.push_back(block);
- if (DEBUG) cerr << "pushing[1] " << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << endl;
- DanglingTargetFWIdxs.push_back(TargetFWIdxs[3*i-2]);
- }
- }
- if (DEBUG)
- for (std::set<int>::const_iterator iter=FWIdxs.begin(); iter!=FWIdxs.end(); iter++) {
- cerr << "FWIdxs=" << *iter << endl;
- }
- std::set<int>::const_iterator currFWIdx = FWIdxs.begin();
- if (currFWIdx == FWIdxs.end()) {
- int* block = new int[4];
- block[0]=1; block[1]=_J-2; block[2]=1; block[3]=_I-2; // no need to consider phrase boundaries
- simplifyBackward(&blocks,block,DanglingTargetFWIdxs);
- } else {
- int anchorIdx = *currFWIdx; // also used to denote _J+1
- do {
- // add alignments whose source from currIdx to currFWIdx-1
- while (currIdx<anchorIdx) {
- bool isMoved = false;
- //cerr << "anchorIdx = " << anchorIdx << ", currIdx = " << currIdx << endl;
- for (int idx=anchorIdx-1; idx>=currIdx; idx--) {
- int* block = blockSource(currIdx,idx);
- if (block[0]==currIdx&&block[1]==idx) {
- if (block[2]!=MINIMUM_INIT) { // must be aligned
- simplifyBackward(&blocks,block,DanglingTargetFWIdxs);
- } else {
- delete block;
- }
- currIdx = idx+1; isMoved = true;
- break;
- } else {
- delete block;
- }
- }
- if (!isMoved) {
- int target = targetOf(currIdx);
- while (target>=0) {
- int* block = new int[4];
- block[0]=currIdx; block[1]=currIdx; block[2]=target; block[3]=target;
- blocks.push_back(block);
- if (DEBUG) cerr << "pushing[2] " << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << endl;
- target = targetOf(currIdx,target+1);
- }
- currIdx++;
- }
- }
- // add function word alignments (anchorIdx)
- if (anchorIdx==getJ()) break;
- int target = targetOf(anchorIdx);
- do {
- int* block = new int[4];
- block[0]=anchorIdx; block[1]=anchorIdx; block[2]=target; block[3]=target;
- blocks.push_back(block);
- if (DEBUG) cerr << "pushing[3] " << block[0] << "," << block[1] << "," << block[2] << "," << block[3] << endl;
- if (target>=0) target = targetOf(anchorIdx,target+1);
- } while (target>=0);
- // advance indexes
- currIdx = anchorIdx+1;
- anchorIdx = getJ()-1; // was minus 2
- if (++currFWIdx!=FWIdxs.end()) anchorIdx = *currFWIdx;
- } while (currIdx<=getJ()-2);
- }
-
-
- vector<int> source_block_mapper(getJ(),-1);
- vector<int> target_block_mapper(getI(),-1);
- for (int i = 0; i<blocks.size(); i++) {
- if (DEBUG) cerr << "blocks[" << i << "]=" << blocks[i][0] << "," << blocks[i][1] << "," << blocks[i][2] << "," << blocks[i][3] << endl;
- if (blocks[i][0]>=0) source_block_mapper[blocks[i][0]]=1;
- if (blocks[i][2]>=0) target_block_mapper[blocks[i][2]]=1;
- }
- int curr = 1;
- int prev = -1;
- for (int idx=0; idx<source_block_mapper.size(); idx++) {
- if (source_block_mapper[idx]>0) {
- source_block_mapper[idx]=curr++;
- prev = curr;
- } else {
- source_block_mapper[idx]=prev;
- }
- }
- curr = 1;
- for (int idx=0; idx<target_block_mapper.size(); idx++) {
- if (target_block_mapper[idx]>0) {
- target_block_mapper[idx]=curr++;
- prev = curr;
- } else {
- target_block_mapper[idx]=prev;
- }
- }
-
- //assert(blocks.size()<=50);
- if (DEBUG) cerr << "resulting alignment:" << endl;
- for (int i = 0; i<blocks.size(); i++) {
- if (blocks[i][2]<0 || blocks[i][0]<0) continue;
- int source = source_block_mapper[blocks[i][0]]-1;
- int target = target_block_mapper[blocks[i][2]]-1;
- el.push_back(link(source,target));
- if (DEBUG) cerr << source << "-" << target << " ";
- }
- el.insert(el.begin(),el.size());
- if (DEBUG) cerr << endl;
- el.push_back(SourceFWIdxs[0]);
- for (int idx=1; idx<=SourceFWIdxs[0]; idx++) {
- if (DEBUG) cerr << "SourceFWIdxs[" << (3*idx-2) << "] from " << SourceFWIdxs[3*idx-2] << endl;
- el.push_back(source_block_mapper[SourceFWIdxs[3*idx-2]]-1);
- }
- el.push_back(TargetFWIdxs[0]);
- for (int idx=1; idx<=TargetFWIdxs[0]; idx++) {
- if (DEBUG) cerr << "TargetFWIdxs[" << (3*idx-2) << "] from " << TargetFWIdxs[3*idx-2] << endl;
- el.push_back(target_block_mapper[TargetFWIdxs[3*idx-2]]-1);
- }
- el.push_back(source_block_mapper[fas]-1);
- el.push_back(source_block_mapper[las]-1);
- el.push_back(target_block_mapper[fat]-1);
- el.push_back(target_block_mapper[lat]-1);
- if (DEBUG) {
- cerr << "insert key:el = ";
- for (int ii=0; ii<el.size(); ii++)
- cerr << ii << "." << el[ii] << " ";
- cerr << " || " << endl;
- }
- if (DEBUG) cerr << "trying to insertL " << endl;
- if (DEBUG) {
- cerr << "size=" << curr_al.size() << " ";
- for (int ii=0; ii<curr_al.size(); ii++) cerr << "curr_al[" << ii << "]=" << curr_al[ii] << " ";
- cerr << endl;
- }
- simplify_hash.insert(pair<vector<int>, vector<int> > (curr_al,el));
- if (DEBUG) cerr << "inserted" << endl;
- } else {
- el = simplify_hash[curr_al];
- }
- if (DEBUG) {
- cerr << "pull key:el = ";
- for (int ii=0; ii<el.size(); ii++)
- cerr << ii << "." << el[ii] << " ";
- cerr << endl;
- }
- ret[12] = el[0];
- for (int i=1; i<=el[0]; i++) ret[12+i] = el[i];
- int istart = el[0]+1;
- assert(el[istart]==SourceFWIdxs[0]);
- for (int i=1; i<=el[istart]; i++) SourceFWIdxs[3*i-2]=el[istart+i];
- istart += el[istart]+1;
- assert(el[istart]==TargetFWIdxs[0]);
- for (int i=1; i<=el[istart]; i++) TargetFWIdxs[3*i-2]=el[istart+i];
- istart += el[istart]+1;
- FillFWIdxsState(ret,el[istart],el[istart+1],el[istart+2],el[istart+3]);
-}
-
-void Alignment::simplify_nofw(int* ret) {
- for (int i=0; i<12; i++) ret[i]=-1;
- ret[12]=1; ret[13]=0;
-}
-
-void Alignment::sort(int* num) {
- if (num[0]>1) quickSort(num,1,num[0]);
-}
-
-void Alignment::quickSort(int arr[], int left, int right) {
- int i = left, j = right;
- int tmp1,tmp2,tmp3;
- int mid = (left + right) / 2;
- int pivot = arr[3*mid-2];
-
- /* partition */
- while (i <= j) {
- while (arr[3*i-2] < pivot) i++;
- while (arr[3*j-2] > pivot) j--;
- if (i <= j) {
- tmp1 = arr[3*i-2]; tmp2 = arr[3*i-1]; tmp3 = arr[3*i];
- arr[3*i-2] = arr[3*j-2]; arr[3*i-1] = arr[3*j-1]; arr[3*i] = arr[3*j];
- arr[3*j-2] = tmp1; arr[3*j-1] = tmp2; arr[3*j] = tmp3;
- i++;
- j--;
- }
- };
-
- /* recursion */
- if (left < j) quickSort(arr, left, j);
- if (i < right) quickSort(arr, i, right);
-}
-
-double Alignment::ScoreOrientation(const CountTable& table, int offset, int ori, WordID cond1, WordID cond2) {
- string source = TD::Convert(cond1);
- string sourceidx;
- if (table.mode == 1) {
- sourceidx = source;
- int slashidx = sourceidx.find_last_of("/");
- source = sourceidx.substr(0,slashidx);
- string idx = sourceidx.substr(slashidx+1);
- if (DEBUG) cerr << " sourceidx = " << sourceidx << ", idx = " << idx << endl;
- if (idx == "X") {
- if (DEBUG) cerr << " idx == X, returning 0" << endl;
- return 0;
- }
- }
- string target = TD::Convert(cond2);
- if (DEBUG) cerr << "sourceidx='" << sourceidx << "', source='" << source << "', target='" << target << "'" << endl;
- double count = table.ultimate[offset+ori-1];
- double total = table.ultimate[offset+5];
- double alpha = 0.1;
- double prob = count/total;
- if (DEBUG) cerr << "level0 " << count << "/" << total << "=" << prob << endl;
-
- WordID key_id = (table.mode!=1) ? cond1 : TD::Convert(source);
- map<WordID,int*>::const_iterator it = table.model.find(key_id);
- bool stop = (it==table.model.end());
- if (!stop) {
- stop=true;
- if (it->second[offset+5]>=0) {
- count = it->second[offset+ori-1] + alpha * prob;
- total = it->second[offset+5] + alpha;
- prob = count/total;
- stop = false;
- if (DEBUG) cerr << "level1 " << count << "/" << total << "=" << prob << endl;
- }
- }
- if (stop) return prob;
-
- string key = source + " " + target;
- it = table.model.find(TD::Convert(key));
- stop = (it==table.model.end());
- if (!stop) {
- stop = true;
- if (it->second[offset+5]>=0) {
- count = it->second[offset+ori-1] + alpha * prob;
- total = it->second[offset+5] + alpha;
- prob = count/total;
- stop = false;
- if (DEBUG) cerr << "level2 " << count << "/" << total << "=" << prob << endl;
- }
- }
-
- if (stop || table.mode!=1) return prob;
-
- key = sourceidx + " " + target;
- it = table.model.find(TD::Convert(key));
- if (it!=table.model.end()) {
- if (it->second[offset+5]>=0) {
- count = it->second[offset+ori-1] + alpha * prob;
- total = it->second[offset+5] + alpha;
- prob = count/total;
- if (DEBUG) cerr << "level3 " << count << "/" << total << "=" << prob << endl;
- }
- }
-
- return prob;
-}
-
-void Alignment::ScoreOrientation(const CountTable& table, int offset, int ori, WordID cond1, WordID cond2,
- bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus,
- double alpha1, double beta1) {
- if (DEBUG) cerr << "ScoreOrientation:" << TD::Convert(cond1) << "," << TD::Convert(cond2) << ", alpha1 = " << alpha1 << ", beta1 = " << beta1 << endl;
- double ret = ScoreOrientation(table,offset,ori,cond1,cond2);
- if (isBonus) {
- if (table.mode == 0) *bonus += log(ret); else *bonus += ret;
- } else {
- if (table.mode == 0) *cost += log(ret); else *cost += ret;
- }
-}
-
-double Alignment::ScoreOrientationLeft(const CountTable& table, int ori, WordID cond1, WordID cond2) {
- double ret = ScoreOrientation(table,0,ori,cond1,cond2);
- if (table.mode == 0) return log(ret);
- return ret;
-}
-
-double Alignment::ScoreOrientationLeftBackward(const CountTable& table, int ori, WordID cond1, WordID cond2) {
- double ret = ScoreOrientation(table,12,ori,cond1,cond2);
- if (table.mode == 0) return log(ret);
- return ret;
-}
-
-double Alignment::ScoreOrientationRight(const CountTable& table, int ori, WordID cond1, WordID cond2) {
- double ret = ScoreOrientation(table,6,ori,cond1,cond2);
- if (table.mode == 0) return log(ret);
- return ret;
-}
-
-double Alignment::ScoreOrientationRightBackward(const CountTable& table, int ori, WordID cond1, WordID cond2) {
- double ret = ScoreOrientation(table,18,ori,cond1,cond2);
- if (table.mode == 0) return log(ret);
- return ret;
-}
-
-void Alignment::ScoreOrientationLeft(const CountTable& table, int ori, WordID cond1, WordID cond2,
- bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, double alpha1, double beta1) {
- if (DEBUG) cerr << "ScoreOrientationLeft(" << isBonus << ")" << endl;
- ScoreOrientation(table,0,ori,cond1,cond2,isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha1,beta1);
-}
-
-void Alignment::ScoreOrientationLeftBackward(const CountTable& table, int ori, WordID cond1, WordID cond2,
- bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, double alpha1, double beta1) {
- if (DEBUG) cerr << "ScoreOrientationLeftBackward" << endl;
- ScoreOrientation(table,12,ori,cond1,cond2,isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha1,beta1);
-}
-
-void Alignment::ScoreOrientationRight(const CountTable& table, int ori, WordID cond1, WordID cond2,
- bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, double alpha1, double beta1) {
- if (DEBUG) cerr << "ScoreOrientationRight(" << isBonus << ")" << endl;
- ScoreOrientation(table,6,ori,cond1,cond2,isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha1,beta1);
-}
-
-void Alignment::ScoreOrientationRightBackward(const CountTable& table, int ori, WordID cond1, WordID cond2,
- bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, double alpha1, double beta1) {
- if (DEBUG) cerr << "ScoreOrientationRightBackward" << endl;
- ScoreOrientation(table,18,ori,cond1,cond2,isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha1,beta1);
-}
-
-void Alignment::computeOrientationSourceBackwardPos(const CountTable& table, double *cost, double *bonus,
- double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, int maxfwidx, int maxdepth1, int maxdepth2) {
- if (DEBUG) cerr << "computeOrientationSourceBackward" << endl;
- int oril, orir;
- for (int idx=1; idx<=SourceFWRuleIdxs[0]; idx++) {
- if (DEBUG) cerr << "considering SourceFWRuleIdxs[" << idx << "]: " << SourceFWRuleIdxs[3*idx-2] << endl;
- if (!(SourceFWRuleAbsIdxs[idx]<=maxdepth1 || maxfwidx-SourceFWRuleAbsIdxs[idx]+1<=maxdepth2)) continue;
- int* fwblock = blockSource(SourceFWRuleIdxs[3*idx-2],SourceFWRuleIdxs[3*idx-2]);
- bool aligned = (fwblock[2]!=MINIMUM_INIT);
- if (aligned) {
- OrientationTarget(fwblock[2],fwblock[3],&oril,&orir);
- } else {
- OrientationSource(SourceFWRuleIdxs[3*idx-2],&oril,&orir);
- }
- if (DEBUG) cerr << "oril = " << oril << ", orir = " << orir << endl;
- bool isBonus = false; // fas -> first aligned source word, las -> last aligned source word
- if ((aligned && fwblock[2]<=fat)||
- (!aligned && SourceFWRuleIdxs[3*idx-2]<=fas)) isBonus=true;
- if (SourceFWRuleAbsIdxs[idx]<=maxdepth1) {
- ostringstream nusource;
- nusource << TD::Convert(SourceFWRuleIdxs[3*idx-1]) << "/" << SourceFWRuleAbsIdxs[idx];
- ScoreOrientationLeftBackward(table,oril,TD::Convert(nusource.str()),SourceFWRuleIdxs[3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris);
- }
- if (maxfwidx-SourceFWRuleAbsIdxs[idx]+1<=maxdepth2) {
- ostringstream nusource;
- nusource << TD::Convert(SourceFWRuleIdxs[3*idx-1]) << "/" << ((maxfwidx-SourceFWRuleAbsIdxs[idx]+1)*-1);
- ScoreOrientationLeftBackward(table,oril,TD::Convert(nusource.str()),SourceFWRuleIdxs[3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris);
- }
- isBonus = false;
- if ((aligned && lat<=fwblock[3])||
- (!aligned && las<=SourceFWRuleIdxs[3*idx-2])) isBonus=true;
- if (SourceFWRuleAbsIdxs[idx]<=maxdepth1) {
- ostringstream nusource;
- nusource << TD::Convert(SourceFWRuleIdxs[3*idx-1]) << "/" << SourceFWRuleAbsIdxs[idx];
- ScoreOrientationRightBackward(table,orir,SourceFWRuleIdxs[3*idx-1],SourceFWRuleIdxs[3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris);
- }
- if (maxfwidx-SourceFWRuleAbsIdxs[idx]+1<=maxdepth2) {
- ostringstream nusource;
- nusource << TD::Convert(SourceFWRuleIdxs[3*idx-1]) << "/" << ((maxfwidx-SourceFWRuleAbsIdxs[idx]+1)*-1);
- ScoreOrientationRightBackward(table,orir,SourceFWRuleIdxs[3*idx-1],SourceFWRuleIdxs[3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris);
- }
- delete fwblock;
- }
- for (int i_ant=0; i_ant<_Arity; i_ant++) {
- // antfas -> first aligned source word antecedent-wise
- // antlas -> last aligned source word antecedent-wise
- int antfat = firstTargetAligned(TargetAntsIdxs[i_ant][1]);
- int antlat = lastTargetAligned(TargetAntsIdxs[i_ant][TargetAntsIdxs[i_ant][0]]);
- int antfas = firstSourceAligned(SourceAntsIdxs[i_ant][1]);
- int antlas = lastSourceAligned(SourceAntsIdxs[i_ant][SourceAntsIdxs[i_ant][0]]);
- assert(antfat <= antlat);
- assert(antfas <= antlas);
- for (int idx=1; idx<=SourceFWAntsIdxs[i_ant][0]; idx++) {
- if (DEBUG)
- cerr << "considering SourceFWAntsIdxs[" << i_ant << "][" << idx << "]: " << SourceFWAntsIdxs[i_ant][3*idx-2] << endl;
- if (!(SourceFWAntsAbsIdxs[i_ant][idx]<=maxdepth1 || maxfwidx-SourceFWAntsAbsIdxs[i_ant][idx]+1<=maxdepth2)) continue;
- int* fwblock = blockSource(SourceFWAntsIdxs[i_ant][3*idx-2],SourceFWAntsIdxs[i_ant][3*idx-2]);
- //bool aligned = (minTSpan(SourceFWAntsIdxs[i_ant][3*idx-2])!=MINIMUM_INIT);
- bool aligned = (fwblock[2]!=MINIMUM_INIT);
- bool Lcompute = true; bool Rcompute = true;
- if (DEBUG) {
- cerr << " aligned = " << aligned << endl;
- cerr << " fwblock = " << fwblock[0] << "," << fwblock[1] << "," << fwblock[2] << "," << fwblock[3] << endl;
- cerr << " antfas=" << antfas << ", antlas=" << antlas << ", antfat=" << antfat << ", antlat=" << antlat << endl;
- }
- if (aligned) {
- if (DEBUG) cerr << "laligned" << endl;
- if (antfat<fwblock[2]) {
- if (DEBUG) cerr << antfat << "<" << fwblock[2] << endl;
- Lcompute=false;
- }
- } else {
- if (DEBUG) cerr << "!laligned" << endl;
- if (antfas<fwblock[0] && fwblock[1] < antlas) Lcompute=false;
- }
- if (aligned) {
- if (DEBUG) cerr << "raligned" << endl;
- if (fwblock[3]<antlat) {
- if (DEBUG) cerr << fwblock[3] << "<" << antlat << endl;
- Rcompute=false;
- }
- } else {
- if (DEBUG) cerr << "!raligned" << endl;
- if (fwblock[1]<antlas && fwblock[1] < antlas) Rcompute=false;
- }
- if (!Lcompute && !Rcompute) continue;
- if (!aligned) {
- OrientationSource(SourceFWAntsIdxs[i_ant][3*idx-2],&oril,&orir,Lcompute,Rcompute);
- } else {
- OrientationTarget(fwblock[2],fwblock[3],&oril,&orir,Lcompute,Rcompute);
- }
- if (DEBUG) cerr << "oril = " << oril << ", orir = " << orir << endl;
- bool isBonus = false;
- if (Lcompute) {
- if ((aligned && fwblock[3]<=fat) ||
- (!aligned && SourceFWAntsIdxs[i_ant][3*idx-2]<=fas)) isBonus = true;
- if (SourceFWAntsAbsIdxs[i_ant][idx]<=maxdepth1) {
- ostringstream nusource;
- nusource << TD::Convert(SourceFWAntsIdxs[i_ant][3*idx-1]) << "/" << SourceFWAntsAbsIdxs[i_ant][idx];
- ScoreOrientationLeftBackward(table,oril,TD::Convert(nusource.str()),SourceFWAntsIdxs[i_ant][3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris);
- }
- if (maxfwidx-SourceFWAntsAbsIdxs[i_ant][idx]+1<=maxdepth2) {
- ostringstream nusource;
- nusource << TD::Convert(SourceFWAntsIdxs[i_ant][3*idx-1]) << "/" << (-1*(maxfwidx-SourceFWAntsAbsIdxs[i_ant][idx]+1));
- ScoreOrientationLeftBackward(table,oril,TD::Convert(nusource.str()),SourceFWAntsIdxs[i_ant][3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris);
- }
- }
- isBonus = false;
- if (Rcompute) {
- if ((aligned && lat<=fwblock[2]) ||
- (!aligned && las<=SourceFWAntsIdxs[i_ant][3*idx-2]))isBonus = true;
- if (SourceFWAntsAbsIdxs[i_ant][idx]<=maxdepth1) {
- ostringstream nusource;
- nusource << TD::Convert(SourceFWAntsIdxs[i_ant][3*idx-1]) << "/" << SourceFWAntsAbsIdxs[i_ant][idx];
- ScoreOrientationRightBackward(table,orir,TD::Convert(nusource.str()),SourceFWAntsIdxs[i_ant][3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris);
- }
- if (maxfwidx-SourceFWAntsAbsIdxs[i_ant][idx]+1<=maxdepth2) {
- ostringstream nusource;
- nusource << TD::Convert(SourceFWAntsIdxs[i_ant][3*idx-1]) << "/" << (-1*(maxfwidx-SourceFWAntsAbsIdxs[i_ant][idx]+1));
- ScoreOrientationRightBackward(table,orir,TD::Convert(nusource.str()),SourceFWAntsIdxs[i_ant][3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris);
- }
- }
- delete fwblock;
- }
- }
-}
-
-
-void Alignment::computeOrientationSourceBackward(const CountTable& table, double *cost, double *bonus,
- double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus) {
- if (DEBUG) cerr << "computeOrientationSourceBackward" << endl;
- int oril, orir;
- for (int idx=1; idx<=SourceFWRuleIdxs[0]; idx++) {
- if (DEBUG) cerr << "considering SourceFWRuleIdxs[" << idx << "]: " << SourceFWRuleIdxs[3*idx-2] << endl;
- int* fwblock = blockSource(SourceFWRuleIdxs[3*idx-2],SourceFWRuleIdxs[3*idx-2]);
- bool aligned = (fwblock[2]!=MINIMUM_INIT);
- if (aligned) {
- OrientationTarget(fwblock[2],fwblock[3],&oril,&orir);
- } else {
- OrientationSource(SourceFWRuleIdxs[3*idx-2],&oril,&orir);
- }
- if (DEBUG) cerr << "oril = " << oril << ", orir = " << orir << endl;
- bool isBonus = false; // fas -> first aligned source word, las -> last aligned source word
- if ((aligned && fwblock[2]<=fat)||
- (!aligned && SourceFWRuleIdxs[3*idx-2]<=fas)) isBonus=true;
- ScoreOrientationLeftBackward(table,oril,SourceFWRuleIdxs[3*idx-1],SourceFWRuleIdxs[3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris);
- isBonus = false;
- if ((aligned && lat<=fwblock[3])||
- (!aligned && las<=SourceFWRuleIdxs[3*idx-2])) isBonus=true;
- ScoreOrientationRightBackward(table,orir,SourceFWRuleIdxs[3*idx-1],SourceFWRuleIdxs[3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris);
- delete fwblock;
- }
- for (int i_ant=0; i_ant<_Arity; i_ant++) {
- // antfas -> first aligned source word antecedent-wise
- // antlas -> last aligned source word antecedent-wise
- int antfat = firstTargetAligned(TargetAntsIdxs[i_ant][1]);
- int antlat = lastTargetAligned(TargetAntsIdxs[i_ant][TargetAntsIdxs[i_ant][0]]);
- int antfas = firstSourceAligned(SourceAntsIdxs[i_ant][1]);
- int antlas = lastSourceAligned(SourceAntsIdxs[i_ant][SourceAntsIdxs[i_ant][0]]);
- assert(antfat <= antlat);
- assert(antfas <= antlas);
- for (int idx=1; idx<=SourceFWAntsIdxs[i_ant][0]; idx++) {
- if (DEBUG)
- cerr << "considering SourceFWAntsIdxs[" << i_ant << "][" << idx << "]: " << SourceFWAntsIdxs[i_ant][3*idx-2] << endl;
- int* fwblock = blockSource(SourceFWAntsIdxs[i_ant][3*idx-2],SourceFWAntsIdxs[i_ant][3*idx-2]);
- //bool aligned = (minTSpan(SourceFWAntsIdxs[i_ant][3*idx-2])!=MINIMUM_INIT);
- bool aligned = (fwblock[2]!=MINIMUM_INIT);
- bool Lcompute = true; bool Rcompute = true;
- if (DEBUG) {
- cerr << " aligned = " << aligned << endl;
- cerr << " fwblock = " << fwblock[0] << "," << fwblock[1] << "," << fwblock[2] << "," << fwblock[3] << endl;
- cerr << " antfas=" << antfas << ", antlas=" << antlas << ", antfat=" << antfat << ", antlat=" << antlat << endl;
- }
- if (aligned) {
- if (DEBUG) cerr << "laligned" << endl;
- if (antfat<fwblock[2]) {
- if (DEBUG) cerr << antfat << "<" << fwblock[2] << endl;
- Lcompute=false;
- }
- } else {
- if (DEBUG) cerr << "!laligned" << endl;
- if (antfas<fwblock[0] && fwblock[1] < antlas) Lcompute=false;
- }
- if (aligned) {
- if (DEBUG) cerr << "raligned" << endl;
- if (fwblock[3]<antlat) {
- if (DEBUG) cerr << fwblock[3] << "<" << antlat << endl;
- Rcompute=false;
- }
- } else {
- if (DEBUG) cerr << "!raligned" << endl;
- if (fwblock[1]<antlas && fwblock[1] < antlas) Rcompute=false;
- }
- if (!Lcompute && !Rcompute) continue;
- if (!aligned) {
- OrientationSource(SourceFWAntsIdxs[i_ant][3*idx-2],&oril,&orir,Lcompute,Rcompute);
- } else {
- OrientationTarget(fwblock[2],fwblock[3],&oril,&orir,Lcompute,Rcompute);
- }
- if (DEBUG) cerr << "oril = " << oril << ", orir = " << orir << endl;
- bool isBonus = false;
- if (Lcompute) {
- if ((aligned && fwblock[3]<=fat) ||
- (!aligned && SourceFWAntsIdxs[i_ant][3*idx-2]<=fas)) isBonus = true;
- ScoreOrientationLeftBackward(table,oril,SourceFWAntsIdxs[i_ant][3*idx-1],SourceFWAntsIdxs[i_ant][3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris);
- }
- isBonus = false;
- if (Rcompute) {
- if ((aligned && lat<=fwblock[2]) ||
- (!aligned && las<=SourceFWAntsIdxs[i_ant][3*idx-2]))isBonus = true;
- ScoreOrientationRightBackward(table,orir,SourceFWAntsIdxs[i_ant][3*idx-1],SourceFWAntsIdxs[i_ant][3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris);
- }
- delete fwblock;
- }
- }
-}
-
-void Alignment::computeOrientationSourcePos(const CountTable& table, double *cost, double *bonus,
- double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, int maxfwidx, int maxdepth1, int maxdepth2) {
- // This implementation is actually really bad, not reusing codes at all
- if (DEBUG) cerr << "computeOrientationSourcePos(maxfwidx=" << maxfwidx << ",maxdepth=" << maxdepth1 << "," << maxdepth2 << ")" << endl;
- if (maxdepth1+maxdepth2==0) return;
- int oril, orir;
- ostringstream oss;
- WordID sourceID;
- for (int idx=1; idx<=SourceFWRuleIdxs[0]; idx++) {
- if (DEBUG) cerr << "considering SourceFWRuleIdxs[" << idx << "]: " << SourceFWRuleIdxs[3*idx-2] << endl;
- //if (!((SourceFWRuleAbsIdxs[idx]<=maxdepth1) || (maxfwidx-SourceFWRuleAbsIdxs[idx]+1<=maxdepth2))) continue;
- string source = TD::Convert(SourceFWRuleIdxs[3*idx-1]);
- OrientationSource(SourceFWRuleIdxs[3*idx-2],&oril,&orir);
- bool isBonus = false; // fas -> first aligned source word, las -> last aligned source word
- if (SourceFWRuleIdxs[3*idx-2]<=fas) isBonus=true;
- if (!isBonus) // this is unnecessary because fas <= las assertion
- if (minTSpan(SourceFWRuleIdxs[3*idx-2])==MINIMUM_INIT && las<=SourceFWRuleIdxs[3*idx-2]) isBonus=true;
- if (maxdepth1>0) {
- oss << source << "/";
- if (SourceFWRuleAbsIdxs[idx]<=maxdepth1)
- oss << SourceFWRuleAbsIdxs[idx];
- else
- oss << "X";
- sourceID = TD::Convert(oss.str());
- oss.str("");
- ScoreOrientationLeft(table,oril,sourceID,SourceFWRuleIdxs[3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris);
- }
- if (maxdepth2>0) {
- oss << source << "/";
- if (maxfwidx-SourceFWRuleAbsIdxs[idx]+1<=maxdepth2)
- oss << ((maxfwidx-SourceFWRuleAbsIdxs[idx]+1)*-1);
- else
- oss << "X";
- sourceID = TD::Convert(oss.str());
- oss.str("");
- ScoreOrientationLeft(table,oril,sourceID,SourceFWRuleIdxs[3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris);
- }
- isBonus = false;
- if (las<=SourceFWRuleIdxs[3*idx-2]) isBonus=true;
- if (!isBonus) // this is unnecessary becuase fas <= las assertion
- if (minTSpan(SourceFWRuleIdxs[3*idx-2])==MINIMUM_INIT && SourceFWRuleIdxs[3*idx-2]<=fas) isBonus=true;
- if (maxdepth1>0) {
- oss << source << "/";
- if (SourceFWRuleAbsIdxs[idx]<=maxdepth1)
- oss << SourceFWRuleAbsIdxs[idx];
- else
- oss << "X";
- sourceID = TD::Convert(oss.str());
- oss.str("");
- ScoreOrientationRight(table,orir,sourceID,SourceFWRuleIdxs[3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris);
- }
- if (maxdepth2>0) {
- oss << source << "/";
- if (maxfwidx-SourceFWRuleAbsIdxs[idx]+1<=maxdepth2)
- oss << ((maxfwidx-SourceFWRuleAbsIdxs[idx]+1)*-1);
- else
- oss << "X";
- sourceID = TD::Convert(oss.str());
- oss.str("");
- ScoreOrientationRight(table,orir,sourceID,SourceFWRuleIdxs[3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris);
- }
-
- }
- for (int i_ant=0; i_ant<_Arity; i_ant++) {
- for (int idx=1; idx<=SourceFWAntsIdxs[i_ant][0]; idx++) {
- if (DEBUG)
- cerr << "considering SourceFWAntsIdxs[" << i_ant << "][" << idx << "]: " << SourceFWAntsIdxs[i_ant][3*idx-2] << endl;
- //if (!((SourceFWAntsAbsIdxs[i_ant][idx]<=maxdepth1)||(maxfwidx-SourceFWAntsAbsIdxs[i_ant][idx]+1<=maxdepth2))) continue;
- // antfas -> first aligned source word antecedent-wise
- // antlas -> last aligned source word antecedent-wise
- int antfas = firstSourceAligned(SourceAntsIdxs[i_ant][1]);
- int antlas = lastSourceAligned(SourceAntsIdxs[i_ant][SourceAntsIdxs[i_ant][0]]);
- if (DEBUG) cerr << " SourceFWAntsAbsIdxs[i_ant][3*idx-1]=" << SourceFWAntsAbsIdxs[i_ant][3*idx-1] << endl;
- string source = TD::Convert(SourceFWAntsIdxs[i_ant][3*idx-1]);
- assert(antfas <= antlas);
- bool aligned = (minTSpan(SourceFWAntsIdxs[i_ant][3*idx-2])!=MINIMUM_INIT);
- bool Lcompute = true;bool Rcompute = true;
- if ((aligned && antfas<SourceFWAntsIdxs[i_ant][3*idx-2]) ||
- (!aligned && antfas < SourceFWAntsIdxs[i_ant][3*idx-2] && SourceFWAntsIdxs[i_ant][3*idx-2] < antlas))
- Lcompute=false;
- if ((aligned && SourceFWAntsIdxs[i_ant][3*idx-2]<antlas) ||
- (!aligned && antfas < SourceFWAntsIdxs[i_ant][3*idx-2] && SourceFWAntsIdxs[i_ant][3*idx-2] < antlas))
- Rcompute=false;
- if (!Lcompute && !Rcompute) continue;
- OrientationSource(SourceFWAntsIdxs[i_ant][3*idx-2],&oril,&orir,Lcompute, Rcompute);
- bool isBonus = false;
- if (Lcompute) {
- if (SourceFWAntsIdxs[i_ant][3*idx-2]<=fas) isBonus = true;
- //if (!isBonus) // this is unnecessary
- // if (!aligned && las<=SourceFWAntsIdxs[i_ant][3*idx-2]) isBonus=true;
- if (maxdepth1>0) {
- oss << source << "/";
- if (SourceFWAntsAbsIdxs[i_ant][idx]<=maxdepth1)
- oss << SourceFWAntsAbsIdxs[i_ant][idx];
- else
- oss << "X";
- sourceID = TD::Convert(oss.str());
- oss.str("");
- ScoreOrientationLeft(table,oril,sourceID,SourceFWAntsIdxs[i_ant][3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris);
- }
- if (maxdepth2>0) {
- oss << source << "/";
- if (maxfwidx-SourceFWAntsAbsIdxs[i_ant][idx]+1<=maxdepth2)
- oss << ((maxfwidx-SourceFWAntsAbsIdxs[i_ant][idx]+1)*-1);
- else
- oss << "X";
- sourceID = TD::Convert(oss.str());
- oss.str("");
- ScoreOrientationLeft(table,oril,sourceID,SourceFWAntsIdxs[i_ant][3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris);
- }
- }
- isBonus = false;
- if (Rcompute) {
- if (las<=SourceFWAntsIdxs[i_ant][3*idx-2]) isBonus = true;
- //if (!isBonus) // this is unnecessary
- // if (!aligned && SourceFWAntsIdxs[i_ant][3*idx-2]<=fas) isBonus=true;
- if (maxdepth1>0) {
- oss << source << "/";
- if (SourceFWAntsAbsIdxs[i_ant][idx]<=maxdepth1)
- oss << SourceFWAntsAbsIdxs[i_ant][idx];
- else
- oss << "X";
- sourceID = TD::Convert(oss.str());
- oss.str("");
- ScoreOrientationRight(table,orir,sourceID,SourceFWAntsIdxs[i_ant][3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris);
- }
- if (maxdepth2>0) {
- oss << source << "/";
- if (maxfwidx-SourceFWAntsAbsIdxs[i_ant][idx]+1<=maxdepth2)
- oss << ((maxfwidx-SourceFWAntsAbsIdxs[i_ant][idx]+1)*-1);
- else
- oss << "X";
- sourceID = TD::Convert(oss.str());
- oss.str("");
- ScoreOrientationRight(table,orir,sourceID,SourceFWAntsIdxs[i_ant][3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris);
- }
- }
- }
- }
-}
-
-void Alignment::computeOrientationSource(const CountTable& table, double *cost, double *bonus,
- double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus) {
-// a bit complex due to imperfect state (TO DO!!!)
-// 1. there are cases where function word alignments come from antecedents, which orientation
-// (either its left or its right) has been computed earlier.
-// 2. some orientation will go as bonus
- if (DEBUG) cerr << "computeOrientationSource" << endl;
- int oril, orir;
- for (int idx=1; idx<=SourceFWRuleIdxs[0]; idx++) {
- if (DEBUG) cerr << "considering SourceFWRuleIdxs[" << idx << "]: " << SourceFWRuleIdxs[3*idx-2] << endl;
- OrientationSource(SourceFWRuleIdxs[3*idx-2],&oril,&orir);
- bool isBonus = false; // fas -> first aligned source word, las -> last aligned source word
- if (SourceFWRuleIdxs[3*idx-2]<=fas) isBonus=true;
- if (!isBonus) // this is unnecessary because fas <= las assertion
- if (minTSpan(SourceFWRuleIdxs[3*idx-2])==MINIMUM_INIT && las<=SourceFWRuleIdxs[3*idx-2]) isBonus=true;
- ScoreOrientationLeft(table,oril,SourceFWRuleIdxs[3*idx-1],SourceFWRuleIdxs[3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris);
- isBonus = false;
- if (las<=SourceFWRuleIdxs[3*idx-2]) isBonus=true;
- if (!isBonus) // this is unnecessary becuase fas <= las assertion
- if (minTSpan(SourceFWRuleIdxs[3*idx-2])==MINIMUM_INIT && SourceFWRuleIdxs[3*idx-2]<=fas) isBonus=true;
- ScoreOrientationRight(table,orir,SourceFWRuleIdxs[3*idx-1],SourceFWRuleIdxs[3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris);
- }
- for (int i_ant=0; i_ant<_Arity; i_ant++) {
- for (int idx=1; idx<=SourceFWAntsIdxs[i_ant][0]; idx++) {
- if (DEBUG)
- cerr << "considering SourceFWAntsIdxs[" << i_ant << "][" << idx << "]: " << SourceFWAntsIdxs[i_ant][3*idx-2] << endl;
- // antfas -> first aligned source word antecedent-wise
- // antlas -> last aligned source word antecedent-wise
- int antfas = firstSourceAligned(SourceAntsIdxs[i_ant][1]);
- int antlas = lastSourceAligned(SourceAntsIdxs[i_ant][SourceAntsIdxs[i_ant][0]]);
- assert(antfas <= antlas);
- bool aligned = (minTSpan(SourceFWAntsIdxs[i_ant][3*idx-2])!=MINIMUM_INIT);
- bool Lcompute = true;bool Rcompute = true;
- if ((aligned && antfas<SourceFWAntsIdxs[i_ant][3*idx-2]) ||
- (!aligned && antfas < SourceFWAntsIdxs[i_ant][3*idx-2] && SourceFWAntsIdxs[i_ant][3*idx-2] < antlas))
- Lcompute=false;
- if ((aligned && SourceFWAntsIdxs[i_ant][3*idx-2]<antlas) ||
- (!aligned && antfas < SourceFWAntsIdxs[i_ant][3*idx-2] && SourceFWAntsIdxs[i_ant][3*idx-2] < antlas))
- Rcompute=false;
- if (!Lcompute && !Rcompute) continue;
- OrientationSource(SourceFWAntsIdxs[i_ant][3*idx-2],&oril,&orir,Lcompute, Rcompute);
- bool isBonus = false;
- if (Lcompute) {
- if (SourceFWAntsIdxs[i_ant][3*idx-2]<=fas) isBonus = true;
- //if (!isBonus) // this is unnecessary
- // if (!aligned && las<=SourceFWAntsIdxs[i_ant][3*idx-2]) isBonus=true;
- ScoreOrientationLeft(table,oril,SourceFWAntsIdxs[i_ant][3*idx-1],SourceFWAntsIdxs[i_ant][3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris);
- }
- isBonus = false;
- if (Rcompute) {
- if (las<=SourceFWAntsIdxs[i_ant][3*idx-2]) isBonus = true;
- //if (!isBonus) // this is unnecessary
- // if (!aligned && SourceFWAntsIdxs[i_ant][3*idx-2]<=fas) isBonus=true;
- ScoreOrientationRight(table,orir,SourceFWAntsIdxs[i_ant][3*idx-1],SourceFWAntsIdxs[i_ant][3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris);
- }
- }
- }
-}
-
-void Alignment::computeOrientationSourceGen(const CountTable& table, double *cost, double *bonus,
- double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, const map<WordID,WordID>& tags) {
- if (DEBUG) cerr << "computeOrientationSourceGen" << endl;
- int oril, orir;
- for (int idx=1; idx<=SourceFWRuleIdxs[0]; idx++) {
- if (DEBUG) cerr << "considering SourceFWRuleIdxs[" << idx << "]: " << SourceFWRuleIdxs[3*idx-2] << endl;
- OrientationSource(SourceFWRuleIdxs[3*idx-2],&oril,&orir);
- bool isBonus = false; // fas -> first aligned source word, las -> last aligned source word
- if (SourceFWRuleIdxs[3*idx-2]<=fas) isBonus=true;
- if (!isBonus) // this is unnecessary because fas <= las assertion
- if (minTSpan(SourceFWRuleIdxs[3*idx-2])==MINIMUM_INIT && las<=SourceFWRuleIdxs[3*idx-2]) isBonus=true;
- ScoreOrientationLeft(table,oril,generalize(SourceFWRuleIdxs[3*idx-1],tags),SourceFWRuleIdxs[3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris);
- isBonus = false;
- if (las<=SourceFWRuleIdxs[3*idx-2]) isBonus=true;
- if (!isBonus) // this is unnecessary becuase fas <= las assertion
- if (minTSpan(SourceFWRuleIdxs[3*idx-2])==MINIMUM_INIT && SourceFWRuleIdxs[3*idx-2]<=fas) isBonus=true;
- ScoreOrientationRight(table,orir,generalize(SourceFWRuleIdxs[3*idx-1],tags),SourceFWRuleIdxs[3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris);
- }
- for (int i_ant=0; i_ant<_Arity; i_ant++) {
- for (int idx=1; idx<=SourceFWAntsIdxs[i_ant][0]; idx++) {
- if (DEBUG)
- cerr << "considering SourceFWAntsIdxs[" << i_ant << "][" << idx << "]: " << SourceFWAntsIdxs[i_ant][3*idx-2] << endl;
- // antfas -> first aligned source word antecedent-wise
- // antlas -> last aligned source word antecedent-wise
- int antfas = firstSourceAligned(SourceAntsIdxs[i_ant][1]);
- int antlas = lastSourceAligned(SourceAntsIdxs[i_ant][SourceAntsIdxs[i_ant][0]]);
- assert(antfas <= antlas);
- bool aligned = (minTSpan(SourceFWAntsIdxs[i_ant][3*idx-2])!=MINIMUM_INIT);
- bool Lcompute = true;bool Rcompute = true;
- if ((aligned && antfas<SourceFWAntsIdxs[i_ant][3*idx-2]) ||
- (!aligned && antfas < SourceFWAntsIdxs[i_ant][3*idx-2] && SourceFWAntsIdxs[i_ant][3*idx-2] < antlas))
- Lcompute=false;
- if ((aligned && SourceFWAntsIdxs[i_ant][3*idx-2]<antlas) ||
- (!aligned && antfas < SourceFWAntsIdxs[i_ant][3*idx-2] && SourceFWAntsIdxs[i_ant][3*idx-2] < antlas))
- Rcompute=false;
- if (!Lcompute && !Rcompute) continue;
- OrientationSource(SourceFWAntsIdxs[i_ant][3*idx-2],&oril,&orir,Lcompute, Rcompute);
- bool isBonus = false;
- if (Lcompute) {
- if (SourceFWAntsIdxs[i_ant][3*idx-2]<=fas) isBonus = true;
- //if (!isBonus) // this is unnecessary
- // if (!aligned && las<=SourceFWAntsIdxs[i_ant][3*idx-2]) isBonus=true;
- ScoreOrientationLeft(table,oril,generalize(SourceFWAntsIdxs[i_ant][3*idx-1],tags),SourceFWAntsIdxs[i_ant][3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris);
- }
- isBonus = false;
- if (Rcompute) {
- if (las<=SourceFWAntsIdxs[i_ant][3*idx-2]) isBonus = true;
- //if (!isBonus) // this is unnecessary
- // if (!aligned && SourceFWAntsIdxs[i_ant][3*idx-2]<=fas) isBonus=true;
- ScoreOrientationRight(table,orir,generalize(SourceFWAntsIdxs[i_ant][3*idx-1],tags),SourceFWAntsIdxs[i_ant][3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_oris,beta_oris);
- }
- }
- }
-}
-void Alignment::computeOrientationTarget(const CountTable& table, double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus) {
- if (DEBUG) cerr << "computeOrientationTarget" << endl;
- int oril, orir;
- for (int idx=1; idx<=TargetFWRuleIdxs[0]; idx++) {
- if (DEBUG) cerr << "considering TargetFWRuleIdxs[" << idx << "]: " << TargetFWRuleIdxs[3*idx-2] << endl;
- OrientationTarget(TargetFWRuleIdxs[3*idx-2],&oril,&orir);
- // the second and the third parameters of ScoreOrientationLeft must be e and f (not f and then e)
- bool isBonus = false;
- if (TargetFWRuleIdxs[3*idx-2]<=fat) isBonus = true;
- if (!isBonus)
- if (minSSpan(TargetFWRuleIdxs[3*idx-2])==MINIMUM_INIT && lat<=TargetFWRuleIdxs[3*idx-2]) isBonus = true;
- ScoreOrientationLeft(table,oril,TargetFWRuleIdxs[3*idx-1],TargetFWRuleIdxs[3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_orit,beta_orit);
- isBonus = false;
- if (lat<=TargetFWRuleIdxs[3*idx-2]) isBonus = true;
- if (!isBonus)
- if (minSSpan(TargetFWRuleIdxs[3*idx-2])==MINIMUM_INIT && TargetFWRuleIdxs[3*idx-2]<=fat) isBonus=true;
- ScoreOrientationRight(table,orir,TargetFWRuleIdxs[3*idx-1],TargetFWRuleIdxs[3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_orit,beta_orit);
- }
-
- for (int i_ant=0; i_ant<_Arity; i_ant++) {
- for (int idx=1; idx<=TargetFWAntsIdxs[i_ant][0]; idx++) {
- if (DEBUG) cerr << "considering TargetFWAntsIdxs[" << i_ant << "][" << idx << "]: " << TargetFWAntsIdxs[i_ant][3*idx-2] << endl;
- int antfat = firstTargetAligned(TargetAntsIdxs[i_ant][1]);
- int antlat = lastTargetAligned(TargetAntsIdxs[i_ant][TargetAntsIdxs[i_ant][0]]);
- int aligned = (minSSpan( TargetFWAntsIdxs[i_ant][3*idx-2])!=MINIMUM_INIT);
- bool Lcompute = true; bool Rcompute = true;
- if ((aligned && antfat<TargetFWAntsIdxs[i_ant][3*idx-2]) ||
- (!aligned && antfat < TargetFWAntsIdxs[i_ant][3*idx-2] && TargetFWAntsIdxs[i_ant][3*idx-2] < antlat))
- Lcompute=false;
- if ((aligned && TargetFWAntsIdxs[i_ant][3*idx-2]<antlat) ||
- (!aligned && antfat < TargetFWAntsIdxs[i_ant][3*idx-2] && TargetFWAntsIdxs[i_ant][3*idx-2] < antlat))
- Rcompute=false;
- if (!Lcompute && !Rcompute) continue;
- bool isBonus = false;
- OrientationTarget(TargetFWAntsIdxs[i_ant][3*idx-2],&oril,&orir, Lcompute, Rcompute);
- if (Lcompute) {
- if (TargetFWAntsIdxs[i_ant][3*idx-2]<=fat) isBonus=true;
- //if (!isBonus)
- // if (!aligned && lat<=TargetFWAntsIdxs[i_ant][3*idx-2]) isBonus=true;
- ScoreOrientationLeft(table,oril,TargetFWAntsIdxs[i_ant][3*idx-1],TargetFWAntsIdxs[i_ant][3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_orit,beta_orit);
- }
- isBonus = false;
- if (Rcompute) {
- if (lat<=TargetFWAntsIdxs[i_ant][3*idx-2]) isBonus=true;
- if (!isBonus)
- //if (!aligned && TargetFWAntsIdxs[i_ant][3*idx-2]<=fat) isBonus=true;
- ScoreOrientationRight(table,orir,TargetFWAntsIdxs[i_ant][3*idx-1],TargetFWAntsIdxs[i_ant][3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_orit,beta_orit);
- }
- }
- }
-}
-
-void Alignment::computeOrientationTargetBackward(const CountTable& table, double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus) {
- if (DEBUG) cerr << "computeOrientationTargetBackward" << endl;
- int oril, orir;
- for (int idx=1; idx<=TargetFWRuleIdxs[0]; idx++) {
- if (DEBUG) cerr << "considering TargetFWRuleIdxs[" << idx << "]: " << TargetFWRuleIdxs[3*idx-2] << endl;
- int* fwblock = blockSource(TargetFWRuleIdxs[3*idx-2],TargetFWRuleIdxs[3*idx-2]);
- bool aligned = (fwblock[0] == MINIMUM_INIT);
- if (aligned) {
- OrientationSource(fwblock[0],fwblock[1],&oril,&orir);
- } else {
- OrientationTarget(TargetFWRuleIdxs[3*idx-2],&oril,&orir);
- }
- delete fwblock;
- // the second and the third parameters of ScoreOrientationLeft must be e and f (not f and then e)
- bool isBonus = false;
- if (TargetFWRuleIdxs[3*idx-2]<=fat) isBonus = true;
- //if (!isBonus) // unnecessary
- //if (minSSpan(TargetFWRuleIdxs[3*idx-2])==MINIMUM_INIT && lat<=TargetFWRuleIdxs[3*idx-2]) isBonus = true;
- ScoreOrientationLeftBackward(table,oril,TargetFWRuleIdxs[3*idx-1],TargetFWRuleIdxs[3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_orit,beta_orit);
- isBonus = false;
- if (lat<=TargetFWRuleIdxs[3*idx-2]) isBonus = true;
- //if (!isBonus) // unnecessary
- //if (minSSpan(TargetFWRuleIdxs[3*idx-2])==MINIMUM_INIT && TargetFWRuleIdxs[3*idx-2]<=fat) isBonus=true;
- ScoreOrientationRightBackward(table,orir,TargetFWRuleIdxs[3*idx-1],TargetFWRuleIdxs[3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_orit,beta_orit);
- }
-
- for (int i_ant=0; i_ant<_Arity; i_ant++) {
- int antfat = firstTargetAligned(TargetAntsIdxs[i_ant][1]);
- int antlat = lastTargetAligned(TargetAntsIdxs[i_ant][TargetAntsIdxs[i_ant][0]]);
- int antfas = firstSourceAligned(SourceAntsIdxs[i_ant][1]);
- int antlas = lastSourceAligned(SourceAntsIdxs[i_ant][SourceAntsIdxs[i_ant][0]]);
- for (int idx=1; idx<=TargetFWAntsIdxs[i_ant][0]; idx++) {
- if (DEBUG) cerr << "considering TargetFWAntsIdxs[" << i_ant << "][" << idx << "]: " << TargetFWAntsIdxs[i_ant][3*idx-2] << endl;
- int* fwblock = blockTarget(TargetFWAntsIdxs[i_ant][3*idx-2],TargetFWAntsIdxs[i_ant][3*idx-2]);
- bool aligned = (fwblock[0]!=MINIMUM_INIT);
- //bool aligned = (minSSpan( TargetFWAntsIdxs[i_ant][3*idx-2])!=MINIMUM_INIT);
- bool Lcompute = true; bool Rcompute = true;
- if ((aligned && antfas<fwblock[0]) ||
- (!aligned && antfat < fwblock[2]))
- Lcompute=false;
- if ((aligned && fwblock[0]<antlas) ||
- (!aligned && fwblock[3] < antlat))
- Rcompute=false;
- if (!Lcompute && !Rcompute) continue;
- bool isBonus = false;
- if (aligned) {
- OrientationSource(fwblock[0],fwblock[1],&oril,&orir,Lcompute,Rcompute);
- } else {
- OrientationTarget(TargetFWAntsIdxs[i_ant][3*idx-2],&oril,&orir, Lcompute, Rcompute);
- }
- if (Lcompute) {
- if ((aligned && fwblock[1]<=fas) ||
- (!aligned && fwblock[3]<=fat))
- isBonus=true;
- //if (!isBonus)
- // if (!aligned && lat<=TargetFWAntsIdxs[i_ant][3*idx-2]) isBonus=true;
- ScoreOrientationLeftBackward(table,oril,TargetFWAntsIdxs[i_ant][3*idx-1],TargetFWAntsIdxs[i_ant][3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_orit,beta_orit);
- }
- isBonus = false;
- if (Rcompute) {
- if ((aligned && las<=fwblock[0]) ||
- (!aligned && lat<=fwblock[2]))
- isBonus=true;
- if (!isBonus)
- //if (!aligned && TargetFWAntsIdxs[i_ant][3*idx-2]<=fat) isBonus=true;
- ScoreOrientationRightBackward(table,orir,TargetFWAntsIdxs[i_ant][3*idx-1],TargetFWAntsIdxs[i_ant][3*idx],
- isBonus,cost,bonus,bo1,bo1_bonus,bo2,bo2_bonus,alpha_orit,beta_orit);
- }
- delete fwblock;
- }
- }
-}
-
-bool Alignment::MemberOf(int* FWIdxs, int pos1, int pos2) {
- for (int idx=2; idx<=FWIdxs[0]; idx++) {
- if (FWIdxs[3*(idx-1)-2]==pos1 && FWIdxs[3*idx-2]==pos2) return true;
- }
- return false;
-}
-
-void Alignment::computeDominanceSource(const CountTable& table, WordID lfw, WordID rfw,
- double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus) {
- // no bonus yet
- if (DEBUG) cerr << "computeDominanceSource" << endl;
- if (DEBUG) cerr << " initial cost=" << *cost << ", initial bonus=" << *bonus << endl;
- for (int idx=2; idx<=SourceFWIdxs[0]; idx++) {
- if (DEBUG) {
- cerr << "PrevSourceFWIdxs :" << SourceFWIdxs[3*(idx-1)-2] << "," << SourceFWIdxs[3*(idx-1)-1]
- << "," << SourceFWIdxs[3*(idx-1)] << endl;
- cerr << "CurrSourceFWIdxs :" << SourceFWIdxs[3*(idx)-2] << "," << SourceFWIdxs[3*(idx)-1]
- << "," << SourceFWIdxs[3*(idx)] << endl;
- }
- bool compute = true;
- for (int i_ant=0; i_ant<_Arity && compute; i_ant++) {
- if (MemberOf(SourceFWAntsIdxs[i_ant],SourceFWIdxs[3*(idx-1)-2],SourceFWIdxs[3*(idx)-2])) {
- //cerr << "Skipping, they have been calculated in the " << (i_ant+1) << "-th branch" << endl;
- compute=false;
- }
- }
- if (compute) {
- int dom = DominanceSource(SourceFWIdxs[3*(idx-1)-2],SourceFWIdxs[3*idx-2]);
- if (DEBUG) cerr << "dom = " << dom << endl;
- ScoreDominance(table,dom,SourceFWIdxs[3*(idx-1)-1],SourceFWIdxs[3*idx-1],SourceFWIdxs[3*(idx-1)],SourceFWIdxs[3*idx],
- cost,bo1,bo2,false,alpha_doms,beta_doms);
- if (DEBUG) cerr << "cost now is " << *cost << endl;
- }
- }
- if (SourceFWIdxs[0]>0) {
- if (lfw>=0) {
- int dom = DominanceSource(0,SourceFWIdxs[1]);
- if (DEBUG) cerr << " --> lfw = " << lfw << "-" << TD::Convert(lfw) << endl;
- if (DEBUG) cerr << " --> rfw = " << rfw << "-" << TD::Convert(rfw) << endl;
- ScoreDominance(table,dom,lfw,SourceFWIdxs[2],lfw,SourceFWIdxs[3],bonus,bo1_bonus,bo2_bonus,true,alpha_doms,beta_doms);
- }
- if (rfw>=0) {
- int dom = DominanceSource(SourceFWIdxs[3*SourceFWIdxs[0]-2],_J-1);
- ScoreDominance(table,dom,SourceFWIdxs[3*SourceFWIdxs[0]-1],rfw,SourceFWIdxs[3*SourceFWIdxs[0]],
- rfw,bonus,bo1_bonus,bo2_bonus,true,alpha_doms,beta_doms);
- }
- }
-}
-
-void Alignment::computeDominanceSourcePos(const CountTable& table, WordID lfw, WordID rfw,
- double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, int maxfwidx, int maxdepth1, int maxdepth2) {
- if (DEBUG) cerr << "computeDominanceSourcePos" << endl;
- if (DEBUG) cerr << " initial cost=" << *cost << ", initial bonus=" << *bonus << endl;
- ostringstream oss;
- for (int idx=2; idx<=SourceFWIdxs[0]; idx++) {
- if (DEBUG) {
- cerr << "PrevSourceFWIdxs :" << SourceFWIdxs[3*(idx-1)-2] << "," << SourceFWIdxs[3*(idx-1)-1]
- << "," << SourceFWIdxs[3*(idx-1)] << endl;
- cerr << "CurrSourceFWIdxs :" << SourceFWIdxs[3*(idx)-2] << "," << SourceFWIdxs[3*(idx)-1]
- << "," << SourceFWIdxs[3*(idx)] << endl;
- }
- //if (!((SourceFWAbsIdxs[3*(idx-1)-2]<=maxdepth1 && SourceFWAbsIdxs[3*idx-2]<=maxdepth1) ||
- // (maxfwidx-SourceFWAbsIdxs[3*(idx-1)-2]+1<=maxdepth2 && maxfwidx-SourceFWAbsIdxs[3*idx-2]+1<=maxdepth2))) continue;
- bool compute = true;
- for (int i_ant=0; i_ant<_Arity && compute; i_ant++) {
- if (MemberOf(SourceFWAntsIdxs[i_ant],SourceFWIdxs[3*(idx-1)-2],SourceFWIdxs[3*(idx)-2])) {
- //cerr << "Skipping, they have been calculated in the " << (i_ant+1) << "-th branch" << endl;
- compute=false;
- }
- }
- if (compute) {
- int dom = DominanceSource(SourceFWIdxs[3*(idx-1)-2],SourceFWIdxs[3*idx-2]);
- if (DEBUG) cerr << "dom = " << dom << endl;
- if (maxdepth1+maxdepth2>0) {
- string source1 = TD::Convert(SourceFWIdxs[3*(idx-1)-1]);
- string source2 = TD::Convert(SourceFWIdxs[3*(idx)-1]);
- if (maxdepth1>0) {
- oss << source1 << "/";
- if (SourceFWAbsIdxs[3*(idx-1)-2]<=maxdepth1)
- oss << SourceFWAbsIdxs[3*(idx-1)-2];
- else
- oss << "X";
- WordID source1id = TD::Convert(oss.str());
- oss.str("");
- oss << source2 << "/";
- if (SourceFWAbsIdxs[3*idx-2]<=maxdepth1)
- oss << SourceFWAbsIdxs[3*idx-2];
- else
- oss << "X";
- WordID source2id = TD::Convert(oss.str());
- oss.str("");
- ScoreDominance(table,dom,source1id,source2id,SourceFWIdxs[3*(idx-1)],SourceFWIdxs[3*idx],
- cost,bo1,bo2,false,alpha_doms,beta_doms);
- }
- if (maxdepth2>0) {
- oss << source1 << "/";
- if (maxfwidx-SourceFWAbsIdxs[3*(idx-1)-2]+1<=maxdepth2)
- oss << ((maxfwidx-SourceFWAbsIdxs[3*(idx-1)-2]+1)*-1);
- else
- oss << "X";
- WordID source1id = TD::Convert(oss.str());
- oss.str("");
- oss << source2 << "/";
- if (maxfwidx-SourceFWAbsIdxs[3*idx-2]+1<=maxdepth2)
- oss << ((maxfwidx-SourceFWAbsIdxs[3*(idx-1)-2]+1)*-1);
- else
- oss << "X";
- WordID source2id = TD::Convert(oss.str());
- oss.str("");
- ScoreDominance(table,dom,source1id,source2id,SourceFWIdxs[3*(idx-1)],SourceFWIdxs[3*idx],
- cost,bo1,bo2,false,alpha_doms,beta_doms);
- }
- }
- }
- }
- if (SourceFWIdxs[0]>0) {
- if (lfw>=0) {
- int dom = DominanceSource(0,SourceFWIdxs[1]);
- string source1 = TD::Convert(lfw);
- string source2 = TD::Convert(SourceFWIdxs[2]);
- if (maxdepth1>0) {
- oss << source1 << "/";
- if (SourceFWAbsIdxs[1]-1<=maxdepth1)
- oss << (SourceFWAbsIdxs[1]-1);
- else
- oss << "X";
- WordID source1id = TD::Convert(oss.str());
- oss.str("");
- oss << source2 << "/";
- if (SourceFWAbsIdxs[1]<=maxdepth1)
- oss << SourceFWAbsIdxs[1];
- else
- oss << "X";
- WordID source2id = TD::Convert(oss.str());
- oss.str("");
- ScoreDominance(table,dom,source1id,source2id,lfw,SourceFWIdxs[3],bonus,bo1_bonus,bo2_bonus,true,alpha_doms,beta_doms);
- }
- if (maxdepth2>0) {
- oss << source1 << "/";
- if (maxfwidx-(SourceFWAbsIdxs[1]-1)+1<=maxdepth2)
- oss << ((maxfwidx-(SourceFWAbsIdxs[1]-1)+1)*-1);
- else
- oss << "X";
- WordID source1id = TD::Convert(oss.str());
- oss.str("");
- oss << source2 << "/";
- if (maxfwidx-SourceFWAbsIdxs[1]+1<=maxdepth2)
- oss << ((maxfwidx-SourceFWAbsIdxs[1]+1)*-1);
- else
- oss << "X";
- WordID source2id = TD::Convert(oss.str());
- oss.str("");
- ScoreDominance(table,dom,source1id,source2id,lfw,SourceFWIdxs[3],bonus,bo1_bonus,bo2_bonus,true,alpha_doms,beta_doms);
- }
- }
- if (rfw>=0) {
- int dom = DominanceSource(SourceFWIdxs[3*SourceFWIdxs[0]-2],_J-1);
- string source1 = TD::Convert(SourceFWIdxs[3*SourceFWIdxs[0]-1]);
- string source2 = TD::Convert(rfw);
- if (maxdepth1>0) {
- oss << source1 << "/";
- if (SourceFWAbsIdxs[3*SourceFWAbsIdxs[0]-2]<=maxdepth1)
- oss << SourceFWAbsIdxs[3*SourceFWIdxs[0]-2];
- else
- oss << "X";
- WordID source1id = TD::Convert(oss.str());
- oss.str("");
- oss << source2 << "/";
- if (SourceFWAbsIdxs[3*SourceFWAbsIdxs[0]-2]+1<=maxdepth1)
- oss << (SourceFWAbsIdxs[3*SourceFWAbsIdxs[0]-2]+1);
- else
- oss << "X";
- WordID source2id = TD::Convert(oss.str());
- ScoreDominance(table,dom,source1id,source2id,SourceFWIdxs[3*SourceFWIdxs[0]],
- rfw,bonus,bo1_bonus,bo2_bonus,true,alpha_doms,beta_doms);
- }
- if (maxdepth2>0) {
- oss << source1 << "/";
- if (maxfwidx-SourceFWAbsIdxs[3*SourceFWAbsIdxs[0]-2]+1<=maxdepth2)
- oss << ((maxfwidx-SourceFWAbsIdxs[3*SourceFWAbsIdxs[0]-2]+1)*-1);
- else
- oss << "X";
- WordID source1id = TD::Convert(oss.str());
- oss.str("");
- oss << source2 << "/";
- if (maxfwidx-(SourceFWAbsIdxs[3*SourceFWAbsIdxs[0]-2]+1)+1<=maxdepth2)
- oss << ((maxfwidx-(SourceFWAbsIdxs[3*SourceFWAbsIdxs[0]-2]+1)+1)*-1);
- else
- oss << "X";
- WordID source2id = TD::Convert(oss.str());
- oss.str("");
- ScoreDominance(table,dom,source1id,source2id,SourceFWIdxs[3*SourceFWIdxs[0]],
- rfw,bonus,bo1_bonus,bo2_bonus,true,alpha_doms,beta_doms);
- }
- }
- }
-}
-
-
-void Alignment::computeDominanceTarget(const CountTable& table, WordID lfw, WordID rfw,
- double *cost, double *bonus, double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus) {
- if (DEBUG) cerr << "computeDominanceTarget" << endl;
- for (int idx=2; idx<=TargetFWIdxs[0]; idx++) {
- if (DEBUG) cerr << "PrevTargetFWIdxs :" << TargetFWIdxs[3*(idx-1)-2] << "," << TargetFWIdxs[3*(idx-1)-1] << "," <<TargetFWIdxs[3*(idx-1)] << endl;
- if (DEBUG) cerr << "CurrTargetFWIdxs :" << TargetFWIdxs[3*(idx)-2] << "," << TargetFWIdxs[3*(idx)-1] << "," <<TargetFWIdxs[3*(idx)] << endl;
- bool compute = true;
- for (int i_ant=0; i_ant <_Arity && compute; i_ant++) {
- if (MemberOf(TargetFWAntsIdxs[i_ant],TargetFWIdxs[3*(idx-1)-2],TargetFWIdxs[3*idx-2])) {
- if (DEBUG) cerr << "Skipping, they have been calculated in the " << (i_ant+1) << "-th branch" << endl;
- compute = false;
- }
- }
- if (compute) {
- int dom = DominanceTarget(TargetFWIdxs[3*(idx-1)-2],TargetFWIdxs[3*idx-2]);
- //cerr << (3*(idx-1)) << "," << (3*idx) << "," << (3*(idx-1)-1) << "," << (3*idx-1) << endl;
- if (DEBUG) cerr << "dom target = " << dom << endl;
- ScoreDominance(table,dom,TargetFWIdxs[3*(idx-1)],TargetFWIdxs[3*idx],TargetFWIdxs[3*(idx-1)-1],TargetFWIdxs[3*idx-1],
- cost,bo1,bo2,false,alpha_domt,beta_domt);
- }
- }
- if (TargetFWIdxs[0]>0) {
- if (DEBUG) cerr << "backoff dominance " << endl;
- if (lfw>=0) {
- int dom = DominanceTarget(0,TargetFWIdxs[1]);
- if (DEBUG) cerr << "dom target (with left) = " << dom << endl;
- ScoreDominance(table,dom,lfw,lfw,TargetFWIdxs[2],TargetFWIdxs[3],bonus,bo1_bonus,bo2_bonus,true,alpha_domt,beta_domt);
- }
- if (rfw>=0) {
- int dom = DominanceTarget(TargetFWIdxs[3*TargetFWIdxs[0]-2],_I-1);
- if (DEBUG) cerr << "dom target (with right) = " << dom << endl;
- ScoreDominance(table,dom,TargetFWIdxs[3*TargetFWIdxs[0]-1],TargetFWIdxs[3*TargetFWIdxs[0]],
- rfw,rfw,bonus,bo1_bonus,bo2_bonus,true,alpha_domt,beta_domt);
- }
- }
-
- //cerr << "END of computeDominanceTarget" << endl;
-}
-
-double Alignment::ScoreDominance(const CountTable& table, int dom, WordID source1, WordID source2, WordID target1, WordID target2) {
- if (DEBUG) {
- cerr << "ScoreDominance(source1=" << TD::Convert(source1) << ",source2=" << TD::Convert(source2)
- << ",target1=" << TD::Convert(target1) << ",target2=" << TD::Convert(target2) << ", dom=" << dom << endl;
- }
- string _source1 = TD::Convert(source1);
- string _source2 = TD::Convert(source2);
- string _source1idx; string _source2idx;
- if (table.mode==1) {
- _source1idx = _source1; _source2idx = _source2;
- _source1 = _source1idx.substr(0,_source1idx.find_last_of("/"));
- _source2 = _source2idx.substr(0,_source2idx.find_last_of("/"));
- }
- string _target1 = TD::Convert(target1);
- string _target2 = TD::Convert(target2);
-
- double count = table.ultimate[dom];
- double total = table.ultimate[4];
- double prob = count/total;
- if (DEBUG) cerr << "level0 " << count << "/" << total << "=" << prob << endl;
- double alpha = 0.1;
-
- string key = _source1 + " " + _source2;
- WordID key_id = TD::Convert(key);
- map<WordID,int*>::const_iterator it = table.model.find(key_id);
- bool stop = (it==table.model.end());
- if (!stop) {
- stop = true;
- if (it->second[4]>=0) {
- count = it->second[dom] + alpha*prob;
- total = it->second[4] + alpha;
- prob = count/total;
- if (DEBUG) cerr << "level1 " << count << "/" << total << "=" << prob << endl;
- stop = false;
- }
- }
- if (stop) return prob;
-
- key = _source1 + " " + _source2 + " " + _target1 + " " + _target2;
- key_id = TD::Convert(key);
- it = table.model.find(key_id);
- stop = (it==table.model.end());
- if (!stop) {
- stop = true;
- if (it->second[4]>=0) {
- count = it->second[dom] + alpha*prob;
- total = it->second[4] + alpha;
- prob = count/total;
- if (DEBUG) cerr << "level2 " << count << "/" << total << "=" << prob << endl;
- stop = false;
- }
- }
-
- if (table.mode!=1 || stop) return prob;
- key = _source1 + " " + _source2 + " " + _target1 + " " + _target2;
- key_id = TD::Convert(key);
- it = table.model.find(key_id);
- if (it!=table.model.end()) {
- if (it->second[4]>=0) {
- count = it->second[dom] + alpha*prob;
- total = it->second[4] + alpha;
- if (DEBUG) cerr << "level3 " << count << "/" << total << "=" << prob << endl;
- prob = count/total;
- }
- }
-
- return prob;
-}
-
-void Alignment::ScoreDominance(const CountTable& table, int dom, WordID source1, WordID source2, WordID target1, WordID target2, double *cost, double *bo1, double *bo2, bool isBonus, double alpha2, double beta2) {
- if (DEBUG)
- cerr << "ScoreDominance(source1=" << TD::Convert(source1) << ",source2=" << TD::Convert(source2)
- << ",target1=" << TD::Convert(target1) << ",target2=" << TD::Convert(target2) << ",isBonus=" << isBonus << ", alpha2 = " << alpha2 << ", beta2 = " << beta2 << endl;
- if (DEBUG) cerr << " BEFORE=" << *cost << endl;
- *cost += ScoreDominance(table,dom,source1,source2,target1,target2);
- if (DEBUG) cerr << " AFTER=" << *cost << endl;
-}
-
-WordID Alignment::F2EProjectionFromExternal(int idx, const vector<AlignmentPoint>& als, const string& delimiter) {
- if (DEBUG) {
- cerr << "F2EProjectionFromExternal=" << idx << endl;
- for (int i=0; i< als.size(); i++) cerr << "als[" << i << "]=" << als[i] << " ";
- cerr << endl;
- }
- vector<int> alignedTo;
- for (int i=0; i<als.size(); i++) {
- if (DEBUG) cerr << als[i] << " ";
- if (als[i].s_==idx)
- alignedTo.push_back(als[i].t_);
- }
- if (DEBUG) {
- cerr << endl;
- cerr << "alignedTo = ";
- for (int i=0; i<alignedTo.size(); i++) cerr << alignedTo[i] << " ";
- cerr << endl;
- }
- if (alignedTo.size()==0) {
- if (DEBUG) cerr << "returns [NULL] : " << TD::Convert("NULL") << endl;
- return TD::Convert("NULL");
- } else if (alignedTo.size()==1) {
- if (DEBUG) cerr << "returns [" << TD::Convert(_e[alignedTo[0]]) << "] : " << _e[alignedTo[0]] << endl;
- return _e[alignedTo[0]]; // if not aligned to many, why bother continuing
- } else {
- ostringstream projection;
- for (int i=0; i<alignedTo.size(); i++) {
- if (i>0) projection << delimiter;
- projection << TD::Convert(_e[alignedTo[i]]);
- }
- if (DEBUG) {
- cerr << "projection = " << projection.str() << endl;
- cerr << "returns = " << TD::Convert(projection.str()) << endl;
- }
- return TD::Convert(projection.str());
- }
-}
-
-WordID Alignment::E2FProjectionFromExternal(int idx, const vector<AlignmentPoint>& als, const string& delimiter) {
- vector<int> alignedTo;
- for (int i=0; i<als.size(); i++)
- if (als[i].t_==idx) alignedTo.push_back(als[i].s_);
- if (alignedTo.size()==0) {
- return TD::Convert("NULL");
- } else if (alignedTo.size()==1) {
- return _f[alignedTo[0]]; // if not aligned to many, why bother continuing
- } else {
- ostringstream projection;
- for (int i=0; i<alignedTo.size(); i++) {
- if (i>0) projection << delimiter;
- projection << TD::Convert(_f[alignedTo[i]]);
- }
- return TD::Convert(projection.str());
- }
-}
-
-
-WordID Alignment::F2EProjection(int idx, const string& delimiter) {
- if (DEBUG) cerr << "F2EProjection(" << idx << ")" << endl;
- int e = targetOf(idx);
- if (e<0) {
- if (DEBUG) cerr << "projection = NULL" << endl;
- return TD::Convert("NULL");
- } else {
- if (targetOf(idx,e+1)<0) {
- if (DEBUG) cerr << "e-1=" << (e-1) << ", size=" << _e.size() << endl;
- return getE(e-1); // if not aligned to many, why bother continuing
- }
- ostringstream projection;
- bool firstTime = true;
- do {
- if (!firstTime) projection << delimiter;
- projection << TD::Convert(_e[e-1]); // transform space
- firstTime = false;
- e = targetOf(idx,e+1);
- //if (DEBUG) cerr << "projection = " << projection.str() << endl;
- } while(e>=0);
- return TD::Convert(projection.str());
- }
-}
-
-WordID Alignment::E2FProjection(int idx, const string& delimiter) {
- //cerr << "E2FProjection(" << idx << ")" << endl;
- //cerr << "i" << endl;
- int f = sourceOf(idx);
- //cerr << "j, f=" << f << endl;
- if (f<0) {
- //cerr << "projection = NULL" << endl;
- return TD::Convert("NULL");
- } else {
- if (sourceOf(idx,f+1)<0) return getF(f-1);
- bool firstTime = true;
- ostringstream projection(ostringstream::out);
- do {
- if (!firstTime) projection << delimiter;
- projection << TD::Convert(_f[f-1]); //transform space
- firstTime = false;
- f = sourceOf(idx,f+1);
- //cerr << "projection = " << projection.str() << endl;
- } while(f>=0);
- return TD::Convert(projection.str());
- }
-}
-void Alignment::computeBorderDominanceSource(const CountTable& table, double *cost, double *bonus, double *state_mono,
- double *state_nonmono, TRule &rule, const std::vector<const void*>& ant_contexts, const map<WordID,int>& sfw) {
- // HACK: GOAL is assumed to always be "S"
- if (DEBUG) cerr << "computeBorderDominanceSource" << endl;
- std::vector<WordID> f = rule.f();
- std::vector<WordID> e = rule.e();
- int nt_index[f.size()];
- int nt_count=0;
- for (int i=0; i<f.size(); i++) nt_index[i] = (f[i]<0)? ++nt_count : 0;
- if (DEBUG) {
- cerr << "f = ";
- for (int i=0; i<f.size(); i++) cerr << i << "." << "[" << f[i] << "] ";
- cerr << endl;
- cerr << "e = ";
- for (int i=0; i<e.size(); i++) cerr << i << "." << "[" << e[i] << "] ";
- cerr << endl;
- }
- bool flag[f.size()];
- for (int idx=0; idx<f.size(); idx++) flag[idx]=false;
- //collect alignments
- vector<int> als;
- for (std::vector<AlignmentPoint>::const_iterator i = rule.als().begin(); i != rule.als().end(); ++i) {
- int s = i->s_; int t = i->t_;
- als.push_back(link(t,s));
- }
- if (DEBUG) cerr << "rule.Arity=" << rule.Arity() << endl;
- if (rule.Arity()>0) {
- int ntc=0;
- for (int s=0; s<f.size(); s++) {
- if (f[s]<=0) {
- if (DEBUG) cerr << "f[s]=" << f[s] << "+" << s << " - ";
- for (int t=0; t<e.size(); t++) {
- if (e[t]==ntc) {
- if (DEBUG) cerr << "e[t]=" << e[t] << "+" << t <<endl;
- als.push_back(link(t,s));
- ntc--; break;
- }
- }
- }
- }
- }
- if (DEBUG) {
- cerr << "unsorted alignments (nonterminals and terminals)" << endl;
- for (int i=0; i<als.size(); i++)
- cerr << source(als[i]) << "-" << target(als[i]) << " ";
- cerr << endl;
- }
- // sort alignments according to target
- std::sort(als.begin(),als.end());
- if (DEBUG) {
- cerr << "sorted alignments (nonterminals and terminals)" << endl;
- for (int i=0; i<als.size(); i++)
- cerr << source(als[i]) << "-" << target(als[i]) << " ";
- cerr << endl;
- }
- // 0 -> neither, 1 -> leftFirst, 2 -> rightFirst, 3 -> dontCare
- // ScoreDominance(const CountTable& table, int dom, WordID source1, WordID source2, WordID target1, WordID target2)
- int prevs = 0;
- for (int i=0; i<als.size(); i++) {
- int currs = target(als[i]); //int currt = source(als[i]);
- if (DEBUG) cerr << "prevs=" << prevs << ", currs=" << currs << endl << endl;
- if (currs<prevs) {
- if (DEBUG) cerr << "currs<prevs" << endl;
- for (int s = currs; s <= prevs; s++) {
- if (sfw.find(f[s])!=sfw.end()) {
- WordID target = F2EProjectionFromExternal(s,rule.a_,"_SEP_");
- if (DEBUG) cerr<<" f[s]="<<TD::Convert(f[s])<<" is a function word, target="<<TD::Convert(target)<<endl;
- //*cost += ScoreDominance(table,1,kSOS,f[s],kSOS,target) + ScoreDominance(table,2,f[s],kEOS,target,kEOS);
- *cost += ScoreDominance(table,1,kSOS,f[s],kUNK,kUNK) + ScoreDominance(table,2,f[s],kEOS,kUNK,kUNK);
- if (DEBUG) cerr << " resulting cost="<< *cost << endl;
- } else if (f[s]<=0) {
- if (DEBUG) cerr << " f[s]= is a nonterminal" << endl;
- const int* ants = reinterpret_cast<const int *>(ant_contexts[nt_index[s]-1]);
- *cost += Dwarf::IntegerToDouble(ants[51]); // 50->mono, 51->non-mono
- if (DEBUG) cerr << " adding "<< Dwarf::IntegerToDouble(ants[51]) << " into cost, resulting = " << *cost << endl;
- }
- flag[s] = true;
- }
- }
- prevs = currs;
- }
- if (DEBUG) cerr << "bonus and state matter" << endl;
- for (int s=0; s<rule.f().size(); s++) {
- if (!flag[s]) {
- if (sfw.find(f[s])!=sfw.end()) {
- WordID target = F2EProjectionFromExternal(s,rule.a_,"_SEP_");
- if (DEBUG) cerr<<" f[s]="<<TD::Convert(f[s])<<" is a function word, target="<<TD::Convert(target)<<endl;
- //double indbonus = ScoreDominance(table,3,kSOS,f[s],kSOS,target) + ScoreDominance(table,3,f[s],kEOS,target,kEOS);
- double indbonus = ScoreDominance(table,3,kSOS,f[s],kUNK,kUNK) + ScoreDominance(table,3,f[s],kEOS,kUNK,kUNK);
- *bonus += indbonus;
- *state_mono += indbonus;
- //*state_nonmono += ScoreDominance(table,1,kSOS,f[s],kSOS,target) + ScoreDominance(table,2,f[s],kEOS,target,kEOS);
- *state_nonmono += ScoreDominance(table,1,kSOS,f[s],kUNK,kUNK) + ScoreDominance(table,2,f[s],kEOS,kUNK,kUNK);
- if (DEBUG) cerr<<" new bonus="<<*bonus<<", new state="<<*state_mono<<","<<*state_nonmono<<endl;
- } else if (f[s]<=0) {
- if (DEBUG) cerr << " f[s]="<< f[s] <<" is a nonterminal" << endl;
- const int* ants = reinterpret_cast<const int *>(ant_contexts[nt_index[s]-1]);
- double indbonus = Dwarf::IntegerToDouble(ants[50]);
- *bonus += indbonus;
- *state_mono += indbonus;
- *state_nonmono += Dwarf::IntegerToDouble(ants[51]);
- if (DEBUG) cerr << " propagating state=" << *state_mono <<","<< *state_nonmono<< endl;
- }
- }
- }
- if (DEBUG) cerr << "LHS:" << rule.GetLHS() << ":" << TD::Convert(rule.GetLHS()*-1) <<endl;
- if (rule.GetLHS()*-1==TD::Convert("S")) {
- *state_mono = 0;
- *state_nonmono = 0;
- for (int i=0; i<rule.Arity(); i++) {
- const int* ants = reinterpret_cast<const int *>(ant_contexts[i]);
- *cost += Dwarf::IntegerToDouble(ants[50]);
- }
- *bonus = 0;
- }
- if (DEBUG) cerr << "-->>>> cost="<<*cost<<", bonus="<<*bonus<<", state_mono="<<*state_mono<<", state_nonmono="<<*state_nonmono<<endl;
-}
-
-bool Alignment::prepare(TRule& rule, const std::vector<const void*>& ant_contexts, const map<WordID,int>& sfw, const map<WordID,int>& tfw,const Lattice& sourcelattice, int spanstart, int spanend) {
- if (DEBUG) cerr << "===Rule===" << rule.AsString() << endl;
- _f = rule.f();
- _e = rule.e();
- _Arity = rule.Arity();
- if (DEBUG) {
- cerr << "F: ";
- for (int idx=0; idx<_f.size(); idx++) cerr << _f[idx] << " ";
- cerr << endl;
- cerr << "F': ";
- for (int idx=0; idx<_f.size(); idx++)
- if (_f[idx]>=0) {
- cerr << TD::Convert(_f[idx]) << " ";
- } else {
- cerr << TD::Convert(_f[idx]*-1);
- }
- cerr << endl;
- cerr << "E: ";
- for (int idx=0; idx<_e.size(); idx++)
- cerr << _e[idx] << " ";
- cerr << endl;
- cerr << "E': ";
- for (int idx=0; idx<_e.size(); idx++)
- if (_e[idx]>0) {
- cerr << TD::Convert(_e[idx]) << " ";
- } else {
- cerr << "[NT]" << " ";
- }
- cerr << endl;
- }
-
- SourceFWRuleIdxs[0]=0;
- SourceFWRuleAbsIdxs[0]=0;
- for (int idx=1; idx<=_f.size(); idx++) { // in transformed space
- if (sfw.find(_f[idx-1])!=sfw.end()) {
- SourceFWRuleIdxs[0]++;
- SourceFWRuleAbsIdxs[++SourceFWRuleAbsIdxs[0]]=GetFWGlobalIdx(idx,sourcelattice,_f,spanstart,spanend,ant_contexts,sfw);
- SourceFWRuleIdxs[3*SourceFWRuleIdxs[0]-2]=idx;
- SourceFWRuleIdxs[3*SourceFWRuleIdxs[0]-1]=_f[idx-1];
- SourceFWRuleIdxs[3*SourceFWRuleIdxs[0]] =F2EProjectionFromExternal(idx-1,rule.a_,"_SEP_");
- }
- }
- TargetFWRuleIdxs[0]=0;
- for (int idx=1; idx<=_e.size(); idx++) { // in transformed space
- if (tfw.find(_e[idx-1])!=tfw.end()) {
- TargetFWRuleIdxs[0]++;
- TargetFWRuleIdxs[3*TargetFWRuleIdxs[0]-2]=idx;
- TargetFWRuleIdxs[3*TargetFWRuleIdxs[0]-1]=E2FProjectionFromExternal(idx-1,rule.a_,"_SEP_");
- TargetFWRuleIdxs[3*TargetFWRuleIdxs[0]] =_e[idx-1];
- }
- }
-
- if (DEBUG) {
- cerr << "SourceFWRuleIdxs[" << SourceFWRuleIdxs[0] << "]:";
- for (int idx=1; idx<=SourceFWRuleIdxs[0]; idx++) {
- cerr << " idx:" << SourceFWRuleIdxs[3*idx-2];
- cerr << " absidx:" << SourceFWRuleAbsIdxs[idx];
- cerr << " F:" << SourceFWRuleIdxs[3*idx-1];
- cerr << " E:" << SourceFWRuleIdxs[3*idx];
- cerr << "; ";
- }
- cerr << endl;
- cerr << "TargetFWRuleIdxs[" << TargetFWRuleIdxs[0] << "]:";
- for (int idx=1; idx<=TargetFWRuleIdxs[0]; idx++) {
- cerr << " idx:" << TargetFWRuleIdxs[3*idx-2];
- cerr << " F:" << TargetFWRuleIdxs[3*idx-1];
- cerr << " E:" << TargetFWRuleIdxs[3*idx];
- }
- cerr << endl;
- }
- if (SourceFWRuleIdxs[0]+TargetFWRuleIdxs[0]==0) {
- bool nofw = true;
- for (int i_ant=0; i_ant<_Arity && nofw; i_ant++) {
- const int* ants = reinterpret_cast<const int *>(ant_contexts[i_ant]);
- if (ants[0]>=0||ants[3]>=0||ants[6]>=0||ants[9]>=0) nofw=false;
- }
- if (nofw) return true;
- }
- //cerr << "clearing als first" << endl;
- clearAls(_J,_I);
-
- if (DEBUG) cerr << "A["<< rule.a_.size() << "]: " ;
- RuleAl[0]=0;
- // add phrase start boundary
- RuleAl[0]++; RuleAl[RuleAl[0]*2-1]=0; RuleAl[RuleAl[0]*2]=0;
- if (DEBUG) cerr << RuleAl[RuleAl[0]*2-1] << "-" << RuleAl[RuleAl[0]*2] << " ";
- for (int idx=0; idx<rule.a_.size(); idx++) {
- RuleAl[0]++;
- RuleAl[RuleAl[0]*2-1]=rule.a_[idx].s_+1;
- RuleAl[RuleAl[0]*2] =rule.a_[idx].t_+1;
- if (DEBUG) cerr << RuleAl[RuleAl[0]*2-1] << "-" << RuleAl[RuleAl[0]*2] << " ";
- }
- // add phrase end boundary
- RuleAl[0]++; RuleAl[RuleAl[0]*2-1]=_f.size()+1; RuleAl[RuleAl[0]*2]=_e.size()+1;
- if (DEBUG) cerr << RuleAl[RuleAl[0]*2-1] << "-" << RuleAl[RuleAl[0]*2] << " ";
- if (DEBUG) cerr << endl;
-
- SourceRuleIdxs[0] = _f.size()+2; // +2 (phrase boundaries)
- TargetRuleIdxs[0] = _e.size()+2;
- int ntidx=-1;
- for (int idx=0; idx<_f.size()+2; idx++) { // idx in transformed space
- SourceRuleIdxs[idx+1]=idx;
- if (0<idx && idx<=_f.size()) if (_f[idx-1]<0) SourceRuleIdxs[idx+1]=ntidx--;
- }
- for (int idx=0; idx<_e.size()+2; idx++) {
- TargetRuleIdxs[idx+1]=idx;
- if (0<idx && idx<=_e.size()) {
- //cerr << "_e[" <<(idx-1)<< "]=" << _e[idx-1] << endl;
- if (_e[idx-1]<=0) TargetRuleIdxs[idx+1]=_e[idx-1]-1;
- }
- }
- if (DEBUG) {
- cerr << "SourceRuleIdxs:";
- for (int idx=0; idx<SourceRuleIdxs[0]+1; idx++)
- cerr << " " << SourceRuleIdxs[idx];
- cerr << endl;
- cerr << "TargetRuleIdxs:";
- for (int idx=0; idx<TargetRuleIdxs[0]+1; idx++)
- cerr << " " << TargetRuleIdxs[idx];
- cerr << endl;
- }
-
- // sloppy, the integrity of anstates is assumed
- // total = 50 bytes
- // first 3 ints for leftmost source function words (1 for index, 4 for source WordID and 4 for target WordI
- // second 3 for rightmost source function words
- // third 3 for leftmost target function words
- // fourth 3 for rightmost target function words
- // the next 1 int for the number of alignments
- // the remaining 37 ints for alignments (source then target)
- for (int i_ant=0; i_ant<_Arity; i_ant++) {
- const int* ants = reinterpret_cast<const int *>(ant_contexts[i_ant]);
- int span = ants[Dwarf::STATE_SIZE-1];
- if (DEBUG) {
- cerr << "antcontexts[" << i_ant << "] ";
- for (int idx=0; idx<Dwarf::STATE_SIZE; idx++) cerr << idx << "." << ants[idx] << " ";
- cerr << endl;
- cerr << "i,j = " << source(ants[Dwarf::STATE_SIZE-1]) << "," << target(ants[Dwarf::STATE_SIZE-1]) << endl;
- }
- SourceFWAntsIdxs[i_ant][0]=0;
- SourceFWAntsAbsIdxs[i_ant][0]=0;
- if (ants[0]>=0) {
- // Given a span, give the index of the first function word
- int firstfwidx = GetFirstFWIdx(source(span),target(span),sourcelattice,sfw);
- if (DEBUG) cerr << " firstfwidx = " << firstfwidx << endl;
- int fwcount = 0;
- if (ants[1]>=0) { // one function word
- SourceFWAntsIdxs[i_ant][0]++; SourceFWAntsIdxs[i_ant][1]=ants[0];
- SourceFWAntsIdxs[i_ant][2]=ants[1]; SourceFWAntsIdxs[i_ant][3]=ants[2];
- fwcount++;
- } else { // if ants[1] < 0 then compound fws
- //cerr << "ants[1]<0" << endl;
- istringstream ossf(TD::Convert(ants[1]*-1)); string ffw;
- istringstream osse(TD::Convert(ants[2])); string efw; //projection would be mostly NULL
- int delta=ants[0];
- while (osse >> efw && ossf >> ffw) {
- SourceFWAntsIdxs[i_ant][0]++;
- SourceFWAntsIdxs[i_ant][SourceFWAntsIdxs[i_ant][0]*3-2]=ants[0]-(delta--);
- SourceFWAntsIdxs[i_ant][SourceFWAntsIdxs[i_ant][0]*3-1]=TD::Convert(ffw);
- SourceFWAntsIdxs[i_ant][SourceFWAntsIdxs[i_ant][0]*3] =TD::Convert(efw);
- fwcount++;
- }
- }
- if (DEBUG) cerr << " fwcount=" << fwcount << endl;
- SourceFWAntsAbsIdxs[i_ant][0]=fwcount;
- for (int i=1; i<=fwcount; i++) SourceFWAntsAbsIdxs[i_ant][i]=firstfwidx++;
- }
- if (ants[3]>=0) {
- int lastfwidx = GetLastFWIdx(source(span),target(span),sourcelattice,sfw);
- if (DEBUG) cerr << " lastfwidx = " << lastfwidx << endl;
- int fwcount=0;
- if (ants[4]>=0) {
- fwcount++;
- SourceFWAntsIdxs[i_ant][0]++;
- SourceFWAntsIdxs[i_ant][SourceFWAntsIdxs[i_ant][0]*3-2]=ants[3];
- SourceFWAntsIdxs[i_ant][SourceFWAntsIdxs[i_ant][0]*3-1]=ants[4];
- SourceFWAntsIdxs[i_ant][SourceFWAntsIdxs[i_ant][0]*3] =ants[5];
- } else { // if ants[4] < 0 then compound fws
- //cerr << "ants[4]<0" << endl;
- istringstream ossf(TD::Convert(ants[4]*-1)); string ffw;
- istringstream osse(TD::Convert(ants[5])); string efw;
- int delta=0;
- while (osse >> efw && ossf >> ffw) {
- fwcount++;
- SourceFWAntsIdxs[i_ant][0]++;
- SourceFWAntsIdxs[i_ant][SourceFWAntsIdxs[i_ant][0]*3-2]=ants[3]+(delta++);
- SourceFWAntsIdxs[i_ant][SourceFWAntsIdxs[i_ant][0]*3-1]=TD::Convert(ffw);
- SourceFWAntsIdxs[i_ant][SourceFWAntsIdxs[i_ant][0]*3] =TD::Convert(efw);
- }
- }
- if (DEBUG) cerr << " fwcount=" << fwcount << endl;
- for (int i=1; i<=fwcount; i++) SourceFWAntsAbsIdxs[i_ant][SourceFWAntsAbsIdxs[i_ant][0]+i]=lastfwidx-fwcount+i;
- SourceFWAntsAbsIdxs[i_ant][0]+=fwcount;
- }
- TargetFWAntsIdxs[i_ant][0]=0;
- if (ants[6]>=0) {
- if (ants[8]>=0) { // check the e part
- TargetFWAntsIdxs[i_ant][0]++;
- TargetFWAntsIdxs[i_ant][1]=ants[6];
- TargetFWAntsIdxs[i_ant][2]=ants[7];
- TargetFWAntsIdxs[i_ant][3]=ants[8];
- } else { // if ants[8] < 0 then compound fws
- //cerr << "ants[8]<0" << endl;
- //cerr << "ants[7]=" << TD::Convert(ants[7]) << endl;
- //cerr << "ants[8]=" << TD::Convert(ants[8]*-1) << endl;
- istringstream ossf(TD::Convert(ants[7])); string ffw;
- istringstream osse(TD::Convert(ants[8]*-1)); string efw;
- int delta=ants[6];
- while (osse >> efw && ossf >> ffw) {
- //cerr << "efw="<< efw << ",ffw=" << ffw << endl;
- TargetFWAntsIdxs[i_ant][0]++;
- TargetFWAntsIdxs[i_ant][TargetFWAntsIdxs[i_ant][0]*3-2]=ants[6]-(delta--);
- TargetFWAntsIdxs[i_ant][TargetFWAntsIdxs[i_ant][0]*3-1]=TD::Convert(ffw);
- TargetFWAntsIdxs[i_ant][TargetFWAntsIdxs[i_ant][0]*3] =TD::Convert(efw);
- }
- }
- }
- if (ants[9]>=0) {
- if (ants[11]>=0) {
- TargetFWAntsIdxs[i_ant][0]++;
- TargetFWAntsIdxs[i_ant][TargetFWAntsIdxs[i_ant][0]*3-2]=ants[9];
- TargetFWAntsIdxs[i_ant][TargetFWAntsIdxs[i_ant][0]*3-1]=ants[10];
- TargetFWAntsIdxs[i_ant][TargetFWAntsIdxs[i_ant][0]*3] =ants[11];
- } else {
- //cerr << "ants[11]<0" << endl;
- //cerr << "ants[10]=" << TD::Convert(ants[10]) << endl;
- //cerr << "ants[11]=" << TD::Convert(ants[11]*-1) << endl;
- istringstream ossf(TD::Convert(ants[10])); string ffw;
- istringstream osse(TD::Convert(ants[11]*-1)); string efw;
- int delta = 0;
- while (osse >> efw && ossf >> ffw) {
- //cerr << "efw="<< efw << ",ffw=" << ffw << endl;
- TargetFWAntsIdxs[i_ant][0]++;
- TargetFWAntsIdxs[i_ant][TargetFWAntsIdxs[i_ant][0]*3-2]=ants[9]+(delta++);
- TargetFWAntsIdxs[i_ant][TargetFWAntsIdxs[i_ant][0]*3-1]=TD::Convert(ffw);
- TargetFWAntsIdxs[i_ant][TargetFWAntsIdxs[i_ant][0]*3] =TD::Convert(efw);
- }
- }
- }
- AntsAl[i_ant][0]=ants[12];//number of alignments
- for (int idx=1; idx<=AntsAl[i_ant][0]; idx++) {
- AntsAl[i_ant][idx*2-1] = source(ants[12+idx]);
- AntsAl[i_ant][idx*2] = target(ants[12+idx]);
- }
- }
-
- for (int i_ant=0; i_ant<_Arity; i_ant++) {
- int length = AntsAl[i_ant][0];
- int maxs = -1000;
- int maxt = -1000;
- for (int idx=0; idx<length; idx++) {
- if (maxs<AntsAl[i_ant][2*idx+1]) maxs = AntsAl[i_ant][2*idx+1];
- if (maxt<AntsAl[i_ant][2*idx+2]) maxt = AntsAl[i_ant][2*idx+2];
- }
- if (DEBUG) cerr << "SourceFWAntsIdxs[" <<i_ant<<"][0]=" << SourceFWAntsIdxs[i_ant][0] << endl;
- for (int idx=1; idx<=SourceFWAntsIdxs[i_ant][0]; idx++) {
- if (DEBUG) {
- cerr << "SourceFWAntsIdxs["<<i_ant<<"]["<<(3*idx-2)<<"]="<<SourceFWAntsIdxs[i_ant][3*idx-2];
- cerr << ","<<SourceFWAntsIdxs[i_ant][3*idx-1]<<","<<SourceFWAntsIdxs[i_ant][3*idx]<<endl;
- cerr << "SourceFWAntsAbsIdxs["<<i_ant<<"]["<<idx<<"]="<<SourceFWAntsAbsIdxs[i_ant][idx] << endl;
- }
- if (maxs<SourceFWAntsIdxs[i_ant][3*idx-2]) maxs=SourceFWAntsIdxs[i_ant][3*idx-2];
- }
- if (DEBUG) cerr << "TargetFWAntsIdxs[" <<i_ant<<"][0]=" << TargetFWAntsIdxs[i_ant][0] << endl;
- for (int idx=1; idx<=TargetFWAntsIdxs[i_ant][0]; idx++) {
- if (DEBUG) {
- cerr << "TargetFWAntsIdxs["<<i_ant<<"]["<<(3*idx-2)<<"]="<<TargetFWAntsIdxs[i_ant][3*idx-2];
- cerr << ","<<TargetFWAntsIdxs[i_ant][3*idx-1]<<","<<TargetFWAntsIdxs[i_ant][3*idx]<<endl;
- }
- if (maxt<TargetFWAntsIdxs[i_ant][3*idx-2]) maxt=TargetFWAntsIdxs[i_ant][3*idx-2];
- }
- SourceAntsIdxs[i_ant][0] = maxs+1;
- if (DEBUG) cerr << "SourceAntsIdxs[" << i_ant << "][0]=" <<SourceAntsIdxs[i_ant][0] << endl;
- for (int idx=0; idx<SourceAntsIdxs[i_ant][0]; idx++) SourceAntsIdxs[i_ant][idx+1]=idx;
- TargetAntsIdxs[i_ant][0] = maxt+1;
- if (DEBUG) cerr << "TargetAntsIdxs[" << i_ant << "][0]=" <<TargetAntsIdxs[i_ant][0] << endl;
- for (int idx=0; idx<TargetAntsIdxs[i_ant][0]; idx++) TargetAntsIdxs[i_ant][idx+1]=idx;
- }
- int TotalSource = SourceRuleIdxs[0] - _Arity;
- for (int idx=0; idx<_Arity; idx++) TotalSource += SourceAntsIdxs[idx][0];
- int TotalTarget = TargetRuleIdxs[0] - _Arity;
- for (int idx=0; idx<_Arity; idx++) TotalTarget += TargetAntsIdxs[idx][0];
- if (DEBUG) cerr << "TotalSource = "<< TotalSource << ", TotalTarget = "<< TotalTarget << endl;
- int curr = 0;
- for (int idx=1; idx<=SourceRuleIdxs[0]; idx++) {
- if (SourceRuleIdxs[idx]>=0) {
- SourceRuleIdxs[idx]=curr++;
- } else {
- int i_ant = SourceRuleIdxs[idx]*-1-1;
- if (DEBUG) cerr << "SourceAntsIdxs[" << i_ant << "]" << endl;
- for (int idx2=1; idx2<=SourceAntsIdxs[i_ant][0]; idx2++) {
- SourceAntsIdxs[i_ant][idx2]=curr++;
- if (DEBUG) cerr << SourceAntsIdxs[i_ant][idx2] << " ";
- }
- if (DEBUG) cerr << endl;
- }
- }
- if (DEBUG) {
- cerr << "SourceRuleIdxs" << endl;
- for (int idx=1; idx<=SourceRuleIdxs[0]; idx++) cerr << SourceRuleIdxs[idx] << " ";
- cerr << endl;
- }
- curr = 0;
- for (int idx=1; idx<=TargetRuleIdxs[0]; idx++) {
- if (TargetRuleIdxs[idx]>=0) {
- TargetRuleIdxs[idx]=curr++;
- } else {
- int i_ant = TargetRuleIdxs[idx]*-1-1;
- if (DEBUG) cerr << "TargetRuleIdxs[" << i_ant << "]" << endl;
- for (int idx2=1; idx2<=TargetAntsIdxs[i_ant][0]; idx2++) {
- TargetAntsIdxs[i_ant][idx2]=curr++;
- if (DEBUG) cerr << TargetAntsIdxs[i_ant][idx2] << " ";
- }
- if (DEBUG) cerr << endl;
- }
- }
- if (DEBUG) {
- cerr << "TargetRuleIdxs" << endl;
- for (int idx=1; idx<=TargetRuleIdxs[0]; idx++) cerr << TargetRuleIdxs[idx] << " ";
- cerr << endl;
- }
- for (int idx=1; idx<=RuleAl[0]; idx++) {
- if (DEBUG) {
- cerr << RuleAl[idx*2-1] << " - " << RuleAl[idx*2] << " to ";
- cerr << SourceRuleIdxs[RuleAl[idx*2-1]+1] << " - " << TargetRuleIdxs[RuleAl[idx*2]+1] << endl;
- }
- set(SourceRuleIdxs[RuleAl[idx*2-1]+1],TargetRuleIdxs[RuleAl[idx*2]+1]);
- }
- for (int i_ant=0; i_ant<_Arity; i_ant++) {
- for (int idx=1; idx<=AntsAl[i_ant][0]; idx++) {
- if (DEBUG) {
- cerr << AntsAl[i_ant][2*idx-1] << " - " << AntsAl[i_ant][2*idx] << " to ";
- cerr << SourceAntsIdxs[i_ant][AntsAl[i_ant][2*idx-1]+1] << " - ";
- cerr << TargetAntsIdxs[i_ant][AntsAl[i_ant][2*idx]+1] << endl;
- }
- set(SourceAntsIdxs[i_ant][AntsAl[i_ant][2*idx-1]+1],TargetAntsIdxs[i_ant][AntsAl[i_ant][2*idx]+1]);
- }
- }
- SourceFWIdxs[0]=0;
- SourceFWAbsIdxs[0]=0;
- if (DEBUG) cerr << "SourceFWRuleIdxs:" << endl;
- for (int idx=1; idx<=SourceFWRuleIdxs[0]; idx++) {
- if (DEBUG) cerr << SourceFWRuleIdxs[3*idx-2] << " to " << SourceRuleIdxs[SourceFWRuleIdxs[3*idx-2]+1] << endl;
- SourceFWRuleIdxs[3*idx-2] = SourceRuleIdxs[SourceFWRuleIdxs[3*idx-2]+1];
- SourceFWAbsIdxs[0]++;
- SourceFWAbsIdxs[3*SourceFWAbsIdxs[0]-2]=SourceFWRuleAbsIdxs[idx];
- SourceFWIdxs[0]++;
- SourceFWIdxs[3*SourceFWIdxs[0]-2]=SourceFWRuleIdxs[3*idx-2];
- SourceFWIdxs[3*SourceFWIdxs[0]-1]=SourceFWRuleIdxs[3*idx-1];
- SourceFWIdxs[3*SourceFWIdxs[0]] =SourceFWRuleIdxs[3*idx];
- }
- for (int i_ant=0; i_ant<_Arity; i_ant++) {
- if (DEBUG) cerr << "SourceFWAntsIdxs[" << i_ant << "]" << endl;
- for (int idx=1; idx<=SourceFWAntsIdxs[i_ant][0]; idx++) {
- if (DEBUG)
- cerr << SourceFWAntsIdxs[i_ant][3*idx-2] << " to " << SourceAntsIdxs[i_ant][SourceFWAntsIdxs[i_ant][3*idx-2]+1] << endl;
- SourceFWAntsIdxs[i_ant][3*idx-2] = SourceAntsIdxs[i_ant][SourceFWAntsIdxs[i_ant][3*idx-2]+1];
- SourceFWAbsIdxs[0]++;
- SourceFWAbsIdxs[3*SourceFWAbsIdxs[0]-2]=SourceFWAntsAbsIdxs[i_ant][idx];
- SourceFWIdxs[0]++;
- SourceFWIdxs[3*SourceFWIdxs[0]-2]=SourceFWAntsIdxs[i_ant][3*idx-2];
- SourceFWIdxs[3*SourceFWIdxs[0]-1]=SourceFWAntsIdxs[i_ant][3*idx-1];
- SourceFWIdxs[3*SourceFWIdxs[0]] =SourceFWAntsIdxs[i_ant][3*idx];
- }
- }
- sort(SourceFWIdxs);
- sort(SourceFWAbsIdxs);
- if (DEBUG) {
- cerr << "SourceFWIdxs : ";
- for (int idx=1; idx<=SourceFWIdxs[0]; idx++) {
- cerr << "idx:" << SourceFWIdxs[3*idx-2] << ",";
- cerr << "F:" << SourceFWIdxs[3*idx-1] << ",";
- cerr << "E:" << SourceFWIdxs[3*idx] << " ";
- }
- cerr << endl;
- }
- TargetFWIdxs[0]=0;
- if (DEBUG) cerr << "TargetFWRuleIdxs:" << endl;
- for (int idx=1; idx<=TargetFWRuleIdxs[0]; idx++) {
- if (DEBUG) cerr << TargetFWRuleIdxs[3*idx-2] << " to " << TargetRuleIdxs[TargetFWRuleIdxs[3*idx-2]+1] << endl;
- TargetFWRuleIdxs[3*idx-2] = TargetRuleIdxs[TargetFWRuleIdxs[3*idx-2]+1];
- TargetFWIdxs[0]++;
- TargetFWIdxs[3*TargetFWIdxs[0]-2]=TargetFWRuleIdxs[3*idx-2];
- TargetFWIdxs[3*TargetFWIdxs[0]-1]=TargetFWRuleIdxs[3*idx-1];
- TargetFWIdxs[3*TargetFWIdxs[0]] =TargetFWRuleIdxs[3*idx];
- }
- for (int i_ant=0; i_ant<_Arity; i_ant++) {
- if (DEBUG) cerr << "TargetFWAntsIdxs[" << i_ant << "]" << endl;
- for (int idx=1; idx<=TargetFWAntsIdxs[i_ant][0]; idx++) {
- if (DEBUG) cerr << TargetFWAntsIdxs[i_ant][3*idx-2] << " to " << TargetAntsIdxs[i_ant][TargetFWAntsIdxs[i_ant][3*idx-2]+1] << endl;
- TargetFWAntsIdxs[i_ant][3*idx-2] = TargetAntsIdxs[i_ant][TargetFWAntsIdxs[i_ant][3*idx-2]+1];
- TargetFWIdxs[0]++;
- TargetFWIdxs[3*TargetFWIdxs[0]-2]=TargetFWAntsIdxs[i_ant][3*idx-2];
- TargetFWIdxs[3*TargetFWIdxs[0]-1]=TargetFWAntsIdxs[i_ant][3*idx-1];
- TargetFWIdxs[3*TargetFWIdxs[0]] =TargetFWAntsIdxs[i_ant][3*idx];
- }
- }
- sort(TargetFWIdxs);
- if (DEBUG) {
- cerr << "TargetFWIdxs : ";
- for (int idx=1; idx<=TargetFWIdxs[0]; idx++) {
- cerr << "idx:" << TargetFWIdxs[3*idx-2]<< ",";
- cerr << "E:" << TargetFWIdxs[3*idx-1]<< ",";
- cerr << "F:" << TargetFWIdxs[3*idx]<< " ";
- }
- cerr << endl;
- cerr << AsString() << endl;
- }
- fas = firstSourceAligned(1); las = lastSourceAligned(_J-2);
- fat = firstTargetAligned(1); lat = lastTargetAligned(_I-2);
- if (DEBUG) cerr << "fas=" << fas << ", las=" << las << ", fat=" << fat << ", lat=" << lat << endl;
- assert(fas<=las);
- assert(fat<=lat);
- SetCurrAlVector();
- if (DEBUG) cerr << "end prepare" << endl;
- return false;
-}
-
-string Alignment::AsStringSimple() {
- ostringstream stream;
- for (int j=0; j<getJ(); j++) {
- int t = targetOf(j,minTSpan(j));
- while (t>=0) {
- stream << " " << j << "-" << t;
- t = targetOf(j,t+1);
- }
- }
- return stream.str();
-};
-
-
-string Alignment::AsString() {
- ostringstream stream;
- stream << "J:" << getJ() << " I:" << getI();
- for (int j=0; j<getJ(); j++) {
- int t = targetOf(j,minTSpan(j));
- while (t>=0) {
- stream << " " << j << "-" << t;
- t = targetOf(j,t+1);
- }
- }
- stream << " TargetSpan:";
- for (int j=0; j<getJ(); j++)
- if (minTSpan(j)!=MINIMUM_INIT)
- stream << " " << j << "[" << minTSpan(j) << "," << maxTSpan(j) << "]";
- else
- stream << " " << j << "[-,-]";
- stream << " SourceSpan:";
- for (int i=0; i<getI(); i++)
- if (minSSpan(i)!=MINIMUM_INIT)
- stream << " " << i << "[" << minSSpan(i) << "," << maxSSpan(i) << "]";
- else
- stream << " " << i << "[-,-]";
- return stream.str();
-};
-
-void Alignment::SetCurrAlVector() {
- curr_al.clear();
- for (int j=0; j<_J; j++) {
- int i = targetOf(j);
- while (i>=0) {
- curr_al.push_back(link(j,i));
- i = targetOf(j,i+1);
- }
- }
-}
-
-void CountTable::print() const {
- cerr << "+++ Model +++" << endl;
- for (map<WordID,int*>::const_iterator iter=model.begin(); iter!=model.end(); iter++) {
- cerr << TD::Convert(iter->first) << " ";
- for (int i=0; i<numColumn; i++) cerr << iter->second[i] << " ";
- cerr << endl;
- }
- cerr << "+++ Ultimate +++" << endl;
- for (int i=0; i<numColumn; i++) cerr << ultimate[i] << " ";
- cerr << endl;
-}
-
-void Alignment::ToArrayInt(vector<int>* ret) {
- ret->clear();
- for (int i=0; i<_J; i++) {
- int t = targetOf(i);
- while (t>=0) {
- ret->push_back(link(i,t));
- t = targetOf(i,t+1);
- }
- }
-}
-
-int Alignment::GetFWGlobalIdx(int idx, const Lattice& sourcelattice, vector<WordID>& sources, int spanstart, int spanend, const std::vector<const void*>& ant_contexts, const map<WordID,int>& sfw) {
- // get the index of the function word in the lattice
- if (DEBUG) cerr << " GetFWGlobalIdx(" << idx << "," << spanstart << "," << spanend << ")" << endl;
- int curr = spanstart; int i_ant = 0;
- for (int i=1; i<sources.size() && i<idx; i++) { // sources contain <s> and </s>
- if (sources[i]<0) {
- const int* ants = reinterpret_cast<const int *>(ant_contexts[i_ant++]);
- int antstate = ants[Dwarf::STATE_SIZE-1];
- if (DEBUG) cerr << " found NT[" << target(antstate) << "," << source(antstate) << "]" << endl;
- curr += target(antstate)-source(antstate);
- } else {
- curr++;
- }
- }
- if (DEBUG) cerr << " curr = " << curr << endl;
- //compute the fw index
- int ret = 1;
- for (int i=0; i<curr; i++) {
- if (sfw.find(sourcelattice[i][0].label)!=sfw.end()) ret++;
- }
- if (DEBUG) cerr << " ret = " << ret << endl;
- return ret;
-}
-
-int Alignment::GetFirstFWIdx(int spanstart,int spanend, const Lattice& sourcelattice, const map<WordID,int>& sfw) {
- if (DEBUG) cerr << " GetFirstFWIdx(" << spanstart << "," << spanend << ")" << endl;
- int curr=0;
- for (int i=0; i<spanend; i++) {
- if (sfw.find(sourcelattice[i][0].label)!=sfw.end()) {
- curr++;
- if (i>=spanstart) return curr;
- }
- }
-// assert(0);
- return curr;
-}
-
-int Alignment::GetLastFWIdx(int spanstart,int spanend, const Lattice& sourcelattice, const map<WordID,int>& sfw) {
- if (DEBUG) cerr << " GetLastFWIdx(" << spanstart << "," << spanend << ")" << endl;
- int curr=0;
- for (int i=0; i<spanend; i++) {
- if (sfw.find(sourcelattice[i][0].label)!=sfw.end()) {
- curr++;
- }
- }
- return curr;
-}
-
-WordID Alignment::generalize(WordID original, const map<WordID,WordID>& tags, bool pos) {
- if (!pos) {
- map<WordID,WordID>::const_iterator it = tags.find(original);
- if (it!=tags.end()) {
- return it->second;
- }
- } else {
- string key,idx;
- Dwarf::stripIndex(TD::Convert(original),&key,&idx);
- map<WordID,WordID>::const_iterator it = tags.find(TD::Convert(key));
- if (it!=tags.end()) {
- ostringstream oss;
- oss << TD::Convert(it->second) << "/" << idx;
- return TD::Convert(oss.str());
- }
- }
- return original;
-}
-
-int* Alignment::SOS() {
- int* neighbor = new int[4];
- neighbor[0]=0; neighbor[1]=0;
- neighbor[2]=0; neighbor[3]=0;
- return neighbor;
-}
-
-int* Alignment::EOS() {
- int* neighbor = new int[4];
- neighbor[0]=getJ()-1; neighbor[1]=neighbor[0];
- neighbor[2]=getI()-1; neighbor[3]=neighbor[2];
- return neighbor;
-}
-
-int* Alignment::neighborLeft(int startidx, int endidx, bool* getit) {
- if (DEBUG) cerr << " neighborLeft("<<startidx<<","<<endidx<<")"<<endl;
- int lborder = startidx;
- int* ret;
- while(lborder<=endidx) {
- ret = blockSource(lborder,endidx);
- if (ret[0]==lborder && ret[1]==endidx && ret[2]!=MINIMUM_INIT) {
- *getit = true;
- return ret;
- } else {
- delete[] ret;
- lborder++;
- }
- }
- ret = new int[4];
- ret[0]=-1; ret[1]=-1; ret[2]=-1; ret[3]=-1;
- *getit = false;
- return ret;
-}
-
-int* Alignment:: neighborRight(int startidx, int endidx, bool* getit) {
- if (DEBUG) cerr << " neighborRight("<<startidx<<","<<endidx<<")"<<endl;
- int rborder = endidx;
- int* ret;
- while(startidx<=rborder) {
- ret = blockSource(startidx,rborder);
- if (ret[0]==startidx && ret[1]==rborder && ret[2]!=MINIMUM_INIT) {
- *getit = true;
- return ret;
- } else {
- delete[] ret;
- rborder--;
- }
- }
- ret = new int[4];
- ret[0]=-1; ret[1]=-1; ret[2]=-1; ret[3]=-1;
- *getit = false;
- return ret;
-}
diff --git a/decoder/dwarf.h b/decoder/dwarf.h
deleted file mode 100644
index 49d2a3b7..00000000
--- a/decoder/dwarf.h
+++ /dev/null
@@ -1,286 +0,0 @@
-#ifndef DWARF_H
-#define DWARF_H
-
-#include <cstdlib>
-#include <vector>
-#include <map>
-#include <string>
-#include <ostream>
-#include "wordid.h"
-#include "lattice.h"
-#include "trule.h"
-#include "tdict.h"
-#include <boost/functional/hash.hpp>
-#include <tr1/unordered_map>
-#include <boost/tuple/tuple.hpp>
-
-using namespace std;
-using namespace std::tr1;
-using namespace boost::tuples;
-using namespace boost;
-
-const static bool DEBUG = false;
-
-class CountTable {
-public:
- int* ultimate;
- map<WordID,int*> model;
- int mode;
- int numColumn;
- void print() const;
- void setup(int _numcolumn, int _mode) {
- mode = _mode; numColumn = _numcolumn;
- }
-};
-
-class Alignment {
-/* Alignment represents an alignment object in a 2D format to support function word-based models calculation
-
- A note about model's parameter estimation:
- ==========================================
- The model is estimated as a two-level Dirichlet process.
- For orientation model, the first tier estimation is:
- P(o|f,e) where *o* is the orientation value to estimate, *f* is the source function word aligned to *e*
- its second tier is: P(o|f), while its third tier is P(o)
- For dominance model, the first tier estimation is:
- P(d|f1,f2,e1,e2) where *d* is a dominance value to estimate, *f1,f2* are the neighboring function words on the source
- aligned to *e1,e2* on the target side
- its second tier is: P(d|f1,f2) while its third tier is P(d)
-
- Taking orientation model as a case in point, a two level estimation proceeds as follow:
- P(o|f,e) = c(o,f,e) + alpha { c(o,f) + beta [ c (o) / c(.) ] }
- ------------------------------
- c(f) + beta
- -------------------------------------------------
- c(f,e) + alpha
- where c() is a count function, alpha and beta are the concentration parameter
- of the first and second Dirichlet process respectively
- To encourage or penalize the use of second and third tier statistics, bo1 and bo2 binary features are introduced
-*/
-public:
- const static int MAX_WORDS = 200;
- const static int MINIMUM_INIT = 1000;
- const static int MAXIMUM_INIT = -1000;
- const static int MAX_ARITY = 2;
- WordID kSOS;
- WordID kEOS;
- WordID kUNK;
- double alpha_oris; // 1st concentration parameter for orientation model
- double beta_oris; // 2nd concentration parameter for orientation model
- double alpha_orit; // 1st concentration parameter for orientation model
- double beta_orit; // 2nd concentration parameter for orientation model
- double alpha_doms; // idem as above but for dominance model
- double beta_doms;
- double alpha_domt; // idem as above but for dominance model
- double beta_domt;
-
- // ACCESS to alignment
- void set(int j,int i); // j is the source index, while i is the target index
- void reset(int j,int i); // idem as above
- inline bool at(int j, int i) { return _matrix[j][i]; };
- inline int getJ() {return _J;}; // max source of the current alignment
- inline int getI() {return _I;}; // max target of the current alignment
- inline void setI(int I) { _I = I; };
- inline void setJ(int J) { _J = J; };
- inline void setF(vector<WordID> f) { _f=f;};
- inline void setE(vector<WordID> e) { _e=e;};
- inline WordID getF(int id) { if (id<0) return TD::Convert("<s>"); if (id>=_f.size()) return TD::Convert("</s>"); return _f[id];};
- inline WordID getE(int id) { if (id<0) return TD::Convert("<s>"); if (id>=_e.size()) return TD::Convert("</s>"); return _e[id];};
- void clearAls(int prevJ=200, int prevI=200);
- int sourceOf(int i, int start = -1);
- int targetOf(int j, int start = -1);
- inline int minSSpan(int i) { return _sSpan[i][0];}
- inline int maxSSpan(int i) { return _sSpan[i][1];}
- inline int minTSpan(int j) { return _tSpan[j][0];}
- inline int maxTSpan(int j) { return _tSpan[j][1];}
- static inline int link(int s, int t) { return (s << 16) | t; }
- static inline int source(int st) {return st >> 16; }
- static inline int target(int st) {return st & 0xffff; }
- inline void setAlphaOris(double val) { alpha_oris=val; }
- inline void setAlphaOrit(double val) { alpha_orit=val; }
- inline void setAlphaDoms(double val) { alpha_doms=val; }
- inline void setAlphaDomt(double val) { alpha_domt=val; }
- inline void setBetaOris(double val) { beta_oris=val; }
- inline void setBetaOrit(double val) { beta_orit=val; }
- inline void setBetaDoms(double val) { beta_doms=val; }
- inline void setBetaDomt(double val) { beta_domt=val; }
- inline void setFreqCutoff(int val) { cout << _freq_cutoff << " to " << val << endl; _freq_cutoff=val; }
- string AsString();
- string AsStringSimple();
- int* SOS();
- int* EOS();
-
- // Model related function
- Alignment();
- // Given the current *rule* and its antecedents, construct an alignment space and mark the function word alignments
- // according *sfw* and *tfw*
- bool prepare(TRule& rule, const std::vector<const void*>& ant_contexts,
- const map<WordID,int>& sfw, const map<WordID,int>& tfw, const Lattice& sourcelattice, int spanstart, int spanend);
-
- // Compute orientation model score which parameters are stored in *table* and pass the values accordingly
- // will call Orientation(Source|Target) and ScoreOrientation(Source|Target)
- void computeOrientationSource(const CountTable& table, double *cost, double *bonus, double *bo1,
- double *bo1_bonus, double *bo2, double *bo2_bonus);
- void computeOrientationSourcePos(const CountTable& table, double *cost, double *bonus,
- double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, int maxfwidx, int maxdepth1, int maxdepth2);
- void computeOrientationSourceGen(const CountTable& table, double *cost, double *bonus, double *bo1,
- double *bo1_bonus, double *bo2, double *bo2_bonus, const map<WordID,WordID>& tags);
- void computeOrientationSourceBackward(const CountTable& table, double *cost, double *bonus, double *bo1,
- double *bo1_bonus, double *bo2, double *bo2_bonus);
- void computeOrientationSourceBackwardPos(const CountTable& table, double *cost, double *bonus, double *bo1,
- double *bo1_bonus, double *bo2, double *bo2_bonus, int maxfwidx, int maxdepth1, int maxdepth2);
- void computeOrientationTarget(const CountTable& table, double *cost, double *bonus, double *bo1,
- double *bo1_bonus, double *bo2, double *bo2_bonus);
- void computeOrientationTargetBackward(const CountTable& table, double *cost, double *bonus, double *bo1,
- double *bo1_bonus, double *bo2, double *bo2_bonus);
- // Get the orientation value of a function word at a particular index *fw*
- // assign the value to either *oril* or *orir* accoring to *Lcompute* and *Rcompute*
- void OrientationSource(int fw, int*oril, int* orir, bool Lcompute=true, bool Rcompute=true);
- void OrientationSource(int fw0, int fw1, int*oril, int* orir, bool Lcompute=true, bool Rcompute=true);
- int OrientationSource(int* left, int* right);
- void OrientationTarget(int fw, int*oril, int* orir, bool Lcompute=true, bool Rcompute=true);
- void OrientationTarget(int fw0, int fw1, int*oril, int* orir, bool Lcompute=true, bool Rcompute=true);
-
- vector<int> OrientationSourceLeft4Sampler(int fw0, int fw1);
- vector<int> OrientationSourceLeft4Sampler(int fw);
- vector<int> OrientationSourceRight4Sampler(int fw0, int fw1);
- vector<int> OrientationSourceRight4Sampler(int fw);
- vector<int> OrientationTargetLeft4Sampler(int fw0, int fw1);
- vector<int> OrientationTargetLeft4Sampler(int fw);
- vector<int> OrientationTargetRight4Sampler(int fw0, int fw1);
- vector<int> OrientationTargetRight4Sampler(int fw);
-
- // Given an orientation value *ori*, estimate the score accoding to *cond1*, *cond2*
- // and assign the value accordingly according to *isBonus* and whether the first or the second tier estimation
- // is used or not
- void ScoreOrientationRight(const CountTable& table, int ori, WordID cond1, WordID cond2,
- bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus,
- double *bo2, double *bo2_bonus, double alpha1, double beta1);
- void ScoreOrientationLeft(const CountTable& table, int ori, WordID cond1, WordID cond,
- bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus,
- double *bo2, double *bo2_bonus, double alpha1, double beta1);
- double ScoreOrientationRight(const CountTable& table, int ori, WordID cond1, WordID cond2);
- double ScoreOrientationLeft(const CountTable& table, int ori, WordID cond1, WordID cond);
- void ScoreOrientationRightBackward(const CountTable& table, int ori, WordID cond1, WordID cond2,
- bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus,
- double *bo2, double *bo2_bonus, double alpha1, double beta1);
- void ScoreOrientationLeftBackward(const CountTable& table, int ori, WordID cond1, WordID cond,
- bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus,
- double *bo2, double *bo2_bonus, double alpha1, double beta1);
- double ScoreOrientationRightBackward(const CountTable& table, int ori, WordID cond1, WordID cond2);
- double ScoreOrientationLeftBackward(const CountTable& table, int ori, WordID cond1, WordID cond);
- void ScoreOrientation(const CountTable& table, int offset, int ori, WordID cond1, WordID cond2,
- bool isBonus, double *cost, double *bonus, double *bo1, double *bo1_bonus,
- double *bo2, double *bo2_bonus, double alpha1, double beta1);
- double ScoreOrientation(const CountTable& table, int offset, int ori, WordID cond1, WordID cond2);
-
- // idem as above except these are for dominance model
- void computeDominanceSource(const CountTable& table, WordID lfw, WordID rfw, double *cost, double *bonus,
- double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus);
- void computeDominanceSourcePos(const CountTable& table, WordID lfw, WordID rfw, double *cost, double *bonus,
- double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus, int maxfwidx, int maxdepth1, int maxdepth2);
- void computeDominanceTarget(const CountTable& table, WordID lfw, WordID rfw, double *cost, double *bonus,
- double *bo1, double *bo1_bonus, double *bo2, double *bo2_bonus);
- void computeBorderDominanceSource(const CountTable& table, double *cost, double *bonus,
- double *state_mono, double *state_nonmono,
- TRule &rule, const std::vector<const void*>& ant_contexts, const map<WordID,int>& sfw);
- int DominanceSource(int fw1, int fw2);
- int DominanceTarget(int fw1, int fw2);
- vector<int> DominanceSource4Sampler(int fw1, int fw2);
- vector<int> DominanceTarget4Sampler(int fw1, int fw2);
- void ScoreDominance(const CountTable& table, int dom, WordID s1, WordID s2, WordID t1, WordID t2,
- double *cost, double *bo1, double *bo2, bool isBonus, double alpha2, double beta2);
- double ScoreDominance(const CountTable& table, int dom, WordID s1, WordID s2, WordID t1, WordID t2);
-
- // Remove all function word alignments except those at the borders
- // May result in more than two function word alignments at each side, because this function
- // will continue keeping function word alignments until the first aligned word at each side
- void BorderingSFWsOnly();
- void BorderingTFWsOnly();
- void simplify(int *ret); // preparing the next state
- void simplify_nofw(int *ret); // preparing the next state when no function word appears
- // set the first part of the next state, which concerns with function word
- // fas, las, fat, lat is the (f)irst or (l)ast function word alignments either on the (s)ource or (t)arget
- // these parameters to anticipate cases where there are more than two function word alignments
- void FillFWIdxsState(int *state, int fas, int las, int fat, int lat);
-
- // Helper function to obtain the aligned words on the other side
- // WARNING!!! Only to be used if the als are in sync with either source or target sentences
- WordID F2EProjectionFromExternal(int idx, const vector<AlignmentPoint>& als, const string& delimiter=" ");
- WordID E2FProjectionFromExternal(int idx, const vector<AlignmentPoint>& als, const string& delimiter=" ");
- // WARNING!!! Only to be used in dwarf_main.cc
- // These two function words assume that the alignment contains phrase boundary
- // but the source and target sentences do not
- WordID F2EProjection(int idx, const string& delimiter=" ");
- WordID E2FProjection(int idx, const string& delimiter=" ");
- void SetCurrAlVector();
- int* blockSource(int fw1, int fw2);
- int* blockTarget(int fw1, int fw2);
- void ToArrayInt(vector<int>* arr);
- int* neighborLeft(int startidx, int endidx, bool* found);
- int* neighborRight(int startidx, int endidx, bool* found);
-private:
- // Hash to avoid redundancy
- unordered_map<vector<int>, int, boost::hash<vector<int> > > oris_hash;
- unordered_map<vector<int>, int, boost::hash<vector<int> > > orit_hash;
- unordered_map<vector<int>, int, boost::hash<vector<int> > > doms_hash;
- unordered_map<vector<int>, int, boost::hash<vector<int> > > domt_hash;
- unordered_map<vector<int>, vector<int>, boost::hash<vector<int> > > simplify_hash;
- unordered_map<vector<int>, vector<int>, boost::hash<vector<int> > > prepare_hash;
-
- int _J; // effective source length;
- int _I; // effective target length;
- bool _matrix[MAX_WORDS][MAX_WORDS]; // true if aligned
- short _sSpan[MAX_WORDS][2]; //the source span of a target index; 0->min, 1->max
- short _tSpan[MAX_WORDS][2]; //the target span of a source index; 0->min, 2->max
- int _freq_cutoff;
- int SourceFWRuleIdxs[40]; //the indexes of function words in the rule;
- // The following applies to all *FW*Idxs
- // *FW*Idxs[0] = size
- // *FW*Idxs[idx*3-2] = index in the alignment, where idx starts from 1 to size
- // *FW*Idxs[idx*3-1] = source WordID
- // *FW*Idxs[idx*3] = target WordID
- int SourceFWRuleAbsIdxs[40];
- int TargetFWRuleIdxs[40]; //the indexes of function words in the rule; zeroth element is the count
- int ** SourceFWAntsIdxs; //the indexes of function words in antecedents
- int ** SourceFWAntsAbsIdxs;
- int ** TargetFWAntsIdxs; //the indexes of function words in antecedents
- int SourceRuleIdxs[40]; //the indexes of SOURCE tokens (zeroth element is the number of source tokens)
- //>0 means terminal, -i means the i-th Xs
- int TargetRuleIdxs[40]; //the indexes of TARGET tokens (zeroth element is the number of target tokens)
- int ** SourceAntsIdxs; //the array of indexes of a particular antecedent's SOURCE tokens
- int ** TargetAntsIdxs; //the array of indexes of a particular antecedent's TARGET tokens
- int SourceFWIdxs[40];
- int SourceFWAbsIdxs[40];
- int TargetFWIdxs[40];
- // *sort* and *quickSort* are used to sort *FW*Idxs
- void sort(int* num);
- void quickSort(int arr[], int top, int bottom);
-
- // *block(Source|Target)* finds the minimum block that containts two indexes (fw1 and fw2)
- inline int least(int i1, int i2) { return (i1<i2)?i1:i2; }
- inline int most(int i1, int i2) { return (i1>i2)?i1:i2; }
- void simplifyBackward(vector<int *>*blocks, int* block, const vector<int>& danglings);
- // used in simplify to check whether an atomic block according to source function words is also atomic according
- // to target function words as well, otherwise break it
- // the resulting blocks are added into *blocks*
- int _Arity;
- std::vector<WordID> _f; // the source sentence of the **current** rule (may not consistent with the current alignment)
- std::vector<WordID> _e; // the target sentence of the **current** rule
- int RuleAl[40];
- int **AntsAl;
- int firstSourceAligned(int start);
- int firstTargetAligned(int start);
- int lastSourceAligned(int end);
- int lastTargetAligned(int end);
- int fas, las, fat, lat; // first aligned source, last aligned source, first aligned target, last aligned target
- bool MemberOf(int* FWIdxs, int pos1, int pos2); // whether FWIdxs contains pos1 and pos2 consecutively
- // Convert the alignment to vector form, will be used for hashing purposes
- vector<int> curr_al;
- int GetFWGlobalIdx(int idx, const Lattice& sourcelattice, vector<WordID>& sources, int spanstart, int spanend, const std::vector<const void*>& ant_contexts, const map<WordID,int>& sfw);
- int GetFirstFWIdx(int spanstart,int spanend, const Lattice& sourcelattice, const map<WordID,int>& sfw);
- int GetLastFWIdx(int spanstart,int spanend, const Lattice& sourcelattice, const map<WordID,int>& sfw);
- WordID generalize(WordID original, const map<WordID,WordID>& tags, bool pos=false);
-};
-
-#endif
diff --git a/decoder/earley_composer.cc b/decoder/earley_composer.cc
index efce70a6..d47a6969 100644
--- a/decoder/earley_composer.cc
+++ b/decoder/earley_composer.cc
@@ -4,8 +4,14 @@
#include <fstream>
#include <map>
#include <queue>
-#include <tr1/unordered_map>
-#include <tr1/unordered_set>
+#ifndef HAVE_OLD_CPP
+# include <unordered_map>
+# include <unordered_set>
+#else
+# include <tr1/unordered_map>
+# include <tr1/unordered_set>
+namespace std { using std::tr1::unordered_map; using std::tr1::unordered_multiset; using std::tr1::unordered_set; }
+#endif
#include <boost/shared_ptr.hpp>
#include <boost/program_options.hpp>
@@ -19,7 +25,6 @@
#include "hg_remove_eps.h"
using namespace std;
-using namespace std::tr1;
// Define the following macro if you want to see lots of debugging output
// when you run the chart parser
diff --git a/decoder/factored_lexicon_helper.cc b/decoder/factored_lexicon_helper.cc
index 7203b325..e7899215 100644
--- a/decoder/factored_lexicon_helper.cc
+++ b/decoder/factored_lexicon_helper.cc
@@ -2,6 +2,7 @@
#include "filelib.h"
#include "stringlib.h"
+#include "sentence_metadata.h"
using namespace std;
diff --git a/decoder/factored_lexicon_helper.h b/decoder/factored_lexicon_helper.h
index 81c75275..7fedc517 100644
--- a/decoder/factored_lexicon_helper.h
+++ b/decoder/factored_lexicon_helper.h
@@ -6,7 +6,8 @@
#include <string>
#include <map>
#include "tdict.h"
-#include "sentence_metadata.h"
+
+struct SentenceMetadata;
// when computing features, it can be advantageous to:
// 1) back off to less specific forms (e.g., less highly inflected forms, POS tags, etc)
diff --git a/decoder/ff_context.cc b/decoder/ff_context.cc
index f2b0e67c..e56f6f1f 100644
--- a/decoder/ff_context.cc
+++ b/decoder/ff_context.cc
@@ -46,7 +46,7 @@ void RuleContextFeatures::ReplaceMacroWithString(
macro << relative_location << "]";
int macro_index = feature_instance.find(macro.str());
if (macro_index == string::npos) {
- cerr << "Can't find macro " << macro << " in feature template "
+ cerr << "Can't find macro " << macro.str() << " in feature template "
<< feature_instance;
abort();
}
diff --git a/decoder/ff_dwarf.cc b/decoder/ff_dwarf.cc
deleted file mode 100644
index fe7a472e..00000000
--- a/decoder/ff_dwarf.cc
+++ /dev/null
@@ -1,894 +0,0 @@
-#include <vector>
-#include <sstream>
-#include <fstream>
-#include <string>
-#include <iostream>
-#include <map>
-#include "hg.h"
-#include "ff_dwarf.h"
-#include "dwarf.h"
-#include "wordid.h"
-#include "tdict.h"
-#include "filelib.h"
-#include "sentence_metadata.h"
-#include "stringlib.h"
-
-using namespace std;
-
-Dwarf::Dwarf(const std::string& param) {
-/* Param is a space separated string which contains any or all of the following:
- oris|orit|doms|domt=filename
- e.g. oris=/fs/clip-galep3eval/hendra/z2e/oris128.gz
-*/
- sSOS="<s>";
- sEOS="</s>";
- kSOS=TD::Convert(sSOS);
- kEOS=TD::Convert(sEOS);
- kGOAL=TD::Convert("S")*-1;
- _sent_id = (int *)malloc(sizeof(int));
- *_sent_id = -1;
- if (DEBUG) cerr << "here = " << *_sent_id << endl;
- _fwcount = (int *)malloc(sizeof(int));
- *_fwcount = -1;
- cerr << "initializing dwarf" << endl;
- flag_oris=false; flag_orit=false; flag_doms=false; flag_domt=false; flag_tfw_count=false;
- flag_bdoms=false; flag_porislr=false, flag_porisrl=false, flag_goris=false; flag_pgorislr=false, flag_pgorisrl=false;
- flag_pdomslr=false; flag_pdomsrl=false; flag_pgdomslr=false; flag_pgdomsrl=false; flag_gdoms=false;
- flag_oris_backward=false; flag_orit_backward=false;
- explicit_soseos=false;
- SetStateSize(STATE_SIZE*sizeof(int));
- als = new Alignment();
- als->clearAls(Alignment::MAX_WORDS,Alignment::MAX_WORDS);
- istringstream iss(param); string w;
- while(iss >> w) {
- int equal = w.find_first_of("=");
- if (equal!=string::npos) {
- string model = w.substr(0,equal);
- vector<string> params;
- Tokenize(w.substr(equal+1),',',&params);
- string fn = params[0];
- if (model == "minfreq") {
- cerr << "model minfreq " << fn << endl;
- als->setFreqCutoff(atoi(fn.c_str()));
- } else if (model == "oris") {
- flag_oris = readOrientation(&toris,fn,&sfw);
- if (flag_oris) {
- oris_ = FD::Convert("OrientationSource");
- //oris_bo1_ = FD::Convert("OrientationSource_BO1");
- //oris_bo2_ = FD::Convert("OrientationSource_BO2");
- }
- if (params.size()>1) als->setAlphaOris(atof(params[1].c_str()));
- if (params.size()>2) als->setBetaOris(atof(params[2].c_str()));
- } else if (model == "porislr") {
- flag_porislr = readOrientation(&tporislr,fn,&sfw,true);
- poris_nlr = 0;
- if (flag_porislr) {
- porislr_ = FD::Convert("OrientationSourcePositionfulLeftRight");
- }
- if (params.size()>1) poris_nlr = atoi(params[1].c_str());
- if (DEBUG) cerr << " maximum poris depth=" << poris_nlr << endl;
- } else if (model == "porisrl") {
- flag_porisrl = readOrientation(&tporisrl,fn,&sfw,true);
- poris_nrl = 0;
- if (flag_porisrl) {
- porisrl_ = FD::Convert("OrientationSourcePositionfulRightLeft");
- }
- if (params.size()>1) poris_nrl = atoi(params[1].c_str());
- if (DEBUG) cerr << " maximum poris depth=" << poris_nrl << endl;
- } else if (model=="goris") {
- flag_goris = readOrientation(&tgoris,fn,&sfw);
- if (flag_goris) {
- goris_ = FD::Convert("OrientationSourceGeneralized");
- }
- if (params.size()>1) {
- readTags(params[1],&tags);
- generalizeOrientation(&tgoris,tags);
- }
- } else if (model=="pgorislr") {
- flag_pgorislr = readOrientation(&tpgorislr,fn,&sfw,true);
- pgoris_nlr = 0;
- if (flag_pgorislr) {
- pgorislr_ = FD::Convert("OrientationSourceGeneralizedPositionfulLeftRight");
- }
- if (DEBUG) {
- cerr << "BEFORE GENERALIZATION" << endl;
- tpgorislr.print();
- }
- if (params.size()>1) pgoris_nlr = atoi(params[1].c_str());
- if (params.size()>2) {
- readTags(params[2],&tags);
- generalizeOrientation(&tpgorislr,tags,true);
- }
- if (DEBUG) {
- cerr << "AFTER GENERALIZATION" << endl;
- tpgorislr.print();
- }
- } else if (model=="pgorisrl") {
- flag_pgorisrl = readOrientation(&tpgorisrl,fn,&sfw,true);
- pgoris_nrl = 0;
- if (flag_pgorisrl) {
- pgorisrl_ = FD::Convert("OrientationSourceGeneralizedPositionfulLeftRight");
- }
- if (params.size()>1) pgoris_nrl = atoi(params[1].c_str());
- if (params.size()>2) {
- readTags(params[2],&tags);
- generalizeOrientation(&tpgorisrl,tags,true);
- }
- } else if (model == "oris_backward") {
- flag_oris_backward = true;
- if (!flag_oris) readOrientation(&toris,fn,&sfw);
- oris_backward_ = FD::Convert("OrientationSourceBackward");
- if (params.size()>1) als->setAlphaOris(atof(params[1].c_str()));
- if (params.size()>2) als->setBetaOris(atof(params[2].c_str()));
- } else if (model == "orit") {
- flag_orit = readOrientation(&torit,fn,&tfw);
- if (flag_orit) {
- orit_ = FD::Convert("OrientationTarget");
- //orit_bo1_ = FD::Convert("OrientationTarget_BO1");
- //orit_bo2_ = FD::Convert("OrientationTarget_BO2");
- }
- if (params.size()>1) als->setAlphaOrit(atof(params[1].c_str()));
- if (params.size()>2) als->setBetaOrit(atof(params[2].c_str()));
- } else if (model == "orit_backward") {
- flag_orit_backward = true;
- if (!flag_orit) readOrientation(&torit,fn,&tfw);
- orit_backward_ = FD::Convert("OrientationTargetBackward");
- if (params.size()>1) als->setAlphaOrit(atof(params[1].c_str()));
- if (params.size()>2) als->setBetaOrit(atof(params[2].c_str()));
- } else if (model == "doms") {
- flag_doms = readDominance(&tdoms,fn,&sfw);
- if (flag_doms) {
- doms_ = FD::Convert("DominanceSource");
- //doms_bo1_ = FD::Convert("DominanceSource_BO1");
- //doms_bo2_ = FD::Convert("DominanceSource_BO2");
- }
- if (params.size()>1) als->setAlphaDoms(atof(params[1].c_str()));
- if (params.size()>2) als->setBetaDoms(atof(params[2].c_str()));
- } else if (model == "pdomsrl") {
- flag_pdomsrl = readDominance(&tpdomsrl,fn,&sfw,true);
- if (flag_pdomsrl) {
- pdomsrl_ = FD::Convert("DominanceSourcePositionfulRightLeft");
- }
- if (params.size()>1) pdoms_nrl = atoi(params[1].c_str());
- } else if (model == "pdomslr") {
- flag_pdomslr = readDominance(&tpdomslr,fn,&sfw,true);
- tpdomslr.print();
- if (flag_pdomslr) {
- pdomslr_ = FD::Convert("DominanceSourcePositionfulLeftRight");
- }
- if (params.size()>1) pdoms_nlr = atoi(params[1].c_str());
- } else if (model == "pgdomsrl") {
- flag_pgdomsrl = readDominance(&tpgdomsrl,fn,&sfw,true);
- if (flag_pgdomsrl) {
- pgdomsrl_ = FD::Convert("DominanceSourceGeneralizedPositionfulRightLeft");
- }
- if (params.size()>1) pgdoms_nrl = atoi(params[1].c_str());
- if (params.size()>2) {
- readTags(params[2],&tags);
- generalizeDominance(&tpgdomsrl,tags,true);
- }
- } else if (model == "pgdomslr") {
- flag_pgdomslr = readDominance(&tpgdomslr,fn,&sfw,true);
- if (flag_pgdomslr) {
- pgdomslr_ = FD::Convert("DominanceSourceGeneralizedPositionfulLeftRight");
- }
- if (params.size()>1) pgdoms_nlr = atoi(params[1].c_str());
- if (params.size()>2) {
- readTags(params[2],&tags);
- if (DEBUG) {
- for (map<WordID,WordID>::const_iterator it=tags.begin(); it!=tags.end(); it++) {
- cerr << "tags = " << TD::Convert(it->first) << ", " << TD::Convert(it->second) << endl;
- }
- }
- generalizeDominance(&tpgdomslr,tags,true);
- }
- if (DEBUG) tpgdomslr.print();
- } else if (model == "bdoms") {
- flag_bdoms = readDominance(&tbdoms,fn,&sfw);
- if (flag_bdoms) {
- bdoms_ = FD::Convert("BorderDominanceSource");
- }
- } else if (model == "domt") {
- flag_domt = readDominance(&tdomt,fn,&tfw);
- if (flag_domt) {
- domt_ = FD::Convert("DominanceTarget");
- //domt_bo1_ = FD::Convert("DominanceTarget_BO1");
- //domt_bo2_ = FD::Convert("DominanceTarget_BO2");
- }
- if (params.size()>1) als->setAlphaDomt(atof(params[1].c_str()));
- if (params.size()>2) als->setBetaDomt(atof(params[2].c_str()));
- } else if (model== "tfw_count") {
- flag_tfw_count = readList(fn,&tfw);
- tfw_count_ = FD::Convert("TargetFunctionWordsCount");
- } else {
- cerr << "DWARF doesn't understand this model: " << model << endl;
- }
- } else {
- if (w=="tfw_count") {
- flag_tfw_count = true;
- tfw_count_ = FD::Convert("TargetFunctionWordsCount");
- } else if (w=="oris_backward") {
- flag_oris_backward = true;
- oris_backward_ = FD::Convert("OrientationSourceBackward");
- } else if (w=="orit_backward") {
- flag_orit_backward = true;
- orit_backward_ = FD::Convert("OrientationTargetBackward");
- } else if (w=="explicit_soseos") {
- explicit_soseos=true;
- } else {
- cerr << "DWARF doesn't need this param: " << param << endl;
- }
- }
- }
- for (map<WordID,int>::const_iterator it=sfw.begin(); it!=sfw.end() && DEBUG; it++) {
- cerr << " FW:" << TD::Convert(it->first) << endl;
- }
-}
-
-void Dwarf::TraversalFeaturesImpl(const SentenceMetadata& smeta,
- const Hypergraph::Edge& edge,
- const std::vector<const void*>& ant_contexts,
- SparseVector<double>* features,
- SparseVector<double>* estimated_features,
- void* context) const {
- if (DEBUG) cerr << "TraversalFeaturesImpl" << endl;
- double cost, bonus, bo1, bo2, bo1_bonus, bo2_bonus;
- double bdoms_state_mono= 0; double bdoms_state_nonmono = 0;
- TRule r = *edge.rule_;
- if (DEBUG) cerr << " sent_id=" << *_sent_id << ", " << smeta.GetSentenceID() << endl;
- if (DEBUG) cerr << "rule = " << r.AsString() << endl;
- if (DEBUG) cerr << "rule[i,j] = " << edge.i_ << "," << edge.j_ << endl;
- if (*_sent_id != smeta.GetSentenceID()) { //new sentence
- *_sent_id = smeta.GetSentenceID();
- const Lattice l = smeta.GetSourceLattice();
- *_fwcount=0;
- for (int i=0; i<smeta.GetSourceLength(); i++) {
- if (sfw.find(l[i][0].label)!=sfw.end()) {
- *_fwcount+=1;
- }
- }
- if (DEBUG) cerr << "new sentence[" << *_sent_id << "]="<<*_fwcount<<endl;
- }
- bool nofw = als->prepare(*edge.rule_, ant_contexts, sfw, tfw,smeta.GetSourceLattice(),edge.i_,edge.j_);
- bool isFinal = (edge.i_==0 && edge.j_==smeta.GetSourceLength() && r.GetLHS()==kGOAL);
- // prepare *nofw* outputs whether the resulting alignment, contains function words or not
- // if not, the models do not have to be calcualted and *simplify* is very simple
- if (DEBUG) cerr << "nofw = " << nofw << endl;
- if (flag_tfw_count) {
- double count = 0;
- for (int i=0; i<r.e_.size(); i++) {
- if (tfw.find(r.e_[i])!=tfw.end()) count++;
- }
- features->set_value(tfw_count_,count);
- }
- if (flag_oris) {
- cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0;
- if (!nofw) als->computeOrientationSource(toris,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus);
- if (isFinal&&!explicit_soseos) {
- cost += bonus;
- bonus = 0;
- }
- features->set_value(oris_,cost);
- //features->set_value(oris_bo1_,bo1);
- //features->set_value(oris_bo2_,bo2);
- estimated_features->set_value(oris_,bonus);
- //estimated_features->set_value(oris_bo1_,bo1_bonus);
- //estimated_features->set_value(oris_bo2_,bo2_bonus);
- }
- if (flag_porislr) {
- cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0;
- if (!nofw)
- als->computeOrientationSourcePos(tporislr,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus,*_fwcount,poris_nlr,0);
- if (isFinal&&!explicit_soseos) {
- cost += bonus;
- bonus = 0;
- }
- features->set_value(porislr_,cost);
- estimated_features->set_value(porislr_,bonus);
- }
- if (flag_porisrl) {
- cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0;
- if (!nofw)
- als->computeOrientationSourcePos(tporisrl,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus,*_fwcount,0,poris_nrl);
- if (isFinal&&!explicit_soseos) {
- cost += bonus;
- bonus = 0;
- }
- features->set_value(porisrl_,cost);
- estimated_features->set_value(porisrl_,bonus);
- }
- if (flag_pgorislr) {
- cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0;
- if (!nofw)
- als->computeOrientationSourcePos(tpgorislr,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus,*_fwcount,pgoris_nlr,0);
- if (isFinal&&!explicit_soseos) {
- cost += bonus;
- bonus = 0;
- }
- features->set_value(pgorislr_,cost);
- estimated_features->set_value(pgorislr_,bonus);
- }
- if (flag_pgorisrl) {
- cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0;
- if (!nofw)
- als->computeOrientationSourcePos(tpgorisrl,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus,*_fwcount,0,pgoris_nrl);
- if (isFinal&&!explicit_soseos) {
- cost += bonus;
- bonus = 0;
- }
- features->set_value(pgorisrl_,cost);
- estimated_features->set_value(pgorisrl_,bonus);
- }
- if (flag_goris) {
- cost=0; bonus=0;
- if (!nofw) als->computeOrientationSource(tgoris,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus);
- if (isFinal&&!explicit_soseos) {
- cost += bonus;
- bonus = 0;
- }
- features->set_value(goris_,cost);
- estimated_features->set_value(goris_,bonus);
- }
- if (flag_oris_backward) {
- cost=0; bonus=0;
- if (!nofw)
- als->computeOrientationSourceBackward(toris,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus);
- if (isFinal&&!explicit_soseos) {
- cost += bonus;
- bonus = 0;
- }
- features->set_value(oris_backward_,cost);
- estimated_features->set_value(oris_backward_,bonus);
- }
- WordID _lfw = kSOS;
- WordID _rfw = kEOS;
- if (flag_doms || flag_pdomslr || flag_pdomsrl || flag_pgdomslr || flag_pgdomsrl) {
- if (DEBUG) cerr << " seeking lfw and rfw" << endl;
- int start = edge.i_;
- int end = edge.j_;
- if (DEBUG) cerr << " start=" << start << ", end=" << end << endl;
- const Lattice l = smeta.GetSourceLattice();
- for (int idx=start-1; idx>=0; idx--) {
- if (DEBUG) cerr << " checking idx=" << idx << ", label=" << l[idx][0].label << "-" << TD::Convert(l[idx][0].label) << endl;
- if (sfw.find(l[idx][0].label) !=sfw.end()) {
- if (DEBUG) cerr << "+";
- _lfw=l[idx][0].label; break;
- }
- }
- for (int idx=end; idx<l.size(); idx++) { // end or end+1
- if (DEBUG) cerr << " checking idx=" << idx << ", label=" << l[idx][0].label << "-" << TD::Convert(l[idx][0].label) << endl;
- if (sfw.find(l[idx][0].label)!=sfw.end()) {
- if (DEBUG) cerr << ".";
- _rfw=l[idx][0].label; break;
- }
- }
- if (isFinal&&!explicit_soseos) {
- _lfw=kSOS; _rfw=kEOS;
- }
- }
- if (flag_doms) {
- cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0;
- if (!nofw) als->computeDominanceSource(tdoms,_lfw,_rfw,&cost,&bonus,
- &bo1,&bo1_bonus,&bo2,&bo2_bonus);
- if (DEBUG) cerr << " COST=" << cost << ", BONUS=" << bonus << endl;
- if (isFinal&&!explicit_soseos) {
- cost += bonus;
- if (DEBUG) cerr << " final and !explicit_soseos, thus cost = " << cost << endl;
- bonus = 0;
- }
- features->set_value(doms_,cost);
- estimated_features->set_value(doms_,bonus);
- }
- if (flag_pdomslr) {
- if (DEBUG) cerr << " flag_pdomslr true, nofw=" << nofw << endl;
- if (DEBUG) cerr << " lfw=" << _lfw << ", rfw=" << _rfw << endl;
- if (DEBUG) cerr << " kSOS=" << kSOS << ", kEOS=" << kEOS << endl;
- cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0;
- if (!nofw) als->computeDominanceSourcePos(tpdomslr,_lfw,_rfw,&cost,&bonus,
- &bo1,&bo1_bonus,&bo2,&bo2_bonus,*_fwcount,pdoms_nlr,0);
- if (isFinal&&!explicit_soseos) {
- cost += bonus;
- bonus = 0;
- }
- features->set_value(pdomslr_,cost);
- estimated_features->set_value(pdomslr_,bonus);
- }
- if (flag_pdomsrl) {
- cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0;
- if (!nofw) als->computeDominanceSourcePos(tpdomsrl,_lfw,_rfw,&cost,&bonus,
- &bo1,&bo1_bonus,&bo2,&bo2_bonus,*_fwcount,0,pdoms_nrl);
- if (isFinal&&!explicit_soseos) {
- cost += bonus;
- bonus = 0;
- }
- features->set_value(pdomsrl_,cost);
- estimated_features->set_value(pdomsrl_,bonus);
- }
- if (flag_pgdomslr) {
- cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0;
- if (!nofw) als->computeDominanceSourcePos(tpgdomslr,_lfw,_rfw,&cost,&bonus,
- &bo1,&bo1_bonus,&bo2,&bo2_bonus,*_fwcount,pgdoms_nlr,0);
- if (isFinal&&!explicit_soseos) {
- cost += bonus;
- bonus = 0;
- }
- features->set_value(pgdomslr_,cost);
- estimated_features->set_value(pgdomslr_,bonus);
- }
- if (flag_pgdomsrl) { cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0;
- if (!nofw) als->computeDominanceSourcePos(tpgdomsrl,_lfw,_rfw,&cost,&bonus,
- &bo1,&bo1_bonus,&bo2,&bo2_bonus,*_fwcount,0,pgdoms_nrl);
- if (isFinal&&!explicit_soseos) {
- cost += bonus;
- bonus = 0;
- }
- features->set_value(pgdomsrl_,cost);
- estimated_features->set_value(pgdomsrl_,bonus);
- }
-
-
- if (flag_bdoms) {
- cost=0; bonus=0; bdoms_state_mono=0; bdoms_state_nonmono=0;
- if (!nofw)
- als->computeBorderDominanceSource(tbdoms,&cost,&bonus,
- &bdoms_state_mono, &bdoms_state_nonmono,*edge.rule_, ant_contexts, sfw);
- features->set_value(bdoms_,cost);
- estimated_features->set_value(bdoms_,bonus);
- }
- if (flag_orit) {
- cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0;
- if (!nofw) als->computeOrientationTarget(torit,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus);
- if (DEBUG) cerr << "cost=" << cost << ", bonus=" << bonus << ", bo1=" << bo1 << ", bo1_bonus=" << bo1_bonus << ", bo2=" << bo2 << ", bo2_bonus=" << bo2_bonus << endl;
- features->set_value(orit_,cost);
- //features->set_value(orit_bo1_,bo1);
- //features->set_value(orit_bo2_,bo2);
- estimated_features->set_value(orit_,bonus);
- //estimated_features->set_value(orit_bo1_,bo1_bonus);
- //estimated_features->set_value(orit_bo2_,bo2_bonus);
- }
- if (flag_orit_backward) {
- cost=0; bonus=0;
- if (!nofw) als->computeOrientationTargetBackward(torit,&cost,&bonus,&bo1,&bo1_bonus,&bo2,&bo2_bonus);
- features->set_value(orit_backward_,cost);
- estimated_features->set_value(orit_backward_,bonus);
- }
- if (flag_domt) {
- cost=0; bonus=0; bo1=0; bo2=0; bo1_bonus=0; bo2_bonus=0;
- WordID _lfw=-1; int start = edge.i_;
- WordID _rfw=-1; int end = edge.j_;
- if (smeta.HasReference()) {
- const Lattice l = smeta.GetReference();
- for (int idx=start-1; idx>=0; idx--) {
- if (l.size()>0)
- if (tfw.find(l[idx][0].label) !=tfw.end()) {
- _lfw=l[idx][0].label; break;
- }
- }
- for (int idx=end; idx<l.size(); idx++) { // end or end+1
- if (l[idx].size()>0)
- if (tfw.find(l[idx][0].label)!=tfw.end()) {
- _rfw=l[idx][0].label; break;
- }
- }
- }
- //neighboringFWs(smeta.GetReference(),edge.i_,edge.j_,tfw,&_lfw,&_rfw);
- if (!nofw) als->computeDominanceTarget(tdomt,_lfw,_rfw,&cost,&bonus,
- &bo1,&bo1_bonus,&bo2,&bo2_bonus);
- features->set_value(domt_,cost);
- //features->set_value(domt_bo1_,bo1);
- //features->set_value(domt_bo2_,bo2);
- estimated_features->set_value(domt_,bonus);
- //estimated_features->set_value(domt_bo1_,bo1_bonus);
- //estimated_features->set_value(domt_bo2_,bo2_bonus);
- }
- int* vcontext = reinterpret_cast<int *>(context);
- if (!nofw) {
- als->BorderingSFWsOnly();
- als->BorderingTFWsOnly();
- als->simplify(vcontext);
- } else {
- als->simplify_nofw(vcontext);
- }
- vcontext[50] = DoubleToInteger(bdoms_state_mono);
- vcontext[51] = DoubleToInteger(bdoms_state_nonmono);
- vcontext[STATE_SIZE-1] = Alignment::link(edge.i_,edge.j_);
- if (DEBUG) {
- cerr << "state@traverse = ";
- for (int idx=0; idx<STATE_SIZE; idx++) cerr << idx << "." << vcontext[idx] << " ";
- cerr << endl;
- cerr << "bdoms_state_mono=" << bdoms_state_mono << ", state[50]=" << IntegerToDouble(vcontext[50]) << endl;
- cerr << "bdoms_state_nonmono=" << bdoms_state_nonmono << ", state[51]=" << IntegerToDouble(vcontext[51]) << endl;
- }
-}
-
-int Dwarf::DoubleToInteger(double val) {
- float x = (float)val;
- float* px = &x;
- int* pix = reinterpret_cast<int *>(px);
- return *pix;
-}
-
-double Dwarf::IntegerToDouble(int val) {
- int *py = &val;
- float* pd = reinterpret_cast<float *>(py);
- return (double)*pd;
-}
-
-void Dwarf::neighboringFWs(const Lattice& l, const int& i, const int& j, const map<WordID,int>& fw_hash, int* lfw, int* rfw) {
- *lfw=0; *rfw=0;
- int idx=i-l[i][0].dist2next;
- while (idx>=0) {
- if (l[idx].size()>0) {
- if (fw_hash.find(l[idx][0].label)!=fw_hash.end()) {
- lfw++;
- }
- }
- idx-=l[idx][0].dist2next;
- }
- idx=j+l[j][0].dist2next;
- while (idx<l.size()) {
- if (l[idx].size()>0) {
- if (fw_hash.find(l[idx][0].label)!=fw_hash.end()) {
- rfw++;
- }
- }
- idx+=l[idx][0].dist2next;
- }
-}
-
-bool Dwarf::readOrientation(CountTable* table, const std::string& filename, std::map<WordID,int> *fw, bool pos) {
- // the input format is
- // source target 0 1 2 3 4 0 1 2 3 4
- // 0 -> MA, 1 -> RA, 2 -> MG, 3 -> RG, 4 -> NO_NEIGHBOR
- // first 01234 corresponds to the left neighbor, the second 01234 corresponds to the right neighbor
- // append 2 more at the end as precomputed total
-
- // TONS of hack here. CountTable should be wrapped as a class
- // TODO: check whether the file exists or not, return false if not
- if (DEBUG) cerr << " readOrientation(" << filename << ", pos=" << pos << ")" << endl;
- ReadFile rf(filename);
- istream& in = *rf.stream();
- table->setup(24,pos);
- table->ultimate = new int[24];
- for (int i=0; i<24; i++) table->ultimate[i]=0;
- ostringstream oss;
- while (in) {
- string line;
- getline(in,line);
- if (line=="") break;
- istringstream tokenizer(line);
- string sourceidx, source, target, word;
- tokenizer >> source >> target;
- if (pos) {
- sourceidx = source;
- source = sourceidx.substr(0,sourceidx.find_last_of("/"));
- }
- if (fw->find(TD::Convert(source))==fw->end()) fw->insert(pair<WordID,int>(TD::Convert(source),1));
-
-
- int* element = new int[24];
- element[5] = 0;
- for (int i=0; i<5; i++) {
- element[i] = 0;
- if (tokenizer >> word) element[i] = atoi(word.c_str());
- element[5] += element[i];
- }
- element[11] = 0;
- for (int i=6; i<11; i++) {
- element[i] = 0;
- if (tokenizer >> word) element[i] = atoi(word.c_str());
- element[11] += element[i];
- }
- element[17] = 0;
- for (int i=12; i<17; i++) {
- element[i] = 0;
- if (tokenizer >> word) element[i] = atoi(word.c_str());
- element[17] += element[i];
- }
- element[23] = 0;
- for (int i=18; i<23; i++) {
- element[i] = 0;
- if (tokenizer >> word) element[i] = atoi(word.c_str());
- element[23] += element[i];
- }
- for (int i=0; i<24; i++) table->ultimate[i] += element[i];
- oss << source << " " << target;
- WordID key_id = TD::Convert(oss.str());
- oss.str("");
- if (table->model.find(key_id)!=table->model.end()) {
- for (int i=0; i<24; i++) table->model[key_id][i]+=element[i];
- } else {
- int* el2 = new int[24];
- for (int i=0; i<24; i++) el2[i] = element[i];
- table->model.insert(pair<WordID,int*>(key_id,el2));
- }
-
- oss << source;
- key_id = TD::Convert(oss.str());
- oss.str("");
- if (table->model.find(key_id)!=table->model.end()) {
- for (int i=0; i<24; i++) table->model[key_id][i]+=element[i];
- } else {
- int* el2 = new int[24];
- for (int i=0; i<24; i++) el2[i] = element[i];
- table->model.insert(pair<WordID,int*>(key_id,el2));
- }
-
- if (pos) {
- oss << sourceidx << " " << target;
- key_id = TD::Convert(oss.str());
- oss.str("");
- if (table->model.find(key_id)!=table->model.end()) {
- for (int i=0; i<24; i++) table->model[key_id][i]+=element[i];
- } else {
- int* el2 = new int[24];
- for (int i=0; i<24; i++) el2[i] = element[i];
- table->model.insert(pair<WordID,int*>(key_id,el2));
- }
- }
- delete[] element;
- }
- return true;
-}
-
-bool Dwarf::readList(const std::string& filename, std::map<WordID,int>* fw) {
- ReadFile rf(filename);
- istream& in = *rf.stream();
- while (in) {
- string word;
- getline(in,word);
- if (fw->find(TD::Convert(word))==fw->end()) fw->insert(pair<WordID,int>(TD::Convert(word),1));
- }
- return true;
-}
-
-bool Dwarf::readDominance(CountTable* table, const std::string& filename, std::map<WordID,int>* fw, bool pos) {
- // the input format is
- // source1 source2 target1 target2 0 1 2 3
- // 0 -> dontcase 1->leftfirst 2->rightfirst 3->neither
- if (DEBUG) cerr << "readDominance(" << filename << ",pos="<< pos << ")" << endl;
- ReadFile rf(filename);
- istream& in = *rf.stream();
- table->ultimate = new int[5];
- table->setup(5,pos);
- for (int i=0; i<5; i++) table->ultimate[i]=0;
- while (in) {
- string line, word;
- getline(in,line);
- if (line=="") break;
- string source1idx, source2idx, target1, target2, source1, source2;
- ostringstream oss;
- WordID key_id;
- istringstream tokenizer(line);
- tokenizer >> source1 >> source2 >> target1 >> target2;
- if (pos) {
- source1idx = source1;
- source2idx = source2;
- source1 = source1idx.substr(0,source1idx.find_last_of("/"));
- source2 = source2idx.substr(0,source2idx.find_last_of("/"));
- }
- if (fw->find(TD::Convert(source1))==fw->end()) fw->insert(pair<WordID,int>(TD::Convert(source1),1));
- if (fw->find(TD::Convert(source2))==fw->end()) fw->insert(pair<WordID,int>(TD::Convert(source2),1));
-
- int* element = new int[5];
- element[4]=0;
- for (int i=0; i<4; i++) {
- element[i] = 0;
- if (tokenizer >> word) element[i] = atoi(word.c_str());
- element[4]+=element[i];
- }
- for (int i=0; i<5; i++) table->ultimate[i] += element[i];
-
- oss << source1 << " " << source2 << " " << target1 << " " << target2;
- key_id = TD::Convert(oss.str());
- oss.str("");
- if (table->model.find(key_id)!=table->model.end()) {
- for (int i=0; i<5; i++) table->model[key_id][i]+=element[i];
- } else {
- int* el2 = new int[5];
- for (int i=0; i<5; i++) el2[i]=element[i];
- table->model.insert(pair<WordID,int*>(key_id,el2));
- }
-
- oss << source1 << " " << source2;
- key_id = TD::Convert(oss.str());
- oss.str("");
- if (table->model.find(key_id)!=table->model.end()) {
- for (int i=0; i<5; i++) table->model[key_id][i]+=element[i];
- } else {
- int* el2 = new int[5];
- for (int i=0; i<5; i++) el2[i]=element[i];
- table->model.insert(pair<WordID,int*>(key_id,el2));
- }
-
- if (pos) {
- oss << source1idx << " " << source2idx << " " << target1 << " " << target2;
- key_id = TD::Convert(oss.str());
- oss.str("");
- if (table->model.find(key_id)!=table->model.end()) {
- for (int i=0; i<5; i++) table->model[key_id][i]+=element[i];
- } else {
- int* el2 = new int[5];
- for (int i=0; i<5; i++) el2[i]=element[i];
- table->model.insert(pair<WordID,int*>(key_id,el2));
- }
- }
- delete element;
- }
-
- return true;
-}
-
-bool Dwarf::readTags(const std::string& filename, std::map<WordID,WordID>* tags) {
- ReadFile rf(filename);
- istream& in = *rf.stream();
- while(in) {
- string line, word, tag;
- getline(in,line);
- if (line=="") break;
- istringstream tokenizer(line);
- tokenizer >> tag >> word;
- tags->insert(pair<WordID,WordID>(TD::Convert(word),TD::Convert(tag)));
- }
- return true;
-}
-
-bool Dwarf::generalizeOrientation(CountTable* table, const std::map<WordID,WordID>& tags, bool pos) {
- map<string,int*> generalized;
- for (map<WordID,int*>::iterator it=table->model.begin(); it!=table->model.end(); it++) {
- string source, target;
- istringstream tokenizer(TD::Convert(it->first));
- tokenizer >> source >> target;
- string idx = "";
- if (pos) {
- int found = source.find_last_of("/");
- if (found!=string::npos && found>0) {
- idx = source.substr(found+1);
- source = source.substr(0,found);
- }
- }
- map<WordID,WordID>::const_iterator tags_iter = tags.find(TD::Convert(source));
- if (tags_iter!=tags.end()) {
- ostringstream genkey;
- genkey << TD::Convert(tags_iter->second);
- if (idx!="") genkey << "/" << idx;
- if (target!="") genkey << " " << target;
- int* model;
- if (generalized.find(genkey.str())!=generalized.end()) {
- model = generalized[genkey.str()];
- for (int i=0; i<24; i++) model[i] += it->second[i];
- } else {
- int* el = new int[24];
- for (int i=0; i<24; i++) el[i] = it->second[i];
- generalized.insert(pair<string,int*>(genkey.str(),el));
- }
- }
- }
- for (map<WordID,int*>::iterator it=table->model.begin(); it!=table->model.end(); it++) {
- string source, target;
- istringstream tokenizer(TD::Convert(it->first));
- tokenizer >> source >> target;
- string idx = "";
- if (pos) {
- int found = source.find_last_of("/");
- if (found!=string::npos && found>0) {
- idx = source.substr(found+1);
- source = source.substr(0,found);
- }
- }
- map<WordID,WordID>::const_iterator tags_iter = tags.find(TD::Convert(source));
- if (tags_iter!=tags.end()) {
- ostringstream genkey;
- genkey << TD::Convert(tags_iter->second);
- if (idx!="") genkey << "/" << idx;
- if (target!="") genkey << " " << target;
- if (generalized.find(genkey.str())!=generalized.end()) {
- delete it->second;
- it->second = generalized[genkey.str()];
- }
- }
- }
- return false; // no idea if this is right
-}
-
-
-
-bool Dwarf::generalizeDominance(CountTable* table, const std::map<WordID,WordID>& tags, bool pos) {
- map<string,int*> generalized;
- ostringstream oss;
- for (map<WordID,int*>::iterator it=table->model.begin(); it!=table->model.end(); it++) {
- string source1, source2, target1, target2;
- string idx1 = ""; string idx2 = "";
- istringstream tokenizer(TD::Convert(it->first));
- tokenizer >> source1 >> source2 >> target1 >> target2;
- if (DEBUG) cerr << "source1=|" << source1 << "|, source2=|" << source2 << "|, target1=|" << target1 << "|, target2=|" << target2 << "|" << endl;
- if (pos) {
- int found1 = source1.find_last_of("/");
- int found2 = source2.find_last_of("/");
- if (found1!=string::npos && found2!=string::npos && found1>0 && found2>0) {
- idx1 = source1.substr(found1+1);
- source1 = source1.substr(0,found1);
- idx2 = source2.substr(found2+1);
- source2 = source2.substr(0,found2);
- }
- }
- if (DEBUG)
- cerr << "[U]source1='" << source1 << "', idx1='"<< idx1 << "', source2='" << source2 << "', idx2='"<< idx2 << "', target1='" << target1 << "', target2='" << target2 << "'" << endl;
- map<WordID,WordID>::const_iterator tags_iter1 = tags.find(TD::Convert(source1));
- map<WordID,WordID>::const_iterator tags_iter2 = tags.find(TD::Convert(source2));
- if (tags_iter1!=tags.end())
- source1 = TD::Convert(tags_iter1->second);
- oss << source1;
- if (idx1!="") oss << "/" << idx1;
- if (tags_iter2!=tags.end())
- source2 = TD::Convert(tags_iter2->second);
- oss << " " << source2;
- if (idx2!="") oss << "/" << idx2;
- if (target1!="" && target2!="") oss << " " << target1 << " " << target2;
-
- if (DEBUG) cerr << "generalized key = '" << oss.str() << "'" << endl;
- if (generalized.find(oss.str())!=generalized.end()) {
- int* model = generalized[oss.str()];
- for (int i=0; i<5; i++) model[i] += it->second[i];
- } else {
- int* model = new int[5];
- for (int i=0; i<5; i++) model[i] = it->second[i];
- generalized.insert(pair<string,int*>(oss.str(),model));
- }
- oss.str("");
- }
-
- if (DEBUG) {
- for (map<string,int*>::const_iterator it=generalized.begin(); it!=generalized.end(); it++) {
- cerr << "GENERALIZED = " << it->first << ", ";
- for (int i=0; i<5; i++) cerr << it->second[i] << " ";
- cerr << endl;
- }
- }
-
- for (map<WordID,int*>::iterator it=table->model.begin(); it!=table->model.end(); it++) {
- string source1, source2, target1, target2;
- string idx1 = ""; string idx2 = "";
- istringstream tokenizer(TD::Convert(it->first));
- tokenizer >> source1 >> source2 >> target1 >> target2;
- if (pos) {
- int found1 = source1.find_last_of("/");
- int found2 = source2.find_last_of("/");
- if (found1!=string::npos && found2!=string::npos && found1>0 && found2>0) {
- idx1 = source1.substr(found1+1);
- source1 = source1.substr(0,found1);
- idx2 = source2.substr(found2+1);
- source2 = source2.substr(0,found2);
- }
- }
- map<WordID,WordID>::const_iterator tags_iter1 = tags.find(TD::Convert(source1));
- map<WordID,WordID>::const_iterator tags_iter2 = tags.find(TD::Convert(source2));
- if (tags_iter1!=tags.end())
- source1 = TD::Convert(tags_iter1->second);
- oss << source1;
- if (idx1!="") oss << "/" << idx1;
- if (tags_iter2!=tags.end())
- source2 = TD::Convert(tags_iter2->second);
- oss << " " << source2;
- if (idx2!="") oss << "/" << idx2;
- if (target1!="" && target2!="") oss << " " << target1 << " " << target2;
-
- if (generalized.find(oss.str())!=generalized.end()) {
- if (DEBUG) cerr << " generalizing "<< TD::Convert(it->first) << " into " << oss.str() << endl;
- if (DEBUG) {
- cerr << " model from ";
- for (int i=0; i<5; i++) cerr << it->second[i] << " ";
- cerr << endl;
- }
- delete it->second;
- it->second = generalized[oss.str()];
- if (DEBUG) {
- cerr << " into ";
- for (int i=0; i<5; i++) cerr << it->second[i] << " ";
- cerr << endl;
- }
- }
- oss.str("");
- }
-
-}
diff --git a/decoder/ff_dwarf.h b/decoder/ff_dwarf.h
deleted file mode 100644
index 3d6a7da6..00000000
--- a/decoder/ff_dwarf.h
+++ /dev/null
@@ -1,100 +0,0 @@
-#include <vector>
-#include <map>
-#include <string>
-#include "ff.h"
-#include "dwarf.h"
-#include "lattice.h"
-
-using namespace std;
-
-class Dwarf : public FeatureFunction {
- public:
- Dwarf(const std::string& param);
- /* State-related param
- STATE_SIZE: the number of ints
- MAXIMUM_ALIGNMENTS: the maximum number of alignments in the states,
- each alignment point is encoded in one int
- (the first two bytes for source, and the remaining one for target)
- */
- static const int STATE_SIZE=53;
- static const int IMPOSSIBLY_LARGE_POS = 9999999;
- static const int MAXIMUM_ALIGNMENTS=37;
- /* Read from file the Orientation(Source|Target model parameter. */
- static bool readOrientation(CountTable* table, const std::string& filename, std::map<WordID,int> *fw, bool pos=false);
- /* Read from file the Dominance(Source|Target) model parameter. */
- static bool readDominance(CountTable* table, const std::string& filename, std::map<WordID,int> *fw, bool pos=false);
- static bool readList(const std::string& filename, std::map<WordID,int>* fw);
- static double IntegerToDouble(int val);
- static int DoubleToInteger(double val);
- bool readTags(const std::string& filename, std::map<WordID,WordID>* tags);
- bool generalizeOrientation(CountTable* table, const std::map<WordID,WordID>& tags, bool pos=false);
- bool generalizeDominance(CountTable* table, const std::map<WordID,WordID>& tags, bool pos=false);
- static void stripIndex(const string& source, string* pkey, string* pidx) {
- if (DEBUG) cerr << " stripIndex(" << source << ")" << endl;
- int found = source.find_last_of("/");
- string idx = source.substr(found+1);
- string key = source.substr(0,found);
- if (DEBUG) cerr << " found=" << found << "," << key << "," << idx << endl;
- pkey = &key;
- pidx = &idx;
- }
-
-
- protected:
- /* The high-level workflow is as follow:
- 1. call *als->prepare*, which constructs the full alignment of the edge while taking into account the antecedents
- also in this call, function words are identified. Most of the work in this call is to make sure the indexes
- of the alignments (including the function words) are consistent with the newly created alignment
- 2. call *als->computeOrientationSource*, *als->computeOrientationTarget*,
- *als->computeDominanceSource*, or *als->computeDominanceTarget*
- and pass the resulting score to either *features* or to *estimated_features*
- 3. call *als->BorderingSFWsOnly()* and *als->BorderingTFWsOnly()*, which removes records of all function word
- alignments except those at the borders. Note that fw alignments kept may be more than two on each side
- for examples if there are a number of unaligned fw alignments before the leftmost alignment or the rightmost one
- 4. call *als->simplify()*, which assigns the state of this edge (*context*). It simplifies the alignment space to
- its most compact representation, enough to compute the unscored models. This is done by observing the surviving
- function word alignments set by 3.
- */
- void TraversalFeaturesImpl(const SentenceMetadata& smeta,
- const HG::Edge& edge,
- const std::vector<const void*>& ant_contexts,
- SparseVector<double>* features,
- SparseVector<double>* estimated_features,
- void* context) const;
- private:
- Alignment* als;
- /* Feature IDs set by calling FD::Convert(model's string) */
- int oris_, oris_bo1_, oris_bo2_, orit_, orit_bo1_, orit_bo2_;
- int oris_backward_, orit_backward_, porislr_, porisrl_, goris_, pgorislr_, pgorisrl_;
- int pdomslr_, pdomsrl_, pgdomslr_, pgdomsrl_;
- int doms_, doms_bo1_, doms_bo2_, domt_, domt_bo1_, domt_bo2_;
- int tfw_count_;
- int bdoms_;
- int poris_count;
- int pgoris_count;
- int poris_nlr, poris_nrl; // maximum depth (1->from the beginning of the sentence, 2-> from the end of the sentence)
- int pgoris_nlr, pgoris_nrl;
- int pdoms_nlr, pdoms_nrl;
- int pgdoms_nlr, pgdoms_nrl;
- int* _sent_id;
- int* _fwcount;
- WordID kSOS;
- WordID kEOS;
- string sSOS;
- string sEOS;
- WordID kGOAL;
- /* model's flag, if set true will invoke the model scoring */
- bool flag_oris, flag_orit, flag_doms, flag_domt, flag_tfw_count, flag_oris_backward, flag_orit_backward, flag_bdoms;
- bool flag_porislr, flag_porisrl, flag_goris, flag_pgorislr, flag_pgorisrl;
- bool explicit_soseos;
- bool flag_pdomslr, flag_pdomsrl, flag_pgdomslr, flag_pgdomsrl, flag_gdoms;
- /* a collection of Source function words (sfw) and Target function words (tfw) */
- std::map<WordID,int> sfw;
- std::map<WordID,int> tfw;
- std::map<WordID,WordID> tags;
- /* a collection of model's parameter */
- CountTable toris, torit, tdoms, tbdoms, tdomt, tporislr, tporisrl, tgoris, tpgorislr, tpgorisrl;
- CountTable tpdomslr, tpdomsrl, tpgdomslr, tpgdomsrl;
- void neighboringFWs(const Lattice& l, const int& i, const int& j, const map<WordID,int>& fw_hash, int* lfw, int* rfw);
-};
-
diff --git a/decoder/ff_external.cc b/decoder/ff_external.cc
index dea0e20f..6ee4b2cf 100644
--- a/decoder/ff_external.cc
+++ b/decoder/ff_external.cc
@@ -19,7 +19,7 @@ ExternalFeature::ExternalFeature(const string& param) {
cerr << "External requires a path to a dynamic library!\n";
abort();
}
- lib_handle = dlopen(file.c_str(), RTLD_LAZY);
+ lib_handle = dlopen(file.c_str(), RTLD_LAZY | RTLD_GLOBAL);
if (!lib_handle) {
cerr << "dlopen reports: " << dlerror() << endl;
cerr << "Did you provide a full path to the dynamic library?\n";
diff --git a/decoder/ff_lm.cc b/decoder/ff_lm.cc
index 6ec7b4f3..bc51076f 100644
--- a/decoder/ff_lm.cc
+++ b/decoder/ff_lm.cc
@@ -61,11 +61,6 @@ char const* usage_verbose="-n determines the name of the feature (and its weight
#include "hg.h"
#include "stringlib.h"
-#ifdef HAVE_RANDLM
-// http://randlm.sourceforge.net/
-#include "RandLM.h"
-#endif
-
using namespace std;
string LanguageModel::usage(bool param,bool verbose) {
@@ -542,99 +537,3 @@ void LanguageModel::FinalTraversalFeatures(const void* ant_state,
features->set_value(fid_, imp().FinalTraversalCost(ant_state));
}
-#ifdef HAVE_RANDLM
-struct RandLMImpl : public LanguageModelImpl {
- RandLMImpl(int order, randlm::RandLM* rlm) :
- LanguageModelImpl(order),
- rlm_(rlm),
- oov_(rlm->getWordID(rlm->getOOV())),
- rb_(1000, oov_) {
- map<int, randlm::WordID> map_cdec2randlm;
- int max_wordid = 0;
- for(map<randlm::Word, randlm::WordID>::const_iterator it = rlm->vocabStart();
- it != rlm->vocabEnd(); ++it) {
- const int cur = TD::Convert(it->first);
- map_cdec2randlm[TD::Convert(it->first)] = it->second;
- if (cur > max_wordid) max_wordid = cur;
- }
- cdec2randlm_.resize(max_wordid + 1, oov_);
- for (map<int, randlm::WordID>::iterator it = map_cdec2randlm.begin();
- it != map_cdec2randlm.end(); ++it)
- cdec2randlm_[it->first] = it->second;
- map_cdec2randlm.clear();
- }
-
- inline randlm::WordID Convert2RandLM(int w) {
- return (w < cdec2randlm_.size() ? cdec2randlm_[w] : oov_);
- }
-
- virtual double WordProb(int word, int* context) {
- int i = order_;
- int c = 1;
- rb_[i] = Convert2RandLM(word);
- while (i > 1 && *context > 0) {
- --i;
- rb_[i] = Convert2RandLM(*context);
- ++context;
- ++c;
- }
- const void* finalState = 0;
- int found;
- //cerr << "I = " << i << endl;
- return rlm_->getProb(&rb_[i], c, &found, &finalState);
- }
- private:
- boost::shared_ptr<randlm::RandLM> rlm_;
- randlm::WordID oov_;
- vector<randlm::WordID> cdec2randlm_;
- vector<randlm::WordID> rb_;
-};
-
-LanguageModelRandLM::LanguageModelRandLM(const string& param) :
- fid_(FD::Convert("RandLM")) {
- vector<string> argv;
- int argc = SplitOnWhitespace(param, &argv);
- int order = 3;
- // TODO add support for -n FeatureName
- string filename;
- if (argc < 1) { cerr << "RandLM requires a filename, minimally!\n"; abort(); }
- else if (argc == 1) { filename = argv[0]; }
- else if (argc == 2 || argc > 3) { cerr << "Don't understand 'RandLM " << param << "'\n"; }
- else if (argc == 3) {
- if (argv[0] == "-o") {
- order = atoi(argv[1].c_str());
- filename = argv[2];
- } else if (argv[1] == "-o") {
- order = atoi(argv[2].c_str());
- filename = argv[0];
- }
- }
-// set_order(order);
- int cache_MB = 200; // increase cache size
- randlm::RandLM* rlm = randlm::RandLM::initRandLM(filename, order, cache_MB);
- assert(rlm != NULL);
- pimpl_ = new RandLMImpl(order, rlm);
-}
-
-LanguageModelRandLM::~LanguageModelRandLM() {
- delete pimpl_;
-}
-
-void LanguageModelRandLM::TraversalFeaturesImpl(const SentenceMetadata& smeta,
- const Hypergraph::Edge& edge,
- const vector<const void*>& ant_states,
- SparseVector<double>* features,
- SparseVector<double>* estimated_features,
- void* state) const {
- (void) smeta;
- features->set_value(fid_, imp().LookupWords(*edge.rule_, ant_states, state));
- estimated_features->set_value(fid_, imp().EstimateProb(state));
-}
-
-void LanguageModelRandLM::FinalTraversalFeatures(const void* ant_state,
- SparseVector<double>* features) const {
- features->set_value(fid_, imp().FinalTraversalCost(ant_state));
-}
-
-#endif
-
diff --git a/decoder/ff_lm.h b/decoder/ff_lm.h
index 94e18f00..85e79704 100644
--- a/decoder/ff_lm.h
+++ b/decoder/ff_lm.h
@@ -69,26 +69,4 @@ class LanguageModel : public FeatureFunction {
/* mutable */ LanguageModelInterface* pimpl_;
};
-#ifdef HAVE_RANDLM
-class LanguageModelRandLM : public FeatureFunction {
- public:
- // param = "filename.lm [-o n]"
- LanguageModelRandLM(const std::string& param);
- ~LanguageModelRandLM();
- virtual void FinalTraversalFeatures(const void* context,
- SparseVector<double>* features) const;
- std::string DebugStateToString(const void* state) const;
- protected:
- virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
- const HG::Edge& edge,
- const std::vector<const void*>& ant_contexts,
- SparseVector<double>* features,
- SparseVector<double>* estimated_features,
- void* out_context) const;
- private:
- const int fid_;
- mutable LanguageModelImpl* pimpl_;
-};
-#endif
-
#endif
diff --git a/decoder/ff_parse_match.cc b/decoder/ff_parse_match.cc
index ed556b91..58026975 100644
--- a/decoder/ff_parse_match.cc
+++ b/decoder/ff_parse_match.cc
@@ -42,10 +42,8 @@ struct ParseMatchFeaturesImpl {
void InitializeGrids(const string& tree, unsigned src_len) {
assert(tree.size() > 0);
- //fids_cat.clear();
fids_ef.clear();
src_tree.clear();
- //fids_cat.resize(src_len, src_len + 1);
fids_ef.resize(src_len, src_len + 1);
src_tree.resize(src_len, src_len + 1, TD::Convert("XX"));
ParseTreeString(tree, src_len);
@@ -112,7 +110,7 @@ struct ParseMatchFeaturesImpl {
int fid_ef = FD::Convert("PM");
int min_dist; // minimal distance to next syntactic constituent of this rule's LHS
int summed_min_dists; // minimal distances of LHS and NTs summed up
- if (TD::Convert(lhs).compare("XX") != 0)
+ if (TD::Convert(lhs).compare("XX") != 0)
min_dist= 0;
// compute the distance to the next syntactical constituent
else {
@@ -131,7 +129,7 @@ struct ParseMatchFeaturesImpl {
ok = 1;
break;
}
- // check if removing k words from the rule span will
+ // check if removing k words from the rule span will
// lead to a syntactical constituent
else {
//cerr << "Hilfe...!" << endl;
@@ -144,7 +142,7 @@ struct ParseMatchFeaturesImpl {
ok = 1;
break;
}
- }
+ }
}
if (ok) break;
}
@@ -183,9 +181,9 @@ struct ParseMatchFeaturesImpl {
return min_dist;
}
- Array2D<WordID> src_tree; // src_tree(i,j) NT = type
+ Array2D<WordID> src_tree; // src_tree(i,j) NT = type
unsigned int src_sent_len;
- mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized
+ mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized
int scoring_method;
};
@@ -214,5 +212,9 @@ void ParseMatchFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta,
}
void ParseMatchFeatures::PrepareForInput(const SentenceMetadata& smeta) {
- impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength());
+ ReadFile f = ReadFile(smeta.GetSGMLValue("src_tree"));
+ string tree;
+ f.ReadAll(tree);
+ impl->InitializeGrids(tree, smeta.GetSourceLength());
}
+
diff --git a/decoder/ff_parse_match.h b/decoder/ff_parse_match.h
index fa73481a..7820b418 100644
--- a/decoder/ff_parse_match.h
+++ b/decoder/ff_parse_match.h
@@ -23,3 +23,4 @@ class ParseMatchFeatures : public FeatureFunction {
};
#endif
+
diff --git a/decoder/ff_soft_syntax.cc b/decoder/ff_soft_syntax.cc
index 9981fa45..23fe87bd 100644
--- a/decoder/ff_soft_syntax.cc
+++ b/decoder/ff_soft_syntax.cc
@@ -13,16 +13,15 @@
using namespace std;
-// Implements the soft syntactic features described in
+// Implements the soft syntactic features described in
// Marton and Resnik (2008): "Soft Syntacitc Constraints for Hierarchical Phrase-Based Translation".
// Source trees must be represented in Penn Treebank format,
// e.g. (S (NP John) (VP (V left))).
-struct SoftSyntacticFeaturesImpl {
- SoftSyntacticFeaturesImpl(const string& param) {
+struct SoftSyntaxFeaturesImpl {
+ SoftSyntaxFeaturesImpl(const string& param) {
vector<string> labels = SplitOnWhitespace(param);
- for (unsigned int i = 0; i < labels.size(); i++)
- //cerr << "Labels: " << labels.at(i) << endl;
+ //for (unsigned int i = 0; i < labels.size(); i++) { cerr << "Labels: " << labels.at(i) << endl; }
for (unsigned int i = 0; i < labels.size(); i++) {
string label = labels.at(i);
pair<string, string> feat_label;
@@ -34,10 +33,8 @@ struct SoftSyntacticFeaturesImpl {
void InitializeGrids(const string& tree, unsigned src_len) {
assert(tree.size() > 0);
- //fids_cat.clear();
fids_ef.clear();
src_tree.clear();
- //fids_cat.resize(src_len, src_len + 1);
fids_ef.resize(src_len, src_len + 1);
src_tree.resize(src_len, src_len + 1, TD::Convert("XX"));
ParseTreeString(tree, src_len);
@@ -99,7 +96,7 @@ struct SoftSyntacticFeaturesImpl {
const WordID lhs = src_tree(i,j);
string lhs_str = TD::Convert(lhs);
//cerr << "LHS: " << lhs_str << " from " << i << " to " << j << endl;
- //cerr << "RULE :"<< rule << endl;
+ //cerr << "RULE :"<< rule << endl;
int& fid_ef = fids_ef(i,j)[&rule];
for (unsigned int i = 0; i < feat_labels.size(); i++) {
ostringstream os;
@@ -110,10 +107,10 @@ struct SoftSyntacticFeaturesImpl {
switch(feat_type) {
case '2':
if (lhs_str.compare(label) == 0) {
- os << "SYN:" << label << "_conform";
+ os << "SOFT:" << label << "_conform";
}
else {
- os << "SYN:" << label << "_cross";
+ os << "SOFT:" << label << "_cross";
}
fid_ef = FD::Convert(os.str());
if (fid_ef > 0) {
@@ -122,11 +119,11 @@ struct SoftSyntacticFeaturesImpl {
}
break;
case '_':
- os << "SYN:" << label;
+ os << "SOFT:" << label;
fid_ef = FD::Convert(os.str());
if (lhs_str.compare(label) == 0) {
if (fid_ef > 0) {
- //cerr << "Feature: " << os.str() << endl;
+ //cerr << "Feature: " << os.str() << endl;
feats->set_value(fid_ef, 1.0);
}
}
@@ -139,7 +136,7 @@ struct SoftSyntacticFeaturesImpl {
break;
case '+':
if (lhs_str.compare(label) == 0) {
- os << "SYN:" << label << "_conform";
+ os << "SOFT:" << label << "_conform";
fid_ef = FD::Convert(os.str());
if (fid_ef > 0) {
//cerr << "Feature: " << os.str() << endl;
@@ -147,10 +144,10 @@ struct SoftSyntacticFeaturesImpl {
}
}
break;
- case '-':
- //cerr << "-" << endl;
+ case '-':
+ //cerr << "-" << endl;
if (lhs_str.compare(label) != 0) {
- os << "SYN:" << label << "_cross";
+ os << "SOFT:" << label << "_cross";
fid_ef = FD::Convert(os.str());
if (fid_ef > 0) {
//cerr << "Feature :" << os.str() << endl;
@@ -167,22 +164,22 @@ struct SoftSyntacticFeaturesImpl {
return lhs;
}
- Array2D<WordID> src_tree; // src_tree(i,j) NT = type
- mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized
+ Array2D<WordID> src_tree; // src_tree(i,j) NT = type
+ mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized
vector<pair<string, string> > feat_labels;
};
-SoftSyntacticFeatures::SoftSyntacticFeatures(const string& param) :
+SoftSyntaxFeatures::SoftSyntaxFeatures(const string& param) :
FeatureFunction(sizeof(WordID)) {
- impl = new SoftSyntacticFeaturesImpl(param);
+ impl = new SoftSyntaxFeaturesImpl(param);
}
-SoftSyntacticFeatures::~SoftSyntacticFeatures() {
+SoftSyntaxFeatures::~SoftSyntaxFeatures() {
delete impl;
impl = NULL;
}
-void SoftSyntacticFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta,
+void SoftSyntaxFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta,
const Hypergraph::Edge& edge,
const vector<const void*>& ant_contexts,
SparseVector<double>* features,
@@ -196,6 +193,10 @@ void SoftSyntacticFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta,
impl->FireFeatures(*edge.rule_, edge.i_, edge.j_, ants, features);
}
-void SoftSyntacticFeatures::PrepareForInput(const SentenceMetadata& smeta) {
- impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength());
+void SoftSyntaxFeatures::PrepareForInput(const SentenceMetadata& smeta) {
+ ReadFile f = ReadFile(smeta.GetSGMLValue("src_tree"));
+ string tree;
+ f.ReadAll(tree);
+ impl->InitializeGrids(tree, smeta.GetSourceLength());
}
+
diff --git a/decoder/ff_soft_syntax.h b/decoder/ff_soft_syntax.h
index 79352f49..e71825d5 100644
--- a/decoder/ff_soft_syntax.h
+++ b/decoder/ff_soft_syntax.h
@@ -1,15 +1,15 @@
-#ifndef _FF_SOFTSYNTAX_H_
-#define _FF_SOFTSYNTAX_H_
+#ifndef _FF_SOFT_SYNTAX_H_
+#define _FF_SOFT_SYNTAX_H_
#include "ff.h"
#include "hg.h"
-struct SoftSyntacticFeaturesImpl;
+struct SoftSyntaxFeaturesImpl;
-class SoftSyntacticFeatures : public FeatureFunction {
+class SoftSyntaxFeatures : public FeatureFunction {
public:
- SoftSyntacticFeatures(const std::string& param);
- ~SoftSyntacticFeatures();
+ SoftSyntaxFeatures(const std::string& param);
+ ~SoftSyntaxFeatures();
protected:
virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
const Hypergraph::Edge& edge,
@@ -19,9 +19,9 @@ class SoftSyntacticFeatures : public FeatureFunction {
void* context) const;
virtual void PrepareForInput(const SentenceMetadata& smeta);
private:
- SoftSyntacticFeaturesImpl* impl;
+ SoftSyntaxFeaturesImpl* impl;
};
-
#endif
+
diff --git a/decoder/ff_soft_syntax2.cc b/decoder/ff_soft_syntax_mindist.cc
index 121bc39b..a23f70f8 100644
--- a/decoder/ff_soft_syntax2.cc
+++ b/decoder/ff_soft_syntax_mindist.cc
@@ -1,4 +1,4 @@
-#include "ff_soft_syntax2.h"
+#include "ff_soft_syntax_mindist.h"
#include <cstdio>
#include <sstream>
@@ -13,16 +13,18 @@
using namespace std;
-// Implements the soft syntactic features described in
+// Implements the soft syntactic features described in
// Marton and Resnik (2008): "Soft Syntacitc Constraints for Hierarchical Phrase-Based Translation".
// Source trees must be represented in Penn Treebank format,
// e.g. (S (NP John) (VP (V left))).
+//
+// This variant accepts fuzzy matches, choosing the constituent with
+// minimum distance.
-struct SoftSyntacticFeatures2Impl {
- SoftSyntacticFeatures2Impl(const string& param) {
+struct SoftSyntaxFeaturesMindistImpl {
+ SoftSyntaxFeaturesMindistImpl(const string& param) {
vector<string> labels = SplitOnWhitespace(param);
- //for (unsigned int i = 0; i < labels.size(); i++)
- //cerr << "Labels: " << labels.at(i) << endl;
+ //for (unsigned int i = 0; i < labels.size(); i++) { cerr << "Labels: " << labels.at(i) << endl; }
for (unsigned int i = 0; i < labels.size(); i++) {
string label = labels.at(i);
pair<string, string> feat_label;
@@ -30,14 +32,12 @@ struct SoftSyntacticFeatures2Impl {
feat_label.second = label.at(label.size() - 1);
feat_labels.push_back(feat_label);
}
- }
+ }
void InitializeGrids(const string& tree, unsigned src_len) {
assert(tree.size() > 0);
- //fids_cat.clear();
fids_ef.clear();
src_tree.clear();
- //fids_cat.resize(src_len, src_len + 1);
fids_ef.resize(src_len, src_len + 1);
src_tree.resize(src_len, src_len + 1, TD::Convert("XX"));
ParseTreeString(tree, src_len);
@@ -99,14 +99,14 @@ struct SoftSyntacticFeatures2Impl {
const WordID lhs = src_tree(i,j);
string lhs_str = TD::Convert(lhs);
//cerr << "LHS: " << lhs_str << " from " << i << " to " << j << endl;
- //cerr << "RULE :"<< rule << endl;
+ //cerr << "RULE :"<< rule << endl;
int& fid_ef = fids_ef(i,j)[&rule];
string lhs_to_str = TD::Convert(lhs);
int min_dist;
string min_dist_label;
if (lhs_to_str.compare("XX") != 0) {
min_dist = 0;
- min_dist_label = lhs_to_str;
+ min_dist_label = lhs_to_str;
}
else {
int ok = 0;
@@ -128,7 +128,7 @@ struct SoftSyntacticFeatures2Impl {
min_dist_label = (TD::Convert(src_tree(l_rem, r_rem)));
break;
}
- }
+ }
}
if (ok) break;
}
@@ -146,10 +146,10 @@ struct SoftSyntacticFeatures2Impl {
case '2':
if (min_dist_label.compare(label) == 0) {
if (min_dist == 0) {
- os << "SYN:" << label << "_conform";
+ os << "SOFTM:" << label << "_conform";
}
else {
- os << "SYN:" << label << "_cross";
+ os << "SOFTM:" << label << "_cross";
}
fid_ef = FD::Convert(os.str());
//cerr << "Feature :" << os.str() << endl;
@@ -157,7 +157,7 @@ struct SoftSyntacticFeatures2Impl {
}
break;
case '_':
- os << "SYN:" << label;
+ os << "SOFTM:" << label;
fid_ef = FD::Convert(os.str());
if (min_dist_label.compare(label) == 0) {
//cerr << "Feature: " << os.str() << endl;
@@ -172,7 +172,7 @@ struct SoftSyntacticFeatures2Impl {
break;
case '+':
if (min_dist_label.compare(label) == 0) {
- os << "SYN:" << label << "_conform";
+ os << "SOFTM:" << label << "_conform";
fid_ef = FD::Convert(os.str());
if (min_dist == 0) {
//cerr << "Feature: " << os.str() << endl;
@@ -180,10 +180,10 @@ struct SoftSyntacticFeatures2Impl {
}
}
break;
- case '-':
- //cerr << "-" << endl;
+ case '-':
+ //cerr << "-" << endl;
if (min_dist_label.compare(label) != 0) {
- os << "SYN:" << label << "_cross";
+ os << "SOFTM:" << label << "_cross";
fid_ef = FD::Convert(os.str());
if (min_dist > 0) {
//cerr << "Feature :" << os.str() << endl;
@@ -200,22 +200,22 @@ struct SoftSyntacticFeatures2Impl {
return lhs;
}
- Array2D<WordID> src_tree; // src_tree(i,j) NT = type
- mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized
+ Array2D<WordID> src_tree; // src_tree(i,j) NT = type
+ mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized
vector<pair<string, string> > feat_labels;
};
-SoftSyntacticFeatures2::SoftSyntacticFeatures2(const string& param) :
+SoftSyntaxFeaturesMindist::SoftSyntaxFeaturesMindist(const string& param) :
FeatureFunction(sizeof(WordID)) {
- impl = new SoftSyntacticFeatures2Impl(param);
+ impl = new SoftSyntaxFeaturesMindistImpl(param);
}
-SoftSyntacticFeatures2::~SoftSyntacticFeatures2() {
+SoftSyntaxFeaturesMindist::~SoftSyntaxFeaturesMindist() {
delete impl;
impl = NULL;
}
-void SoftSyntacticFeatures2::TraversalFeaturesImpl(const SentenceMetadata& smeta,
+void SoftSyntaxFeaturesMindist::TraversalFeaturesImpl(const SentenceMetadata& smeta,
const Hypergraph::Edge& edge,
const vector<const void*>& ant_contexts,
SparseVector<double>* features,
@@ -229,6 +229,10 @@ void SoftSyntacticFeatures2::TraversalFeaturesImpl(const SentenceMetadata& smeta
impl->FireFeatures(*edge.rule_, edge.i_, edge.j_, ants, features);
}
-void SoftSyntacticFeatures2::PrepareForInput(const SentenceMetadata& smeta) {
- impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength());
+void SoftSyntaxFeaturesMindist::PrepareForInput(const SentenceMetadata& smeta) {
+ ReadFile f = ReadFile(smeta.GetSGMLValue("src_tree"));
+ string tree;
+ f.ReadAll(tree);
+ impl->InitializeGrids(tree, smeta.GetSourceLength());
}
+
diff --git a/decoder/ff_soft_syntax2.h b/decoder/ff_soft_syntax_mindist.h
index 4de91d86..bf938b38 100644
--- a/decoder/ff_soft_syntax2.h
+++ b/decoder/ff_soft_syntax_mindist.h
@@ -1,15 +1,15 @@
-#ifndef _FF_SOFTSYNTAX2_H_
-#define _FF_SOFTSYNTAX2_H_
+#ifndef _FF_SOFT_SYNTAX_MINDIST_H_
+#define _FF_SOFT_SYNTAX_MINDIST_H_
#include "ff.h"
#include "hg.h"
-struct SoftSyntacticFeatures2Impl;
+struct SoftSyntaxFeaturesMindistImpl;
-class SoftSyntacticFeatures2 : public FeatureFunction {
+class SoftSyntaxFeaturesMindist : public FeatureFunction {
public:
- SoftSyntacticFeatures2(const std::string& param);
- ~SoftSyntacticFeatures2();
+ SoftSyntaxFeaturesMindist(const std::string& param);
+ ~SoftSyntaxFeaturesMindist();
protected:
virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
const Hypergraph::Edge& edge,
@@ -19,9 +19,9 @@ class SoftSyntacticFeatures2 : public FeatureFunction {
void* context) const;
virtual void PrepareForInput(const SentenceMetadata& smeta);
private:
- SoftSyntacticFeatures2Impl* impl;
+ SoftSyntaxFeaturesMindistImpl* impl;
};
-
#endif
+
diff --git a/decoder/ff_source_syntax.cc b/decoder/ff_source_syntax.cc
index a1997695..6b183863 100644
--- a/decoder/ff_source_syntax.cc
+++ b/decoder/ff_source_syntax.cc
@@ -2,8 +2,13 @@
#include <sstream>
#include <stack>
+#ifndef HAVE_OLD_CPP
+# include <unordered_set>
+#else
+# include <tr1/unordered_set>
+namespace std { using std::tr1::unordered_set; }
+#endif
-#include "hg.h"
#include "sentence_metadata.h"
#include "array2d.h"
#include "filelib.h"
@@ -24,6 +29,17 @@ inline int SpanSizeTransform(unsigned span_size) {
struct SourceSyntaxFeaturesImpl {
SourceSyntaxFeaturesImpl() {}
+ SourceSyntaxFeaturesImpl(const string& param) {
+ if (!(param.compare("") == 0)) {
+ string triggered_features_fn = param;
+ ReadFile triggered_features(triggered_features_fn);
+ string in;
+ while(getline(*triggered_features, in)) {
+ feature_filter.insert(FD::Convert(in));
+ }
+ }
+ }
+
void InitializeGrids(const string& tree, unsigned src_len) {
assert(tree.size() > 0);
//fids_cat.clear();
@@ -93,7 +109,7 @@ struct SourceSyntaxFeaturesImpl {
if (fid_ef <= 0) {
ostringstream os;
//ostringstream os2;
- os << "SYN:" << TD::Convert(lhs);
+ os << "SSYN:" << TD::Convert(lhs);
//os2 << "SYN:" << TD::Convert(lhs) << '_' << SpanSizeTransform(j - i);
//fid_cat = FD::Convert(os2.str());
os << ':';
@@ -118,21 +134,28 @@ struct SourceSyntaxFeaturesImpl {
}
fid_ef = FD::Convert(os.str());
}
- //if (fid_cat > 0)
- // feats->set_value(fid_cat, 1.0);
- if (fid_ef > 0)
- feats->set_value(fid_ef, 1.0);
+ if (fid_ef > 0) {
+ if (feature_filter.size()>0) {
+ if (feature_filter.find(fid_ef) != feature_filter.end()) {
+ feats->set_value(fid_ef, 1.0);
+ }
+ } else {
+ feats->set_value(fid_ef, 1.0);
+ }
+ }
+ cerr << FD::Convert(fid_ef) << endl;
return lhs;
}
- Array2D<WordID> src_tree; // src_tree(i,j) NT = type
- // mutable Array2D<int> fids_cat; // this tends to overfit baddly
- mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized
+ Array2D<WordID> src_tree; // src_tree(i,j) NT = type
+ // mutable Array2D<int> fids_cat; // this tends to overfit baddly
+ mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized
+ unordered_set<int> feature_filter;
};
SourceSyntaxFeatures::SourceSyntaxFeatures(const string& param) :
FeatureFunction(sizeof(WordID)) {
- impl = new SourceSyntaxFeaturesImpl;
+ impl = new SourceSyntaxFeaturesImpl(param);
}
SourceSyntaxFeatures::~SourceSyntaxFeatures() {
@@ -155,7 +178,10 @@ void SourceSyntaxFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta,
}
void SourceSyntaxFeatures::PrepareForInput(const SentenceMetadata& smeta) {
- impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength());
+ ReadFile f = ReadFile(smeta.GetSGMLValue("src_tree"));
+ string tree;
+ f.ReadAll(tree);
+ impl->InitializeGrids(tree, smeta.GetSourceLength());
}
struct SourceSpanSizeFeaturesImpl {
@@ -230,4 +256,3 @@ void SourceSpanSizeFeatures::PrepareForInput(const SentenceMetadata& smeta) {
impl->InitializeGrids(smeta.GetSourceLength());
}
-
diff --git a/decoder/ff_source_syntax.h b/decoder/ff_source_syntax.h
index a8c7150a..bdd638c1 100644
--- a/decoder/ff_source_syntax.h
+++ b/decoder/ff_source_syntax.h
@@ -1,7 +1,8 @@
-#ifndef _FF_SOURCE_TOOLS_H_
-#define _FF_SOURCE_TOOLS_H_
+#ifndef _FF_SOURCE_SYNTAX_H_
+#define _FF_SOURCE_SYNTAX_H_
#include "ff.h"
+#include "hg.h"
struct SourceSyntaxFeaturesImpl;
@@ -11,7 +12,7 @@ class SourceSyntaxFeatures : public FeatureFunction {
~SourceSyntaxFeatures();
protected:
virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
- const HG::Edge& edge,
+ const Hypergraph::Edge& edge,
const std::vector<const void*>& ant_contexts,
SparseVector<double>* features,
SparseVector<double>* estimated_features,
@@ -28,7 +29,7 @@ class SourceSpanSizeFeatures : public FeatureFunction {
~SourceSpanSizeFeatures();
protected:
virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
- const HG::Edge& edge,
+ const Hypergraph::Edge& edge,
const std::vector<const void*>& ant_contexts,
SparseVector<double>* features,
SparseVector<double>* estimated_features,
@@ -39,3 +40,4 @@ class SourceSpanSizeFeatures : public FeatureFunction {
};
#endif
+
diff --git a/decoder/ff_source_syntax2.cc b/decoder/ff_source_syntax2.cc
index 08ece917..a97e31d8 100644
--- a/decoder/ff_source_syntax2.cc
+++ b/decoder/ff_source_syntax2.cc
@@ -3,7 +3,6 @@
#include <sstream>
#include <stack>
#include <string>
-#include <tr1/unordered_set>
#include "sentence_metadata.h"
#include "array2d.h"
@@ -17,7 +16,7 @@ using namespace std;
struct SourceSyntaxFeatures2Impl {
SourceSyntaxFeatures2Impl(const string& param) {
- if (!(param.compare("") == 0)) {
+ if (param.compare("") != 0) {
string triggered_features_fn = param;
ReadFile triggered_features(triggered_features_fn);
string in;
@@ -29,10 +28,8 @@ struct SourceSyntaxFeatures2Impl {
void InitializeGrids(const string& tree, unsigned src_len) {
assert(tree.size() > 0);
- //fids_cat.clear();
fids_ef.clear();
src_tree.clear();
- //fids_cat.resize(src_len, src_len + 1);
fids_ef.resize(src_len, src_len + 1);
src_tree.resize(src_len, src_len + 1, TD::Convert("XX"));
ParseTreeString(tree, src_len);
@@ -40,7 +37,7 @@ struct SourceSyntaxFeatures2Impl {
void ParseTreeString(const string& tree, unsigned src_len) {
//cerr << "TREE: " << tree << endl;
- stack<pair<int, WordID> > stk; // first = i, second = category
+ stack<pair<int, WordID> > stk; // first = i, second = category
pair<int, WordID> cur_cat; cur_cat.first = -1;
unsigned i = 0;
unsigned p = 0;
@@ -92,7 +89,7 @@ struct SourceSyntaxFeatures2Impl {
const WordID lhs = src_tree(i,j);
int& fid_ef = fids_ef(i,j)[&rule];
ostringstream os;
- os << "SYN:" << TD::Convert(lhs);
+ os << "SSYN2:" << TD::Convert(lhs);
os << ':';
unsigned ntc = 0;
for (unsigned k = 0; k < rule.f_.size(); ++k) {
@@ -100,7 +97,7 @@ struct SourceSyntaxFeatures2Impl {
if (k > 0 && fj <= 0) os << '_';
if (fj <= 0) {
os << '[' << TD::Convert(ants[ntc++]) << ']';
- } /*else {
+ }/*else {
os << TD::Convert(fj);
}*/
}
@@ -116,18 +113,23 @@ struct SourceSyntaxFeatures2Impl {
fid_ef = FD::Convert(os.str());
//cerr << "FEATURE: " << os.str() << endl;
//cerr << "FID_EF: " << fid_ef << endl;
- if (feature_filter.find(fid_ef) != feature_filter.end()) {
- cerr << "SYN-Feature was trigger more than once on training set." << endl;
+ if (feature_filter.size() > 0) {
+ if (feature_filter.find(fid_ef) != feature_filter.end()) {
+ //cerr << "SYN-Feature was trigger more than once on training set." << endl;
+ feats->set_value(fid_ef, 1.0);
+ }
+ //else cerr << "SYN-Feature was triggered less than once on training set." << endli;
+ }
+ else {
feats->set_value(fid_ef, 1.0);
}
- else cerr << "SYN-Feature was triggered less than once on training set." << endl;
+ cerr << FD::Convert(fid_ef) << endl;
return lhs;
}
- Array2D<WordID> src_tree; // src_tree(i,j) NT = type
- mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized
- tr1::unordered_set<int> feature_filter;
-
+ Array2D<WordID> src_tree; // src_tree(i,j) NT = type
+ mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized
+ unordered_set<int> feature_filter;
};
SourceSyntaxFeatures2::SourceSyntaxFeatures2(const string& param) :
@@ -155,5 +157,9 @@ void SourceSyntaxFeatures2::TraversalFeaturesImpl(const SentenceMetadata& smeta,
}
void SourceSyntaxFeatures2::PrepareForInput(const SentenceMetadata& smeta) {
- impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength());
+ ReadFile f = ReadFile(smeta.GetSGMLValue("src_tree"));
+ string tree;
+ f.ReadAll(tree);
+ impl->InitializeGrids(tree, smeta.GetSourceLength());
}
+
diff --git a/decoder/ff_source_syntax2.h b/decoder/ff_source_syntax2.h
index b6b7dc3d..f606c2bf 100644
--- a/decoder/ff_source_syntax2.h
+++ b/decoder/ff_source_syntax2.h
@@ -1,5 +1,5 @@
-#ifndef _FF_SOURCE_TOOLS2_H_
-#define _FF_SOURCE_TOOLS2_H_
+#ifndef _FF_SOURCE_SYNTAX2_H_
+#define _FF_SOURCE_SYNTAX2_H_
#include "ff.h"
#include "hg.h"
@@ -23,3 +23,4 @@ class SourceSyntaxFeatures2 : public FeatureFunction {
};
#endif
+
diff --git a/decoder/ff_source_syntax2_p.cc b/decoder/ff_source_syntax2_p.cc
deleted file mode 100644
index dfa791ea..00000000
--- a/decoder/ff_source_syntax2_p.cc
+++ /dev/null
@@ -1,166 +0,0 @@
-#include "ff_source_syntax2_p.h"
-
-#include <sstream>
-#include <stack>
-#include <string>
-#include <tr1/unordered_set>
-
-#include "sentence_metadata.h"
-#include "array2d.h"
-#include "filelib.h"
-
-using namespace std;
-
-// implements the source side syntax features described in Blunsom et al. (EMNLP 2008)
-// source trees must be represented in Penn Treebank format, e.g.
-// (S (NP John) (VP (V left)))
-
-struct PSourceSyntaxFeatures2Impl {
- PSourceSyntaxFeatures2Impl(const string& param) {
- if (param.compare("") != 0) {
- string triggered_features_fn = param;
- ReadFile triggered_features(triggered_features_fn);
- string in;
- while(getline(*triggered_features, in)) {
- feature_filter.insert(FD::Convert(in));
- }
- }
- /*cerr << "find(\"One\") == " << boolalpha << (table.find("One") != table.end()) << endl;
- cerr << "find(\"Three\") == " << boolalpha << (table.find("Three") != table.end()) << endl;*/
- }
-
- void InitializeGrids(const string& tree, unsigned src_len) {
- assert(tree.size() > 0);
- //fids_cat.clear();
- fids_ef.clear();
- src_tree.clear();
- //fids_cat.resize(src_len, src_len + 1);
- fids_ef.resize(src_len, src_len + 1);
- src_tree.resize(src_len, src_len + 1, TD::Convert("XX"));
- ParseTreeString(tree, src_len);
- }
-
- void ParseTreeString(const string& tree, unsigned src_len) {
- //cerr << "TREE: " << tree << endl;
- stack<pair<int, WordID> > stk; // first = i, second = category
- pair<int, WordID> cur_cat; cur_cat.first = -1;
- unsigned i = 0;
- unsigned p = 0;
- while(p < tree.size()) {
- const char cur = tree[p];
- if (cur == '(') {
- stk.push(cur_cat);
- ++p;
- unsigned k = p + 1;
- while (k < tree.size() && tree[k] != ' ') { ++k; }
- cur_cat.first = i;
- cur_cat.second = TD::Convert(tree.substr(p, k - p));
- // cerr << "NT: '" << tree.substr(p, k-p) << "' (i=" << i << ")\n";
- p = k + 1;
- } else if (cur == ')') {
- unsigned k = p;
- while (k < tree.size() && tree[k] == ')') { ++k; }
- const unsigned num_closes = k - p;
- for (unsigned ci = 0; ci < num_closes; ++ci) {
- src_tree(cur_cat.first, i) = cur_cat.second;
- cur_cat = stk.top();
- stk.pop();
- }
- p = k;
- while (p < tree.size() && (tree[p] == ' ' || tree[p] == '\t')) { ++p; }
- } else if (cur == ' ' || cur == '\t') {
- cerr << "Unexpected whitespace in: " << tree << endl;
- abort();
- } else { // terminal symbol
- unsigned k = p + 1;
- do {
- while (k < tree.size() && tree[k] != ')' && tree[k] != ' ') { ++k; }
- // cerr << "TERM: '" << tree.substr(p, k-p) << "' (i=" << i << ")\n";
- ++i;
- assert(i <= src_len);
- while (k < tree.size() && tree[k] == ' ') { ++k; }
- p = k;
- } while (p < tree.size() && tree[p] != ')');
- }
- //cerr << "i=" << i << " src_len=" << src_len << endl;
- }
- //cerr << "i=" << i << " src_len=" << src_len << endl;
- assert(i == src_len); // make sure tree specified in src_tree is
- // the same length as the source sentence
- }
-
- WordID FireFeatures(const TRule& rule, const int i, const int j, const WordID* ants, SparseVector<double>* feats) {
- //cerr << "fire features: " << rule.AsString() << " for " << i << "," << j << endl;
- const WordID lhs = src_tree(i,j);
- int& fid_ef = fids_ef(i,j)[&rule];
- ostringstream os;
- os << "SYN:" << TD::Convert(lhs);
- os << ':';
- unsigned ntc = 0;
- for (unsigned k = 0; k < rule.f_.size(); ++k) {
- int fj = rule.f_[k];
- if (k > 0 && fj <= 0) os << '_';
- if (fj <= 0) {
- os << '[' << TD::Convert(ants[ntc++]) << ']';
- } /*else {
- os << TD::Convert(fj);
- }*/
- }
- os << ':';
- for (unsigned k = 0; k < rule.e_.size(); ++k) {
- const int ei = rule.e_[k];
- if (k > 0) os << '_';
- if (ei <= 0)
- os << '[' << (1-ei) << ']';
- else
- os << TD::Convert(ei);
- }
- fid_ef = FD::Convert(os.str());
- //cerr << "FEATURE: " << os.str() << endl;
- //cerr << "FID_EF: " << fid_ef << endl;
- if (feature_filter.size() > 0) {
- if (feature_filter.find(fid_ef) != feature_filter.end()) {
- //cerr << "SYN-Feature was trigger more than once on training set." << endl;
- feats->set_value(fid_ef, 1.0);
- }
- //else cerr << "SYN-Feature was triggered less than once on training set." << endli;
- }
- else {
- feats->set_value(fid_ef, 1.0);
- }
- return lhs;
- }
-
- Array2D<WordID> src_tree; // src_tree(i,j) NT = type
- mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized
- tr1::unordered_set<int> feature_filter;
-
-};
-
-PSourceSyntaxFeatures2::PSourceSyntaxFeatures2(const string& param) :
- FeatureFunction(sizeof(WordID)) {
- impl = new PSourceSyntaxFeatures2Impl(param);
-}
-
-PSourceSyntaxFeatures2::~PSourceSyntaxFeatures2() {
- delete impl;
- impl = NULL;
-}
-
-void PSourceSyntaxFeatures2::TraversalFeaturesImpl(const SentenceMetadata& smeta,
- const Hypergraph::Edge& edge,
- const vector<const void*>& ant_contexts,
- SparseVector<double>* features,
- SparseVector<double>* estimated_features,
- void* context) const {
- WordID ants[8];
- for (unsigned i = 0; i < ant_contexts.size(); ++i)
- ants[i] = *static_cast<const WordID*>(ant_contexts[i]);
-
- *static_cast<WordID*>(context) =
- impl->FireFeatures(*edge.rule_, edge.i_, edge.j_, ants, features);
-}
-
-void PSourceSyntaxFeatures2::PrepareForInput(const SentenceMetadata& smeta) {
- impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength());
-}
diff --git a/decoder/ff_source_syntax2_p.h b/decoder/ff_source_syntax2_p.h
deleted file mode 100644
index d56ecab0..00000000
--- a/decoder/ff_source_syntax2_p.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef _FF_SOURCE_TOOLS2_H_
-#define _FF_SOURCE_TOOLS2_H_
-
-#include "ff.h"
-#include "hg.h"
-
-struct PSourceSyntaxFeatures2Impl;
-
-class PSourceSyntaxFeatures2 : public FeatureFunction {
- public:
- PSourceSyntaxFeatures2(const std::string& param);
- ~PSourceSyntaxFeatures2();
- protected:
- virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
- const Hypergraph::Edge& edge,
- const std::vector<const void*>& ant_contexts,
- SparseVector<double>* features,
- SparseVector<double>* estimated_features,
- void* context) const;
- virtual void PrepareForInput(const SentenceMetadata& smeta);
- private:
- PSourceSyntaxFeatures2Impl* impl;
-};
-
-#endif
diff --git a/decoder/ff_source_syntax_p.cc b/decoder/ff_source_syntax_p.cc
deleted file mode 100644
index cd081544..00000000
--- a/decoder/ff_source_syntax_p.cc
+++ /dev/null
@@ -1,245 +0,0 @@
-#include "ff_source_syntax_p.h"
-
-#include <sstream>
-#include <stack>
-#include <tr1/unordered_set>
-
-#include "sentence_metadata.h"
-#include "array2d.h"
-#include "filelib.h"
-
-using namespace std;
-
-// implements the source side syntax features described in Blunsom et al. (EMNLP 2008)
-// source trees must be represented in Penn Treebank format, e.g.
-// (S (NP John) (VP (V left)))
-
-// log transform to make long spans cluster together
-// but preserve differences
-inline int SpanSizeTransform(unsigned span_size) {
- if (!span_size) return 0;
- return static_cast<int>(log(span_size+1) / log(1.39)) - 1;
-}
-
-struct PSourceSyntaxFeaturesImpl {
- PSourceSyntaxFeaturesImpl() {}
-
- PSourceSyntaxFeaturesImpl(const string& param) {
- if (!(param.compare("") == 0)) {
- string triggered_features_fn = param;
- ReadFile triggered_features(triggered_features_fn);
- string in;
- while(getline(*triggered_features, in)) {
- feature_filter.insert(FD::Convert(in));
- }
- }
- }
-
- void InitializeGrids(const string& tree, unsigned src_len) {
- assert(tree.size() > 0);
- //fids_cat.clear();
- fids_ef.clear();
- src_tree.clear();
- //fids_cat.resize(src_len, src_len + 1);
- fids_ef.resize(src_len, src_len + 1);
- src_tree.resize(src_len, src_len + 1, TD::Convert("XX"));
- ParseTreeString(tree, src_len);
- }
-
- void ParseTreeString(const string& tree, unsigned src_len) {
- stack<pair<int, WordID> > stk; // first = i, second = category
- pair<int, WordID> cur_cat; cur_cat.first = -1;
- unsigned i = 0;
- unsigned p = 0;
- while(p < tree.size()) {
- const char cur = tree[p];
- if (cur == '(') {
- stk.push(cur_cat);
- ++p;
- unsigned k = p + 1;
- while (k < tree.size() && tree[k] != ' ') { ++k; }
- cur_cat.first = i;
- cur_cat.second = TD::Convert(tree.substr(p, k - p));
- // cerr << "NT: '" << tree.substr(p, k-p) << "' (i=" << i << ")\n";
- p = k + 1;
- } else if (cur == ')') {
- unsigned k = p;
- while (k < tree.size() && tree[k] == ')') { ++k; }
- const unsigned num_closes = k - p;
- for (unsigned ci = 0; ci < num_closes; ++ci) {
- // cur_cat.second spans from cur_cat.first to i
- // cerr << TD::Convert(cur_cat.second) << " from " << cur_cat.first << " to " << i << endl;
- // NOTE: unary rule chains end up being labeled with the top-most category
- src_tree(cur_cat.first, i) = cur_cat.second;
- cur_cat = stk.top();
- stk.pop();
- }
- p = k;
- while (p < tree.size() && (tree[p] == ' ' || tree[p] == '\t')) { ++p; }
- } else if (cur == ' ' || cur == '\t') {
- cerr << "Unexpected whitespace in: " << tree << endl;
- abort();
- } else { // terminal symbol
- unsigned k = p + 1;
- do {
- while (k < tree.size() && tree[k] != ')' && tree[k] != ' ') { ++k; }
- // cerr << "TERM: '" << tree.substr(p, k-p) << "' (i=" << i << ")\n";
- ++i;
- assert(i <= src_len);
- while (k < tree.size() && tree[k] == ' ') { ++k; }
- p = k;
- } while (p < tree.size() && tree[p] != ')');
- }
- }
- // cerr << "i=" << i << " src_len=" << src_len << endl;
- assert(i == src_len); // make sure tree specified in src_tree is
- // the same length as the source sentence
- }
-
- WordID FireFeatures(const TRule& rule, const int i, const int j, const WordID* ants, SparseVector<double>* feats) {
- //cerr << "fire features: " << rule.AsString() << " for " << i << "," << j << endl;
- const WordID lhs = src_tree(i,j);
- //int& fid_cat = fids_cat(i,j);
- int& fid_ef = fids_ef(i,j)[&rule];
- if (fid_ef <= 0) {
- ostringstream os;
- //ostringstream os2;
- os << "SYN:" << TD::Convert(lhs);
- //os2 << "SYN:" << TD::Convert(lhs) << '_' << SpanSizeTransform(j - i);
- //fid_cat = FD::Convert(os2.str());
- os << ':';
- unsigned ntc = 0;
- for (unsigned k = 0; k < rule.f_.size(); ++k) {
- if (k > 0) os << '_';
- int fj = rule.f_[k];
- if (fj <= 0) {
- os << '[' << TD::Convert(ants[ntc++]) << ']';
- } else {
- os << TD::Convert(fj);
- }
- }
- os << ':';
- for (unsigned k = 0; k < rule.e_.size(); ++k) {
- const int ei = rule.e_[k];
- if (k > 0) os << '_';
- if (ei <= 0)
- os << '[' << (1-ei) << ']';
- else
- os << TD::Convert(ei);
- }
- fid_ef = FD::Convert(os.str());
- }
- //if (fid_cat > 0)
- // feats->set_value(fid_cat, 1.0);
- if (fid_ef > 0 && (feature_filter.find(fid_ef) != feature_filter.end()))
- feats->set_value(fid_ef, 1.0);
- return lhs;
- }
-
- Array2D<WordID> src_tree; // src_tree(i,j) NT = type
- // mutable Array2D<int> fids_cat; // this tends to overfit baddly
- mutable Array2D<map<const TRule*, int> > fids_ef; // fires for fully lexicalized
- tr1::unordered_set<int> feature_filter;
-};
-
-PSourceSyntaxFeatures::PSourceSyntaxFeatures(const string& param) :
- FeatureFunction(sizeof(WordID)) {
- impl = new PSourceSyntaxFeaturesImpl(param);
-}
-
-PSourceSyntaxFeatures::~PSourceSyntaxFeatures() {
- delete impl;
- impl = NULL;
-}
-
-void PSourceSyntaxFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta,
- const Hypergraph::Edge& edge,
- const vector<const void*>& ant_contexts,
- SparseVector<double>* features,
- SparseVector<double>* estimated_features,
- void* context) const {
- WordID ants[8];
- for (unsigned i = 0; i < ant_contexts.size(); ++i)
- ants[i] = *static_cast<const WordID*>(ant_contexts[i]);
-
- *static_cast<WordID*>(context) =
- impl->FireFeatures(*edge.rule_, edge.i_, edge.j_, ants, features);
-}
-
-void PSourceSyntaxFeatures::PrepareForInput(const SentenceMetadata& smeta) {
- impl->InitializeGrids(smeta.GetSGMLValue("src_tree"), smeta.GetSourceLength());
-}
-
-struct PSourceSpanSizeFeaturesImpl {
- PSourceSpanSizeFeaturesImpl() {}
-
- void InitializeGrids(unsigned src_len) {
- fids.clear();
- fids.resize(src_len, src_len + 1);
- }
-
- int FireFeatures(const TRule& rule, const int i, const int j, const WordID* ants, SparseVector<double>* feats) {
- if (rule.Arity() > 0) {
- int& fid = fids(i,j)[&rule];
- if (fid <= 0) {
- ostringstream os;
- os << "SSS:";
- unsigned ntc = 0;
- for (unsigned k = 0; k < rule.f_.size(); ++k) {
- if (k > 0) os << '_';
- int fj = rule.f_[k];
- if (fj <= 0) {
- os << '[' << TD::Convert(-fj) << ants[ntc++] << ']';
- } else {
- os << TD::Convert(fj);
- }
- }
- os << ':';
- for (unsigned k = 0; k < rule.e_.size(); ++k) {
- const int ei = rule.e_[k];
- if (k > 0) os << '_';
- if (ei <= 0)
- os << '[' << (1-ei) << ']';
- else
- os << TD::Convert(ei);
- }
- fid = FD::Convert(os.str());
- }
- if (fid > 0)
- feats->set_value(fid, 1.0);
- }
- return SpanSizeTransform(j - i);
- }
-
- mutable Array2D<map<const TRule*, int> > fids;
-};
-
-PSourceSpanSizeFeatures::PSourceSpanSizeFeatures(const string& param) :
- FeatureFunction(sizeof(char)) {
- impl = new PSourceSpanSizeFeaturesImpl;
-}
-
-PSourceSpanSizeFeatures::~PSourceSpanSizeFeatures() {
- delete impl;
- impl = NULL;
-}
-
-void PSourceSpanSizeFeatures::TraversalFeaturesImpl(const SentenceMetadata& smeta,
- const Hypergraph::Edge& edge,
- const vector<const void*>& ant_contexts,
- SparseVector<double>* features,
- SparseVector<double>* estimated_features,
- void* context) const {
- int ants[8];
- for (unsigned i = 0; i < ant_contexts.size(); ++i)
- ants[i] = *static_cast<const char*>(ant_contexts[i]);
-
- *static_cast<char*>(context) =
- impl->FireFeatures(*edge.rule_, edge.i_, edge.j_, ants, features);
-}
-
-void PSourceSpanSizeFeatures::PrepareForInput(const SentenceMetadata& smeta) {
- impl->InitializeGrids(smeta.GetSourceLength());
-}
-
-
diff --git a/decoder/ff_source_syntax_p.h b/decoder/ff_source_syntax_p.h
deleted file mode 100644
index 2dd9094a..00000000
--- a/decoder/ff_source_syntax_p.h
+++ /dev/null
@@ -1,42 +0,0 @@
-#ifndef _FF_SOURCE_TOOLS_H_
-#define _FF_SOURCE_TOOLS_H_
-
-#include "ff.h"
-#include "hg.h"
-
-struct PSourceSyntaxFeaturesImpl;
-
-class PSourceSyntaxFeatures : public FeatureFunction {
- public:
- PSourceSyntaxFeatures(const std::string& param);
- ~PSourceSyntaxFeatures();
- protected:
- virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
- const Hypergraph::Edge& edge,
- const std::vector<const void*>& ant_contexts,
- SparseVector<double>* features,
- SparseVector<double>* estimated_features,
- void* context) const;
- virtual void PrepareForInput(const SentenceMetadata& smeta);
- private:
- PSourceSyntaxFeaturesImpl* impl;
-};
-
-struct PSourceSpanSizeFeaturesImpl;
-class PSourceSpanSizeFeatures : public FeatureFunction {
- public:
- PSourceSpanSizeFeatures(const std::string& param);
- ~PSourceSpanSizeFeatures();
- protected:
- virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
- const Hypergraph::Edge& edge,
- const std::vector<const void*>& ant_contexts,
- SparseVector<double>* features,
- SparseVector<double>* estimated_features,
- void* context) const;
- virtual void PrepareForInput(const SentenceMetadata& smeta);
- private:
- PSourceSpanSizeFeaturesImpl* impl;
-};
-
-#endif
diff --git a/decoder/ff_wordalign.cc b/decoder/ff_wordalign.cc
index 1491819d..dcb80110 100644
--- a/decoder/ff_wordalign.cc
+++ b/decoder/ff_wordalign.cc
@@ -7,7 +7,12 @@
#include <string>
#include <cmath>
#include <bitset>
-#include <tr1/unordered_map>
+#ifndef HAVE_OLD_CPP
+# include <unordered_map>
+#else
+# include <tr1/unordered_map>
+namespace std { using std::tr1::unordered_map; }
+#endif
#include <boost/tuple/tuple.hpp>
#include "boost/tuple/tuple_comparison.hpp"
@@ -249,7 +254,7 @@ void NewJump::FireFeature(const SentenceMetadata& smeta,
if (fp1_) get<6>(key) = GetSourceWord(id, cur_src_index + 1);
if (fprev_) get<7>(key) = GetSourceWord(id, prev_src_index);
- static std::tr1::unordered_map<NewJumpFeatureKey, int, KeyHash> fids;
+ static std::unordered_map<NewJumpFeatureKey, int, KeyHash> fids;
int& fid = fids[key];
if (!fid) {
ostringstream os;
diff --git a/decoder/ff_wordalign.h b/decoder/ff_wordalign.h
index ba3d0b9b..0161f603 100644
--- a/decoder/ff_wordalign.h
+++ b/decoder/ff_wordalign.h
@@ -5,8 +5,16 @@
#include "array2d.h"
#include "factored_lexicon_helper.h"
+#include <boost/functional/hash.hpp>
+#include <cassert>
#include <boost/scoped_ptr.hpp>
#include <boost/multi_array.hpp>
+#ifndef HAVE_OLD_CPP
+# include <unordered_map>
+#else
+# include <tr1/unordered_map>
+namespace std { using std::tr1::unordered_map; }
+#endif
class RelativeSentencePosition : public FeatureFunction {
public:
@@ -124,9 +132,6 @@ class LexicalTranslationTrigger : public FeatureFunction {
std::vector<std::vector<WordID> > triggers_;
};
-#include <tr1/unordered_map>
-#include <boost/functional/hash.hpp>
-#include <cassert>
class BlunsomSynchronousParseHack : public FeatureFunction {
public:
BlunsomSynchronousParseHack(const std::string& param);
@@ -196,7 +201,7 @@ class BlunsomSynchronousParseHack : public FeatureFunction {
const int fid_;
mutable int cur_sent_;
- typedef std::tr1::unordered_map<std::vector<WordID>, int, boost::hash<std::vector<WordID> > > Vec2Int;
+ typedef std::unordered_map<std::vector<WordID>, int, boost::hash<std::vector<WordID> > > Vec2Int;
mutable Vec2Int cur_map_;
const std::vector<WordID> mutable * cur_ref_;
mutable std::vector<std::vector<WordID> > refs_;
diff --git a/decoder/ff_wordset.cc b/decoder/ff_wordset.cc
index 70cea7de..9be6f2e0 100644
--- a/decoder/ff_wordset.cc
+++ b/decoder/ff_wordset.cc
@@ -2,21 +2,67 @@
#include "hg.h"
#include "fdict.h"
+#include "filelib.h"
+#include <boost/algorithm/string.hpp>
#include <sstream>
#include <iostream>
using namespace std;
+void WordSet::parseArgs(const string& args, string* featName, string* vocabFile, bool* oovMode) {
+ vector<string> toks(10);
+ boost::split(toks, args, boost::is_any_of(" "));
+
+ *oovMode = false;
+
+ // skip initial feature name
+ for(vector<string>::const_iterator it = toks.begin(); it != toks.end(); ++it) {
+ if(*it == "-v") {
+ *vocabFile = *++it; // copy
+
+ } else if(*it == "-N") {
+ *featName = *++it;
+ } else if(*it == "--oov") {
+ *oovMode = true;
+ } else {
+ cerr << "Unrecognized argument: " << *it << endl;
+ exit(1);
+ }
+ }
+
+ if(*featName == "") {
+ cerr << "featName (-N) not specified for WordSet" << endl;
+ exit(1);
+ }
+ if(*vocabFile == "") {
+ cerr << "vocabFile (-v) not specified for WordSet" << endl;
+ exit(1);
+ }
+}
+
+void WordSet::loadVocab(const string& vocabFile, unordered_set<WordID>* vocab) {
+ ReadFile rf(vocabFile);
+ if (!rf) {
+ cerr << "Unable to open file: " << vocabFile;
+ abort();
+ }
+ string line;
+ while (getline(*rf.stream(), line)) {
+ boost::trim(line);
+ if(line.empty()) continue;
+ WordID vocabId = TD::Convert(line);
+ vocab->insert(vocabId);
+ }
+}
+
void WordSet::TraversalFeaturesImpl(const SentenceMetadata& /*smeta*/ ,
const Hypergraph::Edge& edge,
const vector<const void*>& /* ant_contexts */,
SparseVector<double>* features,
SparseVector<double>* /* estimated_features */,
void* /* context */) const {
-
double addScore = 0.0;
- for(std::vector<WordID>::const_iterator it = edge.rule_->e_.begin(); it != edge.rule_->e_.end(); ++it) {
-
+ for(vector<WordID>::const_iterator it = edge.rule_->e_.begin(); it != edge.rule_->e_.end(); ++it) {
bool inVocab = (vocab_.find(*it) != vocab_.end());
if(oovMode_ && !inVocab) {
addScore += 1.0;
diff --git a/decoder/ff_wordset.h b/decoder/ff_wordset.h
index 639e1514..e78cd2fb 100644
--- a/decoder/ff_wordset.h
+++ b/decoder/ff_wordset.h
@@ -4,14 +4,18 @@
#include "ff.h"
#include "tdict.h"
-#include <tr1/unordered_set>
-#include <boost/algorithm/string.hpp>
-
#include <vector>
#include <string>
#include <iostream>
#include <fstream>
+#ifndef HAVE_OLD_CPP
+# include <unordered_set>
+#else
+# include <tr1/unordered_set>
+namespace std { using std::tr1::unordered_set; }
+#endif
+
class WordSet : public FeatureFunction {
public:
// we depend on the order of the initializer list
@@ -42,69 +46,12 @@ class WordSet : public FeatureFunction {
void* context) const;
private:
- static void loadVocab(const std::string& vocabFile, std::tr1::unordered_set<WordID>* vocab) {
-
- std::ifstream file;
- std::string line;
-
- file.open(vocabFile.c_str(), std::fstream::in);
- if (file.is_open()) {
- unsigned lineNum = 0;
- while (!file.eof()) {
- ++lineNum;
- getline(file, line);
- boost::trim(line);
- if(line.empty()) {
- continue;
- }
-
- WordID vocabId = TD::Convert(line);
- vocab->insert(vocabId);
- }
- file.close();
- } else {
- std::cerr << "Unable to open file: " << vocabFile;
- exit(1);
- }
- }
-
- static void parseArgs(const std::string& args, std::string* featName, std::string* vocabFile, bool* oovMode) {
-
- std::vector<std::string> toks(10);
- boost::split(toks, args, boost::is_any_of(" "));
-
- *oovMode = false;
-
- // skip initial feature name
- for(std::vector<std::string>::const_iterator it = toks.begin(); it != toks.end(); ++it) {
- if(*it == "-v") {
- *vocabFile = *++it; // copy
-
- } else if(*it == "-N") {
- *featName = *++it;
-
- } else if(*it == "--oov") {
- *oovMode = true;
-
- } else {
- std::cerr << "Unrecognized argument: " << *it << std::endl;
- exit(1);
- }
- }
-
- if(*featName == "") {
- std::cerr << "featName (-N) not specified for WordSet" << std::endl;
- exit(1);
- }
- if(*vocabFile == "") {
- std::cerr << "vocabFile (-v) not specified for WordSet" << std::endl;
- exit(1);
- }
- }
+ static void parseArgs(const std::string& args, std::string* featName, std::string* vocabFile, bool* oovMode);
+ static void loadVocab(const std::string& vocabFile, std::unordered_set<WordID>* vocab);
int fid_;
bool oovMode_;
- std::tr1::unordered_set<WordID> vocab_;
+ std::unordered_set<WordID> vocab_;
};
#endif
diff --git a/decoder/grammar.cc b/decoder/grammar.cc
index ee43f537..160d00e6 100644
--- a/decoder/grammar.cc
+++ b/decoder/grammar.cc
@@ -3,15 +3,20 @@
#include <algorithm>
#include <utility>
#include <map>
-#include <tr1/unordered_map>
-#include <tr1/unordered_set>
+#ifndef HAVE_OLD_CPP
+# include <unordered_map>
+# include <unordered_set>
+#else
+# include <tr1/unordered_map>
+# include <tr1/unordered_set>
+namespace std { using std::tr1::unordered_map; using std::tr1::unordered_set; }
+#endif
#include "rule_lexer.h"
#include "filelib.h"
#include "tdict.h"
using namespace std;
-using namespace std::tr1;
const vector<TRulePtr> Grammar::NO_RULES;
diff --git a/decoder/hg_intersect.cc b/decoder/hg_intersect.cc
index ad5b701a..31a9a1ce 100644
--- a/decoder/hg_intersect.cc
+++ b/decoder/hg_intersect.cc
@@ -1,7 +1,12 @@
#include "hg_intersect.h"
#include <vector>
-#include <tr1/unordered_map>
+#ifndef HAVE_OLD_CPP
+# include <unordered_map>
+#else
+# include <tr1/unordered_map>
+namespace std { using std::tr1::unordered_map; }
+#endif
#include "fast_lexical_cast.hpp"
#include <boost/functional/hash.hpp>
@@ -13,7 +18,6 @@
#include "bottom_up_parser.h"
using boost::lexical_cast;
-using namespace std::tr1;
using namespace std;
struct RuleFilter {
diff --git a/decoder/kbest.h b/decoder/kbest.h
index 44c23151..c7194c7e 100644
--- a/decoder/kbest.h
+++ b/decoder/kbest.h
@@ -3,7 +3,12 @@
#include <vector>
#include <utility>
-#include <tr1/unordered_set>
+#ifndef HAVE_OLD_CPP
+# include <unordered_set>
+#else
+# include <tr1/unordered_set>
+namespace std { using std::tr1::unordered_set; }
+#endif
#include <boost/shared_ptr.hpp>
#include <boost/type_traits.hpp>
@@ -22,7 +27,7 @@ namespace KBest {
// optional, filter unique yield strings
struct FilterUnique {
- std::tr1::unordered_set<std::vector<WordID>, boost::hash<std::vector<WordID> > > unique;
+ std::unordered_set<std::vector<WordID>, boost::hash<std::vector<WordID> > > unique;
bool operator()(const std::vector<WordID>& yield) {
return !unique.insert(yield).second;
@@ -111,7 +116,7 @@ namespace KBest {
};
typedef std::vector<Derivation*> CandidateHeap;
typedef std::vector<Derivation*> DerivationList;
- typedef std::tr1::unordered_set<
+ typedef std::unordered_set<
const Derivation*, DerivationUniquenessHash, DerivationUniquenessEquals> UniqueDerivationSet;
struct NodeDerivationState {
diff --git a/decoder/maxtrans_blunsom.cc b/decoder/maxtrans_blunsom.cc
index 774e4170..a9f65fab 100644
--- a/decoder/maxtrans_blunsom.cc
+++ b/decoder/maxtrans_blunsom.cc
@@ -2,8 +2,14 @@
#include <vector>
#include <algorithm>
-#include <tr1/unordered_map>
-#include <tr1/unordered_set>
+#ifndef HAVE_OLD_CPP
+# include <unordered_map>
+# include <unordered_set>
+#else
+# include <tr1/unordered_map>
+# include <tr1/unordered_set>
+namespace std { using std::tr1::unordered_map; using std::tr1::unordered_set; }
+#endif
#include <boost/tuple/tuple.hpp>
#include <boost/functional/hash.hpp>
@@ -14,7 +20,6 @@
using boost::tuple;
using namespace std;
-using namespace std::tr1;
namespace Hack {
diff --git a/decoder/phrasebased_translator.cc b/decoder/phrasebased_translator.cc
index d65e44d1..8048248e 100644
--- a/decoder/phrasebased_translator.cc
+++ b/decoder/phrasebased_translator.cc
@@ -2,8 +2,14 @@
#include <queue>
#include <iostream>
-#include <tr1/unordered_map>
-#include <tr1/unordered_set>
+#ifndef HAVE_OLD_CPP
+# include <unordered_map>
+# include <unordered_set>
+#else
+# include <tr1/unordered_map>
+# include <tr1/unordered_set>
+namespace std { using std::tr1::unordered_map; using std::tr1::unordered_set; }
+#endif
#include <boost/tuple/tuple.hpp>
#include <boost/functional/hash.hpp>
@@ -17,7 +23,6 @@
#include "array2d.h"
using namespace std;
-using namespace std::tr1;
using namespace boost::tuples;
struct Coverage : public vector<bool> {
@@ -49,10 +54,13 @@ struct Coverage : public vector<bool> {
};
struct CoverageHash {
size_t operator()(const Coverage& cov) const {
- return hasher_(static_cast<const vector<bool>&>(cov));
+ int seed = 131;
+ size_t res = 0;
+ for (vector<bool>::const_iterator it = cov.begin(); it != cov.end(); ++it) {
+ res = (res * seed) + (*it + 1);
+ }
+ return res;
}
- private:
- boost::hash<vector<bool> > hasher_;
};
ostream& operator<<(ostream& os, const Coverage& cov) {
os << '[';
diff --git a/decoder/scfg_translator.cc b/decoder/scfg_translator.cc
index 6f0b003b..a506c591 100644
--- a/decoder/scfg_translator.cc
+++ b/decoder/scfg_translator.cc
@@ -1,13 +1,9 @@
-//TODO: bottom-up pruning, with actual final models' (appropriately weighted) heuristics and local scores.
-
-//TODO: grammar heuristic (min cost of reachable rule set) for binarizations (active edges) if we wish to prune those also
-
-#include "hash.h"
-#include "translator.h"
#include <algorithm>
#include <vector>
#include <boost/foreach.hpp>
#include <boost/functional/hash.hpp>
+#include "hash.h"
+#include "translator.h"
#include "hg.h"
#include "grammar.h"
#include "bottom_up_parser.h"
@@ -16,13 +12,11 @@
#include "tdict.h"
#include "viterbi.h"
#include "verbose.h"
-#include <tr1/unordered_map>
#define foreach BOOST_FOREACH
#define reverse_foreach BOOST_REVERSE_FOREACH
using namespace std;
-using namespace std::tr1;
static bool printGrammarsUsed = false;
struct GlueGrammar : public TextGrammar {
diff --git a/decoder/sentence_metadata.h b/decoder/sentence_metadata.h
index eab9f15d..52586331 100644
--- a/decoder/sentence_metadata.h
+++ b/decoder/sentence_metadata.h
@@ -5,7 +5,9 @@
#include <map>
#include <cassert>
#include "lattice.h"
-#include "scorer.h"
+
+struct DocScorer; // deprecated, will be removed
+struct Score; // deprecated, will be removed
struct SentenceMetadata {
friend class DecoderImpl;
diff --git a/decoder/tromble_loss.cc b/decoder/tromble_loss.cc
deleted file mode 100644
index 24cfef5f..00000000
--- a/decoder/tromble_loss.cc
+++ /dev/null
@@ -1,309 +0,0 @@
-#include "tromble_loss.h"
-#include "fast_lexical_cast.hpp"
-
-#include <boost/algorithm/string/predicate.hpp>
-#include <boost/circular_buffer.hpp>
-#include <boost/functional/hash.hpp>
-#include <boost/range/iterator_range.hpp>
-#include <boost/tokenizer.hpp>
-#include <boost/unordered_map.hpp>
-
-#include <cmath>
-#include <fstream>
-#include <vector>
-
-#include "sentence_metadata.h"
-#include "trule.h"
-#include "tdict.h"
-
-using namespace std;
-
-namespace {
-
-typedef unsigned char GramCount;
-
-struct RefCounts {
- GramCount max;
- std::vector<GramCount> refs;
- size_t length;
-};
-
-typedef boost::unordered_map<std::vector<WordID>, size_t, boost::hash<std::vector<WordID> > > NGramMap;
-
-// Take all the n-grams in the references and stuff them into ngrams.
-void MakeNGramMapFromReferences(const vector<vector<WordID> > &references,
- int n,
- vector<RefCounts> *counts,
- NGramMap *ngrams) {
- ngrams->clear();
- std::pair<vector<WordID>, size_t> insert_me;
- vector<WordID> &ngram = insert_me.first;
- ngram.reserve(n);
- size_t &id = insert_me.second;
- id = 0;
- for (int refi = 0; refi < references.size(); ++refi) {
- const vector<WordID>& ref = references[refi];
- const int s = ref.size();
- for (int j=0; j<s; ++j) {
- const int remaining = s-j;
- const int k = (n < remaining ? n : remaining);
- ngram.clear();
- for (unsigned int i = 0; i < k; ++i) {
- ngram.push_back(ref[j + i]);
- std::pair<NGramMap::iterator, bool> ret(ngrams->insert(insert_me));
- if (ret.second) {
- counts->resize(id + 1);
- RefCounts &ref_counts = counts->back();
- ref_counts.max = 1;
- ref_counts.refs.resize(references.size());
- ref_counts.refs[refi] = 1;
- ref_counts.length = ngram.size();
- ++id;
- } else {
- RefCounts &ref_counts = (*counts)[ret.first->second];
- ref_counts.max = std::max(ref_counts.max, ++ref_counts.refs[refi]);
- }
- }
- }
- }
-}
-
-struct MutableState {
- MutableState(void *from, size_t n) : length(reinterpret_cast<size_t*>(from)), left(reinterpret_cast<WordID *>(length + 1)), right(left + n - 1), counts(reinterpret_cast<GramCount *>(right + n - 1)) {}
- size_t *length;
- WordID *left, *right;
- GramCount *counts;
- static size_t Size(size_t n, size_t bound_ngram_id) { return sizeof(size_t) + (n - 1) * 2 * sizeof(WordID) + bound_ngram_id * sizeof(GramCount); }
-};
-
-struct ConstState {
- ConstState(const void *from, size_t n) : length(reinterpret_cast<const size_t*>(from)), left(reinterpret_cast<const WordID *>(length + 1)), right(left + n - 1), counts(reinterpret_cast<const GramCount *>(right + n - 1)) {}
- const size_t *length;
- const WordID *left, *right;
- const GramCount *counts;
- static size_t Size(size_t n, size_t bound_ngram_id) { return sizeof(size_t) + (n - 1) * 2 * sizeof(WordID) + bound_ngram_id * sizeof(GramCount); }
-};
-
-template <class T> struct CompatibleHashRange : public std::unary_function<const boost::iterator_range<T> &, size_t> {
- size_t operator()(const boost::iterator_range<T> &range) const {
- return boost::hash_range(range.begin(), range.end());
- }
-};
-
-template <class T> struct CompatibleEqualsRange : public std::binary_function<const boost::iterator_range<T> &, const std::vector<WordID> &, size_t> {
- size_t operator()(const boost::iterator_range<T> &range, const std::vector<WordID> &vec) const {
- return boost::algorithm::equals(range, vec);
- }
- size_t operator()(const std::vector<WordID> &vec, const boost::iterator_range<T> &range) const {
- return boost::algorithm::equals(range, vec);
- }
-};
-
-void AddWord(const boost::circular_buffer<WordID> &segment, size_t min_length, const NGramMap &ref_grams, GramCount *counters) {
- typedef boost::circular_buffer<WordID>::const_iterator BufferIt;
- typedef boost::iterator_range<BufferIt> SegmentRange;
- if (segment.size() < min_length) return;
-#if 0
- CompatibleHashRange<BufferIt> hasher;
- CompatibleEqualsRange<BufferIt> equals;
- for (BufferIt seg_start(segment.end() - min_length); ; --seg_start) {
- NGramMap::const_iterator found = ref_grams.find(SegmentRange(seg_start, segment.end()));
- if (found == ref_grams.end()) break;
- ++counters[found->second];
- if (seg_start == segment.begin()) break;
- }
-#endif
-}
-
-} // namespace
-
-class TrombleLossComputerImpl {
- public:
- explicit TrombleLossComputerImpl(const std::string &params) : star_(TD::Convert("<{STAR}>")) {
- typedef boost::tokenizer<boost::char_separator<char> > Tokenizer;
- // Argument parsing
- std::string ref_file_name;
- Tokenizer tok(params, boost::char_separator<char>(" "));
- Tokenizer::iterator i = tok.begin();
- if (i == tok.end()) {
- std::cerr << "TrombleLossComputer needs a reference file name." << std::endl;
- exit(1);
- }
- ref_file_name = *i++;
- if (i == tok.end()) {
- std::cerr << "TrombleLossComputer needs to know how many references." << std::endl;
- exit(1);
- }
- num_refs_ = boost::lexical_cast<unsigned int>(*i++);
- for (; i != tok.end(); ++i) {
- thetas_.push_back(boost::lexical_cast<double>(*i));
- }
- if (thetas_.empty()) {
- std::cerr << "TrombleLossComputer is pointless with no weight on n-grams." << std::endl;
- exit(1);
- }
-
- // Read references file.
- std::ifstream ref_file(ref_file_name.c_str());
- if (!ref_file) {
- std::cerr << "Could not open TrombleLossComputer file " << ref_file_name << std::endl;
- exit(1);
- }
- std::string ref;
- vector<vector<WordID> > references(num_refs_);
- bound_ngram_id_ = 0;
- for (unsigned int sentence = 0; ref_file; ++sentence) {
- for (unsigned int refidx = 0; refidx < num_refs_; ++refidx) {
- if (!getline(ref_file, ref)) {
- if (refidx == 0) break;
- std::cerr << "Short read of " << refidx << " references for sentence " << sentence << std::endl;
- exit(1);
- }
- TD::ConvertSentence(ref, &references[refidx]);
- }
- ref_ids_.resize(sentence + 1);
- ref_counts_.resize(sentence + 1);
- MakeNGramMapFromReferences(references, thetas_.size(), &ref_counts_.back(), &ref_ids_.back());
- bound_ngram_id_ = std::max(bound_ngram_id_, ref_ids_.back().size());
- }
- }
-
- size_t StateSize() const {
- // n-1 boundary words plus counts for n-grams currently rendered as bytes even though most would fit in bits.
- // Also, this is cached by higher up classes so no need to cache here.
- return MutableState::Size(thetas_.size(), bound_ngram_id_);
- }
-
- double Traversal(
- const SentenceMetadata &smeta,
- const TRule &rule,
- const vector<const void*> &ant_contexts,
- void *out_context) const {
- // TODO: get refs from sentence metadata.
- // This will require resizable features.
- if (smeta.GetSentenceID() >= ref_ids_.size()) {
- std::cerr << "Sentence ID " << smeta.GetSentenceID() << " doesn't have references; there are only " << ref_ids_.size() << " references." << std::endl;
- exit(1);
- }
- const NGramMap &ngrams = ref_ids_[smeta.GetSentenceID()];
- MutableState out_state(out_context, thetas_.size());
- memset(out_state.counts, 0, bound_ngram_id_ * sizeof(GramCount));
- boost::circular_buffer<WordID> history(thetas_.size());
- std::vector<const void*>::const_iterator ant_context = ant_contexts.begin();
- *out_state.length = 0;
- size_t pushed = 0;
- const size_t keep = thetas_.size() - 1;
- for (vector<WordID>::const_iterator rhs = rule.e().begin(); rhs != rule.e().end(); ++rhs) {
- if (*rhs < 1) {
- assert(ant_context != ant_contexts.end());
- // Constituent
- ConstState rhs_state(*ant_context, thetas_.size());
- *out_state.length += *rhs_state.length;
- {
- GramCount *accum = out_state.counts;
- for (const GramCount *c = rhs_state.counts; c != rhs_state.counts + ngrams.size(); ++c, ++accum) {
- *accum += *c;
- }
- }
- const WordID *w = rhs_state.left;
- bool long_constit = true;
- for (size_t i = 1; i <= keep; ++i, ++w) {
- if (*w == star_) {
- long_constit = false;
- break;
- }
- history.push_back(*w);
- if (++pushed == keep) {
- std::copy(history.begin(), history.end(), out_state.left);
- }
- // Now i is the length of the history coming from this constituent. So it needs at least i+1 words to have a cross-child add.
- AddWord(history, i + 1, ngrams, out_state.counts);
- }
- // If the consituent is shorter than thetas_.size(), then the
- // constituent's left is the entire constituent, so history is already
- // correct. Otherwise, the entire right hand side is the entire
- // history.
- if (long_constit) {
- history.assign(thetas_.size(), rhs_state.right, rhs_state.right + keep);
- }
- ++ant_context;
- } else {
- // Word
- ++*out_state.length;
- history.push_back(*rhs);
- if (++pushed == keep) {
- std::copy(history.begin(), history.end(), out_state.left);
- }
- AddWord(history, 1, ngrams, out_state.counts);
- }
- }
- // Fill in left and right constituents.
- if (pushed < keep) {
- std::copy(history.begin(), history.end(), out_state.left);
- for (WordID *i = out_state.left + pushed; i != out_state.left + keep; ++i) {
- *i = star_;
- }
- std::copy(out_state.left, out_state.left + keep, out_state.right);
- } else if(pushed == keep) {
- std::copy(history.begin(), history.end(), out_state.right);
- } else if ((pushed > keep) && !history.empty()) {
- std::copy(history.begin() + 1, history.end(), out_state.right);
- }
- std::vector<RefCounts>::const_iterator ref_info = ref_counts_[smeta.GetSentenceID()].begin();
- // Clip the counts and count matches.
- // Indexed by reference then by length.
- std::vector<std::vector<unsigned int> > matches(num_refs_, std::vector<unsigned int>(thetas_.size()));
- for (GramCount *c = out_state.counts; c != out_state.counts + ngrams.size(); ++c, ++ref_info) {
- *c = std::min(*c, ref_info->max);
- if (*c) {
- for (unsigned int refidx = 0; refidx < num_refs_; ++refidx) {
- assert(ref_info->length >= 1);
- assert(ref_info->length - 1 < thetas_.size());
- matches[refidx][ref_info->length - 1] += std::min(*c, ref_info->refs[refidx]);
- }
- }
- }
- double best_score = 0.0;
- for (unsigned int refidx = 0; refidx < num_refs_; ++refidx) {
- double score = 0.0;
- for (unsigned int j = 0; j < std::min(*out_state.length, thetas_.size()); ++j) {
- score += thetas_[j] * static_cast<double>(matches[refidx][j]) / static_cast<double>(*out_state.length - j);
- }
- best_score = std::max(best_score, score);
- }
- return best_score;
- }
-
- private:
- unsigned int num_refs_;
- // Indexed by sentence id.
- std::vector<NGramMap> ref_ids_;
- // Then by id from ref_ids_.
- std::vector<std::vector<RefCounts> > ref_counts_;
-
- // thetas_[0] is the weight for 1-grams
- std::vector<double> thetas_;
-
- // All ngram ids in ref_ids_ are < this value.
- size_t bound_ngram_id_;
-
- const WordID star_;
-};
-
-TrombleLossComputer::TrombleLossComputer(const std::string &params) :
- boost::base_from_member<PImpl>(new TrombleLossComputerImpl(params)),
- FeatureFunction(boost::base_from_member<PImpl>::member->StateSize()),
- fid_(FD::Convert("TrombleLossComputer")) {}
-
-TrombleLossComputer::~TrombleLossComputer() {}
-
-void TrombleLossComputer::TraversalFeaturesImpl(const SentenceMetadata& smeta,
- const Hypergraph::Edge& edge,
- const vector<const void*>& ant_contexts,
- SparseVector<double>* features,
- SparseVector<double>* estimated_features,
- void* out_context) const {
- (void) estimated_features;
- const double loss = boost::base_from_member<PImpl>::member->Traversal(smeta, *edge.rule_, ant_contexts, out_context);
- features->set_value(fid_, loss);
-}
diff --git a/decoder/tromble_loss.h b/decoder/tromble_loss.h
deleted file mode 100644
index fde33100..00000000
--- a/decoder/tromble_loss.h
+++ /dev/null
@@ -1,40 +0,0 @@
-#ifndef _TROMBLE_LOSS_H_
-#define _TROMBLE_LOSS_H_
-
-#include <vector>
-#include <boost/scoped_ptr.hpp>
-#include <boost/utility/base_from_member.hpp>
-
-#include "ff.h"
-#include "wordid.h"
-
-// this may not be the most elegant way to implement this computation, but since we
-// may need cube pruning and state splitting, we reuse the feature detector framework.
-// the loss is then stored in a feature #0 (which is guaranteed to have weight 0 and
-// never be a "real" feature).
-class TrombleLossComputerImpl;
-class TrombleLossComputer : private boost::base_from_member<boost::scoped_ptr<TrombleLossComputerImpl> >, public FeatureFunction {
- private:
- typedef boost::scoped_ptr<TrombleLossComputerImpl> PImpl;
- typedef FeatureFunction Base;
-
- public:
- // String parameters are ref.txt num_ref weight1 weight2 ... weightn
- // where ref.txt contains references on per line, with num_ref references per sentence
- // The weights are the weight on each length n-gram.
- explicit TrombleLossComputer(const std::string &params);
-
- ~TrombleLossComputer();
-
- protected:
- virtual void TraversalFeaturesImpl(const SentenceMetadata& smeta,
- const HG::Edge& edge,
- const std::vector<const void*>& ant_contexts,
- SparseVector<double>* features,
- SparseVector<double>* estimated_features,
- void* out_context) const;
- private:
- const int fid_;
-};
-
-#endif