summaryrefslogtreecommitdiff
path: root/word-aligner
diff options
context:
space:
mode:
Diffstat (limited to 'word-aligner')
-rwxr-xr-xword-aligner/aligner.pl7
-rw-r--r--word-aligner/fast_align.cc8
-rw-r--r--word-aligner/makefiles/makefile.grammars12
-rwxr-xr-xword-aligner/support/generate_word_pair_features.pl63
-rw-r--r--word-aligner/ttables.cc1
-rw-r--r--word-aligner/ttables.h11
6 files changed, 90 insertions, 12 deletions
diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl
index cbccb94a..08d95162 100755
--- a/word-aligner/aligner.pl
+++ b/word-aligner/aligner.pl
@@ -86,10 +86,17 @@ PTRAIN_PARAMS = --gaussian_prior --sigma_squared 1.0 --max_iteration 15
#MPIRUN = mpirun -np $(MPIJOBS)
MPIRUN=
+USE_AFFIXES = 0
+
WALLTIME=90
export
+generate-wordpair-features:
+ \@failcom='exit 1'; \\
+ (cd grammars && make USE_AFFIXES=\$(USE_AFFIXES) ) || eval \$\$failcom;
+ cd ..
+
all:
\@failcom='exit 1'; \\
list='\$(TARGETS)'; for subdir in \$\$list; do \\
diff --git a/word-aligner/fast_align.cc b/word-aligner/fast_align.cc
index fddcba9c..f54233eb 100644
--- a/word-aligner/fast_align.cc
+++ b/word-aligner/fast_align.cc
@@ -1,7 +1,12 @@
#include <iostream>
#include <cmath>
#include <utility>
-#include <tr1/unordered_map>
+#ifndef HAVE_OLD_CPP
+# include <unordered_map>
+#else
+# include <tr1/unordered_map>
+namespace std { using std::tr1::unordered_map; }
+#endif
#include <boost/functional/hash.hpp>
#include <boost/program_options.hpp>
@@ -17,7 +22,6 @@
namespace po = boost::program_options;
using namespace std;
-using namespace std::tr1;
bool InitCommandLine(int argc, char** argv, po::variables_map* conf) {
po::options_description opts("Configuration options");
diff --git a/word-aligner/makefiles/makefile.grammars b/word-aligner/makefiles/makefile.grammars
index 8d3ea8cb..1db516f1 100644
--- a/word-aligner/makefiles/makefile.grammars
+++ b/word-aligner/makefiles/makefile.grammars
@@ -19,6 +19,8 @@ MAKE_LEX_GRAMMAR = $(SUPPORT_DIR)/make_lex_grammar.pl
MODEL1 = $(SCRIPT_DIR)/fast_align
MERGE_CORPUS = $(SUPPORT_DIR)/merge_corpus.pl
+USE_AFFIXES = 0
+
e.voc: corpus.e
$(EXTRACT_VOCAB) < corpus.e > $@
@@ -66,20 +68,20 @@ corpus.e-f: corpus.f corpus.e
$(MERGE_CORPUS) corpus.e corpus.f > $@
corpus.f-e.model1: corpus.f-e
- $(MODEL1) -p -v -i corpus.f-e > $@
+ $(MODEL1) -p corpus.f-e.model1 -v -i corpus.f-e > $@
corpus.e-f.model1: corpus.e-f
- $(MODEL1) -p -v -V -i corpus.e-f > $@
+ $(MODEL1) -p corpus.e-f.model1 -v -V -i corpus.e-f > $@
corpus.f-e.full-model1: corpus.f-e
- $(MODEL1) -p -t -999999 -v -V -i corpus.f-e > $@
+ $(MODEL1) -p corpus.f-e.full-model1 -t -999999 -v -V -i corpus.f-e > $@
corpus.e-f.full-model1: corpus.e-f
- $(MODEL1) -p -t -999999 -v -V -i corpus.e-f > $@
+ $(MODEL1) -p corpus.e-f.full-model1 -t -999999 -v -V -i corpus.e-f > $@
corpus.f-e.lex-grammar.gz: corpus.f-e corpus.f-e.model1 corpus.e-f.model1
$(MAKE_LEX_GRAMMAR) corpus.f-e corpus.f-e.model1 corpus.e-f.model1 | $(GZIP) -9 > $@
wordpairs.f-e.features.gz: corpus.f-e corpus.f-e.full-model1 corpus.e-f.full-model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f corpus.f-e.model1
- $(GENERATE_WORDPAIR_FEATURES) corpus.f-e corpus.f-e.full-model1 corpus.e-f.full-model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f corpus.f-e.model1 | $(GZIP) -9 > $@
+ $(GENERATE_WORDPAIR_FEATURES) corpus.f-e corpus.f-e.full-model1 corpus.e-f.full-model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f corpus.f-e.model1 $(USE_AFFIXES) $(USE_AFFIXES) | $(GZIP) -9 > $@
diff --git a/word-aligner/support/generate_word_pair_features.pl b/word-aligner/support/generate_word_pair_features.pl
index 54b89ce1..f3fdf149 100755
--- a/word-aligner/support/generate_word_pair_features.pl
+++ b/word-aligner/support/generate_word_pair_features.pl
@@ -2,7 +2,7 @@
use utf8;
use strict;
-my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f, $sparse_m1) = @ARGV;
+my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f, $sparse_m1, $use_prefixes, $use_suffixes) = @ARGV;
die "Usage: $0 corpus.fr-en corpus.f-e.full-model1 corpus.e-f.full-model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f corpus.f-e.model1\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f && $sparse_m1 && -f $sparse_m1;
my %eclass = ();
@@ -253,10 +253,71 @@ for my $f (sort keys %fdict) {
push @feats, "PuncMiss=1";
}
}
+ if ($use_prefixes) {
+ my $prefix1 = prefix_to_type($f, $e, 1);
+ if (length $prefix1 > 0 && !$is_null) { push @feats, $prefix1."=1";}
+ my $prefix2 = prefix_to_type($f, $e, 2);
+ if (length $prefix2 > 0 && !$is_null) { push @feats, $prefix2."=1";}
+ my $prefix3 = prefix_to_type($f, $e, 3);
+ if (length $prefix3 > 0 && !$is_null) { push @feats, $prefix3."=1";}
+ my $prefix1_reverse = prefix_to_type($e, $f, 1);
+ if (length $prefix1_reverse > 0 && !$is_null) { push @feats, $prefix1_reverse."=1";}
+ my $prefix2_reverse = prefix_to_type($e, $f, 2);
+ if (length $prefix2_reverse > 0 && !$is_null) { push @feats, $prefix2_reverse."=1";}
+ my $prefix3_reverse = prefix_to_type($e, $f, 3);
+ if (length $prefix3_reverse > 0 && !$is_null) { push @feats, $prefix3_reverse."=1";}
+ }
+ if ($use_suffixes) {
+ my $suffix1 = suffix_to_type($f, $e, 1);
+ if (length $suffix1 > 0 && !$is_null) { push @feats, $suffix1."=1";}
+ my $suffix2 = suffix_to_type($f, $e, 2);
+ if (length $suffix2 > 0 && !$is_null) { push @feats, $suffix2."=1";}
+ my $suffix3 = suffix_to_type($f, $e, 3);
+ if (length $suffix3 > 0 && !$is_null) { push @feats, $suffix3."=1";}
+ my $suffix1_reverse = suffix_to_type($e, $f, 1);
+ if (length $suffix1_reverse > 0 && !$is_null) { push @feats, $suffix1_reverse."=1";}
+ my $suffix2_reverse = suffix_to_type($e, $f, 2);
+ if (length $suffix2_reverse > 0 && !$is_null) { push @feats, $suffix2_reverse."=1";}
+ my $suffix3_reverse = suffix_to_type($e, $f, 3);
+ if (length $suffix3_reverse > 0 && !$is_null) { push @feats, $suffix3_reverse."=1";}
+ }
print "$f ||| $e ||| @feats\n";
}
}
+# returns a feature string instantiating the pattern "(source_prefix,target)"
+sub prefix_to_type
+{
+ # $f => src token
+ # $e => tgt token
+ my ($f, $e, $len_prefix) = @_;
+
+ if (length $f > $len_prefix && index($e.$f, '=') < 0)
+ {
+ return substr($f, 0, $len_prefix)."-".$e;
+ }
+ else
+ {
+ return "";
+ }
+}
+
+# returns a feature string instantiating the pattern "(source_prefix,target)"
+sub suffix_to_type
+{
+ # $f => src token
+ # $e => tgt token
+ my ($f, $e, $len_prefix) = @_;
+
+ if ( (length $f) > $len_prefix && index($e.$f, '=') < 0)
+ {
+ return substr($f, (length $f)-$len_prefix, $len_prefix)."_".$e;
+ }
+ else
+ {
+ return "";
+ }
+}
sub levenshtein
{
diff --git a/word-aligner/ttables.cc b/word-aligner/ttables.cc
index c177aa30..a56bbcef 100644
--- a/word-aligner/ttables.cc
+++ b/word-aligner/ttables.cc
@@ -5,7 +5,6 @@
#include "dict.h"
using namespace std;
-using namespace std::tr1;
void TTable::DeserializeProbsFromText(std::istream* in) {
int c = 0;
diff --git a/word-aligner/ttables.h b/word-aligner/ttables.h
index 507f591a..d82aff72 100644
--- a/word-aligner/ttables.h
+++ b/word-aligner/ttables.h
@@ -2,7 +2,12 @@
#define _TTABLES_H_
#include <iostream>
-#include <tr1/unordered_map>
+#ifndef HAVE_OLD_CPP
+# include <unordered_map>
+#else
+# include <tr1/unordered_map>
+namespace std { using std::tr1::unordered_map; }
+#endif
#include "sparse_vector.h"
#include "m.h"
@@ -12,8 +17,8 @@
class TTable {
public:
TTable() {}
- typedef std::tr1::unordered_map<WordID, double> Word2Double;
- typedef std::tr1::unordered_map<WordID, Word2Double> Word2Word2Double;
+ typedef std::unordered_map<WordID, double> Word2Double;
+ typedef std::unordered_map<WordID, Word2Double> Word2Word2Double;
inline double prob(const int& e, const int& f) const {
const Word2Word2Double::const_iterator cit = ttable.find(e);
if (cit != ttable.end()) {