diff options
author | Chris Dyer <redpony@gmail.com> | 2013-11-13 18:08:03 -0800 |
---|---|---|
committer | Chris Dyer <redpony@gmail.com> | 2013-11-13 18:08:03 -0800 |
commit | f2fb69b10a897e8beb4e6e6d6cbb4327096235ef (patch) | |
tree | bb14849f101a9e96939ec73c8d82ef40c128a435 | |
parent | 9972fc6db6bd816f464dff90741c36f6be137f96 (diff) | |
parent | 8bdea2036a924302a569722359106a2d8a51fb17 (diff) |
Merge pull request #29 from wammar/wordpairfeatures2
1) fix the call to ibm model 1 aligner, 2) create a makefile target for ...
-rwxr-xr-x | word-aligner/aligner.pl | 7 | ||||
-rw-r--r-- | word-aligner/makefiles/makefile.grammars | 12 | ||||
-rwxr-xr-x | word-aligner/support/generate_word_pair_features.pl | 63 |
3 files changed, 76 insertions, 6 deletions
diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl index cbccb94a..08d95162 100755 --- a/word-aligner/aligner.pl +++ b/word-aligner/aligner.pl @@ -86,10 +86,17 @@ PTRAIN_PARAMS = --gaussian_prior --sigma_squared 1.0 --max_iteration 15 #MPIRUN = mpirun -np $(MPIJOBS) MPIRUN= +USE_AFFIXES = 0 + WALLTIME=90 export +generate-wordpair-features: + \@failcom='exit 1'; \\ + (cd grammars && make USE_AFFIXES=\$(USE_AFFIXES) ) || eval \$\$failcom; + cd .. + all: \@failcom='exit 1'; \\ list='\$(TARGETS)'; for subdir in \$\$list; do \\ diff --git a/word-aligner/makefiles/makefile.grammars b/word-aligner/makefiles/makefile.grammars index 8d3ea8cb..1db516f1 100644 --- a/word-aligner/makefiles/makefile.grammars +++ b/word-aligner/makefiles/makefile.grammars @@ -19,6 +19,8 @@ MAKE_LEX_GRAMMAR = $(SUPPORT_DIR)/make_lex_grammar.pl MODEL1 = $(SCRIPT_DIR)/fast_align MERGE_CORPUS = $(SUPPORT_DIR)/merge_corpus.pl +USE_AFFIXES = 0 + e.voc: corpus.e $(EXTRACT_VOCAB) < corpus.e > $@ @@ -66,20 +68,20 @@ corpus.e-f: corpus.f corpus.e $(MERGE_CORPUS) corpus.e corpus.f > $@ corpus.f-e.model1: corpus.f-e - $(MODEL1) -p -v -i corpus.f-e > $@ + $(MODEL1) -p corpus.f-e.model1 -v -i corpus.f-e > $@ corpus.e-f.model1: corpus.e-f - $(MODEL1) -p -v -V -i corpus.e-f > $@ + $(MODEL1) -p corpus.e-f.model1 -v -V -i corpus.e-f > $@ corpus.f-e.full-model1: corpus.f-e - $(MODEL1) -p -t -999999 -v -V -i corpus.f-e > $@ + $(MODEL1) -p corpus.f-e.full-model1 -t -999999 -v -V -i corpus.f-e > $@ corpus.e-f.full-model1: corpus.e-f - $(MODEL1) -p -t -999999 -v -V -i corpus.e-f > $@ + $(MODEL1) -p corpus.e-f.full-model1 -t -999999 -v -V -i corpus.e-f > $@ corpus.f-e.lex-grammar.gz: corpus.f-e corpus.f-e.model1 corpus.e-f.model1 $(MAKE_LEX_GRAMMAR) corpus.f-e corpus.f-e.model1 corpus.e-f.model1 | $(GZIP) -9 > $@ wordpairs.f-e.features.gz: corpus.f-e corpus.f-e.full-model1 corpus.e-f.full-model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f corpus.f-e.model1 - $(GENERATE_WORDPAIR_FEATURES) corpus.f-e corpus.f-e.full-model1 corpus.e-f.full-model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f corpus.f-e.model1 | $(GZIP) -9 > $@ + $(GENERATE_WORDPAIR_FEATURES) corpus.f-e corpus.f-e.full-model1 corpus.e-f.full-model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f corpus.f-e.model1 $(USE_AFFIXES) $(USE_AFFIXES) | $(GZIP) -9 > $@ diff --git a/word-aligner/support/generate_word_pair_features.pl b/word-aligner/support/generate_word_pair_features.pl index 54b89ce1..f3fdf149 100755 --- a/word-aligner/support/generate_word_pair_features.pl +++ b/word-aligner/support/generate_word_pair_features.pl @@ -2,7 +2,7 @@ use utf8; use strict; -my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f, $sparse_m1) = @ARGV; +my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f, $sparse_m1, $use_prefixes, $use_suffixes) = @ARGV; die "Usage: $0 corpus.fr-en corpus.f-e.full-model1 corpus.e-f.full-model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f corpus.f-e.model1\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f && $sparse_m1 && -f $sparse_m1; my %eclass = (); @@ -253,10 +253,71 @@ for my $f (sort keys %fdict) { push @feats, "PuncMiss=1"; } } + if ($use_prefixes) { + my $prefix1 = prefix_to_type($f, $e, 1); + if (length $prefix1 > 0 && !$is_null) { push @feats, $prefix1."=1";} + my $prefix2 = prefix_to_type($f, $e, 2); + if (length $prefix2 > 0 && !$is_null) { push @feats, $prefix2."=1";} + my $prefix3 = prefix_to_type($f, $e, 3); + if (length $prefix3 > 0 && !$is_null) { push @feats, $prefix3."=1";} + my $prefix1_reverse = prefix_to_type($e, $f, 1); + if (length $prefix1_reverse > 0 && !$is_null) { push @feats, $prefix1_reverse."=1";} + my $prefix2_reverse = prefix_to_type($e, $f, 2); + if (length $prefix2_reverse > 0 && !$is_null) { push @feats, $prefix2_reverse."=1";} + my $prefix3_reverse = prefix_to_type($e, $f, 3); + if (length $prefix3_reverse > 0 && !$is_null) { push @feats, $prefix3_reverse."=1";} + } + if ($use_suffixes) { + my $suffix1 = suffix_to_type($f, $e, 1); + if (length $suffix1 > 0 && !$is_null) { push @feats, $suffix1."=1";} + my $suffix2 = suffix_to_type($f, $e, 2); + if (length $suffix2 > 0 && !$is_null) { push @feats, $suffix2."=1";} + my $suffix3 = suffix_to_type($f, $e, 3); + if (length $suffix3 > 0 && !$is_null) { push @feats, $suffix3."=1";} + my $suffix1_reverse = suffix_to_type($e, $f, 1); + if (length $suffix1_reverse > 0 && !$is_null) { push @feats, $suffix1_reverse."=1";} + my $suffix2_reverse = suffix_to_type($e, $f, 2); + if (length $suffix2_reverse > 0 && !$is_null) { push @feats, $suffix2_reverse."=1";} + my $suffix3_reverse = suffix_to_type($e, $f, 3); + if (length $suffix3_reverse > 0 && !$is_null) { push @feats, $suffix3_reverse."=1";} + } print "$f ||| $e ||| @feats\n"; } } +# returns a feature string instantiating the pattern "(source_prefix,target)" +sub prefix_to_type +{ + # $f => src token + # $e => tgt token + my ($f, $e, $len_prefix) = @_; + + if (length $f > $len_prefix && index($e.$f, '=') < 0) + { + return substr($f, 0, $len_prefix)."-".$e; + } + else + { + return ""; + } +} + +# returns a feature string instantiating the pattern "(source_prefix,target)" +sub suffix_to_type +{ + # $f => src token + # $e => tgt token + my ($f, $e, $len_prefix) = @_; + + if ( (length $f) > $len_prefix && index($e.$f, '=') < 0) + { + return substr($f, (length $f)-$len_prefix, $len_prefix)."_".$e; + } + else + { + return ""; + } +} sub levenshtein { |