summaryrefslogtreecommitdiff
path: root/word-aligner/support
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-10-28 22:19:32 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-10-28 22:19:32 +0000
commitb01eeb03ebd34a737f698b647976f8f6dc1b3775 (patch)
treecba610484518a2a6d034d8094f0f63c63fb76bff /word-aligner/support
parent14b4d7dff699259bc5e606fa0d5beb77001e32fb (diff)
small fixes
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@700 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'word-aligner/support')
-rwxr-xr-xword-aligner/support/extract_vocab.pl5
-rwxr-xr-xword-aligner/support/generate_per_sentence_grammars.pl1
-rwxr-xr-xword-aligner/support/make_lex_grammar.pl9
3 files changed, 11 insertions, 4 deletions
diff --git a/word-aligner/support/extract_vocab.pl b/word-aligner/support/extract_vocab.pl
index 070d4202..ef94b7d6 100755
--- a/word-aligner/support/extract_vocab.pl
+++ b/word-aligner/support/extract_vocab.pl
@@ -7,7 +7,10 @@ my $wc = 0;
while(<>) {
chomp;
my @words = split /\s+/;
- for my $word (@words) { $wc++; $dict{$word}++; }
+ for my $word (@words) {
+ die if $word eq '';
+ $wc++; $dict{$word}++;
+ }
}
my $tc = 0;
diff --git a/word-aligner/support/generate_per_sentence_grammars.pl b/word-aligner/support/generate_per_sentence_grammars.pl
index 8779ac9c..6a02bb0a 100755
--- a/word-aligner/support/generate_per_sentence_grammars.pl
+++ b/word-aligner/support/generate_per_sentence_grammars.pl
@@ -14,6 +14,7 @@ open FILT,"|gzip -c > $ARGV[3]" or die "Can't write $ARGV[3]: $!";
open PSG,">$ARGV[4]" or die "Can't write $ARGV[4]: $!";
open OTRAIN,">$ARGV[5]" or die "Can't write $ARGV[5]: $!";
+binmode OTRAIN, ":utf8";
binmode FILT, ":utf8";
binmode PSG, ":utf8";
binmode STDOUT, ":utf8";
diff --git a/word-aligner/support/make_lex_grammar.pl b/word-aligner/support/make_lex_grammar.pl
index 3e243125..e4cbf7ba 100755
--- a/word-aligner/support/make_lex_grammar.pl
+++ b/word-aligner/support/make_lex_grammar.pl
@@ -35,7 +35,7 @@ my %sizes = ();
while(<M1>) {
chomp;
my ($f, $e, $lp) = split /\s+/;
- $model1{$f}->{$e} = 1;
+ $model1{$f}->{$e} = 1e-12 + exp($lp);
$sizes{$f}++;
}
close M1;
@@ -50,7 +50,7 @@ while(<IM1>) {
$invm1{$e}->{$f} = 1;
$esizes{$e}++;
if (($sizes{$f} or 0) < $LIMIT_SIZE && !(defined $model1{$f}->{$e})) {
- $model1{$f}->{$e} = 1;
+ $model1{$f}->{$e} = 1e-12;
$sizes{$f}++;
$inv_add++;
}
@@ -66,7 +66,7 @@ while(<M1>) {
chomp;
my ($f, $e, $lp) = split /\s+/;
if (($esizes{$e} or 0) < $LIMIT_SIZE && !(defined $invm1{$e}->{$f})) {
- $invm1{$e}->{$f} = 1;
+ $invm1{$e}->{$f} = 1e-12;
$esizes{$e}++;
$dir_add++;
}
@@ -106,6 +106,7 @@ my $ADD_111 = 1;
my $ADD_ID = 1;
my $ADD_PUNC = 1;
my $ADD_NULL = 1;
+my $ADD_MODEL1 = 1;
my $ADD_STEM_ID = 0;
my $ADD_SYM = 0;
my $BEAM_RATIO = 50;
@@ -184,6 +185,8 @@ for my $f (sort keys %fdict) {
my $total_eandf = $ecounts{$e} + $fcounts{$f};
my $dice = 2 * $efcount / $total_eandf;
my @feats;
+ if (defined $m1 && $ADD_MODEL1) { push @feats, "Model1=$m1"; my $m1d = $m1 * $dice; push @feats, "M1Dice=$m1d"; }
+ if ($ADD_MODEL1 && !defined $m1) { push @feats, "NoModel1=1"; }
if ($ADD_FIDENT && $efcount > $MIN_FEATURE_COUNT) {
$fc++;
push @feats, "F$fc=1";