more wa

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@701 ec762483-ff6d-05da-a07a-a48fb63a330f
author: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-10-29 14:07:45 +0000
committer: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-10-29 14:07:45 +0000
commit: 78f50e0c3c63de2149045c5afb307e9a3cacff82 (patch)
tree: 5aa0a441e06b36397070281d383f1dc2d4bf9e91
parent: 0c6c6e1e72b13ab0bf6ea2da3ac83ba5a74e5cff (diff)
2 files changed, 16 insertions, 6 deletions
diff --git a/word-aligner/makefiles/makefile.grammars b/word-aligner/makefiles/makefile.grammars
index 8a10cb19..c113688c 100644
--- a/word-aligner/makefiles/makefile.grammars
+++ b/word-aligner/makefiles/makefile.grammars
@@ -1,14 +1,14 @@
-all: corpus.f-e.lex-grammar.gz corpus.e-f.lex-grammar.gz corpus.class.e corpus.class.f corpus.stemmed.f fstem.map corpus.stemmed.e estem.map
+all: corpus.f-e.lex-grammar.gz corpus.e-f.lex-grammar.gz corpus.class.e corpus.class.f corpus.stemmed.f fstem.map corpus.stemmed.e estem.map corpus.f-e.sgml corpus.e-f.sgml
 
 clean:
-	$(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar* *stem* 
+	$(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar* *stem* corpus.f-e.sgml freq* psg*
+
 SUPPORT_DIR = $(SCRIPT_DIR)/support
 GZIP = /usr/bin/gzip
 ZCAT = zcat
-EXTRACT_WEIGHTS = $(SUPPORT_DIR)/extract_weights.pl
 EXTRACT_GRAMMAR = $(SUPPORT_DIR)/extract_grammar.pl
-SUPPLEMENT_WEIGHTS = $(SUPPORT_DIR)/supplement_weights_file.pl
 EXTRACT_VOCAB = $(SUPPORT_DIR)/extract_vocab.pl
+GENERATE_PSG = $(SUPPORT_DIR)/generate_per_sentence_grammars.pl
 ORTHONORM_E = $(SCRIPT_DIR)/ortho-norm/$(E_LANG).pl
 ORTHONORM_F = $(SCRIPT_DIR)/ortho-norm/$(F_LANG).pl
 STEM_F = $(SCRIPT_DIR)/stemmers/$(F_LANG).pl
@@ -81,3 +81,9 @@ corpus.f-e.lex-grammar.gz: bidir.grammars
 corpus.e-f.lex-grammar.gz: bidir.grammars
 	$(EXTRACT_GRAMMAR) 2 bidir.grammars | $(GZIP) -9 > corpus.e-f.lex-grammar.gz
 
+corpus.f-e.sgml: f.voc corpus.f-e.lex-grammar.gz corpus.f-e
+	$(GENERATE_PSG) f.voc corpus.f-e corpus.f-e.lex-grammar.gz freq_grammar.f-e.gz psg.f-e $@
+
+corpus.e-f.sgml: e.voc corpus.e-f.lex-grammar.gz corpus.e-f
+	$(GENERATE_PSG) e.voc corpus.e-f corpus.e-f.lex-grammar.gz freq_grammar.e-f.gz psg.e-f $@
+
diff --git a/word-aligner/support/make_lex_grammar.pl b/word-aligner/support/make_lex_grammar.pl
index e4cbf7ba..8d38abda 100755
--- a/word-aligner/support/make_lex_grammar.pl
+++ b/word-aligner/support/make_lex_grammar.pl
@@ -35,7 +35,7 @@ my %sizes = ();
 while(<M1>) {
   chomp;
   my ($f, $e, $lp) = split /\s+/;
-  $model1{$f}->{$e} = 1e-12 + exp($lp);
+  $model1{$f}->{$e} = sprintf("%.5g", 1e-12 + exp($lp));
   $sizes{$f}++;
 }
 close M1;
@@ -185,7 +185,11 @@ for my $f (sort keys %fdict) {
     my $total_eandf = $ecounts{$e} + $fcounts{$f};
     my $dice = 2 * $efcount / $total_eandf;
     my @feats;
-    if (defined $m1 && $ADD_MODEL1) { push @feats, "Model1=$m1"; my $m1d = $m1 * $dice; push @feats, "M1Dice=$m1d"; }
+    if (defined $m1 && $ADD_MODEL1) {
+      push @feats, "Model1=$m1";
+      my $m1d = sprintf("%.5g", $m1 * $dice);
+      push @feats, "M1Dice=$m1d";
+    }
     if ($ADD_MODEL1 && !defined $m1) { push @feats, "NoModel1=1"; }
     if ($ADD_FIDENT && $efcount > $MIN_FEATURE_COUNT) {
       $fc++;
author	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-10-29 14:07:45 +0000
committer	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-10-29 14:07:45 +0000
commit	78f50e0c3c63de2149045c5afb307e9a3cacff82 (patch)
tree	5aa0a441e06b36397070281d383f1dc2d4bf9e91
parent	0c6c6e1e72b13ab0bf6ea2da3ac83ba5a74e5cff (diff)