more wa

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@702 ec762483-ff6d-05da-a07a-a48fb63a330f
author: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-10-29 17:14:57 +0000
committer: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-10-29 17:14:57 +0000
commit: 1305f38dba367f31754044cabc7b2bea9b8a2074 (patch)
tree: 0ccfd08b62d4c1f22df2fcace03d0f2335995e15 /word-aligner
parent: 78f50e0c3c63de2149045c5afb307e9a3cacff82 (diff)
3 files changed, 22 insertions, 28 deletions
diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl
index 508dbd8d..f0733449 100755
--- a/word-aligner/aligner.pl
+++ b/word-aligner/aligner.pl
@@ -33,8 +33,7 @@ print STDERR " Using mkcls in: $mkcls\n\n";
 die "Don't have an orthographic normalizer for $f_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$f_lang.pl";
 die "Don't have an orthographic normalizer for $e_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$e_lang.pl";
 
-my @stages = qw(markov);
-my @directions = qw(f-e e-f);
+my @directions = qw(f-e);
 
 my $corpus = 'c';
 
@@ -112,8 +111,13 @@ sub make_stage {
   open CDEC, ">$stage_dir/cdec.ini" or die "Can't write $stage_dir/cdec.ini: $!";
   print CDEC <<EOT;
 formalism=lextrans
+lextrans_use_null=true
 intersection_strategy=full
+
 grammar=$align_dir/grammars/corpus.$direction.lex-grammar.gz
+# grammar=$align_dir/grammars/freq_grammar.$direction.gz
+# per_sentence_grammar_file=$align_dir/grammars/psg.$direction
+
 feature_function=LexicalPairIdentity
 feature_function=LexicalPairIdentity C $align_dir/grammars/corpus.class.$first $align_dir/grammars/voc2class.$second
 feature_function=LexicalPairIdentity S $align_dir/grammars/corpus.stemmed.$first $align_dir/grammars/${second}stem.map
diff --git a/word-aligner/makefiles/makefile.grammars b/word-aligner/makefiles/makefile.grammars
index c113688c..21f39ac1 100644
--- a/word-aligner/makefiles/makefile.grammars
+++ b/word-aligner/makefiles/makefile.grammars
@@ -1,4 +1,4 @@
-all: corpus.f-e.lex-grammar.gz corpus.e-f.lex-grammar.gz corpus.class.e corpus.class.f corpus.stemmed.f fstem.map corpus.stemmed.e estem.map corpus.f-e.sgml corpus.e-f.sgml
+all: corpus.f-e.lex-grammar.gz corpus.class.e corpus.class.f corpus.stemmed.f fstem.map corpus.stemmed.e estem.map corpus.f-e.sgml
 
 clean:
 	$(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar* *stem* corpus.f-e.sgml freq* psg*
@@ -66,24 +66,14 @@ corpus.e-f: corpus.f corpus.e $(MERGE_CORPUS)
 	$(MERGE_CORPUS) corpus.e corpus.f > $@
 
 corpus.f-e.model1: corpus.f-e $(MODEL1)
-	$(MODEL1) corpus.f-e > corpus.f-e.model1
+	$(MODEL1) corpus.f-e > $@
 
 corpus.e-f.model1: corpus.e-f $(MODEL1)
-	$(MODEL1) corpus.e-f > corpus.e-f.model1
+	$(MODEL1) corpus.e-f > $@
 
-bidir.grammars: corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f $(GIZAALIGN) $(INVGIZAALIGN)
-	$(RM) $@
-	$(MAKE_LEX_GRAMMAR) corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f $(GIZAALIGN) $(INVGIZAALIGN) > bidir.grammars
-
-corpus.f-e.lex-grammar.gz: bidir.grammars
-	$(EXTRACT_GRAMMAR) 1 bidir.grammars | $(GZIP) -9 > corpus.f-e.lex-grammar.gz
-
-corpus.e-f.lex-grammar.gz: bidir.grammars
-	$(EXTRACT_GRAMMAR) 2 bidir.grammars | $(GZIP) -9 > corpus.e-f.lex-grammar.gz
+corpus.f-e.lex-grammar.gz: corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f $(GIZAALIGN) $(INVGIZAALIGN)
+	$(MAKE_LEX_GRAMMAR) corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f $(GIZAALIGN) $(INVGIZAALIGN) | $(GZIP) -9 > $@
 
 corpus.f-e.sgml: f.voc corpus.f-e.lex-grammar.gz corpus.f-e
 	$(GENERATE_PSG) f.voc corpus.f-e corpus.f-e.lex-grammar.gz freq_grammar.f-e.gz psg.f-e $@
 
-corpus.e-f.sgml: e.voc corpus.e-f.lex-grammar.gz corpus.e-f
-	$(GENERATE_PSG) e.voc corpus.e-f corpus.e-f.lex-grammar.gz freq_grammar.e-f.gz psg.e-f $@
-
diff --git a/word-aligner/support/make_lex_grammar.pl b/word-aligner/support/make_lex_grammar.pl
index 8d38abda..2cc1be52 100755
--- a/word-aligner/support/make_lex_grammar.pl
+++ b/word-aligner/support/make_lex_grammar.pl
@@ -182,9 +182,18 @@ for my $f (sort keys %fdict) {
     my $im1 = $invm1{$e}->{$f};
     my $is_good_pair = (defined $m1 || defined $m4);
     my $is_inv_good_pair = (defined $im1);
+    my $ident = ($e eq $f);
+    if ($ident) { $is_good_pair = 1; }
     my $total_eandf = $ecounts{$e} + $fcounts{$f};
     my $dice = 2 * $efcount / $total_eandf;
     my @feats;
+    if ($efcount == 1 && $ecounts{$e} == 1 && $fcounts{$f} == 1) {
+      $is_good_pair = 1;
+      if ($ADD_111) {
+        push @feats, "OneOneOne=1";
+      }
+    }
+    next unless $is_good_pair;
     if (defined $m1 && $ADD_MODEL1) {
       push @feats, "Model1=$m1";
       my $m1d = sprintf("%.5g", $m1 * $dice);
@@ -259,8 +268,6 @@ for my $f (sort keys %fdict) {
         push @feats, "OrthoSim=$ld";
       }
     }
-    my $ident = ($e eq $f);
-    if ($ident) { $is_good_pair = 1; }
     if ($ident && $ADD_ID) {
       if ($e =~ /\d/ && $len_e > 2) { push @feats, "IdentNumber=1"; }
       if ($total_eandf < 8) { push @feats, "IdentRare=1"; }
@@ -277,22 +284,15 @@ for my $f (sort keys %fdict) {
         if ($pe eq $pf) { push @feats, "PfxIdentical=1"; }
       }
     }
-    if ($efcount == 1 && $ecounts{$e} == 1 && $fcounts{$f} == 1) {
-      $is_good_pair = 1;
-      if ($ADD_111) {
-        push @feats, "OneOneOne=1";
-      }
-    }
     if ($ADD_PUNC) {
-      if ($f =~ /^[!,\-\/"':;=+?.()\[\]«»]+$/ && $e =~ /[a-z]+/) {
+      if ($f =~ /^[!,\-\/"'`:;=+?.()\[\]«»]+$/ && $e =~ /[a-z]+/) {
         push @feats, "PuncMiss=1";
       }
     }
     my $is_special = ($is_good_pair && !(defined $m1));
     $specials++ if $is_special;
     print STDERR "$f -> $e\n" if $is_special;
-    print "1 ||| $f ||| $e ||| @feats\n" if $is_good_pair;
-    print "2 ||| $e ||| $f ||| @feats\n" if $is_inv_good_pair;
+    print "$f ||| $e ||| @feats\n" if $is_good_pair;
   }
 }
 print STDERR "Added $specials special rules that were not in the M1 set\n";
author	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-10-29 17:14:57 +0000
committer	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-10-29 17:14:57 +0000
commit	1305f38dba367f31754044cabc7b2bea9b8a2074 (patch)
tree	0ccfd08b62d4c1f22df2fcace03d0f2335995e15 /word-aligner
parent	78f50e0c3c63de2149045c5afb307e9a3cacff82 (diff)