handle translation from the null word

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@689 ec762483-ff6d-05da-a07a-a48fb63a330f
author: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-10-22 23:29:11 +0000
committer: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-10-22 23:29:11 +0000
commit: dd886ca6da84970ccb96b2f0155ff672e03f5b58 (patch)
tree: 78b5627347f3953539852cdd6b92053e844e87d4
parent: 550019457302ecaaec6f72e912013a6fa9f2da67 (diff)
6 files changed, 99 insertions, 86 deletions
diff --git a/decoder/aligner.cc b/decoder/aligner.cc
index 92431be4..3f0c7347 100644
--- a/decoder/aligner.cc
+++ b/decoder/aligner.cc
@@ -24,8 +24,10 @@ void SourceEdgeCoveragesUsingParseIndices(const Hypergraph& g,
     if (edge.rule_->EWords() == 0 || edge.rule_->FWords() == 0)
       continue;
     // aligned to NULL (crf ibm variant only)
-    if (edge.prev_i_ == -1 || edge.i_ == -1)
+    if (edge.prev_i_ == -1 || edge.i_ == -1) {
+      cov.insert(-1);
       continue;
+    }
     assert(edge.j_ >= 0);
     assert(edge.prev_j_ >= 0);
     if (edge.Arity() == 0) {
@@ -211,7 +213,7 @@ void AlignerTools::WriteAlignment(const Lattice& src_lattice,
   // figure out the src and reference size;
   int src_size = src_sent.size();
   int ref_size = trg_sent.size();
-  Array2D<prob_t> align(src_size, ref_size, prob_t::Zero());
+  Array2D<prob_t> align(src_size + 1, ref_size, prob_t::Zero());
   for (int c = 0; c < g->edges_.size(); ++c) {
     const prob_t& p = edge_posteriors[c];
     const set<int>& srcs = src_cov[c];
@@ -220,7 +222,7 @@ void AlignerTools::WriteAlignment(const Lattice& src_lattice,
          si != srcs.end(); ++si) {
       for (set<int>::const_iterator ti = trgs.begin();
            ti != trgs.end(); ++ti) {
-        align(*si, *ti) += p;
+        align(*si + 1, *ti) += p;
       }
     }
   }
@@ -234,12 +236,12 @@ void AlignerTools::WriteAlignment(const Lattice& src_lattice,
   for (int j = 0; j < ref_size; ++j) {
     if (use_soft_threshold) {
       threshold = prob_t::Zero();
-      for (int i = 0; i < src_size; ++i)
+      for (int i = 0; i <= src_size; ++i)
         if (align(i, j) > threshold) threshold = align(i, j);
       //threshold *= prob_t(0.99);
     }
     for (int i = 0; i < src_size; ++i)
-      grid(i, j) = align(i, j) >= threshold;
+      grid(i, j) = align(i+1, j) >= threshold;
   }
   if (out == &cout) {
     // TODO need to do some sort of verbose flag
diff --git a/decoder/decoder.cc b/decoder/decoder.cc
index b975a5fc..eb983419 100644
--- a/decoder/decoder.cc
+++ b/decoder/decoder.cc
@@ -364,7 +364,7 @@ DecoderImpl::DecoderImpl(po::variables_map& conf, int argc, char** argv, istream
         ("beam_prune", po::value<double>(), "Prune paths from +LM forest, keep paths within exp(alpha>=0)")
     ("scale_prune_srclen", "scale beams by the input length (in # of tokens; may not be what you want for lattices")
     ("promise_power",po::value<double>()->default_value(0), "Give more beam budget to more promising previous-pass nodes when pruning - but allocate the same average beams.  0 means off, 1 means beam proportional to inside*outside prob, n means nth power (affects just --cubepruning_pop_limit).  note: for the same pop_limit, this gives more search error unless very close to 0 (recommend disabled; even 0.01 is slightly worse than 0) which is a bad sign and suggests this isn't doing a good job; further it's slightly slower to LM cube rescore with 0.01 compared to 0, as well as giving (very insignificantly) lower BLEU.  TODO: test under more conditions, or try idea with different formula, or prob. cube beams.")
-        ("lexalign_use_null", "Support source-side null words in lexical translation")
+        ("lextrans_use_null", "Support source-side null words in lexical translation")
         ("tagger_tagset,t", po::value<string>(), "(Tagger) file containing tag set")
         ("csplit_output_plf", "(Compound splitter) Output lattice in PLF format")
         ("csplit_preserve_full_word", "(Compound splitter) Always include the unsegmented form in the output lattice")
diff --git a/decoder/ff_wordalign.cc b/decoder/ff_wordalign.cc
index f2f07033..5f42b438 100644
--- a/decoder/ff_wordalign.cc
+++ b/decoder/ff_wordalign.cc
@@ -16,6 +16,8 @@
 
 static const int MAX_SENTENCE_SIZE = 100;
 
+static const int kNULL_i = 255;  // -1 as an unsigned char
+
 using namespace std;
 
 Model2BinaryFeatures::Model2BinaryFeatures(const string& ) :
@@ -149,7 +151,11 @@ void MarkovJumpFClass::FireFeature(const SentenceMetadata& smeta,
                                    int prev_src_pos,
                                    int cur_src_pos,
                                    SparseVector<double>* features) const {
+  if (prev_src_pos == kNULL_i || cur_src_pos == kNULL_i)
+    return;
+
   const int jumpsize = cur_src_pos - prev_src_pos;
+
   assert(smeta.GetSentenceID() < pos_.size());
   const WordID cur_fclass = pos_[smeta.GetSentenceID()][cur_src_pos];
   const int fid = fids_[smeta.GetSourceLength()].find(cur_fclass)->second.find(jumpsize)->second;
@@ -189,10 +195,13 @@ void MarkovJumpFClass::TraversalFeaturesImpl(const SentenceMetadata& smeta,
   }
 }
 
-//  std::vector<std::map<int, int> > flen2jump2fid_;
 MarkovJump::MarkovJump(const string& param) :
     FeatureFunction(1),
     fid_(FD::Convert("MarkovJump")),
+    fid_lex_null_(FD::Convert("JumpLexNull")),
+    fid_null_lex_(FD::Convert("JumpNullLex")),
+    fid_null_null_(FD::Convert("JumpNullNull")),
+    fid_lex_lex_(FD::Convert("JumpLexLex")),
     binary_params_(false) {
   cerr << "    MarkovJump";
   vector<string> argv;
@@ -218,7 +227,7 @@ MarkovJump::MarkovJump(const string& param) :
   cerr << endl;
 }
 
-// TODO handle NULLs according to Och 2000
+// TODO handle NULLs according to Och 2000?
 void MarkovJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,
                                        const Hypergraph::Edge& edge,
                                        const vector<const void*>& ant_states,
@@ -229,19 +238,20 @@ void MarkovJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,
   const int flen = smeta.GetSourceLength();
   if (edge.Arity() == 0) {
     dpstate = static_cast<unsigned int>(edge.i_);
-    if (edge.prev_i_ == 0) {
-      if (binary_params_) {
-        // NULL will be tricky
-        // TODO initial state distribution, not normal jumps
+    if (edge.prev_i_ == 0) {     // first word in sentence
+      if (edge.i_ >= 0 && binary_params_) {
         const int fid = flen2jump2fid_[flen].find(edge.i_ + 1)->second;
         features->set_value(fid, 1.0);
+      } else if (edge.i_ < 0 && binary_params_) {
+        // handled by bigram features
       }
     } else if (edge.prev_i_ == smeta.GetTargetLength() - 1) {
-        // NULL will be tricky
-      if (binary_params_) {
+      if (edge.i_ >= 0 && binary_params_) {
         int jumpsize = flen - edge.i_;
         const int fid = flen2jump2fid_[flen].find(jumpsize)->second;
         features->set_value(fid, 1.0);
+      } else if (edge.i_ < 0 && binary_params_) {
+        // handled by bigram features
       }
     }
   } else if (edge.Arity() == 1) {
@@ -253,13 +263,24 @@ void MarkovJump::TraversalFeaturesImpl(const SentenceMetadata& smeta,
       dpstate = static_cast<unsigned int>(left_index);
     else
       dpstate = static_cast<unsigned int>(right_index);
-    const int jumpsize = right_index - left_index;
+    if (left_index == kNULL_i || right_index == kNULL_i) {
+      if (left_index == kNULL_i && right_index == kNULL_i)
+        features->set_value(fid_null_null_, 1.0);
+      else if (left_index == kNULL_i)
+        features->set_value(fid_null_lex_, 1.0);
+      else
+        features->set_value(fid_lex_null_, 1.0);
 
-    if (binary_params_) {
-      const int fid = flen2jump2fid_[flen].find(jumpsize)->second;
-      features->set_value(fid, 1.0);
     } else {
-      features->set_value(fid_, fabs(jumpsize - 1));  // Blunsom and Cohn def
+      features->set_value(fid_lex_lex_, 1.0); // TODO should only use if NULL is enabled
+      const int jumpsize = right_index - left_index;
+
+      if (binary_params_) {
+        const int fid = flen2jump2fid_[flen].find(jumpsize)->second;
+        features->set_value(fid, 1.0);
+      } else {
+        features->set_value(fid_, fabs(jumpsize - 1));  // Blunsom and Cohn def
+      }
     }
   } else {
     assert(!"something really unexpected is happening");
@@ -294,15 +315,6 @@ void SourceBigram::FireFeature(WordID left,
     if (fid == 0) fid = -1;
   }
   if (fid > 0) features->set_value(fid, 1.0);
-  int& ufid = ufmap_[left];
-  if (!ufid) {
-    ostringstream os;
-    os << "SU:";
-    if (left < 0) { os << "BOS"; } else { os << TD::Convert(left); }
-    ufid = FD::Convert(os.str());
-    if (ufid == 0) fid = -1;
-  }
-  if (ufid > 0) features->set_value(ufid, 1.0);
 }
 
 void SourceBigram::TraversalFeaturesImpl(const SentenceMetadata& smeta,
@@ -386,8 +398,14 @@ void SourcePOSBigram::TraversalFeaturesImpl(const SentenceMetadata& smeta,
   if (arity == 0) {
     assert(smeta.GetSentenceID() < pos_.size());
     const vector<WordID>& pos_sent = pos_[smeta.GetSentenceID()];
-    assert(edge.i_ < pos_sent.size());
-    out_context = pos_sent[edge.i_];
+    if (edge.i_ >= 0) {  // non-NULL source
+      assert(edge.i_ < pos_sent.size());
+      out_context = pos_sent[edge.i_];
+    } else { // NULL source
+      // should assert that source is kNULL?
+      static const WordID kNULL = TD::Convert("<eps>");
+      out_context = kNULL;
+    }
     out_word_count = edge.rule_->EWords();
     assert(out_word_count == 1); // this is only defined for lex translation!
     // revisit this if you want to translate into null words
diff --git a/decoder/ff_wordalign.h b/decoder/ff_wordalign.h
index 30ddf7a1..0714229c 100644
--- a/decoder/ff_wordalign.h
+++ b/decoder/ff_wordalign.h
@@ -49,6 +49,11 @@ class MarkovJump : public FeatureFunction {
                                      void* out_context) const;
  private:
   const int fid_;
+  const int fid_lex_null_;
+  const int fid_null_lex_;
+  const int fid_null_null_;
+  const int fid_lex_lex_;
+
   bool binary_params_;
   std::vector<std::map<int, int> > flen2jump2fid_;
 };
@@ -96,7 +101,6 @@ class SourceBigram : public FeatureFunction {
                    WordID trg,
                    SparseVector<double>* features) const;
   mutable Class2Class2FID fmap_;
-  mutable Class2FID ufmap_;
 };
 
 class SourcePOSBigram : public FeatureFunction {
diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl
index 7821560f..e23c2beb 100755
--- a/word-aligner/aligner.pl
+++ b/word-aligner/aligner.pl
@@ -6,9 +6,10 @@ use Getopt::Long;
 my $training_dir = "$SCRIPT_DIR/../training";
 die "Can't find training dir: $training_dir" unless -d $training_dir;
 
-my $mkcls = '/Users/redpony/software/giza/giza-pp/mkcls-v2/mkcls';
+my $mkcls = '/Users/cdyer/software/giza-pp/mkcls-v2/mkcls';
 my $num_classes = 50;
 my $nodes = 40;
+my $TRAINING_ITERATIONS = 2000;
 my $pmem = "2500mb";
 my $DECODER = "cdec";
 GetOptions("cdec=s" => \$DECODER,
@@ -16,15 +17,11 @@ GetOptions("cdec=s" => \$DECODER,
            "pmem=s" => \$pmem,
            "mkcls=s" => \$mkcls,
           ) or usage();
-usage() unless (scalar @ARGV == 3);
+usage() unless (scalar @ARGV == 1);
 die "Cannot find mkcls (specify with --mkcls=/path/to/mkcls) at $mkcls\n" unless -f $mkcls;
 die "Cannot execute mkcls at $mkcls\n" unless -x $mkcls;
 
 my $in_file = shift @ARGV;
-my $m4 = shift @ARGV;
-my $im4 = shift @ARGV;
-die "Can't find model4: $m4" unless -f $m4;
-die "Can't find inverse model4: $im4" unless -f $im4;
 
 die "Expected format corpus.l1-l2 where l1 & l2 are two-letter abbreviations\nfor the source and target language respectively\n" unless ($in_file =~ /^.+\.([a-z][a-z])-([a-z][a-z])$/);
 my $f_lang = $1;
@@ -32,13 +29,11 @@ my $e_lang = $2;
 
 print STDERR "Source language: $f_lang\n";
 print STDERR "Target language: $e_lang\n";
-print STDERR "  Model 4 align: $m4\n";
-print STDERR "InModel 4 align: $im4\n";
 print STDERR " Using mkcls in: $mkcls\n\n";
 die "Don't have an orthographic normalizer for $f_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$f_lang.pl";
 die "Don't have an orthographic normalizer for $e_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$e_lang.pl";
 
-my @stages = qw(nopos relpos markov);
+my @stages = qw(markov);
 my @directions = qw(f-e e-f);
 
 my $corpus = 'c';
@@ -67,12 +62,8 @@ die unless $? == 0;
 my @targets = qw(grammars);
 
 for my $direction (@directions) {
-  my $prev_stage = undef;
-  for my $stage (@stages) {
-    push @targets, "$stage-$direction";
-    make_stage($stage, $direction, $prev_stage);
-    $prev_stage = $stage;
-  }
+  push @targets, "model-$direction";
+  make_stage($direction);
 }
 
 open TOPLEVEL, ">$align_dir/Makefile" or die "Can't write $align_dir/Makefile: $!";
@@ -84,8 +75,6 @@ SCRIPT_DIR = $SCRIPT_DIR
 TRAINING_DIR = $training_dir
 MKCLS = $mkcls
 NCLASSES = $num_classes
-GIZAALIGN = $m4
-INVGIZAALIGN = $im4
 
 TARGETS = @targets
 PTRAIN = \$(TRAINING_DIR)/cluster-ptrain.pl --restart_if_necessary
@@ -113,13 +102,12 @@ print STDERR "Created alignment task. chdir to talign/ then type make.\n\n";
 exit 0;
 
 sub make_stage {
-  my ($stage, $direction, $prev_stage) = @_;
+  my ($direction) = @_;
   my $stage_dir = "$align_dir/model-$direction";
   my $first = $direction;
   $first =~ s/^(.+)-.*$/$1/;
   mkdir $stage_dir;
-  my $RELPOS = "feature_function=RelativeSentencePosition $align_dir/grammars/corpus.class.$first\n";
-  open CDEC, ">$stage_dir/cdec.$stage.ini" or die;
+  open CDEC, ">$stage_dir/cdec.ini" or die "Can't write $stage_dir/cdec.ini: $!";
   print CDEC <<EOT;
 formalism=lextrans
 intersection_strategy=full
@@ -127,23 +115,22 @@ grammar=$align_dir/grammars/corpus.$direction.lex-grammar.gz
 feature_function=LexicalPairIdentity
 feature_function=InputIdentity
 feature_function=OutputIdentity
+feature_function=RelativeSentencePosition $align_dir/grammars/corpus.class.$first
+feature_function=MarkovJump +b
+feature_function=MarkovJumpFClass $align_dir/grammars/corpus.class.$first
+feature_function=SourceBigram
+feature_function=SourcePOSBigram $align_dir/grammars/corpus.class.$first
 EOT
-  if ($stage =~ /relpos/) {
-    print CDEC "$RELPOS\n";
-  } elsif ($stage =~ /markov/) {
-    print CDEC "$RELPOS\n";
-    print CDEC "feature_function=MarkovJump\n";
-    print CDEC "feature_function=MarkovJumpFClass $align_dir/grammars/corpus.class.$first\n";
-    print CDEC "feature_function=SourceBigram\n";
-    print CDEC "feature_function=SourcePOSBigram $align_dir/grammars/corpus.class.$first\n";
-  }
   close CDEC;
+  open AGENDA, ">$stage_dir/agenda.txt" or die "Can't write $stage_dir/agenda.txt: $!";
+  print AGENDA "cdec.ini $TRAINING_ITERATIONS\n";
+  close AGENDA;
 }
 
 sub usage {
   die <<EOT;
 
-Usage: $0 [OPTIONS] training_corpus.fr-en giza.en-fr.A3 giza.fr-en.A3
+Usage: $0 [OPTIONS] training_corpus.fr-en
 
 EOT
 }
diff --git a/word-aligner/support/make_lex_grammar.pl b/word-aligner/support/make_lex_grammar.pl
index 3926fd8d..fb9d0214 100755
--- a/word-aligner/support/make_lex_grammar.pl
+++ b/word-aligner/support/make_lex_grammar.pl
@@ -5,7 +5,8 @@ use strict;
 my $LIMIT_SIZE=30;
 
 my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f, $gizaf2e, $gizae2f) = @ARGV;
-die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f && -f $gizaf2e && -f $gizae2f;
+die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f;
+
 
 my %eclass = ();
 my %fclass = ();
@@ -20,12 +21,12 @@ our %cache;
 open EF, "<$effile" or die;
 open M1, "<$model1" or die;
 open IM1, "<$imodel1" or die;
-open M4, "<$gizaf2e" or die;
-open IM4, "<$gizae2f" or die;
+#open M4, "<$gizaf2e" or die;
+#open IM4, "<$gizae2f" or die;
+#binmode(M4,":utf8");
+#binmode(IM4,":utf8");
 binmode(EF,":utf8");
 binmode(M1,":utf8");
-binmode(M4,":utf8");
-binmode(IM4,":utf8");
 binmode(IM1,":utf8");
 binmode(STDOUT,":utf8");
 my %model1;
@@ -105,7 +106,7 @@ my $ADD_DICE = 1;
 my $ADD_111 = 1;
 my $ADD_ID = 1;
 my $ADD_PUNC = 1;
-my $ADD_NULL = 0;
+my $ADD_NULL = 1;
 my $ADD_STEM_ID = 0;
 my $ADD_SYM = 0;
 my $BEAM_RATIO = 50;
@@ -115,6 +116,8 @@ my $BIN_IDENT = 1;
 my $BIN_DICE = 1;
 my $ADD_FIDENT = 0;
 
+if ($ADD_NULL) { $fclass{'<eps>'}='NUL'; $eclass{'<eps>'} ='NUL'; }
+
 my %fdict;
 my %fcounts;
 my %ecounts;
@@ -146,24 +149,24 @@ while(<EF>) {
 
 print STDERR "Loading Giza output...\n";
 my %model4;
-while(<M4>) {
-  my $en = <M4>; chomp $en;
-  my $zh = <M4>; chomp $zh;
-  die unless $zh =~ /^NULL \({/;
-  my @ewords = split /\s+/, $en;
-  my @chunks = split /\}\) ?/, $zh;
-
-  for my $c (@chunks) {
-    my ($zh, $taps) = split / \(\{ /, $c;
-    if ($zh eq 'NULL') { $zh = '<eps>'; }
-    my @aps = map { $ewords[$_ - 1]; } (split / /, $taps);
-    #print "$zh -> @aps\n";
-    for my $ap (@aps) {
-      $model4{$zh}->{$ap} += 1;
-    }
-  }
-}
-close M4;
+#while(<M4>) {
+#  my $en = <M4>; chomp $en;
+#  my $zh = <M4>; chomp $zh;
+#  die unless $zh =~ /^NULL \({/;
+#  my @ewords = split /\s+/, $en;
+#  my @chunks = split /\}\) ?/, $zh;
+#
+#  for my $c (@chunks) {
+#    my ($zh, $taps) = split / \(\{ /, $c;
+#    if ($zh eq 'NULL') { $zh = '<eps>'; }
+#    my @aps = map { $ewords[$_ - 1]; } (split / /, $taps);
+#    #print "$zh -> @aps\n";
+#    for my $ap (@aps) {
+#      $model4{$zh}->{$ap} += 1;
+#    }
+#  }
+#}
+#close M4;
 
 my $specials = 0;
 my $fc = 1000000;
@@ -207,7 +210,6 @@ for my $f (sort keys %fdict) {
     }
     my $is_null = undef;
     if ($ADD_NULL && $f eq '<eps>') {
-      push @feats, "IsNull=1";
       $is_null = 1;
     }
     if ($ADD_LEN) {
author	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-10-22 23:29:11 +0000
committer	redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-10-22 23:29:11 +0000
commit	dd886ca6da84970ccb96b2f0155ff672e03f5b58 (patch)
tree	78b5627347f3953539852cdd6b92053e844e87d4
parent	550019457302ecaaec6f72e912013a6fa9f2da67 (diff)