word aligner cleanup, new features

author: Chris Dyer <redpony@gmail.com> 2010-02-01 17:38:39 -0500
committer: Chris Dyer <redpony@gmail.com> 2010-02-01 17:38:39 -0500
commit: c97b8a8b58f7385fb48b74e2cf1ea9610cd1202f (patch)
tree: 3bc1b02c39927a810862136534d5a0e35d7ed4fc /word-aligner/aligner.pl
parent: da222df300e4f87ad185a7decbf119ad56aa34e0 (diff)
1 files changed, 15 insertions, 5 deletions
diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl
index d203fc53..7eec0e42 100755
--- a/word-aligner/aligner.pl
+++ b/word-aligner/aligner.pl
@@ -6,15 +6,20 @@ use Getopt::Long;
 my $training_dir = "$SCRIPT_DIR/../training";
 die "Can't find training dir: $training_dir" unless -d $training_dir;
 
+my $mkcls = '/Users/redpony/software/giza/giza-pp/mkcls-v2/mkcls';
 my $num_classes = 50;
 my $nodes = 40;
 my $pmem = "2500mb";
 my $DECODER = "cdec";
 GetOptions("cdec=s" => \$DECODER,
            "jobs=i" => \$nodes,
-           "pmem=s" => \$pmem
+           "pmem=s" => \$pmem,
+           "mkcls=s" => \$mkcls,
           ) or usage();
 usage() unless (scalar @ARGV == 1);
+die "Cannot find mkcls (specify with --mkcls=/path/to/mkcls) at $mkcls\n" unless -f $mkcls;
+die "Cannot execute mkcls at $mkcls\n" unless -x $mkcls;
+
 my $in_file = shift @ARGV;
 die "Expected format corpus.l1-l2 where l1 & l2 are two-letter abbreviations\nfor the source and target language respectively\n" unless ($in_file =~ /^.+\.([a-z][a-z])-([a-z][a-z])$/);
 my $f_lang = $1;
@@ -22,13 +27,13 @@ my $e_lang = $2;
 
 print STDERR "Source language: $f_lang\n";
 print STDERR "Target language: $e_lang\n";
+print STDERR " Using mkcls in: $mkcls\n\n";
 die "Don't have an orthographic normalizer for $f_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$f_lang.pl";
 die "Don't have an orthographic normalizer for $e_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$e_lang.pl";
 
 my @stages = qw(nopos relpos markov);
 my @directions = qw(f-e e-f);
 
-my $mkcls = '/Users/redpony/software/giza/giza-pp/mkcls-v2/mkcls';
 my $corpus = 'c';
 
 my $cwd = getcwd();
@@ -75,7 +80,7 @@ NCLASSES = $num_classes
 
 TARGETS = @targets
 PTRAIN = \$(TRAINING_DIR)/cluster-ptrain.pl --restart_if_necessary
-PTRAIN_PARAMS = --gaussian_prior --sigma_squared 1.0 --max_iteration 5
+PTRAIN_PARAMS = --gaussian_prior --sigma_squared 1.0 --max_iteration 15
 
 export
 
@@ -95,12 +100,16 @@ clean:
 EOT
 close TOPLEVEL;
 
+print STDERR "Created alignment task. chdir to talign/ then type make.\n\n";
+exit 0;
+
 sub make_stage {
   my ($stage, $direction, $prev_stage) = @_;
   my $stage_dir = "$align_dir/$stage-$direction";
   my $first = $direction;
   $first =~ s/^(.+)-.*$/$1/;
   mkdir $stage_dir;
+  my $RELPOS = "feature_function=RelativeSentencePosition $align_dir/grammars/corpus.class.$first\n";
   open CDEC, ">$stage_dir/cdec.ini" or die;
   print CDEC <<EOT;
 formalism=lexcrf
@@ -108,10 +117,11 @@ intersection_strategy=full
 grammar=$align_dir/grammars/corpus.$direction.lex-grammar.gz
 EOT
   if ($stage =~ /relpos/) {
-    print CDEC "feature_function=RelativeSentencePosition\n";
+    print CDEC "$RELPOS\n";
   } elsif ($stage =~ /markov/) {
-    print CDEC "feature_function=RelativeSentencePosition\n";
+    print CDEC "$RELPOS\n";
     print CDEC "feature_function=MarkovJump\n";
+    print CDEC "feature_function=MarkovJumpFClass $align_dir/grammars/corpus.class.$first\n";
     print CDEC "feature_function=SourcePOSBigram $align_dir/grammars/corpus.class.$first\n";
   }
   close CDEC;
author	Chris Dyer <redpony@gmail.com>	2010-02-01 17:38:39 -0500
committer	Chris Dyer <redpony@gmail.com>	2010-02-01 17:38:39 -0500
commit	c97b8a8b58f7385fb48b74e2cf1ea9610cd1202f (patch)
tree	3bc1b02c39927a810862136534d5a0e35d7ed4fc /word-aligner/aligner.pl
parent	da222df300e4f87ad185a7decbf119ad56aa34e0 (diff)