summaryrefslogtreecommitdiff
path: root/word-aligner/aligner.pl
diff options
context:
space:
mode:
authorChris Dyer <redpony@gmail.com>2010-02-01 17:38:39 -0500
committerChris Dyer <redpony@gmail.com>2010-02-01 17:38:39 -0500
commitc97b8a8b58f7385fb48b74e2cf1ea9610cd1202f (patch)
tree3bc1b02c39927a810862136534d5a0e35d7ed4fc /word-aligner/aligner.pl
parentda222df300e4f87ad185a7decbf119ad56aa34e0 (diff)
word aligner cleanup, new features
Diffstat (limited to 'word-aligner/aligner.pl')
-rwxr-xr-xword-aligner/aligner.pl20
1 files changed, 15 insertions, 5 deletions
diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl
index d203fc53..7eec0e42 100755
--- a/word-aligner/aligner.pl
+++ b/word-aligner/aligner.pl
@@ -6,15 +6,20 @@ use Getopt::Long;
my $training_dir = "$SCRIPT_DIR/../training";
die "Can't find training dir: $training_dir" unless -d $training_dir;
+my $mkcls = '/Users/redpony/software/giza/giza-pp/mkcls-v2/mkcls';
my $num_classes = 50;
my $nodes = 40;
my $pmem = "2500mb";
my $DECODER = "cdec";
GetOptions("cdec=s" => \$DECODER,
"jobs=i" => \$nodes,
- "pmem=s" => \$pmem
+ "pmem=s" => \$pmem,
+ "mkcls=s" => \$mkcls,
) or usage();
usage() unless (scalar @ARGV == 1);
+die "Cannot find mkcls (specify with --mkcls=/path/to/mkcls) at $mkcls\n" unless -f $mkcls;
+die "Cannot execute mkcls at $mkcls\n" unless -x $mkcls;
+
my $in_file = shift @ARGV;
die "Expected format corpus.l1-l2 where l1 & l2 are two-letter abbreviations\nfor the source and target language respectively\n" unless ($in_file =~ /^.+\.([a-z][a-z])-([a-z][a-z])$/);
my $f_lang = $1;
@@ -22,13 +27,13 @@ my $e_lang = $2;
print STDERR "Source language: $f_lang\n";
print STDERR "Target language: $e_lang\n";
+print STDERR " Using mkcls in: $mkcls\n\n";
die "Don't have an orthographic normalizer for $f_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$f_lang.pl";
die "Don't have an orthographic normalizer for $e_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$e_lang.pl";
my @stages = qw(nopos relpos markov);
my @directions = qw(f-e e-f);
-my $mkcls = '/Users/redpony/software/giza/giza-pp/mkcls-v2/mkcls';
my $corpus = 'c';
my $cwd = getcwd();
@@ -75,7 +80,7 @@ NCLASSES = $num_classes
TARGETS = @targets
PTRAIN = \$(TRAINING_DIR)/cluster-ptrain.pl --restart_if_necessary
-PTRAIN_PARAMS = --gaussian_prior --sigma_squared 1.0 --max_iteration 5
+PTRAIN_PARAMS = --gaussian_prior --sigma_squared 1.0 --max_iteration 15
export
@@ -95,12 +100,16 @@ clean:
EOT
close TOPLEVEL;
+print STDERR "Created alignment task. chdir to talign/ then type make.\n\n";
+exit 0;
+
sub make_stage {
my ($stage, $direction, $prev_stage) = @_;
my $stage_dir = "$align_dir/$stage-$direction";
my $first = $direction;
$first =~ s/^(.+)-.*$/$1/;
mkdir $stage_dir;
+ my $RELPOS = "feature_function=RelativeSentencePosition $align_dir/grammars/corpus.class.$first\n";
open CDEC, ">$stage_dir/cdec.ini" or die;
print CDEC <<EOT;
formalism=lexcrf
@@ -108,10 +117,11 @@ intersection_strategy=full
grammar=$align_dir/grammars/corpus.$direction.lex-grammar.gz
EOT
if ($stage =~ /relpos/) {
- print CDEC "feature_function=RelativeSentencePosition\n";
+ print CDEC "$RELPOS\n";
} elsif ($stage =~ /markov/) {
- print CDEC "feature_function=RelativeSentencePosition\n";
+ print CDEC "$RELPOS\n";
print CDEC "feature_function=MarkovJump\n";
+ print CDEC "feature_function=MarkovJumpFClass $align_dir/grammars/corpus.class.$first\n";
print CDEC "feature_function=SourcePOSBigram $align_dir/grammars/corpus.class.$first\n";
}
close CDEC;