summaryrefslogtreecommitdiff
path: root/word-aligner
diff options
context:
space:
mode:
authorChris Dyer <redpony@gmail.com>2010-02-01 17:38:39 -0500
committerChris Dyer <redpony@gmail.com>2010-02-01 17:38:39 -0500
commitc97b8a8b58f7385fb48b74e2cf1ea9610cd1202f (patch)
tree3bc1b02c39927a810862136534d5a0e35d7ed4fc /word-aligner
parentda222df300e4f87ad185a7decbf119ad56aa34e0 (diff)
word aligner cleanup, new features
Diffstat (limited to 'word-aligner')
-rwxr-xr-xword-aligner/aligner.pl20
-rw-r--r--word-aligner/makefiles/makefile.grammars15
-rwxr-xr-xword-aligner/supplement_weights_file.pl37
-rwxr-xr-xword-aligner/support/classify.pl (renamed from word-aligner/classify.pl)0
-rwxr-xr-xword-aligner/support/extract_grammar.pl (renamed from word-aligner/extract_grammar.pl)0
-rwxr-xr-xword-aligner/support/extract_vocab.pl (renamed from word-aligner/extract_vocab.pl)0
-rwxr-xr-xword-aligner/support/extract_weights.pl (renamed from word-aligner/extract_weights.pl)0
-rwxr-xr-xword-aligner/support/invert_grammar.pl (renamed from word-aligner/invert_grammar.pl)0
-rwxr-xr-xword-aligner/support/make_lex_grammar.pl (renamed from word-aligner/make_lex_grammar.pl)0
-rwxr-xr-xword-aligner/support/merge_corpus.pl (renamed from word-aligner/merge_corpus.pl)0
-rwxr-xr-xword-aligner/support/supplement_weights_file.pl73
11 files changed, 96 insertions, 49 deletions
diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl
index d203fc53..7eec0e42 100755
--- a/word-aligner/aligner.pl
+++ b/word-aligner/aligner.pl
@@ -6,15 +6,20 @@ use Getopt::Long;
my $training_dir = "$SCRIPT_DIR/../training";
die "Can't find training dir: $training_dir" unless -d $training_dir;
+my $mkcls = '/Users/redpony/software/giza/giza-pp/mkcls-v2/mkcls';
my $num_classes = 50;
my $nodes = 40;
my $pmem = "2500mb";
my $DECODER = "cdec";
GetOptions("cdec=s" => \$DECODER,
"jobs=i" => \$nodes,
- "pmem=s" => \$pmem
+ "pmem=s" => \$pmem,
+ "mkcls=s" => \$mkcls,
) or usage();
usage() unless (scalar @ARGV == 1);
+die "Cannot find mkcls (specify with --mkcls=/path/to/mkcls) at $mkcls\n" unless -f $mkcls;
+die "Cannot execute mkcls at $mkcls\n" unless -x $mkcls;
+
my $in_file = shift @ARGV;
die "Expected format corpus.l1-l2 where l1 & l2 are two-letter abbreviations\nfor the source and target language respectively\n" unless ($in_file =~ /^.+\.([a-z][a-z])-([a-z][a-z])$/);
my $f_lang = $1;
@@ -22,13 +27,13 @@ my $e_lang = $2;
print STDERR "Source language: $f_lang\n";
print STDERR "Target language: $e_lang\n";
+print STDERR " Using mkcls in: $mkcls\n\n";
die "Don't have an orthographic normalizer for $f_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$f_lang.pl";
die "Don't have an orthographic normalizer for $e_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$e_lang.pl";
my @stages = qw(nopos relpos markov);
my @directions = qw(f-e e-f);
-my $mkcls = '/Users/redpony/software/giza/giza-pp/mkcls-v2/mkcls';
my $corpus = 'c';
my $cwd = getcwd();
@@ -75,7 +80,7 @@ NCLASSES = $num_classes
TARGETS = @targets
PTRAIN = \$(TRAINING_DIR)/cluster-ptrain.pl --restart_if_necessary
-PTRAIN_PARAMS = --gaussian_prior --sigma_squared 1.0 --max_iteration 5
+PTRAIN_PARAMS = --gaussian_prior --sigma_squared 1.0 --max_iteration 15
export
@@ -95,12 +100,16 @@ clean:
EOT
close TOPLEVEL;
+print STDERR "Created alignment task. chdir to talign/ then type make.\n\n";
+exit 0;
+
sub make_stage {
my ($stage, $direction, $prev_stage) = @_;
my $stage_dir = "$align_dir/$stage-$direction";
my $first = $direction;
$first =~ s/^(.+)-.*$/$1/;
mkdir $stage_dir;
+ my $RELPOS = "feature_function=RelativeSentencePosition $align_dir/grammars/corpus.class.$first\n";
open CDEC, ">$stage_dir/cdec.ini" or die;
print CDEC <<EOT;
formalism=lexcrf
@@ -108,10 +117,11 @@ intersection_strategy=full
grammar=$align_dir/grammars/corpus.$direction.lex-grammar.gz
EOT
if ($stage =~ /relpos/) {
- print CDEC "feature_function=RelativeSentencePosition\n";
+ print CDEC "$RELPOS\n";
} elsif ($stage =~ /markov/) {
- print CDEC "feature_function=RelativeSentencePosition\n";
+ print CDEC "$RELPOS\n";
print CDEC "feature_function=MarkovJump\n";
+ print CDEC "feature_function=MarkovJumpFClass $align_dir/grammars/corpus.class.$first\n";
print CDEC "feature_function=SourcePOSBigram $align_dir/grammars/corpus.class.$first\n";
}
close CDEC;
diff --git a/word-aligner/makefiles/makefile.grammars b/word-aligner/makefiles/makefile.grammars
index a6167010..b89937c1 100644
--- a/word-aligner/makefiles/makefile.grammars
+++ b/word-aligner/makefiles/makefile.grammars
@@ -3,18 +3,19 @@ all: corpus.f-e.lex-grammar.gz corpus.e-f.lex-grammar.gz corpus.class.e corpus.c
clean:
$(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e weights* corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar*
+SUPPORT_DIR = $(SCRIPT_DIR)/support
GZIP = /usr/bin/gzip
ZCAT = zcat
-EXTRACT_WEIGHTS = $(SCRIPT_DIR)/extract_weights.pl
-EXTRACT_GRAMMAR = $(SCRIPT_DIR)/extract_grammar.pl
-SUPPLEMENT_WEIGHTS = $(SCRIPT_DIR)/supplement_weights_file.pl
-EXTRACT_VOCAB = $(SCRIPT_DIR)/extract_vocab.pl
+EXTRACT_WEIGHTS = $(SUPPORT_DIR)/extract_weights.pl
+EXTRACT_GRAMMAR = $(SUPPORT_DIR)/extract_grammar.pl
+SUPPLEMENT_WEIGHTS = $(SUPPORT_DIR)/supplement_weights_file.pl
+EXTRACT_VOCAB = $(SUPPORT_DIR)/extract_vocab.pl
ORTHONORM_E = $(SCRIPT_DIR)/ortho-norm/$(E_LANG).pl
ORTHONORM_F = $(SCRIPT_DIR)/ortho-norm/$(F_LANG).pl
-CLASSIFY = $(SCRIPT_DIR)/classify.pl
-MAKE_LEX_GRAMMAR = $(SCRIPT_DIR)/make_lex_grammar.pl
+CLASSIFY = $(SUPPORT_DIR)/classify.pl
+MAKE_LEX_GRAMMAR = $(SUPPORT_DIR)/make_lex_grammar.pl
MODEL1 = $(TRAINING_DIR)/model1
-MERGE_CORPUS = $(SCRIPT_DIR)/merge_corpus.pl
+MERGE_CORPUS = $(SUPPORT_DIR)/merge_corpus.pl
orthonorm-dict.e: corpus.e
$(EXTRACT_VOCAB) corpus.e > e.voc
diff --git a/word-aligner/supplement_weights_file.pl b/word-aligner/supplement_weights_file.pl
deleted file mode 100755
index 76f668e2..00000000
--- a/word-aligner/supplement_weights_file.pl
+++ /dev/null
@@ -1,37 +0,0 @@
-#!/usr/bin/perl -w
-use strict;
-
-my ($f_classes) = @ARGV;
-
-die "Usage: $0 f-classes.file" unless $f_classes && -f $f_classes;
-
-print <<EOT;
-MarkovJump 0
-RelativeSentencePosition 0
-EOT
-
-# ! 8
-# " 11
-# 's 18
-
-my %dcats = ();
-$dcats{'BOS'} = 1;
-$dcats{'EOS'} = 1;
-
-open FC, "<$f_classes" or die;
-while(<FC>) {
- chomp;
- my ($x, $cat) = split /\s+/;
- $dcats{$cat} = 1;
-}
-
-my @cats = sort keys %dcats;
-
-for (my $i=0; $i < scalar @cats; $i++) {
- my $c1 = $cats[$i];
- for (my $j=0; $j < scalar @cats; $j++) {
- my $c2 = $cats[$j];
- print "SP:${c1}_${c2} 0\n";
- }
-}
-
diff --git a/word-aligner/classify.pl b/word-aligner/support/classify.pl
index 893c7b22..893c7b22 100755
--- a/word-aligner/classify.pl
+++ b/word-aligner/support/classify.pl
diff --git a/word-aligner/extract_grammar.pl b/word-aligner/support/extract_grammar.pl
index d7275ef5..d7275ef5 100755
--- a/word-aligner/extract_grammar.pl
+++ b/word-aligner/support/extract_grammar.pl
diff --git a/word-aligner/extract_vocab.pl b/word-aligner/support/extract_vocab.pl
index 070d4202..070d4202 100755
--- a/word-aligner/extract_vocab.pl
+++ b/word-aligner/support/extract_vocab.pl
diff --git a/word-aligner/extract_weights.pl b/word-aligner/support/extract_weights.pl
index dfedd12e..dfedd12e 100755
--- a/word-aligner/extract_weights.pl
+++ b/word-aligner/support/extract_weights.pl
diff --git a/word-aligner/invert_grammar.pl b/word-aligner/support/invert_grammar.pl
index 3988388d..3988388d 100755
--- a/word-aligner/invert_grammar.pl
+++ b/word-aligner/support/invert_grammar.pl
diff --git a/word-aligner/make_lex_grammar.pl b/word-aligner/support/make_lex_grammar.pl
index bdb2752c..bdb2752c 100755
--- a/word-aligner/make_lex_grammar.pl
+++ b/word-aligner/support/make_lex_grammar.pl
diff --git a/word-aligner/merge_corpus.pl b/word-aligner/support/merge_corpus.pl
index 02827903..02827903 100755
--- a/word-aligner/merge_corpus.pl
+++ b/word-aligner/support/merge_corpus.pl
diff --git a/word-aligner/support/supplement_weights_file.pl b/word-aligner/support/supplement_weights_file.pl
new file mode 100755
index 00000000..7f804b90
--- /dev/null
+++ b/word-aligner/support/supplement_weights_file.pl
@@ -0,0 +1,73 @@
+#!/usr/bin/perl -w
+use strict;
+
+my $ADD_FCLASS_JUMP = 1;
+my $ADD_MODEL2_BINARY = 0;
+my $ADD_FC_RELPOS = 1;
+
+my ($f_classes) = @ARGV;
+
+die "Usage: $0 f-classes.file" unless $f_classes && -f $f_classes;
+
+print <<EOT;
+MarkovJump 0
+RelativeSentencePosition 0
+EOT
+
+# ! 8
+# " 11
+# 's 18
+
+my %dcats = ();
+$dcats{'BOS'} = 1;
+$dcats{'EOS'} = 1;
+
+open FC, "<$f_classes" or die;
+while(<FC>) {
+ chomp;
+ my ($x, $cat) = split /\s+/;
+ $dcats{$cat} = 1;
+}
+
+my @cats = sort keys %dcats;
+
+my $added = 0;
+for (my $i=0; $i < scalar @cats; $i++) {
+ my $c1 = $cats[$i];
+ for (my $j=0; $j < scalar @cats; $j++) {
+ my $c2 = $cats[$j];
+ print "SP:${c1}_${c2} 0\n";
+ $added++;
+ }
+}
+
+for (my $ss=1; $ss < 100; $ss++) {
+ if ($ADD_FCLASS_JUMP) {
+ for (my $i=0; $i < scalar @cats; $i++) {
+ my $cat = $cats[$i];
+ for (my $j = -$ss; $j <= $ss; $j++) {
+ print "Jump_FL:${ss}_FC:${cat}_J:$j 0\n";
+ $added++;
+ }
+ }
+ }
+ if ($ADD_MODEL2_BINARY) {
+ # M2_FL:8_SI:3_TI:2=1
+ for (my $i = 0; $i < $ss; $i++) {
+ for (my $j = 0; $j < 100; $j++) {
+ print "M2_FL:${ss}_SI:${i}_TI:${j} 0\n";
+ $added++;
+ }
+ }
+ }
+}
+if ($ADD_FC_RELPOS) {
+ #RelPos_FC:11
+ for (my $i=0; $i < scalar @cats; $i++) {
+ my $cat = $cats[$i];
+ print "RelPos_FC:$cat 0\n";
+ $added++;
+ }
+}
+
+print STDERR "Added $added weights\n";