diff options
author | Chris Dyer <redpony@gmail.com> | 2010-02-01 17:38:39 -0500 |
---|---|---|
committer | Chris Dyer <redpony@gmail.com> | 2010-02-01 17:38:39 -0500 |
commit | c97b8a8b58f7385fb48b74e2cf1ea9610cd1202f (patch) | |
tree | 3bc1b02c39927a810862136534d5a0e35d7ed4fc /word-aligner | |
parent | da222df300e4f87ad185a7decbf119ad56aa34e0 (diff) |
word aligner cleanup, new features
Diffstat (limited to 'word-aligner')
-rwxr-xr-x | word-aligner/aligner.pl | 20 | ||||
-rw-r--r-- | word-aligner/makefiles/makefile.grammars | 15 | ||||
-rwxr-xr-x | word-aligner/supplement_weights_file.pl | 37 | ||||
-rwxr-xr-x | word-aligner/support/classify.pl (renamed from word-aligner/classify.pl) | 0 | ||||
-rwxr-xr-x | word-aligner/support/extract_grammar.pl (renamed from word-aligner/extract_grammar.pl) | 0 | ||||
-rwxr-xr-x | word-aligner/support/extract_vocab.pl (renamed from word-aligner/extract_vocab.pl) | 0 | ||||
-rwxr-xr-x | word-aligner/support/extract_weights.pl (renamed from word-aligner/extract_weights.pl) | 0 | ||||
-rwxr-xr-x | word-aligner/support/invert_grammar.pl (renamed from word-aligner/invert_grammar.pl) | 0 | ||||
-rwxr-xr-x | word-aligner/support/make_lex_grammar.pl (renamed from word-aligner/make_lex_grammar.pl) | 0 | ||||
-rwxr-xr-x | word-aligner/support/merge_corpus.pl (renamed from word-aligner/merge_corpus.pl) | 0 | ||||
-rwxr-xr-x | word-aligner/support/supplement_weights_file.pl | 73 |
11 files changed, 96 insertions, 49 deletions
diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl index d203fc53..7eec0e42 100755 --- a/word-aligner/aligner.pl +++ b/word-aligner/aligner.pl @@ -6,15 +6,20 @@ use Getopt::Long; my $training_dir = "$SCRIPT_DIR/../training"; die "Can't find training dir: $training_dir" unless -d $training_dir; +my $mkcls = '/Users/redpony/software/giza/giza-pp/mkcls-v2/mkcls'; my $num_classes = 50; my $nodes = 40; my $pmem = "2500mb"; my $DECODER = "cdec"; GetOptions("cdec=s" => \$DECODER, "jobs=i" => \$nodes, - "pmem=s" => \$pmem + "pmem=s" => \$pmem, + "mkcls=s" => \$mkcls, ) or usage(); usage() unless (scalar @ARGV == 1); +die "Cannot find mkcls (specify with --mkcls=/path/to/mkcls) at $mkcls\n" unless -f $mkcls; +die "Cannot execute mkcls at $mkcls\n" unless -x $mkcls; + my $in_file = shift @ARGV; die "Expected format corpus.l1-l2 where l1 & l2 are two-letter abbreviations\nfor the source and target language respectively\n" unless ($in_file =~ /^.+\.([a-z][a-z])-([a-z][a-z])$/); my $f_lang = $1; @@ -22,13 +27,13 @@ my $e_lang = $2; print STDERR "Source language: $f_lang\n"; print STDERR "Target language: $e_lang\n"; +print STDERR " Using mkcls in: $mkcls\n\n"; die "Don't have an orthographic normalizer for $f_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$f_lang.pl"; die "Don't have an orthographic normalizer for $e_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$e_lang.pl"; my @stages = qw(nopos relpos markov); my @directions = qw(f-e e-f); -my $mkcls = '/Users/redpony/software/giza/giza-pp/mkcls-v2/mkcls'; my $corpus = 'c'; my $cwd = getcwd(); @@ -75,7 +80,7 @@ NCLASSES = $num_classes TARGETS = @targets PTRAIN = \$(TRAINING_DIR)/cluster-ptrain.pl --restart_if_necessary -PTRAIN_PARAMS = --gaussian_prior --sigma_squared 1.0 --max_iteration 5 +PTRAIN_PARAMS = --gaussian_prior --sigma_squared 1.0 --max_iteration 15 export @@ -95,12 +100,16 @@ clean: EOT close TOPLEVEL; +print STDERR "Created alignment task. chdir to talign/ then type make.\n\n"; +exit 0; + sub make_stage { my ($stage, $direction, $prev_stage) = @_; my $stage_dir = "$align_dir/$stage-$direction"; my $first = $direction; $first =~ s/^(.+)-.*$/$1/; mkdir $stage_dir; + my $RELPOS = "feature_function=RelativeSentencePosition $align_dir/grammars/corpus.class.$first\n"; open CDEC, ">$stage_dir/cdec.ini" or die; print CDEC <<EOT; formalism=lexcrf @@ -108,10 +117,11 @@ intersection_strategy=full grammar=$align_dir/grammars/corpus.$direction.lex-grammar.gz EOT if ($stage =~ /relpos/) { - print CDEC "feature_function=RelativeSentencePosition\n"; + print CDEC "$RELPOS\n"; } elsif ($stage =~ /markov/) { - print CDEC "feature_function=RelativeSentencePosition\n"; + print CDEC "$RELPOS\n"; print CDEC "feature_function=MarkovJump\n"; + print CDEC "feature_function=MarkovJumpFClass $align_dir/grammars/corpus.class.$first\n"; print CDEC "feature_function=SourcePOSBigram $align_dir/grammars/corpus.class.$first\n"; } close CDEC; diff --git a/word-aligner/makefiles/makefile.grammars b/word-aligner/makefiles/makefile.grammars index a6167010..b89937c1 100644 --- a/word-aligner/makefiles/makefile.grammars +++ b/word-aligner/makefiles/makefile.grammars @@ -3,18 +3,19 @@ all: corpus.f-e.lex-grammar.gz corpus.e-f.lex-grammar.gz corpus.class.e corpus.c clean: $(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e weights* corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar* +SUPPORT_DIR = $(SCRIPT_DIR)/support GZIP = /usr/bin/gzip ZCAT = zcat -EXTRACT_WEIGHTS = $(SCRIPT_DIR)/extract_weights.pl -EXTRACT_GRAMMAR = $(SCRIPT_DIR)/extract_grammar.pl -SUPPLEMENT_WEIGHTS = $(SCRIPT_DIR)/supplement_weights_file.pl -EXTRACT_VOCAB = $(SCRIPT_DIR)/extract_vocab.pl +EXTRACT_WEIGHTS = $(SUPPORT_DIR)/extract_weights.pl +EXTRACT_GRAMMAR = $(SUPPORT_DIR)/extract_grammar.pl +SUPPLEMENT_WEIGHTS = $(SUPPORT_DIR)/supplement_weights_file.pl +EXTRACT_VOCAB = $(SUPPORT_DIR)/extract_vocab.pl ORTHONORM_E = $(SCRIPT_DIR)/ortho-norm/$(E_LANG).pl ORTHONORM_F = $(SCRIPT_DIR)/ortho-norm/$(F_LANG).pl -CLASSIFY = $(SCRIPT_DIR)/classify.pl -MAKE_LEX_GRAMMAR = $(SCRIPT_DIR)/make_lex_grammar.pl +CLASSIFY = $(SUPPORT_DIR)/classify.pl +MAKE_LEX_GRAMMAR = $(SUPPORT_DIR)/make_lex_grammar.pl MODEL1 = $(TRAINING_DIR)/model1 -MERGE_CORPUS = $(SCRIPT_DIR)/merge_corpus.pl +MERGE_CORPUS = $(SUPPORT_DIR)/merge_corpus.pl orthonorm-dict.e: corpus.e $(EXTRACT_VOCAB) corpus.e > e.voc diff --git a/word-aligner/supplement_weights_file.pl b/word-aligner/supplement_weights_file.pl deleted file mode 100755 index 76f668e2..00000000 --- a/word-aligner/supplement_weights_file.pl +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/perl -w -use strict; - -my ($f_classes) = @ARGV; - -die "Usage: $0 f-classes.file" unless $f_classes && -f $f_classes; - -print <<EOT; -MarkovJump 0 -RelativeSentencePosition 0 -EOT - -# ! 8 -# " 11 -# 's 18 - -my %dcats = (); -$dcats{'BOS'} = 1; -$dcats{'EOS'} = 1; - -open FC, "<$f_classes" or die; -while(<FC>) { - chomp; - my ($x, $cat) = split /\s+/; - $dcats{$cat} = 1; -} - -my @cats = sort keys %dcats; - -for (my $i=0; $i < scalar @cats; $i++) { - my $c1 = $cats[$i]; - for (my $j=0; $j < scalar @cats; $j++) { - my $c2 = $cats[$j]; - print "SP:${c1}_${c2} 0\n"; - } -} - diff --git a/word-aligner/classify.pl b/word-aligner/support/classify.pl index 893c7b22..893c7b22 100755 --- a/word-aligner/classify.pl +++ b/word-aligner/support/classify.pl diff --git a/word-aligner/extract_grammar.pl b/word-aligner/support/extract_grammar.pl index d7275ef5..d7275ef5 100755 --- a/word-aligner/extract_grammar.pl +++ b/word-aligner/support/extract_grammar.pl diff --git a/word-aligner/extract_vocab.pl b/word-aligner/support/extract_vocab.pl index 070d4202..070d4202 100755 --- a/word-aligner/extract_vocab.pl +++ b/word-aligner/support/extract_vocab.pl diff --git a/word-aligner/extract_weights.pl b/word-aligner/support/extract_weights.pl index dfedd12e..dfedd12e 100755 --- a/word-aligner/extract_weights.pl +++ b/word-aligner/support/extract_weights.pl diff --git a/word-aligner/invert_grammar.pl b/word-aligner/support/invert_grammar.pl index 3988388d..3988388d 100755 --- a/word-aligner/invert_grammar.pl +++ b/word-aligner/support/invert_grammar.pl diff --git a/word-aligner/make_lex_grammar.pl b/word-aligner/support/make_lex_grammar.pl index bdb2752c..bdb2752c 100755 --- a/word-aligner/make_lex_grammar.pl +++ b/word-aligner/support/make_lex_grammar.pl diff --git a/word-aligner/merge_corpus.pl b/word-aligner/support/merge_corpus.pl index 02827903..02827903 100755 --- a/word-aligner/merge_corpus.pl +++ b/word-aligner/support/merge_corpus.pl diff --git a/word-aligner/support/supplement_weights_file.pl b/word-aligner/support/supplement_weights_file.pl new file mode 100755 index 00000000..7f804b90 --- /dev/null +++ b/word-aligner/support/supplement_weights_file.pl @@ -0,0 +1,73 @@ +#!/usr/bin/perl -w +use strict; + +my $ADD_FCLASS_JUMP = 1; +my $ADD_MODEL2_BINARY = 0; +my $ADD_FC_RELPOS = 1; + +my ($f_classes) = @ARGV; + +die "Usage: $0 f-classes.file" unless $f_classes && -f $f_classes; + +print <<EOT; +MarkovJump 0 +RelativeSentencePosition 0 +EOT + +# ! 8 +# " 11 +# 's 18 + +my %dcats = (); +$dcats{'BOS'} = 1; +$dcats{'EOS'} = 1; + +open FC, "<$f_classes" or die; +while(<FC>) { + chomp; + my ($x, $cat) = split /\s+/; + $dcats{$cat} = 1; +} + +my @cats = sort keys %dcats; + +my $added = 0; +for (my $i=0; $i < scalar @cats; $i++) { + my $c1 = $cats[$i]; + for (my $j=0; $j < scalar @cats; $j++) { + my $c2 = $cats[$j]; + print "SP:${c1}_${c2} 0\n"; + $added++; + } +} + +for (my $ss=1; $ss < 100; $ss++) { + if ($ADD_FCLASS_JUMP) { + for (my $i=0; $i < scalar @cats; $i++) { + my $cat = $cats[$i]; + for (my $j = -$ss; $j <= $ss; $j++) { + print "Jump_FL:${ss}_FC:${cat}_J:$j 0\n"; + $added++; + } + } + } + if ($ADD_MODEL2_BINARY) { + # M2_FL:8_SI:3_TI:2=1 + for (my $i = 0; $i < $ss; $i++) { + for (my $j = 0; $j < 100; $j++) { + print "M2_FL:${ss}_SI:${i}_TI:${j} 0\n"; + $added++; + } + } + } +} +if ($ADD_FC_RELPOS) { + #RelPos_FC:11 + for (my $i=0; $i < scalar @cats; $i++) { + my $cat = $cats[$i]; + print "RelPos_FC:$cat 0\n"; + $added++; + } +} + +print STDERR "Added $added weights\n"; |