diff options
author | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-22 05:12:27 +0000 |
---|---|---|
committer | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-22 05:12:27 +0000 |
commit | 0172721855098ca02b207231a654dffa5e4eb1c9 (patch) | |
tree | 8069c3a62e2d72bd64a2cdeee9724b2679c8a56b /word-aligner | |
parent | 37728b8be4d0b3df9da81fdda2198ff55b4b2d91 (diff) |
initial checkin
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@2 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'word-aligner')
-rwxr-xr-x | word-aligner/aligner.pl | 157 | ||||
-rw-r--r-- | word-aligner/makefiles/makefile.grammars | 70 | ||||
-rw-r--r-- | word-aligner/ortho-norm/README | 2 | ||||
-rwxr-xr-x | word-aligner/ortho-norm/ar.pl | 32 | ||||
-rwxr-xr-x | word-aligner/ortho-norm/en.pl | 11 | ||||
-rwxr-xr-x | word-aligner/ortho-norm/fr.pl | 22 | ||||
-rwxr-xr-x | word-aligner/ortho-norm/ur.pl | 34 | ||||
-rwxr-xr-x | word-aligner/support/classify.pl | 27 | ||||
-rwxr-xr-x | word-aligner/support/extract_grammar.pl | 11 | ||||
-rwxr-xr-x | word-aligner/support/extract_vocab.pl | 20 | ||||
-rwxr-xr-x | word-aligner/support/extract_weights.pl | 17 | ||||
-rwxr-xr-x | word-aligner/support/invert_grammar.pl | 8 | ||||
-rwxr-xr-x | word-aligner/support/make_lex_grammar.pl | 435 | ||||
-rwxr-xr-x | word-aligner/support/merge_corpus.pl | 18 | ||||
-rwxr-xr-x | word-aligner/support/supplement_weights_file.pl | 73 |
15 files changed, 937 insertions, 0 deletions
diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl new file mode 100755 index 00000000..7eec0e42 --- /dev/null +++ b/word-aligner/aligner.pl @@ -0,0 +1,157 @@ +#!/usr/bin/perl -w +use strict; + +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path getcwd /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } +use Getopt::Long; +my $training_dir = "$SCRIPT_DIR/../training"; +die "Can't find training dir: $training_dir" unless -d $training_dir; + +my $mkcls = '/Users/redpony/software/giza/giza-pp/mkcls-v2/mkcls'; +my $num_classes = 50; +my $nodes = 40; +my $pmem = "2500mb"; +my $DECODER = "cdec"; +GetOptions("cdec=s" => \$DECODER, + "jobs=i" => \$nodes, + "pmem=s" => \$pmem, + "mkcls=s" => \$mkcls, + ) or usage(); +usage() unless (scalar @ARGV == 1); +die "Cannot find mkcls (specify with --mkcls=/path/to/mkcls) at $mkcls\n" unless -f $mkcls; +die "Cannot execute mkcls at $mkcls\n" unless -x $mkcls; + +my $in_file = shift @ARGV; +die "Expected format corpus.l1-l2 where l1 & l2 are two-letter abbreviations\nfor the source and target language respectively\n" unless ($in_file =~ /^.+\.([a-z][a-z])-([a-z][a-z])$/); +my $f_lang = $1; +my $e_lang = $2; + +print STDERR "Source language: $f_lang\n"; +print STDERR "Target language: $e_lang\n"; +print STDERR " Using mkcls in: $mkcls\n\n"; +die "Don't have an orthographic normalizer for $f_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$f_lang.pl"; +die "Don't have an orthographic normalizer for $e_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$e_lang.pl"; + +my @stages = qw(nopos relpos markov); +my @directions = qw(f-e e-f); + +my $corpus = 'c'; + +my $cwd = getcwd(); +my $align_dir = "$cwd/talign"; + +mkdir $align_dir; +mkdir "$align_dir/grammars"; +open IN, "<$in_file" or die "Can't read $in_file: $!"; +open E, ">$align_dir/grammars/corpus.e" or die "Can't write: $!"; +open F, ">$align_dir/grammars/corpus.f" or die "Can't write: $!"; +while(<IN>) { + chomp; + my ($f, $e) = split / \|\|\| /; + die "Bad format, excepted ||| separated line" unless defined $f && defined $e; + print F "$f\n"; + print E "$e\n"; +} +close F; +close E; +close IN; +`cp $SCRIPT_DIR/makefiles/makefile.grammars $align_dir/grammars/Makefile`; +die unless $? == 0; + +my @targets = qw(grammars); + +for my $direction (@directions) { + my $prev_stage = undef; + for my $stage (@stages) { + push @targets, "$stage-$direction"; + make_stage($stage, $direction, $prev_stage); + $prev_stage = $stage; + } +} + +open TOPLEVEL, ">$align_dir/Makefile" or die "Can't write $align_dir/Makefile: $!"; + +print TOPLEVEL <<EOT; +E_LANG = $e_lang +F_LANG = $f_lang +SCRIPT_DIR = $SCRIPT_DIR +TRAINING_DIR = $training_dir +MKCLS = $mkcls +NCLASSES = $num_classes + +TARGETS = @targets +PTRAIN = \$(TRAINING_DIR)/cluster-ptrain.pl --restart_if_necessary +PTRAIN_PARAMS = --gaussian_prior --sigma_squared 1.0 --max_iteration 15 + +export + +all: + \@failcom='exit 1'; \\ + list='\$(TARGETS)'; for subdir in \$\$list; do \\ + echo "Making \$\$subdir ..."; \\ + (cd \$\$subdir && \$(MAKE)) || eval \$\$failcom; \\ + done + +clean: + \@failcom='exit 1'; \\ + list='\$(TARGETS)'; for subdir in \$\$list; do \\ + echo "Making \$\$subdir ..."; \\ + (cd \$\$subdir && \$(MAKE) clean) || eval \$\$failcom; \\ + done +EOT +close TOPLEVEL; + +print STDERR "Created alignment task. chdir to talign/ then type make.\n\n"; +exit 0; + +sub make_stage { + my ($stage, $direction, $prev_stage) = @_; + my $stage_dir = "$align_dir/$stage-$direction"; + my $first = $direction; + $first =~ s/^(.+)-.*$/$1/; + mkdir $stage_dir; + my $RELPOS = "feature_function=RelativeSentencePosition $align_dir/grammars/corpus.class.$first\n"; + open CDEC, ">$stage_dir/cdec.ini" or die; + print CDEC <<EOT; +formalism=lexcrf +intersection_strategy=full +grammar=$align_dir/grammars/corpus.$direction.lex-grammar.gz +EOT + if ($stage =~ /relpos/) { + print CDEC "$RELPOS\n"; + } elsif ($stage =~ /markov/) { + print CDEC "$RELPOS\n"; + print CDEC "feature_function=MarkovJump\n"; + print CDEC "feature_function=MarkovJumpFClass $align_dir/grammars/corpus.class.$first\n"; + print CDEC "feature_function=SourcePOSBigram $align_dir/grammars/corpus.class.$first\n"; + } + close CDEC; + + my $init_weights = "weights.init.gz: ../grammars/weights.init.gz\n\tcp \$< \$\@\n"; + if ($prev_stage) { + $init_weights = "weights.init.gz: ../$prev_stage-$direction/weights.final.gz\n\tcp \$< \$\@\n"; + } + + open MAKE, ">$stage_dir/Makefile" or die; + print MAKE <<EOT; +all: weights.final.gz + +clean: + \$(RM) -r ptrain weights.init.gz weights.final.gz + +$init_weights + +weights.final.gz: weights.init.gz cdec.ini + \$(PTRAIN) \$(PTRAIN_PARAMS) cdec.ini ../grammars/corpus.$direction weights.init.gz + cp ptrain/weights.final.gz weights.final.gz + \$(RM) -r ptrain +EOT + close MAKE; +} + +sub usage { + die <<EOT; + +Usage: $0 [OPTIONS] training_corpus.fr-en + +EOT +} diff --git a/word-aligner/makefiles/makefile.grammars b/word-aligner/makefiles/makefile.grammars new file mode 100644 index 00000000..b89937c1 --- /dev/null +++ b/word-aligner/makefiles/makefile.grammars @@ -0,0 +1,70 @@ +all: corpus.f-e.lex-grammar.gz corpus.e-f.lex-grammar.gz corpus.class.e corpus.class.f weights.init.gz + +clean: + $(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e weights* corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar* + +SUPPORT_DIR = $(SCRIPT_DIR)/support +GZIP = /usr/bin/gzip +ZCAT = zcat +EXTRACT_WEIGHTS = $(SUPPORT_DIR)/extract_weights.pl +EXTRACT_GRAMMAR = $(SUPPORT_DIR)/extract_grammar.pl +SUPPLEMENT_WEIGHTS = $(SUPPORT_DIR)/supplement_weights_file.pl +EXTRACT_VOCAB = $(SUPPORT_DIR)/extract_vocab.pl +ORTHONORM_E = $(SCRIPT_DIR)/ortho-norm/$(E_LANG).pl +ORTHONORM_F = $(SCRIPT_DIR)/ortho-norm/$(F_LANG).pl +CLASSIFY = $(SUPPORT_DIR)/classify.pl +MAKE_LEX_GRAMMAR = $(SUPPORT_DIR)/make_lex_grammar.pl +MODEL1 = $(TRAINING_DIR)/model1 +MERGE_CORPUS = $(SUPPORT_DIR)/merge_corpus.pl + +orthonorm-dict.e: corpus.e + $(EXTRACT_VOCAB) corpus.e > e.voc + $(ORTHONORM_E) < e.voc > e.ortho-voc + $(MERGE_CORPUS) e.voc e.ortho-voc > $@ + +orthonorm-dict.f: corpus.f + $(EXTRACT_VOCAB) corpus.f > f.voc + $(ORTHONORM_F) < f.voc > f.ortho-voc + $(MERGE_CORPUS) f.voc f.ortho-voc > $@ + +voc2class.e: corpus.e $(MKCLS) + $(MKCLS) -c$(NCLASSES) -n10 -pcorpus.e -Vvoc2class.e opt + +voc2class.f: corpus.f $(MKCLS) + $(MKCLS) -c$(NCLASSES) -n10 -pcorpus.f -Vvoc2class.f opt + +corpus.class.e: corpus.e voc2class.e $(CLASSIFY) + $(CLASSIFY) voc2class.e corpus.e > $@ + +corpus.class.f: corpus.f voc2class.f $(CLASSIFY) + $(CLASSIFY) voc2class.f corpus.f > $@ + +corpus.f-e: corpus.f corpus.e $(MERGE_CORPUS) + $(MERGE_CORPUS) corpus.f corpus.e > $@ + +corpus.e-f: corpus.f corpus.e $(MERGE_CORPUS) + $(MERGE_CORPUS) corpus.e corpus.f > $@ + +corpus.f-e.model1: corpus.f-e $(MODEL1) + $(MODEL1) corpus.f-e > corpus.f-e.model1 + +corpus.e-f.model1: corpus.e-f $(MODEL1) + $(MODEL1) corpus.e-f > corpus.e-f.model1 + +bidir.grammars: corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f + $(RM) $@ + $(MAKE_LEX_GRAMMAR) corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f > bidir.grammars + +corpus.f-e.lex-grammar.gz: bidir.grammars + $(EXTRACT_GRAMMAR) 1 bidir.grammars | $(GZIP) -9 > corpus.f-e.lex-grammar.gz + +corpus.e-f.lex-grammar.gz: bidir.grammars + $(EXTRACT_GRAMMAR) 2 bidir.grammars | $(GZIP) -9 > corpus.e-f.lex-grammar.gz + +weights.init.gz: bidir.grammars voc2class.f voc2class.e + $(EXTRACT_WEIGHTS) bidir.grammars > weights.init + $(SUPPLEMENT_WEIGHTS) voc2class.f > weights.dup + $(SUPPLEMENT_WEIGHTS) voc2class.e >> weights.dup + sort -u weights.dup >> weights.init + $(GZIP) -9 weights.init + diff --git a/word-aligner/ortho-norm/README b/word-aligner/ortho-norm/README new file mode 100644 index 00000000..7071798a --- /dev/null +++ b/word-aligner/ortho-norm/README @@ -0,0 +1,2 @@ +The normalizations in this directory are supposed to create forms +that are roughly comparable by string comparison between languages. diff --git a/word-aligner/ortho-norm/ar.pl b/word-aligner/ortho-norm/ar.pl new file mode 100755 index 00000000..e8bd521a --- /dev/null +++ b/word-aligner/ortho-norm/ar.pl @@ -0,0 +1,32 @@ +#!/usr/bin/perl -w +use strict; +use utf8; + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); + +while(<STDIN>) { + chomp; + my $len = length($_); + if ($len > 1 && !($_ =~ /\d/)) { + s/\$/sh/g; + } + s/([a-z])\~/$1$1/g; + s/E/'/g; + s/^Aw/o/g; + s/\|/a/g; + s/@/h/g; + s/c/ch/g; + s/x/kh/g; + s/\*/dh/g; + s/w/o/g; + s/v/th/g; + if ($len > 1) { s/}/'/g; } + s/Z/dh/g; + s/y/i/g; + s/Y/a/g; + if ($len > 1) { s/p$//; } + $_ = lc $_; + print "$_\n"; +} + diff --git a/word-aligner/ortho-norm/en.pl b/word-aligner/ortho-norm/en.pl new file mode 100755 index 00000000..b167803e --- /dev/null +++ b/word-aligner/ortho-norm/en.pl @@ -0,0 +1,11 @@ +#!/usr/bin/perl -w +use strict; +use utf8; + +while(<STDIN>) { + $_ = lc $_; + s/ al-/ al/g; + s/^al-/al/; + print; +} + diff --git a/word-aligner/ortho-norm/fr.pl b/word-aligner/ortho-norm/fr.pl new file mode 100755 index 00000000..5592ab05 --- /dev/null +++ b/word-aligner/ortho-norm/fr.pl @@ -0,0 +1,22 @@ +#!/usr/bin/perl -w +use strict; +use utf8; + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); + +while(<STDIN>) { + $_ = lc $_; + # see http://en.wikipedia.org/wiki/Use_of_the_circumflex_in_French + s/â/as/g; + s/ê/es/g; + s/î/is/g; + s/ô/os/g; + s/û/us/g; + + s/ç/c/g; + s/é|è/e/g; + s/á/a/g; + print; +} + diff --git a/word-aligner/ortho-norm/ur.pl b/word-aligner/ortho-norm/ur.pl new file mode 100755 index 00000000..34953dd3 --- /dev/null +++ b/word-aligner/ortho-norm/ur.pl @@ -0,0 +1,34 @@ +#!/usr/bin/perl -w +use strict; +use utf8; + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); + +while(<STDIN>) { + chomp; + my @out = (); + my @words = split /\s+/; + for my $of (@words) { + if (length($of) > 1 && !($of =~ /\d/)) { + $of =~ s/\$/sh/g; + } + $of =~ s/([a-z])\~/$1$1/g; + $of =~ s/E/'/g; + $of =~ s/^Aw/o/; + $of =~ s/\|/a/g; + $of =~ s/@/h/g; + $of =~ s/c/ch/g; + $of =~ s/x/kh/g; + $of =~ s/\*/dh/g; + $of =~ s/p$/a/; + $of =~ s/w/o/g; + $of =~ s/Z/dh/g; + $of =~ s/y/i/g; + $of =~ s/Y/a/g; + $of = lc $of; + push @out, $of; + } + print "@out\n"; +} + diff --git a/word-aligner/support/classify.pl b/word-aligner/support/classify.pl new file mode 100755 index 00000000..893c7b22 --- /dev/null +++ b/word-aligner/support/classify.pl @@ -0,0 +1,27 @@ +#!/usr/bin/perl -w +use strict; + +die "Usage: $0 classes.txt corpus.txt" unless scalar @ARGV == 2; + +my ($class, $text) = @ARGV; +open C, "<$class" or die "Can't read $class: $!"; +open T, "<$text" or die "Can't read $text: $!"; + +my %dict = (); +my $cc = 0; +while(<C>) { + chomp; + my ($word, $cat) = split /\s+/; + die "'$word' '$cat'" unless (defined $word && defined $cat); + $dict{$word} = $cat; + $cc++; +} +close C; +print STDERR "Loaded classes for $cc words\n"; + +while(<T>) { + chomp; + my @cats = map { $dict{$_} or die "Undefined class for $_"; } split /\s+/; + print "@cats\n"; +} + diff --git a/word-aligner/support/extract_grammar.pl b/word-aligner/support/extract_grammar.pl new file mode 100755 index 00000000..d7275ef5 --- /dev/null +++ b/word-aligner/support/extract_grammar.pl @@ -0,0 +1,11 @@ +#!/usr/bin/perl -w +use strict; + +my $key = shift @ARGV; +die "Usage: $0 KEY\n" unless defined $key; + +while(<>) { + my ($k, @rest) = split / \|\|\| /; + print join(' ||| ', @rest) if ($k eq $key); +} + diff --git a/word-aligner/support/extract_vocab.pl b/word-aligner/support/extract_vocab.pl new file mode 100755 index 00000000..070d4202 --- /dev/null +++ b/word-aligner/support/extract_vocab.pl @@ -0,0 +1,20 @@ +#!/usr/bin/perl -w +use strict; + +print STDERR "Extracting vocabulary...\n"; +my %dict = (); +my $wc = 0; +while(<>) { + chomp; + my @words = split /\s+/; + for my $word (@words) { $wc++; $dict{$word}++; } +} + +my $tc = 0; +for my $word (sort {$dict{$b} <=> $dict{$a}} keys %dict) { + print "$word\n"; + $tc++; +} + +print STDERR "$tc types / $wc tokens\n"; + diff --git a/word-aligner/support/extract_weights.pl b/word-aligner/support/extract_weights.pl new file mode 100755 index 00000000..dfedd12e --- /dev/null +++ b/word-aligner/support/extract_weights.pl @@ -0,0 +1,17 @@ +#!/usr/bin/perl -w +use strict; +my %dict=(); +while(<>) { + chomp; + my ($dummy, $a, $b, $wts) = split / \|\|\| /; + my @weights = split /\s+/, $wts; + for my $w (@weights) { + my ($name, $val) = split /=/, $w; + unless ($dict{$name}) { + my $r = (0.5 - rand) / 5; + $r = sprintf ("%0.4f", $r); + print "$name $r\n"; + $dict{$name}= 1; + } + } +} diff --git a/word-aligner/support/invert_grammar.pl b/word-aligner/support/invert_grammar.pl new file mode 100755 index 00000000..3988388d --- /dev/null +++ b/word-aligner/support/invert_grammar.pl @@ -0,0 +1,8 @@ +#!/usr/bin/perl -w +use strict; + +while(<>) { + my ($f, $e, $scores) = split / \|\|\| /; + print "$e ||| $f ||| $scores"; +} + diff --git a/word-aligner/support/make_lex_grammar.pl b/word-aligner/support/make_lex_grammar.pl new file mode 100755 index 00000000..c3e29540 --- /dev/null +++ b/word-aligner/support/make_lex_grammar.pl @@ -0,0 +1,435 @@ +#!/usr/bin/perl -w +use utf8; +use strict; + +my $LIMIT_SIZE=30; + +my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f) = @ARGV; +die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f; + +my %eclass = (); +my %fclass = (); +load_classes($class_e, \%eclass); +load_classes($class_f, \%fclass); + +our @IDENT_BINS = qw (Ident0 Ident1 Ident2 Ident3 Ident4 Ident5 Ident6 Ident7 Ident8_9 Ident8_9 Ident10_11 Ident10_11 Ident12_14 Ident12_14 Ident12_14); +die unless scalar @IDENT_BINS == 15; +our $MAX_IDENT_BIN = 'IdentGT' . scalar @IDENT_BINS; + +our %cache; +open EF, "<$effile" or die; +open M1, "<$model1" or die; +open IM1, "<$imodel1" or die; +binmode(EF,":utf8"); +binmode(M1,":utf8"); +binmode(IM1,":utf8"); +binmode(STDOUT,":utf8"); +my %model1; +print STDERR "Reading model1...\n"; +my %sizes = (); +while(<M1>) { + chomp; + my ($f, $e, $lp) = split /\s+/; + $model1{$f}->{$e} = 1; + $sizes{$f}++; +} +close M1; + +my $inv_add = 0; +my %invm1; +print STDERR "Reading inverse model1...\n"; +my %esizes=(); +while(<IM1>) { + chomp; + my ($e, $f, $lp) = split /\s+/; + $invm1{$e}->{$f} = 1; + $esizes{$e}++; + if (($sizes{$f} or 0) < $LIMIT_SIZE && !(defined $model1{$f}->{$e})) { + $model1{$f}->{$e} = 1; + $sizes{$f}++; + $inv_add++; + } +} +close IM1; +print STDERR "Added $inv_add from inverse model1\n"; + +open M1, "<$model1" or die; +binmode(M1,":utf8"); +my $dir_add = 0; +print STDERR "Reading model1 (again) for extra inverse translations...\n"; +while(<M1>) { + chomp; + my ($f, $e, $lp) = split /\s+/; + if (($esizes{$e} or 0) < $LIMIT_SIZE && !(defined $invm1{$e}->{$f})) { + $invm1{$e}->{$f} = 1; + $esizes{$e}++; + $dir_add++; + } +} +close M1; +print STDERR "Added $dir_add from model 1\n"; +print STDERR "Generating grammars...\n"; +open OE, "<$orthoe" or die; +binmode(OE,":utf8"); +my %oe_dict; +while(<OE>) { + chomp; + my ($a, $b) = split / \|\|\| /, $_; + die "BAD: $_" unless defined $a && defined $b; + $oe_dict{$a} = $b; +} +close OE; +open OF, "<$orthof" or die; +binmode(OF,":utf8"); +my %of_dict; +while(<OF>) { + chomp; + my ($a, $b) = split / \|\|\| /, $_; + die "BAD: $_" unless defined $a && defined $b; + $of_dict{$a} = $b; +} +close OF; +$of_dict{'<eps>'} = '<eps>'; +$oe_dict{'<eps>'} = '<eps>'; + +my $MIN_FEATURE_COUNT = 0; +my $ADD_PREFIX_ID = 0; +my $ADD_CLASS_CLASS = 1; +my $ADD_LEN = 1; +my $ADD_SIM = 1; +my $ADD_DICE = 1; +my $ADD_111 = 1; +my $ADD_ID = 1; +my $ADD_PUNC = 1; +my $ADD_NULL = 0; +my $ADD_STEM_ID = 1; +my $ADD_SYM = 0; +my $BEAM_RATIO = 50; +my $BIN_ORTHO = 1; +my $BIN_DLEN = 1; +my $BIN_IDENT = 1; +my $BIN_DICE = 1; + +my %fdict; +my %fcounts; +my %ecounts; + +my %sdict; + +while(<EF>) { + chomp; + my ($f, $e) = split /\s*\|\|\|\s*/; + my @es = split /\s+/, $e; + my @fs = split /\s+/, $f; + for my $ew (@es){ + die "E: Empty word" if $ew eq ''; + $ecounts{$ew}++; + } + push @fs, '<eps>' if $ADD_NULL; + for my $fw (@fs){ + die "F: Empty word" if $fw eq ''; + $fcounts{$fw}++; + } + for my $fw (@fs){ + for my $ew (@es){ + $fdict{$fw}->{$ew}++; + } + } +} + +my $specials = 0; +my $fc = 1000000; +my $sids = 1000000; +for my $f (sort keys %fdict) { + my $re = $fdict{$f}; + my $max; + for my $e (sort {$re->{$b} <=> $re->{$a}} keys %$re) { + my $efcount = $re->{$e}; + unless (defined $max) { $max = $efcount; } + my $m1 = $model1{$f}->{$e}; + my $im1 = $invm1{$e}->{$f}; + my $is_good_pair = (defined $m1); + my $is_inv_good_pair = (defined $im1); + my $dice = 2 * $efcount / ($ecounts{$e} + $fcounts{$f}); + my @feats; + if ($efcount > $MIN_FEATURE_COUNT) { + $fc++; + push @feats, "F$fc=1"; + } + if ($ADD_SYM && $is_good_pair && $is_inv_good_pair) { push @feats, 'Sym=1'; } + my $oe = $oe_dict{$e}; + die "Can't find orthonorm form for $e" unless defined $oe; + my $of = $of_dict{$f}; + die "Can't find orthonorm form for $f" unless defined $of; + my $len_e = length($oe); + my $len_f = length($of); + if ($ADD_DICE) { + if ($BIN_DICE) { + push @feats, dicebin($dice) . '=1'; + } else { + push @feats, "Dice=$dice"; + } + } + if ($ADD_CLASS_CLASS) { + my $ce = $eclass{$e} or die "E- no class for: '$e'"; + my $cf = $fclass{$f} or die "F- no class for: '$f'"; + push @feats, "C${cf}_${ce}=1"; + } + my $is_null = undef; + if ($ADD_NULL && $f eq '<eps>') { + push @feats, "IsNull=1"; + $is_null = 1; + } + if ($ADD_LEN) { + if (!$is_null) { + my $dlen = abs($len_e - $len_f); + if ($BIN_DLEN) { + push @feats, dlenbin($dlen) . '=1'; + } else { + push @feats, "DLen=$dlen"; + } + } + } + my $f_num = ($of =~ /^-?\d[0-9\.\,]+%?$/ && (length($of) > 3)); + my $e_num = ($oe =~ /^-?\d[0-9\.\,]+%?$/ && (length($oe) > 3)); + my $both_non_numeric = (!$e_num && !$f_num); + if ($ADD_STEM_ID) { + my $el = 4; + my $fl = 4; + if ($oe =~ /^al|re|co/) { $el++; } + if ($of =~ /^al|re|co/) { $fl++; } + if ($oe =~ /^trans|inter/) { $el+=2; } + if ($of =~ /^trans|inter/) { $fl+=2; } + if ($fl > length($of)) { $fl = length($of); } + if ($el > length($oe)) { $el = length($oe); } + my $sf = substr $of, 0, $fl; + my $se = substr $oe, 0, $el; + my $id = $sdict{$sf}->{$se}; + if (!$id) { + $sids++; + $sdict{$sf}->{$se} = $sids; + $id = $sids; + } + push @feats, "S$id=1"; + } + if ($ADD_PREFIX_ID) { + if ($len_e > 3 && $len_f > 3 && $both_non_numeric) { + my $pe = substr $oe, 0, 3; + my $pf = substr $of, 0, 3; + if ($pe eq $pf) { push @feats, "PfxIdentical=1"; } + } + } + if ($ADD_SIM) { + my $ld = 0; + my $eff = $len_e; + if ($eff < $len_f) { $eff = $len_f; } + if (!$is_null) { + $ld = ($eff - levenshtein($oe, $of)) / sqrt($eff); + } + if ($ld > 1.5) { $is_good_pair = 1; } + if ($BIN_ORTHO) { + push @feats, orthobin($ld) . '=1'; + } else { + push @feats, "OrthoSim=$ld"; + } + } + my $ident = ($e eq $f); + if ($ident) { $is_good_pair = 1; } + if ($ident && $ADD_ID) { + if ($BIN_IDENT) { + push @feats, identbin($len_e) . '=1'; + } else { + push @feats, "Identical=$len_e"; + } + } + if ($efcount == 1 && $ecounts{$e} == 1 && $fcounts{$f} == 1) { + $is_good_pair = 1; + if ($ADD_111) { + push @feats, "OneOneOne=1"; + } + } + if ($ADD_PUNC) { + if ($f =~ /^[!,\-\/"':;=+?.()\[\]«»]+$/ && $e =~ /[a-z]+/) { + push @feats, "PuncMiss=1"; + } + } + my $is_special = ($is_good_pair && !(defined $m1)); + $specials++ if $is_special; + print STDERR "$f -> $e\n" if $is_special; + print "1 ||| $f ||| $e ||| @feats\n" if $is_good_pair; + print "2 ||| $e ||| $f ||| @feats\n" if $is_inv_good_pair; + } +} +print STDERR "Added $specials special rules that were not in the M1 set\n"; + + +sub levenshtein +{ + # $s1 and $s2 are the two strings + # $len1 and $len2 are their respective lengths + # + my ($s1, $s2) = @_; + my ($len1, $len2) = (length $s1, length $s2); + + # If one of the strings is empty, the distance is the length + # of the other string + # + return $len2 if ($len1 == 0); + return $len1 if ($len2 == 0); + + my %mat; + + # Init the distance matrix + # + # The first row to 0..$len1 + # The first column to 0..$len2 + # The rest to 0 + # + # The first row and column are initialized so to denote distance + # from the empty string + # + for (my $i = 0; $i <= $len1; ++$i) + { + for (my $j = 0; $j <= $len2; ++$j) + { + $mat{$i}{$j} = 0; + $mat{0}{$j} = $j; + } + + $mat{$i}{0} = $i; + } + + # Some char-by-char processing is ahead, so prepare + # array of chars from the strings + # + my @ar1 = split(//, $s1); + my @ar2 = split(//, $s2); + + for (my $i = 1; $i <= $len1; ++$i) + { + for (my $j = 1; $j <= $len2; ++$j) + { + # Set the cost to 1 iff the ith char of $s1 + # equals the jth of $s2 + # + # Denotes a substitution cost. When the char are equal + # there is no need to substitute, so the cost is 0 + # + my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1; + + # Cell $mat{$i}{$j} equals the minimum of: + # + # - The cell immediately above plus 1 + # - The cell immediately to the left plus 1 + # - The cell diagonally above and to the left plus the cost + # + # We can either insert a new char, delete a char or + # substitute an existing char (with an associated cost) + # + $mat{$i}{$j} = min([$mat{$i-1}{$j} + 1, + $mat{$i}{$j-1} + 1, + $mat{$i-1}{$j-1} + $cost]); + } + } + + # Finally, the Levenshtein distance equals the rightmost bottom cell + # of the matrix + # + # Note that $mat{$x}{$y} denotes the distance between the substrings + # 1..$x and 1..$y + # + return $mat{$len1}{$len2}; +} + + +# minimal element of a list +# +sub min +{ + my @list = @{$_[0]}; + my $min = $list[0]; + + foreach my $i (@list) + { + $min = $i if ($i < $min); + } + + return $min; +} + +sub load_classes { + my ($file, $ref) = @_; + print STDERR "Reading classes from $file...\n"; + open F, "<$file" or die "Can't read $file: $!"; + binmode(F, ":utf8") or die; + while(<F>) { + chomp; + my ($word, $class) = split /\s+/; +# print STDERR "'$word' -> $class\n"; + $ref->{$word} = $class; + } + close F; +} + +sub dicebin { + my $x = shift; + if ($x < 0.05) { return 'DiceLT005'; } + elsif ($x >= 0.05 && $x < 0.1) { return 'Dice005_01'; } + elsif ($x >= 0.1 && $x < 0.2) { return 'Dice01_02'; } + elsif ($x >= 0.2 && $x < 0.3) { return 'Dice02_03'; } + elsif ($x >= 0.3 && $x < 0.4) { return 'Dice03_04'; } + elsif ($x >= 0.4 && $x < 0.5) { return 'Dice04_05'; } + elsif ($x >= 0.5 && $x < 0.6) { return 'Dice05_06'; } + elsif ($x >= 0.6 && $x < 0.7) { return 'Dice06_07'; } + elsif ($x >= 0.7 && $x < 0.8) { return 'Dice07_08'; } + elsif ($x >= 0.8 && $x < 0.9) { return 'Dice08_09'; } + elsif ($x >= 0.9 && $x < 1.0) { return 'Dice09_10'; } + elsif ($x >= 1.0 && $x < 1.1) { return 'Dice10_11'; } + elsif ($x >= 1.1 && $x < 1.2) { return 'Dice11_12'; } + elsif ($x >= 1.2 && $x < 1.4) { return 'Dice12_14'; } + elsif ($x >= 1.4 && $x < 1.6) { return 'Dice14_16'; } + elsif ($x >= 1.6 && $x < 1.8) { return 'Dice16_18'; } + elsif ($x >= 1.8 && $x < 2.0) { return 'Dice18_20'; } + elsif ($x >= 2.0 && $x < 2.3) { return 'Dice20_23'; } + elsif ($x >= 2.3) { return 'DiceGT23'; } +} + +sub orthobin { + my $x = shift; + if ($x < 0.9) { return 'OrthoLT09'; } + elsif ($x >= 0.9 && $x < 1.1) { return 'Ortho09_11'; } + elsif ($x >= 1.1 && $x < 1.3) { return 'Ortho11_13'; } + elsif ($x >= 1.3 && $x < 1.5) { return 'Ortho13_15'; } + elsif ($x >= 1.5 && $x < 1.7) { return 'Ortho15_17'; } + elsif ($x >= 1.7 && $x < 1.9) { return 'Ortho17_19'; } + elsif ($x >= 1.9 && $x < 2.1) { return 'Ortho19_21'; } + elsif ($x >= 2.1 && $x < 2.3) { return 'Ortho21_23'; } + elsif ($x >= 2.3 && $x < 2.5) { return 'Ortho23_25'; } + elsif ($x >= 2.5 && $x < 2.7) { return 'Ortho25_27'; } + elsif ($x >= 2.7 && $x < 2.9) { return 'Ortho27_29'; } + elsif ($x >= 2.9) { return 'OrthoGT29'; } +} + +sub dlenbin { + my $x = shift; + if ($x == 0) { return 'DLen0'; } + elsif ($x == 1) { return 'DLen1'; } + elsif ($x == 2) { return 'DLen2'; } + elsif ($x == 3) { return 'DLen3'; } + elsif ($x == 4) { return 'DLen4'; } + elsif ($x == 5) { return 'DLen5'; } + elsif ($x == 6) { return 'DLen6'; } + elsif ($x == 7) { return 'DLen7'; } + elsif ($x == 8) { return 'DLen8'; } + elsif ($x == 9) { return 'DLen9'; } + elsif ($x >= 10) { return 'DLenGT10'; } +} + +sub identbin { + my $x = shift; + if ($x == 0) { die; } + if ($x > scalar @IDENT_BINS) { return $MAX_IDENT_BIN; } + return $IDENT_BINS[$x]; +} + + diff --git a/word-aligner/support/merge_corpus.pl b/word-aligner/support/merge_corpus.pl new file mode 100755 index 00000000..02827903 --- /dev/null +++ b/word-aligner/support/merge_corpus.pl @@ -0,0 +1,18 @@ +#!/usr/bin/perl -w +use strict; +die "Usage: $0 corpus.e|f corpus.f|e" unless scalar @ARGV == 2; + +my ($a, $b) = @ARGV; +open A, "<$a" or die "Can't read $a: $!"; +open B, "<$b" or die "Can't read $a: $!"; + +while(<A>) { + chomp; + my $e = <B>; + die "Mismatched lines in $a and $b!" unless defined $e; + print "$_ ||| $e"; +} + +my $e = <B>; +die "Mismatched lines in $a and $b!" unless !defined $e; + diff --git a/word-aligner/support/supplement_weights_file.pl b/word-aligner/support/supplement_weights_file.pl new file mode 100755 index 00000000..06876043 --- /dev/null +++ b/word-aligner/support/supplement_weights_file.pl @@ -0,0 +1,73 @@ +#!/usr/bin/perl -w +use strict; + +my $ADD_FCLASS_JUMP = 1; +my $ADD_MODEL2_BINARY = 0; +my $ADD_FC_RELPOS = 1; + +my ($f_classes) = @ARGV; + +die "Usage: $0 f-classes.file" unless $f_classes && -f $f_classes; + +print <<EOT; +MarkovJump 0 +RelativeSentencePosition 0 +EOT + +# ! 8 +# " 11 +# 's 18 + +my %dcats = (); +$dcats{'BOS'} = 1; +$dcats{'EOS'} = 1; + +open FC, "<$f_classes" or die; +while(<FC>) { + chomp; + my ($x, $cat) = split /\s+/; + $dcats{$cat} = 1; +} + +my @cats = sort keys %dcats; + +my $added = 0; +for (my $i=0; $i < scalar @cats; $i++) { + my $c1 = $cats[$i]; + for (my $j=0; $j < scalar @cats; $j++) { + my $c2 = $cats[$j]; + print "SP:${c1}_${c2} 0\n"; + $added++; + } +} + +for (my $ss=1; $ss < 100; $ss++) { + if ($ADD_FCLASS_JUMP) { + for (my $i=0; $i < scalar @cats; $i++) { + my $cat = $cats[$i]; + for (my $j = -$ss; $j <= $ss; $j++) { + print "Jump_FL:${ss}_FC:${cat}_J:$j 0\n"; + $added++; + } + } + } + if ($ADD_MODEL2_BINARY) { + # M2_FL:8_SI:3_TI:2=1 + for (my $i = 0; $i < $ss; $i++) { + for (my $j = 0; $j < 100; $j++) { + print "M2FL:${ss}:TI:${j}_SI:${i} 0\n"; + $added++; + } + } + } +} +if ($ADD_FC_RELPOS) { + #RelPos_FC:11 + for (my $i=0; $i < scalar @cats; $i++) { + my $cat = $cats[$i]; + print "RelPos_FC:$cat 0\n"; + $added++; + } +} + +print STDERR "Added $added weights\n"; |