diff options
Diffstat (limited to 'word-aligner')
| -rwxr-xr-x | word-aligner/aligner.pl | 157 | ||||
| -rw-r--r-- | word-aligner/makefiles/makefile.grammars | 70 | ||||
| -rw-r--r-- | word-aligner/ortho-norm/README | 2 | ||||
| -rwxr-xr-x | word-aligner/ortho-norm/ar.pl | 32 | ||||
| -rwxr-xr-x | word-aligner/ortho-norm/en.pl | 11 | ||||
| -rwxr-xr-x | word-aligner/ortho-norm/fr.pl | 22 | ||||
| -rwxr-xr-x | word-aligner/ortho-norm/ur.pl | 34 | ||||
| -rwxr-xr-x | word-aligner/support/classify.pl | 27 | ||||
| -rwxr-xr-x | word-aligner/support/extract_grammar.pl | 11 | ||||
| -rwxr-xr-x | word-aligner/support/extract_vocab.pl | 20 | ||||
| -rwxr-xr-x | word-aligner/support/extract_weights.pl | 17 | ||||
| -rwxr-xr-x | word-aligner/support/invert_grammar.pl | 8 | ||||
| -rwxr-xr-x | word-aligner/support/make_lex_grammar.pl | 435 | ||||
| -rwxr-xr-x | word-aligner/support/merge_corpus.pl | 18 | ||||
| -rwxr-xr-x | word-aligner/support/supplement_weights_file.pl | 73 | 
15 files changed, 937 insertions, 0 deletions
| diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl new file mode 100755 index 00000000..7eec0e42 --- /dev/null +++ b/word-aligner/aligner.pl @@ -0,0 +1,157 @@ +#!/usr/bin/perl -w +use strict; + +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path getcwd /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } +use Getopt::Long; +my $training_dir = "$SCRIPT_DIR/../training"; +die "Can't find training dir: $training_dir" unless -d $training_dir; + +my $mkcls = '/Users/redpony/software/giza/giza-pp/mkcls-v2/mkcls'; +my $num_classes = 50; +my $nodes = 40; +my $pmem = "2500mb"; +my $DECODER = "cdec"; +GetOptions("cdec=s" => \$DECODER, +           "jobs=i" => \$nodes, +           "pmem=s" => \$pmem, +           "mkcls=s" => \$mkcls, +          ) or usage(); +usage() unless (scalar @ARGV == 1); +die "Cannot find mkcls (specify with --mkcls=/path/to/mkcls) at $mkcls\n" unless -f $mkcls; +die "Cannot execute mkcls at $mkcls\n" unless -x $mkcls; + +my $in_file = shift @ARGV; +die "Expected format corpus.l1-l2 where l1 & l2 are two-letter abbreviations\nfor the source and target language respectively\n" unless ($in_file =~ /^.+\.([a-z][a-z])-([a-z][a-z])$/); +my $f_lang = $1; +my $e_lang = $2; + +print STDERR "Source language: $f_lang\n"; +print STDERR "Target language: $e_lang\n"; +print STDERR " Using mkcls in: $mkcls\n\n"; +die "Don't have an orthographic normalizer for $f_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$f_lang.pl"; +die "Don't have an orthographic normalizer for $e_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$e_lang.pl"; + +my @stages = qw(nopos relpos markov); +my @directions = qw(f-e e-f); + +my $corpus = 'c'; + +my $cwd = getcwd(); +my $align_dir = "$cwd/talign"; + +mkdir $align_dir; +mkdir "$align_dir/grammars"; +open IN, "<$in_file" or die "Can't read $in_file: $!"; +open E, ">$align_dir/grammars/corpus.e" or die "Can't write: $!"; +open F, ">$align_dir/grammars/corpus.f" or die "Can't write: $!"; +while(<IN>) { +  chomp; +  my ($f, $e) = split / \|\|\| /; +  die "Bad format, excepted ||| separated line" unless defined $f && defined $e; +  print F "$f\n"; +  print E "$e\n"; +} +close F; +close E; +close IN; +`cp $SCRIPT_DIR/makefiles/makefile.grammars $align_dir/grammars/Makefile`; +die unless $? == 0; + +my @targets = qw(grammars); + +for my $direction (@directions) { +  my $prev_stage = undef; +  for my $stage (@stages) { +    push @targets, "$stage-$direction"; +    make_stage($stage, $direction, $prev_stage); +    $prev_stage = $stage; +  } +} + +open TOPLEVEL, ">$align_dir/Makefile" or die "Can't write $align_dir/Makefile: $!"; + +print TOPLEVEL <<EOT; +E_LANG = $e_lang +F_LANG = $f_lang +SCRIPT_DIR = $SCRIPT_DIR +TRAINING_DIR = $training_dir +MKCLS = $mkcls +NCLASSES = $num_classes + +TARGETS = @targets +PTRAIN = \$(TRAINING_DIR)/cluster-ptrain.pl --restart_if_necessary +PTRAIN_PARAMS = --gaussian_prior --sigma_squared 1.0 --max_iteration 15 + +export + +all: +	\@failcom='exit 1'; \\ +	list='\$(TARGETS)'; for subdir in \$\$list; do \\ +	echo "Making \$\$subdir ..."; \\ +	(cd \$\$subdir && \$(MAKE)) || eval \$\$failcom; \\ +	done + +clean: +	\@failcom='exit 1'; \\ +	list='\$(TARGETS)'; for subdir in \$\$list; do \\ +	echo "Making \$\$subdir ..."; \\ +	(cd \$\$subdir && \$(MAKE) clean) || eval \$\$failcom; \\ +	done +EOT +close TOPLEVEL; + +print STDERR "Created alignment task. chdir to talign/ then type make.\n\n"; +exit 0; + +sub make_stage { +  my ($stage, $direction, $prev_stage) = @_; +  my $stage_dir = "$align_dir/$stage-$direction"; +  my $first = $direction; +  $first =~ s/^(.+)-.*$/$1/; +  mkdir $stage_dir; +  my $RELPOS = "feature_function=RelativeSentencePosition $align_dir/grammars/corpus.class.$first\n"; +  open CDEC, ">$stage_dir/cdec.ini" or die; +  print CDEC <<EOT; +formalism=lexcrf +intersection_strategy=full +grammar=$align_dir/grammars/corpus.$direction.lex-grammar.gz +EOT +  if ($stage =~ /relpos/) { +    print CDEC "$RELPOS\n"; +  } elsif ($stage =~ /markov/) { +    print CDEC "$RELPOS\n"; +    print CDEC "feature_function=MarkovJump\n"; +    print CDEC "feature_function=MarkovJumpFClass $align_dir/grammars/corpus.class.$first\n"; +    print CDEC "feature_function=SourcePOSBigram $align_dir/grammars/corpus.class.$first\n"; +  } +  close CDEC; + +  my $init_weights = "weights.init.gz: ../grammars/weights.init.gz\n\tcp \$< \$\@\n"; +  if ($prev_stage) { +    $init_weights = "weights.init.gz: ../$prev_stage-$direction/weights.final.gz\n\tcp \$< \$\@\n"; +  } + +  open MAKE, ">$stage_dir/Makefile" or die; +  print MAKE <<EOT; +all: weights.final.gz + +clean: +	\$(RM) -r ptrain weights.init.gz weights.final.gz + +$init_weights + +weights.final.gz: weights.init.gz cdec.ini +	\$(PTRAIN) \$(PTRAIN_PARAMS) cdec.ini ../grammars/corpus.$direction weights.init.gz +	cp ptrain/weights.final.gz weights.final.gz +	\$(RM) -r ptrain +EOT +  close MAKE; +} + +sub usage { +  die <<EOT; + +Usage: $0 [OPTIONS] training_corpus.fr-en + +EOT +} diff --git a/word-aligner/makefiles/makefile.grammars b/word-aligner/makefiles/makefile.grammars new file mode 100644 index 00000000..b89937c1 --- /dev/null +++ b/word-aligner/makefiles/makefile.grammars @@ -0,0 +1,70 @@ +all: corpus.f-e.lex-grammar.gz corpus.e-f.lex-grammar.gz corpus.class.e corpus.class.f weights.init.gz + +clean: +	$(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e weights* corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar* + +SUPPORT_DIR = $(SCRIPT_DIR)/support +GZIP = /usr/bin/gzip +ZCAT = zcat +EXTRACT_WEIGHTS = $(SUPPORT_DIR)/extract_weights.pl +EXTRACT_GRAMMAR = $(SUPPORT_DIR)/extract_grammar.pl +SUPPLEMENT_WEIGHTS = $(SUPPORT_DIR)/supplement_weights_file.pl +EXTRACT_VOCAB = $(SUPPORT_DIR)/extract_vocab.pl +ORTHONORM_E = $(SCRIPT_DIR)/ortho-norm/$(E_LANG).pl +ORTHONORM_F = $(SCRIPT_DIR)/ortho-norm/$(F_LANG).pl +CLASSIFY = $(SUPPORT_DIR)/classify.pl +MAKE_LEX_GRAMMAR = $(SUPPORT_DIR)/make_lex_grammar.pl +MODEL1 = $(TRAINING_DIR)/model1 +MERGE_CORPUS = $(SUPPORT_DIR)/merge_corpus.pl + +orthonorm-dict.e: corpus.e +	$(EXTRACT_VOCAB) corpus.e > e.voc +	$(ORTHONORM_E) < e.voc > e.ortho-voc +	$(MERGE_CORPUS) e.voc e.ortho-voc > $@ + +orthonorm-dict.f: corpus.f +	$(EXTRACT_VOCAB) corpus.f > f.voc +	$(ORTHONORM_F) < f.voc > f.ortho-voc +	$(MERGE_CORPUS) f.voc f.ortho-voc > $@ + +voc2class.e: corpus.e $(MKCLS) +	$(MKCLS) -c$(NCLASSES) -n10 -pcorpus.e -Vvoc2class.e opt + +voc2class.f: corpus.f $(MKCLS) +	$(MKCLS) -c$(NCLASSES) -n10 -pcorpus.f -Vvoc2class.f opt + +corpus.class.e: corpus.e voc2class.e $(CLASSIFY) +	$(CLASSIFY) voc2class.e corpus.e > $@ + +corpus.class.f: corpus.f voc2class.f $(CLASSIFY) +	$(CLASSIFY) voc2class.f corpus.f > $@ + +corpus.f-e: corpus.f corpus.e $(MERGE_CORPUS) +	$(MERGE_CORPUS) corpus.f corpus.e > $@ + +corpus.e-f: corpus.f corpus.e $(MERGE_CORPUS) +	$(MERGE_CORPUS) corpus.e corpus.f > $@ + +corpus.f-e.model1: corpus.f-e $(MODEL1) +	$(MODEL1) corpus.f-e > corpus.f-e.model1 + +corpus.e-f.model1: corpus.e-f $(MODEL1) +	$(MODEL1) corpus.e-f > corpus.e-f.model1 + +bidir.grammars: corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f +	$(RM) $@ +	$(MAKE_LEX_GRAMMAR) corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f > bidir.grammars + +corpus.f-e.lex-grammar.gz: bidir.grammars +	$(EXTRACT_GRAMMAR) 1 bidir.grammars | $(GZIP) -9 > corpus.f-e.lex-grammar.gz + +corpus.e-f.lex-grammar.gz: bidir.grammars +	$(EXTRACT_GRAMMAR) 2 bidir.grammars | $(GZIP) -9 > corpus.e-f.lex-grammar.gz + +weights.init.gz: bidir.grammars voc2class.f voc2class.e +	$(EXTRACT_WEIGHTS) bidir.grammars > weights.init +	$(SUPPLEMENT_WEIGHTS) voc2class.f > weights.dup +	$(SUPPLEMENT_WEIGHTS) voc2class.e >> weights.dup +	sort -u weights.dup >> weights.init +	$(GZIP) -9 weights.init + diff --git a/word-aligner/ortho-norm/README b/word-aligner/ortho-norm/README new file mode 100644 index 00000000..7071798a --- /dev/null +++ b/word-aligner/ortho-norm/README @@ -0,0 +1,2 @@ +The normalizations in this directory are supposed to create forms +that are roughly comparable by string comparison between languages. diff --git a/word-aligner/ortho-norm/ar.pl b/word-aligner/ortho-norm/ar.pl new file mode 100755 index 00000000..e8bd521a --- /dev/null +++ b/word-aligner/ortho-norm/ar.pl @@ -0,0 +1,32 @@ +#!/usr/bin/perl -w +use strict; +use utf8; + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); + +while(<STDIN>) { +  chomp; +  my $len = length($_); +  if ($len > 1 && !($_ =~ /\d/)) { +    s/\$/sh/g; +  } +  s/([a-z])\~/$1$1/g; +  s/E/'/g; +  s/^Aw/o/g; +  s/\|/a/g; +  s/@/h/g; +  s/c/ch/g; +  s/x/kh/g; +  s/\*/dh/g; +  s/w/o/g; +  s/v/th/g; +  if ($len > 1) { s/}/'/g; } +  s/Z/dh/g; +  s/y/i/g; +  s/Y/a/g; +  if ($len > 1) { s/p$//; } +  $_ = lc $_; +  print "$_\n"; +} + diff --git a/word-aligner/ortho-norm/en.pl b/word-aligner/ortho-norm/en.pl new file mode 100755 index 00000000..b167803e --- /dev/null +++ b/word-aligner/ortho-norm/en.pl @@ -0,0 +1,11 @@ +#!/usr/bin/perl -w +use strict; +use utf8; + +while(<STDIN>) { +  $_ = lc $_; +  s/ al-/ al/g; +  s/^al-/al/; +  print; +} + diff --git a/word-aligner/ortho-norm/fr.pl b/word-aligner/ortho-norm/fr.pl new file mode 100755 index 00000000..5592ab05 --- /dev/null +++ b/word-aligner/ortho-norm/fr.pl @@ -0,0 +1,22 @@ +#!/usr/bin/perl -w +use strict; +use utf8; + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); + +while(<STDIN>) { +  $_ = lc $_; +  # see http://en.wikipedia.org/wiki/Use_of_the_circumflex_in_French +  s/â/as/g; +  s/ê/es/g; +  s/î/is/g; +  s/ô/os/g; +  s/û/us/g; + +  s/ç/c/g; +  s/é|è/e/g; +  s/á/a/g; +  print; +} + diff --git a/word-aligner/ortho-norm/ur.pl b/word-aligner/ortho-norm/ur.pl new file mode 100755 index 00000000..34953dd3 --- /dev/null +++ b/word-aligner/ortho-norm/ur.pl @@ -0,0 +1,34 @@ +#!/usr/bin/perl -w +use strict; +use utf8; + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); + +while(<STDIN>) { +  chomp; +  my @out = (); +  my @words = split /\s+/; +  for my $of (@words) { +    if (length($of) > 1 && !($of =~ /\d/)) { +      $of =~ s/\$/sh/g; +    } +    $of =~ s/([a-z])\~/$1$1/g; +    $of =~ s/E/'/g; +    $of =~ s/^Aw/o/; +    $of =~ s/\|/a/g; +    $of =~ s/@/h/g; +    $of =~ s/c/ch/g; +    $of =~ s/x/kh/g; +    $of =~ s/\*/dh/g; +    $of =~ s/p$/a/; +    $of =~ s/w/o/g; +    $of =~ s/Z/dh/g; +    $of =~ s/y/i/g; +    $of =~ s/Y/a/g; +    $of = lc $of; +    push @out, $of; +  } +  print "@out\n"; +} + diff --git a/word-aligner/support/classify.pl b/word-aligner/support/classify.pl new file mode 100755 index 00000000..893c7b22 --- /dev/null +++ b/word-aligner/support/classify.pl @@ -0,0 +1,27 @@ +#!/usr/bin/perl -w +use strict; + +die "Usage: $0 classes.txt corpus.txt" unless scalar @ARGV == 2; + +my ($class, $text) = @ARGV; +open C, "<$class" or die "Can't read $class: $!"; +open T, "<$text" or die "Can't read $text: $!"; + +my %dict = (); +my $cc = 0; +while(<C>) { +  chomp; +  my ($word, $cat) = split /\s+/; +  die "'$word' '$cat'" unless (defined $word && defined $cat); +  $dict{$word} = $cat; +  $cc++; +} +close C; +print STDERR "Loaded classes for $cc words\n"; + +while(<T>) { +  chomp; +  my @cats = map { $dict{$_} or die "Undefined class for $_"; } split /\s+/; +  print "@cats\n"; +} + diff --git a/word-aligner/support/extract_grammar.pl b/word-aligner/support/extract_grammar.pl new file mode 100755 index 00000000..d7275ef5 --- /dev/null +++ b/word-aligner/support/extract_grammar.pl @@ -0,0 +1,11 @@ +#!/usr/bin/perl -w +use strict; + +my $key = shift @ARGV; +die "Usage: $0 KEY\n" unless defined $key; + +while(<>) { +  my ($k, @rest) = split / \|\|\| /; +  print join(' ||| ', @rest) if ($k eq $key); +} + diff --git a/word-aligner/support/extract_vocab.pl b/word-aligner/support/extract_vocab.pl new file mode 100755 index 00000000..070d4202 --- /dev/null +++ b/word-aligner/support/extract_vocab.pl @@ -0,0 +1,20 @@ +#!/usr/bin/perl -w +use strict; + +print STDERR "Extracting vocabulary...\n"; +my %dict = (); +my $wc = 0; +while(<>) { +  chomp; +  my @words = split /\s+/; +  for my $word (@words) { $wc++; $dict{$word}++; } +} + +my $tc = 0; +for my $word (sort {$dict{$b} <=> $dict{$a}} keys %dict) { +  print "$word\n"; +  $tc++; +} + +print STDERR "$tc types / $wc tokens\n"; + diff --git a/word-aligner/support/extract_weights.pl b/word-aligner/support/extract_weights.pl new file mode 100755 index 00000000..dfedd12e --- /dev/null +++ b/word-aligner/support/extract_weights.pl @@ -0,0 +1,17 @@ +#!/usr/bin/perl -w +use strict; +my %dict=(); +while(<>) { +  chomp; +  my ($dummy, $a, $b, $wts) = split / \|\|\| /; +  my @weights = split /\s+/, $wts; +  for my $w (@weights) { +    my ($name, $val) = split /=/, $w; +    unless ($dict{$name}) { +      my $r = (0.5 - rand) / 5; +      $r = sprintf ("%0.4f", $r); +      print "$name $r\n"; +      $dict{$name}= 1; +    } +  } +} diff --git a/word-aligner/support/invert_grammar.pl b/word-aligner/support/invert_grammar.pl new file mode 100755 index 00000000..3988388d --- /dev/null +++ b/word-aligner/support/invert_grammar.pl @@ -0,0 +1,8 @@ +#!/usr/bin/perl -w +use strict; + +while(<>) { +  my ($f, $e, $scores) = split / \|\|\| /; +  print "$e ||| $f ||| $scores"; +} + diff --git a/word-aligner/support/make_lex_grammar.pl b/word-aligner/support/make_lex_grammar.pl new file mode 100755 index 00000000..c3e29540 --- /dev/null +++ b/word-aligner/support/make_lex_grammar.pl @@ -0,0 +1,435 @@ +#!/usr/bin/perl -w +use utf8; +use strict; + +my $LIMIT_SIZE=30; + +my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f) = @ARGV; +die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f; + +my %eclass = (); +my %fclass = (); +load_classes($class_e, \%eclass); +load_classes($class_f, \%fclass); + +our @IDENT_BINS = qw (Ident0 Ident1 Ident2 Ident3 Ident4 Ident5 Ident6 Ident7 Ident8_9 Ident8_9 Ident10_11 Ident10_11 Ident12_14 Ident12_14 Ident12_14); +die unless scalar @IDENT_BINS == 15; +our $MAX_IDENT_BIN = 'IdentGT' . scalar @IDENT_BINS; + +our %cache; +open EF, "<$effile" or die; +open M1, "<$model1" or die; +open IM1, "<$imodel1" or die; +binmode(EF,":utf8"); +binmode(M1,":utf8"); +binmode(IM1,":utf8"); +binmode(STDOUT,":utf8"); +my %model1; +print STDERR "Reading model1...\n"; +my %sizes = (); +while(<M1>) { +  chomp; +  my ($f, $e, $lp) = split /\s+/; +  $model1{$f}->{$e} = 1; +  $sizes{$f}++; +} +close M1; + +my $inv_add = 0; +my %invm1; +print STDERR "Reading inverse model1...\n"; +my %esizes=(); +while(<IM1>) { +  chomp; +  my ($e, $f, $lp) = split /\s+/; +  $invm1{$e}->{$f} = 1; +  $esizes{$e}++; +  if (($sizes{$f} or 0) < $LIMIT_SIZE && !(defined $model1{$f}->{$e})) { +    $model1{$f}->{$e} = 1; +    $sizes{$f}++; +    $inv_add++; +  } +} +close IM1; +print STDERR "Added $inv_add from inverse model1\n"; + +open M1, "<$model1" or die; +binmode(M1,":utf8"); +my $dir_add = 0; +print STDERR "Reading model1 (again) for extra inverse translations...\n"; +while(<M1>) { +  chomp; +  my ($f, $e, $lp) = split /\s+/; +  if (($esizes{$e} or 0) < $LIMIT_SIZE && !(defined $invm1{$e}->{$f})) { +    $invm1{$e}->{$f} = 1; +    $esizes{$e}++; +    $dir_add++; +  } +} +close M1; +print STDERR "Added $dir_add from model 1\n"; +print STDERR "Generating grammars...\n"; +open OE, "<$orthoe" or die; +binmode(OE,":utf8"); +my %oe_dict; +while(<OE>) { +  chomp; +  my ($a, $b) = split / \|\|\| /, $_; +  die "BAD: $_" unless defined $a && defined $b; +  $oe_dict{$a} = $b; +} +close OE; +open OF, "<$orthof" or die; +binmode(OF,":utf8"); +my %of_dict; +while(<OF>) { +  chomp; +  my ($a, $b) = split / \|\|\| /, $_; +  die "BAD: $_" unless defined $a && defined $b; +  $of_dict{$a} = $b; +} +close OF; +$of_dict{'<eps>'} = '<eps>'; +$oe_dict{'<eps>'} = '<eps>'; + +my $MIN_FEATURE_COUNT = 0; +my $ADD_PREFIX_ID = 0; +my $ADD_CLASS_CLASS = 1; +my $ADD_LEN = 1; +my $ADD_SIM = 1; +my $ADD_DICE = 1; +my $ADD_111 = 1; +my $ADD_ID = 1; +my $ADD_PUNC = 1; +my $ADD_NULL = 0; +my $ADD_STEM_ID = 1; +my $ADD_SYM = 0; +my $BEAM_RATIO = 50; +my $BIN_ORTHO = 1; +my $BIN_DLEN = 1; +my $BIN_IDENT = 1; +my $BIN_DICE = 1; + +my %fdict; +my %fcounts; +my %ecounts; + +my %sdict; + +while(<EF>) { +  chomp; +  my ($f, $e) = split /\s*\|\|\|\s*/; +  my @es = split /\s+/, $e; +  my @fs = split /\s+/, $f; +  for my $ew (@es){ +    die "E: Empty word" if $ew eq ''; +    $ecounts{$ew}++; +  } +  push @fs, '<eps>' if $ADD_NULL; +  for my $fw (@fs){ +    die "F: Empty word" if $fw eq ''; +    $fcounts{$fw}++; +  } +  for my $fw (@fs){ +    for my $ew (@es){ +      $fdict{$fw}->{$ew}++; +    } +  } +} + +my $specials = 0; +my $fc = 1000000; +my $sids = 1000000; +for my $f (sort keys %fdict) { +  my $re = $fdict{$f}; +  my $max; +  for my $e (sort {$re->{$b} <=> $re->{$a}} keys %$re) { +    my $efcount = $re->{$e}; +    unless (defined $max) { $max = $efcount; } +    my $m1 = $model1{$f}->{$e}; +    my $im1 = $invm1{$e}->{$f}; +    my $is_good_pair = (defined $m1); +    my $is_inv_good_pair = (defined $im1); +    my $dice = 2 * $efcount / ($ecounts{$e} + $fcounts{$f}); +    my @feats; +    if ($efcount > $MIN_FEATURE_COUNT) { +      $fc++; +      push @feats, "F$fc=1"; +    } +    if ($ADD_SYM && $is_good_pair && $is_inv_good_pair) { push @feats, 'Sym=1'; } +    my $oe = $oe_dict{$e}; +    die "Can't find orthonorm form for $e" unless defined $oe; +    my $of = $of_dict{$f}; +    die "Can't find orthonorm form for $f" unless defined $of; +    my $len_e = length($oe); +    my $len_f = length($of); +    if ($ADD_DICE) { +      if ($BIN_DICE) { +        push @feats, dicebin($dice) . '=1'; +      } else { +        push @feats, "Dice=$dice"; +      } +    } +    if ($ADD_CLASS_CLASS) { +      my $ce = $eclass{$e} or die "E- no class for: '$e'"; +      my $cf = $fclass{$f} or die "F- no class for: '$f'"; +      push @feats, "C${cf}_${ce}=1"; +    } +    my $is_null = undef; +    if ($ADD_NULL && $f eq '<eps>') { +      push @feats, "IsNull=1"; +      $is_null = 1; +    } +    if ($ADD_LEN) { +      if (!$is_null) { +        my $dlen = abs($len_e - $len_f); +        if ($BIN_DLEN) { +          push @feats, dlenbin($dlen) . '=1'; +        } else { +          push @feats, "DLen=$dlen"; +        } +      } +    } +    my $f_num = ($of =~ /^-?\d[0-9\.\,]+%?$/ && (length($of) > 3)); +    my $e_num = ($oe =~ /^-?\d[0-9\.\,]+%?$/ && (length($oe) > 3)); +    my $both_non_numeric = (!$e_num && !$f_num); +    if ($ADD_STEM_ID) { +      my $el = 4; +      my $fl = 4; +      if ($oe =~ /^al|re|co/) { $el++; } +      if ($of =~ /^al|re|co/) { $fl++; } +      if ($oe =~ /^trans|inter/) { $el+=2; } +      if ($of =~ /^trans|inter/) { $fl+=2; } +      if ($fl > length($of)) { $fl = length($of); } +      if ($el > length($oe)) { $el = length($oe); } +      my $sf = substr $of, 0, $fl; +      my $se = substr $oe, 0, $el; +      my $id = $sdict{$sf}->{$se}; +      if (!$id) { +        $sids++; +	$sdict{$sf}->{$se} = $sids; +	$id = $sids; +      } +      push @feats, "S$id=1"; +    } +    if ($ADD_PREFIX_ID) { +      if ($len_e > 3 && $len_f > 3 && $both_non_numeric) {  +        my $pe = substr $oe, 0, 3; +        my $pf = substr $of, 0, 3; +        if ($pe eq $pf) { push @feats, "PfxIdentical=1"; } +      } +    } +    if ($ADD_SIM) { +      my $ld = 0; +      my $eff = $len_e; +      if ($eff < $len_f) { $eff = $len_f; } +      if (!$is_null) { +        $ld = ($eff - levenshtein($oe, $of)) / sqrt($eff); +      } +      if ($ld > 1.5) { $is_good_pair = 1; } +      if ($BIN_ORTHO) { +        push @feats, orthobin($ld) . '=1'; +      } else { +        push @feats, "OrthoSim=$ld"; +      } +    } +    my $ident = ($e eq $f); +    if ($ident) { $is_good_pair = 1; } +    if ($ident && $ADD_ID) { +      if ($BIN_IDENT) { +        push @feats, identbin($len_e) . '=1'; +      } else { +        push @feats, "Identical=$len_e"; +      } +    } +    if ($efcount == 1 && $ecounts{$e} == 1 && $fcounts{$f} == 1) { +      $is_good_pair = 1; +      if ($ADD_111) { +        push @feats, "OneOneOne=1"; +      } +    } +    if ($ADD_PUNC) { +      if ($f =~ /^[!,\-\/"':;=+?.()\[\]«»]+$/ && $e =~ /[a-z]+/) { +        push @feats, "PuncMiss=1"; +      } +    } +    my $is_special = ($is_good_pair && !(defined $m1)); +    $specials++ if $is_special; +    print STDERR "$f -> $e\n" if $is_special; +    print "1 ||| $f ||| $e ||| @feats\n" if $is_good_pair; +    print "2 ||| $e ||| $f ||| @feats\n" if $is_inv_good_pair; +  } +} +print STDERR "Added $specials special rules that were not in the M1 set\n"; + + +sub levenshtein +{ +    # $s1 and $s2 are the two strings +    # $len1 and $len2 are their respective lengths +    # +    my ($s1, $s2) = @_; +    my ($len1, $len2) = (length $s1, length $s2); + +    # If one of the strings is empty, the distance is the length +    # of the other string +    # +    return $len2 if ($len1 == 0); +    return $len1 if ($len2 == 0); + +    my %mat; + +    # Init the distance matrix +    # +    # The first row to 0..$len1 +    # The first column to 0..$len2 +    # The rest to 0 +    # +    # The first row and column are initialized so to denote distance +    # from the empty string +    # +    for (my $i = 0; $i <= $len1; ++$i) +    { +        for (my $j = 0; $j <= $len2; ++$j) +        { +            $mat{$i}{$j} = 0; +            $mat{0}{$j} = $j; +        } + +        $mat{$i}{0} = $i; +    } + +    # Some char-by-char processing is ahead, so prepare +    # array of chars from the strings +    # +    my @ar1 = split(//, $s1); +    my @ar2 = split(//, $s2); + +    for (my $i = 1; $i <= $len1; ++$i) +    { +        for (my $j = 1; $j <= $len2; ++$j) +        { +            # Set the cost to 1 iff the ith char of $s1 +            # equals the jth of $s2 +            #  +            # Denotes a substitution cost. When the char are equal +            # there is no need to substitute, so the cost is 0 +            # +            my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1; + +            # Cell $mat{$i}{$j} equals the minimum of: +            # +            # - The cell immediately above plus 1 +            # - The cell immediately to the left plus 1 +            # - The cell diagonally above and to the left plus the cost +            # +            # We can either insert a new char, delete a char or +            # substitute an existing char (with an associated cost) +            # +            $mat{$i}{$j} = min([$mat{$i-1}{$j} + 1, +                                $mat{$i}{$j-1} + 1, +                                $mat{$i-1}{$j-1} + $cost]); +        } +    } + +    # Finally, the Levenshtein distance equals the rightmost bottom cell +    # of the matrix +    # +    # Note that $mat{$x}{$y} denotes the distance between the substrings +    # 1..$x and 1..$y +    # +    return $mat{$len1}{$len2}; +} + + +# minimal element of a list +# +sub min +{ +    my @list = @{$_[0]}; +    my $min = $list[0]; + +    foreach my $i (@list) +    { +        $min = $i if ($i < $min); +    } + +    return $min; +} + +sub load_classes { +  my ($file, $ref) = @_; +  print STDERR "Reading classes from $file...\n"; +  open F, "<$file" or die "Can't read $file: $!"; +  binmode(F, ":utf8") or die; +  while(<F>) { +    chomp; +    my ($word, $class) = split /\s+/; +#    print STDERR "'$word' -> $class\n"; +    $ref->{$word} = $class; +  } +  close F; +} + +sub dicebin { +  my $x = shift; +  if ($x < 0.05) { return 'DiceLT005'; } +  elsif ($x >= 0.05 && $x < 0.1) { return 'Dice005_01'; } +  elsif ($x >= 0.1 && $x < 0.2) { return 'Dice01_02'; } +  elsif ($x >= 0.2 && $x < 0.3) { return 'Dice02_03'; } +  elsif ($x >= 0.3 && $x < 0.4) { return 'Dice03_04'; } +  elsif ($x >= 0.4 && $x < 0.5) { return 'Dice04_05'; } +  elsif ($x >= 0.5 && $x < 0.6) { return 'Dice05_06'; } +  elsif ($x >= 0.6 && $x < 0.7) { return 'Dice06_07'; } +  elsif ($x >= 0.7 && $x < 0.8) { return 'Dice07_08'; } +  elsif ($x >= 0.8 && $x < 0.9) { return 'Dice08_09'; } +  elsif ($x >= 0.9 && $x < 1.0) { return 'Dice09_10'; } +  elsif ($x >= 1.0 && $x < 1.1) { return 'Dice10_11'; } +  elsif ($x >= 1.1 && $x < 1.2) { return 'Dice11_12'; } +  elsif ($x >= 1.2 && $x < 1.4) { return 'Dice12_14'; } +  elsif ($x >= 1.4 && $x < 1.6) { return 'Dice14_16'; } +  elsif ($x >= 1.6 && $x < 1.8) { return 'Dice16_18'; } +  elsif ($x >= 1.8 && $x < 2.0) { return 'Dice18_20'; } +  elsif ($x >= 2.0 && $x < 2.3) { return 'Dice20_23'; } +  elsif ($x >= 2.3) { return 'DiceGT23'; } +} + +sub orthobin { +  my $x = shift; +  if ($x < 0.9) { return 'OrthoLT09'; } +  elsif ($x >= 0.9 && $x < 1.1) { return 'Ortho09_11'; } +  elsif ($x >= 1.1 && $x < 1.3) { return 'Ortho11_13'; } +  elsif ($x >= 1.3 && $x < 1.5) { return 'Ortho13_15'; } +  elsif ($x >= 1.5 && $x < 1.7) { return 'Ortho15_17'; } +  elsif ($x >= 1.7 && $x < 1.9) { return 'Ortho17_19'; } +  elsif ($x >= 1.9 && $x < 2.1) { return 'Ortho19_21'; } +  elsif ($x >= 2.1 && $x < 2.3) { return 'Ortho21_23'; } +  elsif ($x >= 2.3 && $x < 2.5) { return 'Ortho23_25'; } +  elsif ($x >= 2.5 && $x < 2.7) { return 'Ortho25_27'; } +  elsif ($x >= 2.7 && $x < 2.9) { return 'Ortho27_29'; } +  elsif ($x >= 2.9) { return 'OrthoGT29'; } +} + +sub dlenbin { +  my $x = shift; +  if ($x == 0) { return 'DLen0'; } +  elsif ($x == 1) { return 'DLen1'; } +  elsif ($x == 2) { return 'DLen2'; } +  elsif ($x == 3) { return 'DLen3'; } +  elsif ($x == 4) { return 'DLen4'; } +  elsif ($x == 5) { return 'DLen5'; } +  elsif ($x == 6) { return 'DLen6'; } +  elsif ($x == 7) { return 'DLen7'; } +  elsif ($x == 8) { return 'DLen8'; } +  elsif ($x == 9) { return 'DLen9'; } +  elsif ($x >= 10) { return 'DLenGT10'; } +} + +sub identbin { +  my $x = shift; +  if ($x == 0) { die; } +  if ($x > scalar @IDENT_BINS) { return $MAX_IDENT_BIN; } +  return $IDENT_BINS[$x]; +} + + diff --git a/word-aligner/support/merge_corpus.pl b/word-aligner/support/merge_corpus.pl new file mode 100755 index 00000000..02827903 --- /dev/null +++ b/word-aligner/support/merge_corpus.pl @@ -0,0 +1,18 @@ +#!/usr/bin/perl -w +use strict; +die "Usage: $0 corpus.e|f corpus.f|e" unless scalar @ARGV == 2; + +my ($a, $b) = @ARGV; +open A, "<$a" or die "Can't read $a: $!"; +open B, "<$b" or die "Can't read $a: $!"; + +while(<A>) { +  chomp; +  my $e = <B>; +  die "Mismatched lines in $a and $b!" unless defined $e; +  print "$_ ||| $e"; +} + +my $e = <B>; +die "Mismatched lines in $a and $b!" unless !defined $e; + diff --git a/word-aligner/support/supplement_weights_file.pl b/word-aligner/support/supplement_weights_file.pl new file mode 100755 index 00000000..06876043 --- /dev/null +++ b/word-aligner/support/supplement_weights_file.pl @@ -0,0 +1,73 @@ +#!/usr/bin/perl -w +use strict; + +my $ADD_FCLASS_JUMP = 1; +my $ADD_MODEL2_BINARY = 0; +my $ADD_FC_RELPOS = 1; + +my ($f_classes) = @ARGV; + +die "Usage: $0 f-classes.file" unless $f_classes && -f $f_classes; + +print <<EOT; +MarkovJump 0 +RelativeSentencePosition 0 +EOT + +# !	8 +# "	11 +# 's	18 + +my %dcats = (); +$dcats{'BOS'} = 1; +$dcats{'EOS'} = 1; + +open FC, "<$f_classes" or die; +while(<FC>) { +  chomp; +  my ($x, $cat) = split /\s+/; +  $dcats{$cat} = 1; +} + +my @cats = sort keys %dcats; + +my $added = 0; +for (my $i=0; $i < scalar @cats; $i++) { +  my $c1 = $cats[$i]; +  for (my $j=0; $j < scalar @cats; $j++) { +    my $c2 = $cats[$j]; +    print "SP:${c1}_${c2} 0\n"; +    $added++; +  } +} + +for (my $ss=1; $ss < 100; $ss++) { +  if ($ADD_FCLASS_JUMP) { +    for (my $i=0; $i < scalar @cats; $i++) { +      my $cat = $cats[$i]; +      for (my $j = -$ss; $j <= $ss; $j++) { +        print "Jump_FL:${ss}_FC:${cat}_J:$j 0\n"; +        $added++; +      } +    } +  } +  if ($ADD_MODEL2_BINARY) { +    # M2_FL:8_SI:3_TI:2=1 +    for (my $i = 0; $i < $ss; $i++) { +      for (my $j = 0; $j < 100; $j++) { +        print "M2FL:${ss}:TI:${j}_SI:${i} 0\n"; +        $added++; +      } +    } +  } +} +if ($ADD_FC_RELPOS) { +  #RelPos_FC:11 +  for (my $i=0; $i < scalar @cats; $i++) { +    my $cat = $cats[$i]; +    print "RelPos_FC:$cat 0\n"; +    $added++; +  } +} + +print STDERR "Added $added weights\n"; | 
