From 0172721855098ca02b207231a654dffa5e4eb1c9 Mon Sep 17 00:00:00 2001
From: redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>
Date: Tue, 22 Jun 2010 05:12:27 +0000
Subject: initial checkin

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@2 ec762483-ff6d-05da-a07a-a48fb63a330f
---
 word-aligner/aligner.pl                         | 157 +++++++++
 word-aligner/makefiles/makefile.grammars        |  70 ++++
 word-aligner/ortho-norm/README                  |   2 +
 word-aligner/ortho-norm/ar.pl                   |  32 ++
 word-aligner/ortho-norm/en.pl                   |  11 +
 word-aligner/ortho-norm/fr.pl                   |  22 ++
 word-aligner/ortho-norm/ur.pl                   |  34 ++
 word-aligner/support/classify.pl                |  27 ++
 word-aligner/support/extract_grammar.pl         |  11 +
 word-aligner/support/extract_vocab.pl           |  20 ++
 word-aligner/support/extract_weights.pl         |  17 +
 word-aligner/support/invert_grammar.pl          |   8 +
 word-aligner/support/make_lex_grammar.pl        | 435 ++++++++++++++++++++++++
 word-aligner/support/merge_corpus.pl            |  18 +
 word-aligner/support/supplement_weights_file.pl |  73 ++++
 15 files changed, 937 insertions(+)
 create mode 100755 word-aligner/aligner.pl
 create mode 100644 word-aligner/makefiles/makefile.grammars
 create mode 100644 word-aligner/ortho-norm/README
 create mode 100755 word-aligner/ortho-norm/ar.pl
 create mode 100755 word-aligner/ortho-norm/en.pl
 create mode 100755 word-aligner/ortho-norm/fr.pl
 create mode 100755 word-aligner/ortho-norm/ur.pl
 create mode 100755 word-aligner/support/classify.pl
 create mode 100755 word-aligner/support/extract_grammar.pl
 create mode 100755 word-aligner/support/extract_vocab.pl
 create mode 100755 word-aligner/support/extract_weights.pl
 create mode 100755 word-aligner/support/invert_grammar.pl
 create mode 100755 word-aligner/support/make_lex_grammar.pl
 create mode 100755 word-aligner/support/merge_corpus.pl
 create mode 100755 word-aligner/support/supplement_weights_file.pl

(limited to 'word-aligner')
diff --git a/word-aligner/aligner.pl b/word-aligner/aligner.pl
new file mode 100755
index 00000000..7eec0e42
--- /dev/null
+++ b/word-aligner/aligner.pl
@@ -0,0 +1,157 @@
+#!/usr/bin/perl -w
+use strict;
+
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path getcwd /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; }
+use Getopt::Long;
+my $training_dir = "$SCRIPT_DIR/../training";
+die "Can't find training dir: $training_dir" unless -d $training_dir;
+
+my $mkcls = '/Users/redpony/software/giza/giza-pp/mkcls-v2/mkcls';
+my $num_classes = 50;
+my $nodes = 40;
+my $pmem = "2500mb";
+my $DECODER = "cdec";
+GetOptions("cdec=s" => \$DECODER,
+           "jobs=i" => \$nodes,
+           "pmem=s" => \$pmem,
+           "mkcls=s" => \$mkcls,
+          ) or usage();
+usage() unless (scalar @ARGV == 1);
+die "Cannot find mkcls (specify with --mkcls=/path/to/mkcls) at $mkcls\n" unless -f $mkcls;
+die "Cannot execute mkcls at $mkcls\n" unless -x $mkcls;
+
+my $in_file = shift @ARGV;
+die "Expected format corpus.l1-l2 where l1 & l2 are two-letter abbreviations\nfor the source and target language respectively\n" unless ($in_file =~ /^.+\.([a-z][a-z])-([a-z][a-z])$/);
+my $f_lang = $1;
+my $e_lang = $2;
+
+print STDERR "Source language: $f_lang\n";
+print STDERR "Target language: $e_lang\n";
+print STDERR " Using mkcls in: $mkcls\n\n";
+die "Don't have an orthographic normalizer for $f_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$f_lang.pl";
+die "Don't have an orthographic normalizer for $e_lang\n" unless -f "$SCRIPT_DIR/ortho-norm/$e_lang.pl";
+
+my @stages = qw(nopos relpos markov);
+my @directions = qw(f-e e-f);
+
+my $corpus = 'c';
+
+my $cwd = getcwd();
+my $align_dir = "$cwd/talign";
+
+mkdir $align_dir;
+mkdir "$align_dir/grammars";
+open IN, "<$in_file" or die "Can't read $in_file: $!";
+open E, ">$align_dir/grammars/corpus.e" or die "Can't write: $!";
+open F, ">$align_dir/grammars/corpus.f" or die "Can't write: $!";
+while(<IN>) {
+  chomp;
+  my ($f, $e) = split / \|\|\| /;
+  die "Bad format, excepted ||| separated line" unless defined $f && defined $e;
+  print F "$f\n";
+  print E "$e\n";
+}
+close F;
+close E;
+close IN;
+`cp $SCRIPT_DIR/makefiles/makefile.grammars $align_dir/grammars/Makefile`;
+die unless $? == 0;
+
+my @targets = qw(grammars);
+
+for my $direction (@directions) {
+  my $prev_stage = undef;
+  for my $stage (@stages) {
+    push @targets, "$stage-$direction";
+    make_stage($stage, $direction, $prev_stage);
+    $prev_stage = $stage;
+  }
+}
+
+open TOPLEVEL, ">$align_dir/Makefile" or die "Can't write $align_dir/Makefile: $!";
+
+print TOPLEVEL <<EOT;
+E_LANG = $e_lang
+F_LANG = $f_lang
+SCRIPT_DIR = $SCRIPT_DIR
+TRAINING_DIR = $training_dir
+MKCLS = $mkcls
+NCLASSES = $num_classes
+
+TARGETS = @targets
+PTRAIN = \$(TRAINING_DIR)/cluster-ptrain.pl --restart_if_necessary
+PTRAIN_PARAMS = --gaussian_prior --sigma_squared 1.0 --max_iteration 15
+
+export
+
+all:
+	\@failcom='exit 1'; \\
+	list='\$(TARGETS)'; for subdir in \$\$list; do \\
+	echo "Making \$\$subdir ..."; \\
+	(cd \$\$subdir && \$(MAKE)) || eval \$\$failcom; \\
+	done
+
+clean:
+	\@failcom='exit 1'; \\
+	list='\$(TARGETS)'; for subdir in \$\$list; do \\
+	echo "Making \$\$subdir ..."; \\
+	(cd \$\$subdir && \$(MAKE) clean) || eval \$\$failcom; \\
+	done
+EOT
+close TOPLEVEL;
+
+print STDERR "Created alignment task. chdir to talign/ then type make.\n\n";
+exit 0;
+
+sub make_stage {
+  my ($stage, $direction, $prev_stage) = @_;
+  my $stage_dir = "$align_dir/$stage-$direction";
+  my $first = $direction;
+  $first =~ s/^(.+)-.*$/$1/;
+  mkdir $stage_dir;
+  my $RELPOS = "feature_function=RelativeSentencePosition $align_dir/grammars/corpus.class.$first\n";
+  open CDEC, ">$stage_dir/cdec.ini" or die;
+  print CDEC <<EOT;
+formalism=lexcrf
+intersection_strategy=full
+grammar=$align_dir/grammars/corpus.$direction.lex-grammar.gz
+EOT
+  if ($stage =~ /relpos/) {
+    print CDEC "$RELPOS\n";
+  } elsif ($stage =~ /markov/) {
+    print CDEC "$RELPOS\n";
+    print CDEC "feature_function=MarkovJump\n";
+    print CDEC "feature_function=MarkovJumpFClass $align_dir/grammars/corpus.class.$first\n";
+    print CDEC "feature_function=SourcePOSBigram $align_dir/grammars/corpus.class.$first\n";
+  }
+  close CDEC;
+
+  my $init_weights = "weights.init.gz: ../grammars/weights.init.gz\n\tcp \$< \$\@\n";
+  if ($prev_stage) {
+    $init_weights = "weights.init.gz: ../$prev_stage-$direction/weights.final.gz\n\tcp \$< \$\@\n";
+  }
+
+  open MAKE, ">$stage_dir/Makefile" or die;
+  print MAKE <<EOT;
+all: weights.final.gz
+
+clean:
+	\$(RM) -r ptrain weights.init.gz weights.final.gz
+
+$init_weights
+
+weights.final.gz: weights.init.gz cdec.ini
+	\$(PTRAIN) \$(PTRAIN_PARAMS) cdec.ini ../grammars/corpus.$direction weights.init.gz
+	cp ptrain/weights.final.gz weights.final.gz
+	\$(RM) -r ptrain
+EOT
+  close MAKE;
+}
+
+sub usage {
+  die <<EOT;
+
+Usage: $0 [OPTIONS] training_corpus.fr-en
+
+EOT
+}
diff --git a/word-aligner/makefiles/makefile.grammars b/word-aligner/makefiles/makefile.grammars
new file mode 100644
index 00000000..b89937c1
--- /dev/null
+++ b/word-aligner/makefiles/makefile.grammars
@@ -0,0 +1,70 @@
+all: corpus.f-e.lex-grammar.gz corpus.e-f.lex-grammar.gz corpus.class.e corpus.class.f weights.init.gz
+
+clean:
+	$(RM) orthonorm-dict.* voc2class* corpus.class.* corpus.e-f corpus.f-e weights* corpus.f-e.lex-grammar* *.model1 *voc corpus.e-f.lex-grammar*
+
+SUPPORT_DIR = $(SCRIPT_DIR)/support
+GZIP = /usr/bin/gzip
+ZCAT = zcat
+EXTRACT_WEIGHTS = $(SUPPORT_DIR)/extract_weights.pl
+EXTRACT_GRAMMAR = $(SUPPORT_DIR)/extract_grammar.pl
+SUPPLEMENT_WEIGHTS = $(SUPPORT_DIR)/supplement_weights_file.pl
+EXTRACT_VOCAB = $(SUPPORT_DIR)/extract_vocab.pl
+ORTHONORM_E = $(SCRIPT_DIR)/ortho-norm/$(E_LANG).pl
+ORTHONORM_F = $(SCRIPT_DIR)/ortho-norm/$(F_LANG).pl
+CLASSIFY = $(SUPPORT_DIR)/classify.pl
+MAKE_LEX_GRAMMAR = $(SUPPORT_DIR)/make_lex_grammar.pl
+MODEL1 = $(TRAINING_DIR)/model1
+MERGE_CORPUS = $(SUPPORT_DIR)/merge_corpus.pl
+
+orthonorm-dict.e: corpus.e
+	$(EXTRACT_VOCAB) corpus.e > e.voc
+	$(ORTHONORM_E) < e.voc > e.ortho-voc
+	$(MERGE_CORPUS) e.voc e.ortho-voc > $@
+
+orthonorm-dict.f: corpus.f
+	$(EXTRACT_VOCAB) corpus.f > f.voc
+	$(ORTHONORM_F) < f.voc > f.ortho-voc
+	$(MERGE_CORPUS) f.voc f.ortho-voc > $@
+
+voc2class.e: corpus.e $(MKCLS)
+	$(MKCLS) -c$(NCLASSES) -n10 -pcorpus.e -Vvoc2class.e opt
+
+voc2class.f: corpus.f $(MKCLS)
+	$(MKCLS) -c$(NCLASSES) -n10 -pcorpus.f -Vvoc2class.f opt
+
+corpus.class.e: corpus.e voc2class.e $(CLASSIFY)
+	$(CLASSIFY) voc2class.e corpus.e > $@
+
+corpus.class.f: corpus.f voc2class.f $(CLASSIFY)
+	$(CLASSIFY) voc2class.f corpus.f > $@
+
+corpus.f-e: corpus.f corpus.e $(MERGE_CORPUS)
+	$(MERGE_CORPUS) corpus.f corpus.e > $@
+
+corpus.e-f: corpus.f corpus.e $(MERGE_CORPUS)
+	$(MERGE_CORPUS) corpus.e corpus.f > $@
+
+corpus.f-e.model1: corpus.f-e $(MODEL1)
+	$(MODEL1) corpus.f-e > corpus.f-e.model1
+
+corpus.e-f.model1: corpus.e-f $(MODEL1)
+	$(MODEL1) corpus.e-f > corpus.e-f.model1
+
+bidir.grammars: corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f
+	$(RM) $@
+	$(MAKE_LEX_GRAMMAR) corpus.f-e corpus.f-e.model1 corpus.e-f.model1 orthonorm-dict.f orthonorm-dict.e voc2class.e voc2class.f > bidir.grammars
+
+corpus.f-e.lex-grammar.gz: bidir.grammars
+	$(EXTRACT_GRAMMAR) 1 bidir.grammars | $(GZIP) -9 > corpus.f-e.lex-grammar.gz
+
+corpus.e-f.lex-grammar.gz: bidir.grammars
+	$(EXTRACT_GRAMMAR) 2 bidir.grammars | $(GZIP) -9 > corpus.e-f.lex-grammar.gz
+
+weights.init.gz: bidir.grammars voc2class.f voc2class.e
+	$(EXTRACT_WEIGHTS) bidir.grammars > weights.init
+	$(SUPPLEMENT_WEIGHTS) voc2class.f > weights.dup
+	$(SUPPLEMENT_WEIGHTS) voc2class.e >> weights.dup
+	sort -u weights.dup >> weights.init
+	$(GZIP) -9 weights.init
+
diff --git a/word-aligner/ortho-norm/README b/word-aligner/ortho-norm/README
new file mode 100644
index 00000000..7071798a
--- /dev/null
+++ b/word-aligner/ortho-norm/README
@@ -0,0 +1,2 @@
+The normalizations in this directory are supposed to create forms
+that are roughly comparable by string comparison between languages.
diff --git a/word-aligner/ortho-norm/ar.pl b/word-aligner/ortho-norm/ar.pl
new file mode 100755
index 00000000..e8bd521a
--- /dev/null
+++ b/word-aligner/ortho-norm/ar.pl
@@ -0,0 +1,32 @@
+#!/usr/bin/perl -w
+use strict;
+use utf8;
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+while(<STDIN>) {
+  chomp;
+  my $len = length($_);
+  if ($len > 1 && !($_ =~ /\d/)) {
+    s/\$/sh/g;
+  }
+  s/([a-z])\~/$1$1/g;
+  s/E/'/g;
+  s/^Aw/o/g;
+  s/\|/a/g;
+  s/@/h/g;
+  s/c/ch/g;
+  s/x/kh/g;
+  s/\*/dh/g;
+  s/w/o/g;
+  s/v/th/g;
+  if ($len > 1) { s/}/'/g; }
+  s/Z/dh/g;
+  s/y/i/g;
+  s/Y/a/g;
+  if ($len > 1) { s/p$//; }
+  $_ = lc $_;
+  print "$_\n";
+}
+
diff --git a/word-aligner/ortho-norm/en.pl b/word-aligner/ortho-norm/en.pl
new file mode 100755
index 00000000..b167803e
--- /dev/null
+++ b/word-aligner/ortho-norm/en.pl
@@ -0,0 +1,11 @@
+#!/usr/bin/perl -w
+use strict;
+use utf8;
+
+while(<STDIN>) {
+  $_ = lc $_;
+  s/ al-/ al/g;
+  s/^al-/al/;
+  print;
+}
+
diff --git a/word-aligner/ortho-norm/fr.pl b/word-aligner/ortho-norm/fr.pl
new file mode 100755
index 00000000..5592ab05
--- /dev/null
+++ b/word-aligner/ortho-norm/fr.pl
@@ -0,0 +1,22 @@
+#!/usr/bin/perl -w
+use strict;
+use utf8;
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+while(<STDIN>) {
+  $_ = lc $_;
+  # see http://en.wikipedia.org/wiki/Use_of_the_circumflex_in_French
+  s/â/as/g;
+  s/ê/es/g;
+  s/î/is/g;
+  s/ô/os/g;
+  s/û/us/g;
+
+  s/ç/c/g;
+  s/é|è/e/g;
+  s/á/a/g;
+  print;
+}
+
diff --git a/word-aligner/ortho-norm/ur.pl b/word-aligner/ortho-norm/ur.pl
new file mode 100755
index 00000000..34953dd3
--- /dev/null
+++ b/word-aligner/ortho-norm/ur.pl
@@ -0,0 +1,34 @@
+#!/usr/bin/perl -w
+use strict;
+use utf8;
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+while(<STDIN>) {
+  chomp;
+  my @out = ();
+  my @words = split /\s+/;
+  for my $of (@words) {
+    if (length($of) > 1 && !($of =~ /\d/)) {
+      $of =~ s/\$/sh/g;
+    }
+    $of =~ s/([a-z])\~/$1$1/g;
+    $of =~ s/E/'/g;
+    $of =~ s/^Aw/o/;
+    $of =~ s/\|/a/g;
+    $of =~ s/@/h/g;
+    $of =~ s/c/ch/g;
+    $of =~ s/x/kh/g;
+    $of =~ s/\*/dh/g;
+    $of =~ s/p$/a/;
+    $of =~ s/w/o/g;
+    $of =~ s/Z/dh/g;
+    $of =~ s/y/i/g;
+    $of =~ s/Y/a/g;
+    $of = lc $of;
+    push @out, $of;
+  }
+  print "@out\n";
+}
+
diff --git a/word-aligner/support/classify.pl b/word-aligner/support/classify.pl
new file mode 100755
index 00000000..893c7b22
--- /dev/null
+++ b/word-aligner/support/classify.pl
@@ -0,0 +1,27 @@
+#!/usr/bin/perl -w
+use strict;
+
+die "Usage: $0 classes.txt corpus.txt" unless scalar @ARGV == 2;
+
+my ($class, $text) = @ARGV;
+open C, "<$class" or die "Can't read $class: $!";
+open T, "<$text" or die "Can't read $text: $!";
+
+my %dict = ();
+my $cc = 0;
+while(<C>) {
+  chomp;
+  my ($word, $cat) = split /\s+/;
+  die "'$word' '$cat'" unless (defined $word && defined $cat);
+  $dict{$word} = $cat;
+  $cc++;
+}
+close C;
+print STDERR "Loaded classes for $cc words\n";
+
+while(<T>) {
+  chomp;
+  my @cats = map { $dict{$_} or die "Undefined class for $_"; } split /\s+/;
+  print "@cats\n";
+}
+
diff --git a/word-aligner/support/extract_grammar.pl b/word-aligner/support/extract_grammar.pl
new file mode 100755
index 00000000..d7275ef5
--- /dev/null
+++ b/word-aligner/support/extract_grammar.pl
@@ -0,0 +1,11 @@
+#!/usr/bin/perl -w
+use strict;
+
+my $key = shift @ARGV;
+die "Usage: $0 KEY\n" unless defined $key;
+
+while(<>) {
+  my ($k, @rest) = split / \|\|\| /;
+  print join(' ||| ', @rest) if ($k eq $key);
+}
+
diff --git a/word-aligner/support/extract_vocab.pl b/word-aligner/support/extract_vocab.pl
new file mode 100755
index 00000000..070d4202
--- /dev/null
+++ b/word-aligner/support/extract_vocab.pl
@@ -0,0 +1,20 @@
+#!/usr/bin/perl -w
+use strict;
+
+print STDERR "Extracting vocabulary...\n";
+my %dict = ();
+my $wc = 0;
+while(<>) {
+  chomp;
+  my @words = split /\s+/;
+  for my $word (@words) { $wc++; $dict{$word}++; }
+}
+
+my $tc = 0;
+for my $word (sort {$dict{$b} <=> $dict{$a}} keys %dict) {
+  print "$word\n";
+  $tc++;
+}
+
+print STDERR "$tc types / $wc tokens\n";
+
diff --git a/word-aligner/support/extract_weights.pl b/word-aligner/support/extract_weights.pl
new file mode 100755
index 00000000..dfedd12e
--- /dev/null
+++ b/word-aligner/support/extract_weights.pl
@@ -0,0 +1,17 @@
+#!/usr/bin/perl -w
+use strict;
+my %dict=();
+while(<>) {
+  chomp;
+  my ($dummy, $a, $b, $wts) = split / \|\|\| /;
+  my @weights = split /\s+/, $wts;
+  for my $w (@weights) {
+    my ($name, $val) = split /=/, $w;
+    unless ($dict{$name}) {
+      my $r = (0.5 - rand) / 5;
+      $r = sprintf ("%0.4f", $r);
+      print "$name $r\n";
+      $dict{$name}= 1;
+    }
+  }
+}
diff --git a/word-aligner/support/invert_grammar.pl b/word-aligner/support/invert_grammar.pl
new file mode 100755
index 00000000..3988388d
--- /dev/null
+++ b/word-aligner/support/invert_grammar.pl
@@ -0,0 +1,8 @@
+#!/usr/bin/perl -w
+use strict;
+
+while(<>) {
+  my ($f, $e, $scores) = split / \|\|\| /;
+  print "$e ||| $f ||| $scores";
+}
+
diff --git a/word-aligner/support/make_lex_grammar.pl b/word-aligner/support/make_lex_grammar.pl
new file mode 100755
index 00000000..c3e29540
--- /dev/null
+++ b/word-aligner/support/make_lex_grammar.pl
@@ -0,0 +1,435 @@
+#!/usr/bin/perl -w
+use utf8;
+use strict;
+
+my $LIMIT_SIZE=30;
+
+my ($effile, $model1, $imodel1, $orthof, $orthoe, $class_e, $class_f) = @ARGV;
+die "Usage: $0 corpus.fr-en corpus.f-e.model1 corpus.e-f.model1 corpus.orthonorm-dict.f corpus.orthnorm-dict.e class.e class.f\n" unless $effile && -f $effile && $model1 && -f $model1 && $imodel1 && -f $imodel1 && $orthof && -f $orthof && $orthoe && -f $orthoe && -f $class_e && -f $class_f;
+
+my %eclass = ();
+my %fclass = ();
+load_classes($class_e, \%eclass);
+load_classes($class_f, \%fclass);
+
+our @IDENT_BINS = qw (Ident0 Ident1 Ident2 Ident3 Ident4 Ident5 Ident6 Ident7 Ident8_9 Ident8_9 Ident10_11 Ident10_11 Ident12_14 Ident12_14 Ident12_14);
+die unless scalar @IDENT_BINS == 15;
+our $MAX_IDENT_BIN = 'IdentGT' . scalar @IDENT_BINS;
+
+our %cache;
+open EF, "<$effile" or die;
+open M1, "<$model1" or die;
+open IM1, "<$imodel1" or die;
+binmode(EF,":utf8");
+binmode(M1,":utf8");
+binmode(IM1,":utf8");
+binmode(STDOUT,":utf8");
+my %model1;
+print STDERR "Reading model1...\n";
+my %sizes = ();
+while(<M1>) {
+  chomp;
+  my ($f, $e, $lp) = split /\s+/;
+  $model1{$f}->{$e} = 1;
+  $sizes{$f}++;
+}
+close M1;
+
+my $inv_add = 0;
+my %invm1;
+print STDERR "Reading inverse model1...\n";
+my %esizes=();
+while(<IM1>) {
+  chomp;
+  my ($e, $f, $lp) = split /\s+/;
+  $invm1{$e}->{$f} = 1;
+  $esizes{$e}++;
+  if (($sizes{$f} or 0) < $LIMIT_SIZE && !(defined $model1{$f}->{$e})) {
+    $model1{$f}->{$e} = 1;
+    $sizes{$f}++;
+    $inv_add++;
+  }
+}
+close IM1;
+print STDERR "Added $inv_add from inverse model1\n";
+
+open M1, "<$model1" or die;
+binmode(M1,":utf8");
+my $dir_add = 0;
+print STDERR "Reading model1 (again) for extra inverse translations...\n";
+while(<M1>) {
+  chomp;
+  my ($f, $e, $lp) = split /\s+/;
+  if (($esizes{$e} or 0) < $LIMIT_SIZE && !(defined $invm1{$e}->{$f})) {
+    $invm1{$e}->{$f} = 1;
+    $esizes{$e}++;
+    $dir_add++;
+  }
+}
+close M1;
+print STDERR "Added $dir_add from model 1\n";
+print STDERR "Generating grammars...\n";
+open OE, "<$orthoe" or die;
+binmode(OE,":utf8");
+my %oe_dict;
+while(<OE>) {
+  chomp;
+  my ($a, $b) = split / \|\|\| /, $_;
+  die "BAD: $_" unless defined $a && defined $b;
+  $oe_dict{$a} = $b;
+}
+close OE;
+open OF, "<$orthof" or die;
+binmode(OF,":utf8");
+my %of_dict;
+while(<OF>) {
+  chomp;
+  my ($a, $b) = split / \|\|\| /, $_;
+  die "BAD: $_" unless defined $a && defined $b;
+  $of_dict{$a} = $b;
+}
+close OF;
+$of_dict{'<eps>'} = '<eps>';
+$oe_dict{'<eps>'} = '<eps>';
+
+my $MIN_FEATURE_COUNT = 0;
+my $ADD_PREFIX_ID = 0;
+my $ADD_CLASS_CLASS = 1;
+my $ADD_LEN = 1;
+my $ADD_SIM = 1;
+my $ADD_DICE = 1;
+my $ADD_111 = 1;
+my $ADD_ID = 1;
+my $ADD_PUNC = 1;
+my $ADD_NULL = 0;
+my $ADD_STEM_ID = 1;
+my $ADD_SYM = 0;
+my $BEAM_RATIO = 50;
+my $BIN_ORTHO = 1;
+my $BIN_DLEN = 1;
+my $BIN_IDENT = 1;
+my $BIN_DICE = 1;
+
+my %fdict;
+my %fcounts;
+my %ecounts;
+
+my %sdict;
+
+while(<EF>) {
+  chomp;
+  my ($f, $e) = split /\s*\|\|\|\s*/;
+  my @es = split /\s+/, $e;
+  my @fs = split /\s+/, $f;
+  for my $ew (@es){
+    die "E: Empty word" if $ew eq '';
+    $ecounts{$ew}++;
+  }
+  push @fs, '<eps>' if $ADD_NULL;
+  for my $fw (@fs){
+    die "F: Empty word" if $fw eq '';
+    $fcounts{$fw}++;
+  }
+  for my $fw (@fs){
+    for my $ew (@es){
+      $fdict{$fw}->{$ew}++;
+    }
+  }
+}
+
+my $specials = 0;
+my $fc = 1000000;
+my $sids = 1000000;
+for my $f (sort keys %fdict) {
+  my $re = $fdict{$f};
+  my $max;
+  for my $e (sort {$re->{$b} <=> $re->{$a}} keys %$re) {
+    my $efcount = $re->{$e};
+    unless (defined $max) { $max = $efcount; }
+    my $m1 = $model1{$f}->{$e};
+    my $im1 = $invm1{$e}->{$f};
+    my $is_good_pair = (defined $m1);
+    my $is_inv_good_pair = (defined $im1);
+    my $dice = 2 * $efcount / ($ecounts{$e} + $fcounts{$f});
+    my @feats;
+    if ($efcount > $MIN_FEATURE_COUNT) {
+      $fc++;
+      push @feats, "F$fc=1";
+    }
+    if ($ADD_SYM && $is_good_pair && $is_inv_good_pair) { push @feats, 'Sym=1'; }
+    my $oe = $oe_dict{$e};
+    die "Can't find orthonorm form for $e" unless defined $oe;
+    my $of = $of_dict{$f};
+    die "Can't find orthonorm form for $f" unless defined $of;
+    my $len_e = length($oe);
+    my $len_f = length($of);
+    if ($ADD_DICE) {
+      if ($BIN_DICE) {
+        push @feats, dicebin($dice) . '=1';
+      } else {
+        push @feats, "Dice=$dice";
+      }
+    }
+    if ($ADD_CLASS_CLASS) {
+      my $ce = $eclass{$e} or die "E- no class for: '$e'";
+      my $cf = $fclass{$f} or die "F- no class for: '$f'";
+      push @feats, "C${cf}_${ce}=1";
+    }
+    my $is_null = undef;
+    if ($ADD_NULL && $f eq '<eps>') {
+      push @feats, "IsNull=1";
+      $is_null = 1;
+    }
+    if ($ADD_LEN) {
+      if (!$is_null) {
+        my $dlen = abs($len_e - $len_f);
+        if ($BIN_DLEN) {
+          push @feats, dlenbin($dlen) . '=1';
+        } else {
+          push @feats, "DLen=$dlen";
+        }
+      }
+    }
+    my $f_num = ($of =~ /^-?\d[0-9\.\,]+%?$/ && (length($of) > 3));
+    my $e_num = ($oe =~ /^-?\d[0-9\.\,]+%?$/ && (length($oe) > 3));
+    my $both_non_numeric = (!$e_num && !$f_num);
+    if ($ADD_STEM_ID) {
+      my $el = 4;
+      my $fl = 4;
+      if ($oe =~ /^al|re|co/) { $el++; }
+      if ($of =~ /^al|re|co/) { $fl++; }
+      if ($oe =~ /^trans|inter/) { $el+=2; }
+      if ($of =~ /^trans|inter/) { $fl+=2; }
+      if ($fl > length($of)) { $fl = length($of); }
+      if ($el > length($oe)) { $el = length($oe); }
+      my $sf = substr $of, 0, $fl;
+      my $se = substr $oe, 0, $el;
+      my $id = $sdict{$sf}->{$se};
+      if (!$id) {
+        $sids++;
+	$sdict{$sf}->{$se} = $sids;
+	$id = $sids;
+      }
+      push @feats, "S$id=1";
+    }
+    if ($ADD_PREFIX_ID) {
+      if ($len_e > 3 && $len_f > 3 && $both_non_numeric) { 
+        my $pe = substr $oe, 0, 3;
+        my $pf = substr $of, 0, 3;
+        if ($pe eq $pf) { push @feats, "PfxIdentical=1"; }
+      }
+    }
+    if ($ADD_SIM) {
+      my $ld = 0;
+      my $eff = $len_e;
+      if ($eff < $len_f) { $eff = $len_f; }
+      if (!$is_null) {
+        $ld = ($eff - levenshtein($oe, $of)) / sqrt($eff);
+      }
+      if ($ld > 1.5) { $is_good_pair = 1; }
+      if ($BIN_ORTHO) {
+        push @feats, orthobin($ld) . '=1';
+      } else {
+        push @feats, "OrthoSim=$ld";
+      }
+    }
+    my $ident = ($e eq $f);
+    if ($ident) { $is_good_pair = 1; }
+    if ($ident && $ADD_ID) {
+      if ($BIN_IDENT) {
+        push @feats, identbin($len_e) . '=1';
+      } else {
+        push @feats, "Identical=$len_e";
+      }
+    }
+    if ($efcount == 1 && $ecounts{$e} == 1 && $fcounts{$f} == 1) {
+      $is_good_pair = 1;
+      if ($ADD_111) {
+        push @feats, "OneOneOne=1";
+      }
+    }
+    if ($ADD_PUNC) {
+      if ($f =~ /^[!,\-\/"':;=+?.()\[\]«»]+$/ && $e =~ /[a-z]+/) {
+        push @feats, "PuncMiss=1";
+      }
+    }
+    my $is_special = ($is_good_pair && !(defined $m1));
+    $specials++ if $is_special;
+    print STDERR "$f -> $e\n" if $is_special;
+    print "1 ||| $f ||| $e ||| @feats\n" if $is_good_pair;
+    print "2 ||| $e ||| $f ||| @feats\n" if $is_inv_good_pair;
+  }
+}
+print STDERR "Added $specials special rules that were not in the M1 set\n";
+
+
+sub levenshtein
+{
+    # $s1 and $s2 are the two strings
+    # $len1 and $len2 are their respective lengths
+    #
+    my ($s1, $s2) = @_;
+    my ($len1, $len2) = (length $s1, length $s2);
+
+    # If one of the strings is empty, the distance is the length
+    # of the other string
+    #
+    return $len2 if ($len1 == 0);
+    return $len1 if ($len2 == 0);
+
+    my %mat;
+
+    # Init the distance matrix
+    #
+    # The first row to 0..$len1
+    # The first column to 0..$len2
+    # The rest to 0
+    #
+    # The first row and column are initialized so to denote distance
+    # from the empty string
+    #
+    for (my $i = 0; $i <= $len1; ++$i)
+    {
+        for (my $j = 0; $j <= $len2; ++$j)
+        {
+            $mat{$i}{$j} = 0;
+            $mat{0}{$j} = $j;
+        }
+
+        $mat{$i}{0} = $i;
+    }
+
+    # Some char-by-char processing is ahead, so prepare
+    # array of chars from the strings
+    #
+    my @ar1 = split(//, $s1);
+    my @ar2 = split(//, $s2);
+
+    for (my $i = 1; $i <= $len1; ++$i)
+    {
+        for (my $j = 1; $j <= $len2; ++$j)
+        {
+            # Set the cost to 1 iff the ith char of $s1
+            # equals the jth of $s2
+            # 
+            # Denotes a substitution cost. When the char are equal
+            # there is no need to substitute, so the cost is 0
+            #
+            my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1;
+
+            # Cell $mat{$i}{$j} equals the minimum of:
+            #
+            # - The cell immediately above plus 1
+            # - The cell immediately to the left plus 1
+            # - The cell diagonally above and to the left plus the cost
+            #
+            # We can either insert a new char, delete a char or
+            # substitute an existing char (with an associated cost)
+            #
+            $mat{$i}{$j} = min([$mat{$i-1}{$j} + 1,
+                                $mat{$i}{$j-1} + 1,
+                                $mat{$i-1}{$j-1} + $cost]);
+        }
+    }
+
+    # Finally, the Levenshtein distance equals the rightmost bottom cell
+    # of the matrix
+    #
+    # Note that $mat{$x}{$y} denotes the distance between the substrings
+    # 1..$x and 1..$y
+    #
+    return $mat{$len1}{$len2};
+}
+
+
+# minimal element of a list
+#
+sub min
+{
+    my @list = @{$_[0]};
+    my $min = $list[0];
+
+    foreach my $i (@list)
+    {
+        $min = $i if ($i < $min);
+    }
+
+    return $min;
+}
+
+sub load_classes {
+  my ($file, $ref) = @_;
+  print STDERR "Reading classes from $file...\n";
+  open F, "<$file" or die "Can't read $file: $!";
+  binmode(F, ":utf8") or die;
+  while(<F>) {
+    chomp;
+    my ($word, $class) = split /\s+/;
+#    print STDERR "'$word' -> $class\n";
+    $ref->{$word} = $class;
+  }
+  close F;
+}
+
+sub dicebin {
+  my $x = shift;
+  if ($x < 0.05) { return 'DiceLT005'; }
+  elsif ($x >= 0.05 && $x < 0.1) { return 'Dice005_01'; }
+  elsif ($x >= 0.1 && $x < 0.2) { return 'Dice01_02'; }
+  elsif ($x >= 0.2 && $x < 0.3) { return 'Dice02_03'; }
+  elsif ($x >= 0.3 && $x < 0.4) { return 'Dice03_04'; }
+  elsif ($x >= 0.4 && $x < 0.5) { return 'Dice04_05'; }
+  elsif ($x >= 0.5 && $x < 0.6) { return 'Dice05_06'; }
+  elsif ($x >= 0.6 && $x < 0.7) { return 'Dice06_07'; }
+  elsif ($x >= 0.7 && $x < 0.8) { return 'Dice07_08'; }
+  elsif ($x >= 0.8 && $x < 0.9) { return 'Dice08_09'; }
+  elsif ($x >= 0.9 && $x < 1.0) { return 'Dice09_10'; }
+  elsif ($x >= 1.0 && $x < 1.1) { return 'Dice10_11'; }
+  elsif ($x >= 1.1 && $x < 1.2) { return 'Dice11_12'; }
+  elsif ($x >= 1.2 && $x < 1.4) { return 'Dice12_14'; }
+  elsif ($x >= 1.4 && $x < 1.6) { return 'Dice14_16'; }
+  elsif ($x >= 1.6 && $x < 1.8) { return 'Dice16_18'; }
+  elsif ($x >= 1.8 && $x < 2.0) { return 'Dice18_20'; }
+  elsif ($x >= 2.0 && $x < 2.3) { return 'Dice20_23'; }
+  elsif ($x >= 2.3) { return 'DiceGT23'; }
+}
+
+sub orthobin {
+  my $x = shift;
+  if ($x < 0.9) { return 'OrthoLT09'; }
+  elsif ($x >= 0.9 && $x < 1.1) { return 'Ortho09_11'; }
+  elsif ($x >= 1.1 && $x < 1.3) { return 'Ortho11_13'; }
+  elsif ($x >= 1.3 && $x < 1.5) { return 'Ortho13_15'; }
+  elsif ($x >= 1.5 && $x < 1.7) { return 'Ortho15_17'; }
+  elsif ($x >= 1.7 && $x < 1.9) { return 'Ortho17_19'; }
+  elsif ($x >= 1.9 && $x < 2.1) { return 'Ortho19_21'; }
+  elsif ($x >= 2.1 && $x < 2.3) { return 'Ortho21_23'; }
+  elsif ($x >= 2.3 && $x < 2.5) { return 'Ortho23_25'; }
+  elsif ($x >= 2.5 && $x < 2.7) { return 'Ortho25_27'; }
+  elsif ($x >= 2.7 && $x < 2.9) { return 'Ortho27_29'; }
+  elsif ($x >= 2.9) { return 'OrthoGT29'; }
+}
+
+sub dlenbin {
+  my $x = shift;
+  if ($x == 0) { return 'DLen0'; }
+  elsif ($x == 1) { return 'DLen1'; }
+  elsif ($x == 2) { return 'DLen2'; }
+  elsif ($x == 3) { return 'DLen3'; }
+  elsif ($x == 4) { return 'DLen4'; }
+  elsif ($x == 5) { return 'DLen5'; }
+  elsif ($x == 6) { return 'DLen6'; }
+  elsif ($x == 7) { return 'DLen7'; }
+  elsif ($x == 8) { return 'DLen8'; }
+  elsif ($x == 9) { return 'DLen9'; }
+  elsif ($x >= 10) { return 'DLenGT10'; }
+}
+
+sub identbin {
+  my $x = shift;
+  if ($x == 0) { die; }
+  if ($x > scalar @IDENT_BINS) { return $MAX_IDENT_BIN; }
+  return $IDENT_BINS[$x];
+}
+
+
diff --git a/word-aligner/support/merge_corpus.pl b/word-aligner/support/merge_corpus.pl
new file mode 100755
index 00000000..02827903
--- /dev/null
+++ b/word-aligner/support/merge_corpus.pl
@@ -0,0 +1,18 @@
+#!/usr/bin/perl -w
+use strict;
+die "Usage: $0 corpus.e|f corpus.f|e" unless scalar @ARGV == 2;
+
+my ($a, $b) = @ARGV;
+open A, "<$a" or die "Can't read $a: $!";
+open B, "<$b" or die "Can't read $a: $!";
+
+while(<A>) {
+  chomp;
+  my $e = <B>;
+  die "Mismatched lines in $a and $b!" unless defined $e;
+  print "$_ ||| $e";
+}
+
+my $e = <B>;
+die "Mismatched lines in $a and $b!" unless !defined $e;
+
diff --git a/word-aligner/support/supplement_weights_file.pl b/word-aligner/support/supplement_weights_file.pl
new file mode 100755
index 00000000..06876043
--- /dev/null
+++ b/word-aligner/support/supplement_weights_file.pl
@@ -0,0 +1,73 @@
+#!/usr/bin/perl -w
+use strict;
+
+my $ADD_FCLASS_JUMP = 1;
+my $ADD_MODEL2_BINARY = 0;
+my $ADD_FC_RELPOS = 1;
+
+my ($f_classes) = @ARGV;
+
+die "Usage: $0 f-classes.file" unless $f_classes && -f $f_classes;
+
+print <<EOT;
+MarkovJump 0
+RelativeSentencePosition 0
+EOT
+
+# !	8
+# "	11
+# 's	18
+
+my %dcats = ();
+$dcats{'BOS'} = 1;
+$dcats{'EOS'} = 1;
+
+open FC, "<$f_classes" or die;
+while(<FC>) {
+  chomp;
+  my ($x, $cat) = split /\s+/;
+  $dcats{$cat} = 1;
+}
+
+my @cats = sort keys %dcats;
+
+my $added = 0;
+for (my $i=0; $i < scalar @cats; $i++) {
+  my $c1 = $cats[$i];
+  for (my $j=0; $j < scalar @cats; $j++) {
+    my $c2 = $cats[$j];
+    print "SP:${c1}_${c2} 0\n";
+    $added++;
+  }
+}
+
+for (my $ss=1; $ss < 100; $ss++) {
+  if ($ADD_FCLASS_JUMP) {
+    for (my $i=0; $i < scalar @cats; $i++) {
+      my $cat = $cats[$i];
+      for (my $j = -$ss; $j <= $ss; $j++) {
+        print "Jump_FL:${ss}_FC:${cat}_J:$j 0\n";
+        $added++;
+      }
+    }
+  }
+  if ($ADD_MODEL2_BINARY) {
+    # M2_FL:8_SI:3_TI:2=1
+    for (my $i = 0; $i < $ss; $i++) {
+      for (my $j = 0; $j < 100; $j++) {
+        print "M2FL:${ss}:TI:${j}_SI:${i} 0\n";
+        $added++;
+      }
+    }
+  }
+}
+if ($ADD_FC_RELPOS) {
+  #RelPos_FC:11
+  for (my $i=0; $i < scalar @cats; $i++) {
+    my $cat = $cats[$i];
+    print "RelPos_FC:$cat 0\n";
+    $added++;
+  }
+}
+
+print STDERR "Added $added weights\n";
-- 
cgit v1.2.3