add dep

author: Guest_account Guest_account prguest11 <prguest11@taipan.cs> 2011-09-17 01:39:07 +0100
committer: Guest_account Guest_account prguest11 <prguest11@taipan.cs> 2011-09-17 01:39:07 +0100
commit: 1afbff874473c79619ce74cdf90f3c312185e4e1 (patch)
tree: 997d2cc819e07c83c545bc0d17a3d3ffeacee47a /training/make-lexcrf-grammar.pl
parent: b01f2499efa155a227f13e34f1aed9f54c39ab91 (diff)
1 files changed, 0 insertions, 285 deletions
diff --git a/training/make-lexcrf-grammar.pl b/training/make-lexcrf-grammar.pl
deleted file mode 100755
index 8cdf7718..00000000
--- a/training/make-lexcrf-grammar.pl
+++ /dev/null
@@ -1,285 +0,0 @@
-#!/usr/bin/perl -w
-use utf8;
-use strict;
-my ($effile, $model1) = @ARGV;
-die "Usage: $0 corpus.fr-en corpus.model1\n" unless $effile && -f $effile && $model1 && -f $model1;
-
-open EF, "<$effile" or die;
-open M1, "<$model1" or die;
-binmode(EF,":utf8");
-binmode(M1,":utf8");
-binmode(STDOUT,":utf8");
-my %model1;
-while(<M1>) {
-  chomp;
-  my ($f, $e, $lp) = split /\s+/;
-  $model1{$f}->{$e} = $lp;
-}
-
-my $ADD_MODEL1 = 0;      # found that model1 hurts performance
-my $IS_FRENCH_F = 1;     # indicates that the f language is french
-my $IS_ARABIC_F = 0;     # indicates that the f language is arabic
-my $IS_URDU_F = 0;     # indicates that the f language is arabic
-my $ADD_PREFIX_ID = 0;
-my $ADD_LEN = 1;
-my $ADD_SIM = 1;
-my $ADD_DICE = 1;
-my $ADD_111 = 1;
-my $ADD_ID = 1;
-my $ADD_PUNC = 1;
-my $ADD_NUM_MM = 1;
-my $ADD_NULL = 1;
-my $ADD_STEM_ID = 1;
-my $BEAM_RATIO = 50;
-
-my %fdict;
-my %fcounts;
-my %ecounts;
-
-my %sdict;
-
-while(<EF>) {
-  chomp;
-  my ($f, $e) = split /\s*\|\|\|\s*/;
-  my @es = split /\s+/, $e;
-  my @fs = split /\s+/, $f;
-  for my $ew (@es){ $ecounts{$ew}++; }
-  push @fs, '<eps>' if $ADD_NULL;
-  for my $fw (@fs){ $fcounts{$fw}++; }
-  for my $fw (@fs){
-    for my $ew (@es){
-      $fdict{$fw}->{$ew}++;
-    }
-  }
-}
-
-print STDERR "Dice 0\n" if $ADD_DICE;
-print STDERR "OneOneOne 0\nId_OneOneOne 0\n" if $ADD_111;
-print STDERR "Identical 0\n" if $ADD_ID;
-print STDERR "PuncMiss 0\n" if $ADD_PUNC;
-print STDERR "IsNull 0\n" if $ADD_NULL;
-print STDERR "Model1 0\n" if $ADD_MODEL1;
-print STDERR "DLen 0\n" if $ADD_LEN;
-print STDERR "NumMM 0\nNumMatch 0\n" if $ADD_NUM_MM;
-print STDERR "OrthoSim 0\n" if $ADD_SIM;
-print STDERR "PfxIdentical 0\n" if ($ADD_PREFIX_ID);
-my $fc = 1000000;
-my $sids = 1000000;
-for my $f (sort keys %fdict) {
-  my $re = $fdict{$f};
-  my $max;
-  for my $e (sort {$re->{$b} <=> $re->{$a}} keys %$re) {
-    my $efcount = $re->{$e};
-    unless (defined $max) { $max = $efcount; }
-    my $m1 = $model1{$f}->{$e};
-    unless (defined $m1) { next; }
-    $fc++;
-    my $dice = 2 * $efcount / ($ecounts{$e} + $fcounts{$f});
-    my $feats = "F$fc=1";
-    my $oe = $e;
-    my $of = $f;   # normalized form
-    if ($IS_FRENCH_F) {
-      # see http://en.wikipedia.org/wiki/Use_of_the_circumflex_in_French
-      $of =~ s/â/as/g;
-      $of =~ s/ê/es/g;
-      $of =~ s/î/is/g;
-      $of =~ s/ô/os/g;
-      $of =~ s/û/us/g;
-    } elsif ($IS_ARABIC_F) {
-      if (length($of) > 1 && !($of =~ /\d/)) {
-        $of =~ s/\$/sh/g;
-      }
-    } elsif ($IS_URDU_F) {
-      if (length($of) > 1 && !($of =~ /\d/)) {
-        $of =~ s/\$/sh/g;
-      }
-      $oe =~ s/^-e-//;
-      $oe =~ s/^al-/al/;
-      $of =~ s/([a-z])\~/$1$1/g;
-      $of =~ s/E/'/g;
-      $of =~ s/^Aw/o/g;
-      $of =~ s/\|/a/g;
-      $of =~ s/@/h/g;
-      $of =~ s/c/ch/g;
-      $of =~ s/x/kh/g;
-      $of =~ s/\*/dh/g;
-      $of =~ s/w/o/g;
-      $of =~ s/Z/dh/g;
-      $of =~ s/y/i/g;
-      $of =~ s/Y/a/g;
-      $of = lc $of;
-    }
-    my $len_e = length($oe);
-    my $len_f = length($of);
-    $feats .= " Model1=$m1" if ($ADD_MODEL1);
-    $feats .= " Dice=$dice" if $ADD_DICE;
-    my $is_null = undef;
-    if ($ADD_NULL && $f eq '<eps>') {
-      $feats .= " IsNull=1";
-      $is_null = 1;
-    }
-    if ($ADD_LEN) {
-      if (!$is_null) {
-        my $dlen = abs($len_e - $len_f);
-        $feats .= " DLen=$dlen";
-      }
-    }
-    my $f_num = ($of =~ /^-?\d[0-9\.\,]+%?$/ && (length($of) > 3));
-    my $e_num = ($oe =~ /^-?\d[0-9\.\,]+%?$/ && (length($oe) > 3));
-    my $both_non_numeric = (!$e_num && !$f_num);
-    if ($ADD_NUM_MM && (($f_num && !$e_num) || ($e_num && !$f_num))) {
-      $feats .= " NumMM=1";
-    }
-    if ($ADD_NUM_MM && ($f_num && $e_num) && ($oe eq $of)) {
-      $feats .= " NumMatch=1";
-    }
-    if ($ADD_STEM_ID) {
-      my $el = 4;
-      my $fl = 4;
-      if ($oe =~ /^al|re|co/) { $el++; }
-      if ($of =~ /^al|re|co/) { $fl++; }
-      if ($oe =~ /^trans|inter/) { $el+=2; }
-      if ($of =~ /^trans|inter/) { $fl+=2; }
-      if ($fl > length($of)) { $fl = length($of); }
-      if ($el > length($oe)) { $el = length($oe); }
-      my $sf = substr $of, 0, $fl;
-      my $se = substr $oe, 0, $el;
-      my $id = $sdict{$sf}->{$se};
-      if (!$id) {
-        $sids++;
-	$sdict{$sf}->{$se} = $sids;
-	$id = $sids;
-	print STDERR "S$sids 0\n"
-      }
-      $feats .= " S$id=1";
-    }
-    if ($ADD_PREFIX_ID) {
-      if ($len_e > 3 && $len_f > 3 && $both_non_numeric) { 
-        my $pe = substr $oe, 0, 3;
-        my $pf = substr $of, 0, 3;
-        if ($pe eq $pf) { $feats .= " PfxIdentical=1"; }
-      }
-    }
-    if ($ADD_SIM) {
-      my $ld = 0;
-      my $eff = $len_e;
-      if ($eff < $len_f) { $eff = $len_f; }
-      if (!$is_null) {
-        $ld = ($eff - levenshtein($oe, $of)) / sqrt($eff);
-      }
-      $feats .= " OrthoSim=$ld";
-    }
-    my $ident = ($e eq $f);
-    if ($ident && $ADD_ID) { $feats .= " Identical=1"; }
-    if ($ADD_111 && ($efcount == 1 && $ecounts{$e} == 1 && $fcounts{$f} == 1)) {
-      if ($ident && $ADD_ID) {
-        $feats .= " Id_OneOneOne=1";
-      }
-      $feats .= " OneOneOne=1";
-    }
-    if ($ADD_PUNC) {
-      if (($f =~ /^[0-9!\$%,\-\/"':;=+?.()«»]+$/ && $e =~ /[a-z]+/) ||
-          ($e =~ /^[0-9!\$%,\-\/"':;=+?.()«»]+$/ && $f =~ /[a-z]+/)) {
-        $feats .= " PuncMiss=1";
-      }
-    }
-    my $r = (0.5 - rand)/5;
-    print STDERR "F$fc $r\n";
-    print "$f ||| $e ||| $feats\n";
-  }
-}
-
-sub levenshtein
-{
-    # $s1 and $s2 are the two strings
-    # $len1 and $len2 are their respective lengths
-    #
-    my ($s1, $s2) = @_;
-    my ($len1, $len2) = (length $s1, length $s2);
-
-    # If one of the strings is empty, the distance is the length
-    # of the other string
-    #
-    return $len2 if ($len1 == 0);
-    return $len1 if ($len2 == 0);
-
-    my %mat;
-
-    # Init the distance matrix
-    #
-    # The first row to 0..$len1
-    # The first column to 0..$len2
-    # The rest to 0
-    #
-    # The first row and column are initialized so to denote distance
-    # from the empty string
-    #
-    for (my $i = 0; $i <= $len1; ++$i)
-    {
-        for (my $j = 0; $j <= $len2; ++$j)
-        {
-            $mat{$i}{$j} = 0;
-            $mat{0}{$j} = $j;
-        }
-
-        $mat{$i}{0} = $i;
-    }
-
-    # Some char-by-char processing is ahead, so prepare
-    # array of chars from the strings
-    #
-    my @ar1 = split(//, $s1);
-    my @ar2 = split(//, $s2);
-
-    for (my $i = 1; $i <= $len1; ++$i)
-    {
-        for (my $j = 1; $j <= $len2; ++$j)
-        {
-            # Set the cost to 1 iff the ith char of $s1
-            # equals the jth of $s2
-            # 
-            # Denotes a substitution cost. When the char are equal
-            # there is no need to substitute, so the cost is 0
-            #
-            my $cost = ($ar1[$i-1] eq $ar2[$j-1]) ? 0 : 1;
-
-            # Cell $mat{$i}{$j} equals the minimum of:
-            #
-            # - The cell immediately above plus 1
-            # - The cell immediately to the left plus 1
-            # - The cell diagonally above and to the left plus the cost
-            #
-            # We can either insert a new char, delete a char or
-            # substitute an existing char (with an associated cost)
-            #
-            $mat{$i}{$j} = min([$mat{$i-1}{$j} + 1,
-                                $mat{$i}{$j-1} + 1,
-                                $mat{$i-1}{$j-1} + $cost]);
-        }
-    }
-
-    # Finally, the Levenshtein distance equals the rightmost bottom cell
-    # of the matrix
-    #
-    # Note that $mat{$x}{$y} denotes the distance between the substrings
-    # 1..$x and 1..$y
-    #
-    return $mat{$len1}{$len2};
-}
-
-
-# minimal element of a list
-#
-sub min
-{
-    my @list = @{$_[0]};
-    my $min = $list[0];
-
-    foreach my $i (@list)
-    {
-        $min = $i if ($i < $min);
-    }
-
-    return $min;
-}
-
author	Guest_account Guest_account prguest11 <prguest11@taipan.cs>	2011-09-17 01:39:07 +0100
committer	Guest_account Guest_account prguest11 <prguest11@taipan.cs>	2011-09-17 01:39:07 +0100
commit	1afbff874473c79619ce74cdf90f3c312185e4e1 (patch)
tree	997d2cc819e07c83c545bc0d17a3d3ffeacee47a /training/make-lexcrf-grammar.pl
parent	b01f2499efa155a227f13e34f1aed9f54c39ab91 (diff)