summaryrefslogtreecommitdiff
path: root/corpus/moses-scfg-to-cdec.pl
blob: 9b8e36179a4b46d3bba5bde47055c1cadf5ed2fe (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/perl -w
use strict;

while(<>) {
  my ($src, $trg, $feats, $al) = split / \|\|\| /;
  # [X][NP] von [X][NP] [X] ||| [X][NP] &apos;s [X][NP] [S] ||| 0.00110169 0.0073223 2.84566e-06 0.0027702 0.0121867 2.718 0.606531 ||| 0-0 1-1 2-2 ||| 635 245838 2

  my @srcs = split /\s+/, $src;
  my @trgs = split /\s+/, $trg;
  my $lhs = pop @trgs;
  $lhs =~ s/&amp;apos;/'/g;
  $lhs =~ s/&apos;/'/g;
  $lhs =~ s/,/COMMA/g;
  my $ntc = 0;
  my $sc = 0;
  my @of = ();
  my $x = pop @srcs;
  my %d = ();  # src index to nonterminal count
  die "Expected [X]" unless $x eq '[X]';
  my %amap = ();
  my @als = split / /, $al;
  for my $st (@als) {
    my ($s, $t) = split /-/, $st;
    $amap{$t} = $s;
  }
  for my $f (@srcs) {
    if ($f =~ /^\[X\]\[([^]]+)\]$/) {
      $ntc++;
      my $nt = $1;
      $nt =~ s/&amp;apos;/'/g;
      $nt =~ s/&apos;/'/g;
      $nt =~ s/,/COMMA/g;
      push @of, "[$nt]";
      $d{$sc} = $ntc;
    } elsif ($f =~ /^\[[^]]+\]$/) {
      die "Unexpected $f";
    } else {
      push @of, $f;
    }
    $sc++;
  }
  my @oe = ();
  my $ind = 0;
  for my $e (@trgs) {
    if ($e =~ /^\[X\]\[([^]]+)\]$/) {
      my $imap = $d{$amap{$ind}};
      push @oe, "[$imap]";
    } else {
      push @oe, $e;
    }
    $ind++;
  }
  my ($fe, $ef, $j, $lfe, $lef, $dummy, $of) = split / /, $feats;
  next if $lef eq '0';
  next if $lfe eq '0';
  next if $ef eq '0';
  next if $fe eq '0';
  next if $j eq '0';
  next if $of eq '0';
  $ef = sprintf('%.6g', log($ef));
  $fe = sprintf('%.6g',log($fe));
  $j = sprintf('%.6g',log($j));
  $lef = sprintf('%.6g',log($lef));
  $lfe = sprintf('%.6g',log($lfe));
  $of = sprintf('%.6g',log($of));
  print "$lhs ||| @of ||| @oe ||| RuleCount=1 FgivenE=$fe EgivenF=$ef Joint=$j LexEgivenF=$lef LexFgivenE=$lfe Other=$of\n";
}

# [X][ADVP] angestiegen [X] ||| rose [X][ADVP] [VP] ||| 0.0538131 0.0097508 0.00744224 0.0249653 0.000698602 2.718 0.606531 ||| 0-1 1-0 ||| 13 94 2