summaryrefslogtreecommitdiff
path: root/compound-split/compound-split.pl
blob: 0f3595047653af0df861f704b67510aceae43051 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/perl -w

use strict;
my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }
use Getopt::Long;
use IPC::Open2;

my $CDEC = "$script_dir/../decoder/cdec";
my $LANG = 'de';

my $BEAM = 2.1;
my $OUTPUT = 'plf';
my $HELP;
my $VERBOSE;
my $PRESERVE_CASE;

GetOptions("decoder=s" => \$CDEC,
           "language=s" => \$LANG,
           "beam=f" => \$BEAM,
           "output=s" => \$OUTPUT,
           "verbose" => \$VERBOSE,
           "preserve_case" => \$PRESERVE_CASE,
           "help" => \$HELP
          ) or usage();

usage() if $HELP;

chdir $script_dir;

if ($VERBOSE) { $VERBOSE = ""; } else { $VERBOSE = " 2> /dev/null"; }
$LANG = lc $LANG;
die "Can't find $CDEC\n" unless -f $CDEC;
die "Can't execute $CDEC\n" unless -x $CDEC;
die "Don't know about language: $LANG\n" unless -d "./$LANG";
my $CONFIG="cdec-$LANG.ini";
die "Can't find $CONFIG" unless -f $CONFIG;
die "--output must be '1best' or 'plf'\n" unless ($OUTPUT =~ /^(plf|1best)$/);
print STDERR "(Run with --help for options)\n";
print STDERR "LANGUAGE: $LANG\n";
print STDERR "  OUTPUT: $OUTPUT\n";

my $CMD = "$CDEC -c $CONFIG";
my $IS_PLF;
if ($OUTPUT eq 'plf') {
  $IS_PLF = 1;
  $CMD .= " --csplit_preserve_full_word --csplit_output_plf --beam_prune $BEAM";
}
$CMD .= $VERBOSE;

print STDERR "Executing: $CMD\n";

open2(\*OUT, \*IN, $CMD) or die "Couldn't fork: $!";
binmode(STDIN,":utf8");
binmode(STDOUT,":utf8");
binmode(IN,":utf8");
binmode(OUT,":utf8");

while(<STDIN>) {
  chomp;
  s/^\s+//;
  s/\s+$//;
  my @words = split /\s+/;
  my @res = ();
  my @todo = ();
  my @casings = ();
  for (my $i=0; $i < scalar @words; $i++) {
    my $word = lc $words[$i];
    if (length($word)<6 || $word =~ /^[,\-0-9\.]+$/ || $word =~ /[@.\-\/:]/) {
      push @casings, 0;
      if ($IS_PLF) {
        push @res, "(('" . escape($word) . "',0,1),),";
      } else {
        push @res, $word;
      }
    } else {
      push @casings, guess_casing($words[$i]);
      push @res, undef;
      push @todo, $word;
    }
  }
  if (scalar @todo > 0) {
    # print STDERR "TODO: @todo\n";
    my $tasks = join "\n", @todo;
    print IN "$tasks\n";
    for (my $i = 0; $i < scalar @res; $i++) {
      if (!defined $res[$i]) {
        my $seg = <OUT>;
        chomp $seg;
        unless ($IS_PLF) {
          $seg =~ s/^# //o;
        }
        if ($PRESERVE_CASE && $casings[$i]) { $seg = recase_words($seg); }
        $res[$i] = $seg;
      }
    }
  }
  if ($IS_PLF) {
    print '(';
    print join '', @res;
    print ")\n";
  } else {
    print "@res\n";
  }
}

close IN;
close OUT;

sub recase_words {
  my $word = shift;
  $word =~ s/\b(\w)/\u$1/g;
  return $word;
}

sub escape {
  $_ = shift;
  s/\\/\\\\/g;
  s/'/\\'/g;
  return $_;
}

sub guess_casing {
  my $word = shift @_;
  if (lc($word) eq $word) { return 0; } else { return 1; }
}

sub usage {
  print <<EOT;

Usage: $0 [OPTIONS] < file.txt

  Options:
    --decoder PATH        Path to cdec
    --lanugage XX         Two letter language code (de, ...)
    --beam NUM            Beam threshold, used with PLF output
                            (probably between 1.5 and 5.0)
    --output plf|1best    Output format, 1best or plf (lattice)
    --preserve_case       Preserve the casing of the input word
                            (model will be scored lowercase)
    --verbose             Show verbose decoder output

EOT
  exit(1);
}