1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
|
#!/usr/bin/perl -w
use strict;
my $script_dir; BEGIN { use Cwd qw/ abs_path /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }
use Getopt::Long;
use IPC::Open2;
my $CDEC = "$script_dir/../decoder/cdec";
my $LANG = 'de';
my $BEAM = 2.1;
my $OUTPUT = 'plf';
my $HELP;
my $VERBOSE;
GetOptions("decoder=s" => \$CDEC,
"language=s" => \$LANG,
"beam=f" => \$BEAM,
"output=s" => \$OUTPUT,
"verbose" => \$VERBOSE,
"help" => \$HELP
) or usage();
usage() if $HELP;
chdir $script_dir;
if ($VERBOSE) { $VERBOSE = ""; } else { $VERBOSE = " 2> /dev/null"; }
$LANG = lc $LANG;
die "Can't find $CDEC\n" unless -f $CDEC;
die "Can't execute $CDEC\n" unless -x $CDEC;
die "Don't know about language: $LANG\n" unless -d "./$LANG";
my $CONFIG="cdec-$LANG.ini";
die "Can't find $CONFIG" unless -f $CONFIG;
die "--output must be '1best' or 'plf'\n" unless ($OUTPUT =~ /^(plf|1best)$/);
print STDERR "(Run with --help for options)\n";
print STDERR "LANGUAGE: $LANG\n";
print STDERR " OUTPUT: $OUTPUT\n";
my $CMD = "$CDEC -c $CONFIG";
my $IS_PLF;
if ($OUTPUT eq 'plf') {
$IS_PLF = 1;
$CMD .= " --csplit_preserve_full_word --csplit_output_plf --beam_prune $BEAM";
}
$CMD .= $VERBOSE;
print STDERR "Executing: $CMD\n";
open2(\*OUT, \*IN, $CMD) or die "Couldn't fork: $!";
binmode(STDIN,":utf8");
binmode(STDOUT,":utf8");
binmode(IN,":utf8");
binmode(OUT,":utf8");
while(<STDIN>) {
chomp;
s/^\s+//;
s/\s+$//;
my @words = split /\s+/;
my @res = ();
my @todo = ();
for (my $i=0; $i < scalar @words; $i++) {
my $word = lc $words[$i];
if (length($word)<6 || $word =~ /^[,\-0-9\.]+$/) {
if ($IS_PLF) {
push @res, "(('" . escape($word) . "',0,1),),";
} else {
push @res, $word;
}
} else {
push @res, undef;
push @todo, $word;
}
}
if (scalar @todo > 0) {
# print STDERR "TODO: @todo\n";
my $tasks = join "\n", @todo;
print IN "$tasks\n";
for (my $i = 0; $i < scalar @res; $i++) {
if (!defined $res[$i]) {
my $seg = <OUT>;
chomp $seg;
unless ($IS_PLF) {
$seg =~ s/^# //o;
}
$res[$i] = $seg;
}
}
}
if ($IS_PLF) {
print '(';
print join '', @res;
print ")\n";
} else {
print "@res\n";
}
}
close IN;
close OUT;
sub escape {
$_ = shift;
s/\\/\\\\/g;
s/'/\\'/g;
return $_;
}
sub usage {
print <<EOT;
Usage: $0 [OPTIONS] < file.txt
Options:
--decoder PATH Path to cdec
--lanugage XX Two letter language code (de, ...)
--beam NUM Beam threshold, used with PLF output
(probably between 1.5 and 5.0)
--output plf|1best Output format, 1best or plf (lattice)
--verbose Show verbose decoder output
EOT
exit(1);
}
|