blob: c50b8e686b4e383efffb68251346ec6542e75d09 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
|
#!/usr/bin/perl -w
use strict;
my $script_dir; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }
use Getopt::Long;
use File::Spec qw (rel2abs);
my $DECODER = "$script_dir/../decoder/cdec";
my $TRAINER = "$script_dir/gibbs_train_plm_notables";
die "Can't find $TRAINER" unless -f $TRAINER;
die "Can't execute $TRAINER" unless -x $TRAINER;
if (!GetOptions(
"decoder=s" => \$DECODER,
)) { usage(); }
die "Can't find $DECODER" unless -f $DECODER;
die "Can't execute $DECODER" unless -x $DECODER;
if (scalar @ARGV != 2) { usage(); }
my $INFILE = shift @ARGV;
my $OUTDIR = shift @ARGV;
$OUTDIR = File::Spec->rel2abs($OUTDIR);
print STDERR " Input file: $INFILE\n";
print STDERR "Output directory: $OUTDIR\n";
open F, "<$INFILE" or die "Failed to open $INFILE for reading: $!";
close F;
die "Please remove existing directory $OUTDIR\n" if (-f $OUTDIR || -d $OUTDIR);
my $CMD = "mkdir $OUTDIR";
safesystem($CMD) or die "Failed to create directory $OUTDIR\n$!";
my $grammar="$OUTDIR/grammar.gz";
my $weights="$OUTDIR/weights";
$CMD = "$TRAINER -w $weights -g $grammar -i $INFILE";
safesystem($CMD) or die "Failed to train model!\n";
my $cdecini = "$OUTDIR/cdec.ini";
open C, ">$cdecini" or die "Failed to open $cdecini for writing: $!";
print C <<EOINI;
quiet=true
formalism=scfg
grammar=$grammar
add_pass_through_rules=true
weights=$OUTDIR/weights
EOINI
close C;
print <<EOT;
Model trained successfully. Text can be decoded into phrasal units with
the following command:
$DECODER -c $OUTDIR/cdec.ini < FILE.TXT
EOT
exit(0);
sub usage {
print <<EOT;
Usage: $0 [options] INPUT.TXT OUTPUT-DIRECTORY
Infers a phrasal segmentation model from the tokenized text in INPUT.TXT
and writes it to OUTPUT-DIRECTORY/ so that it can be applied to other
text or have its granularity altered.
EOT
exit(1);
}
sub safesystem {
print STDERR "Executing: @_\n";
system(@_);
if ($? == -1) {
print STDERR "ERROR: Failed to execute: @_\n $!\n";
exit(1);
}
elsif ($? & 127) {
printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n",
($? & 127), ($? & 128) ? 'with' : 'without';
exit(1);
}
else {
my $exitcode = $? >> 8;
print STDERR "Exit code: $exitcode\n" if $exitcode;
return ! $exitcode;
}
}
|