diff options
Diffstat (limited to 'phrasinator/train-phrasinator.pl')
-rwxr-xr-x | phrasinator/train-phrasinator.pl | 89 |
1 files changed, 89 insertions, 0 deletions
diff --git a/phrasinator/train-phrasinator.pl b/phrasinator/train-phrasinator.pl new file mode 100755 index 00000000..de258caf --- /dev/null +++ b/phrasinator/train-phrasinator.pl @@ -0,0 +1,89 @@ +#!/usr/bin/perl -w +use strict; +my $script_dir; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; } +use Getopt::Long; +use File::Spec qw (rel2abs); + +my $DECODER = "$script_dir/../decoder/cdec"; +my $TRAINER = "$script_dir/gibbs_train_plm"; + +die "Can't find $TRAINER" unless -f $TRAINER; +die "Can't execute $TRAINER" unless -x $TRAINER; + +if (!GetOptions( + "decoder=s" => \$DECODER, +)) { usage(); } + +die "Can't find $DECODER" unless -f $DECODER; +die "Can't execute $DECODER" unless -x $DECODER; +if (scalar @ARGV != 2) { usage(); } +my $INFILE = shift @ARGV; +my $OUTDIR = shift @ARGV; +$OUTDIR = File::Spec->rel2abs($OUTDIR); +print STDERR " Input file: $INFILE\n"; +print STDERR "Output directory: $OUTDIR\n"; +open F, "<$INFILE" or die "Failed to open $INFILE for reading: $!"; +close F; +die "Please remove existing directory $OUTDIR\n" if (-f $OUTDIR || -d $OUTDIR); + +my $CMD = "mkdir $OUTDIR"; +safesystem($CMD) or die "Failed to create directory $OUTDIR\n$!"; + +my $grammar="$OUTDIR/grammar.gz"; +my $weights="$OUTDIR/weights"; +$CMD = "$TRAINER -w $weights -g $grammar -i $INFILE"; +safesystem($CMD) or die "Failed to train model!\n"; +my $cdecini = "$OUTDIR/cdec.ini"; +open C, ">$cdecini" or die "Failed to open $cdecini for writing: $!"; + +print C <<EOINI; +quiet=true +formalism=scfg +grammar=$grammar +add_pass_through_rules=true +weights=$OUTDIR/weights +EOINI + +close C; + +print <<EOT; + +Model trained successfully. Text can be decoded into phrasal units with +the following command: + + $DECODER -c $OUTDIR/cdec.ini < FILE.TXT + +EOT +exit(0); + +sub usage { + print <<EOT; +Usage: $0 [options] INPUT.TXT OUTPUT-DIRECTORY + + Infers a phrasal segmentation model from the tokenized text in INPUT.TXT + and writes it to OUTPUT-DIRECTORY/ so that it can be applied to other + text or have its granularity altered. + +EOT + exit(1); +} + +sub safesystem { + print STDERR "Executing: @_\n"; + system(@_); + if ($? == -1) { + print STDERR "ERROR: Failed to execute: @_\n $!\n"; + exit(1); + } + elsif ($? & 127) { + printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n", + ($? & 127), ($? & 128) ? 'with' : 'without'; + exit(1); + } + else { + my $exitcode = $? >> 8; + print STDERR "Exit code: $exitcode\n" if $exitcode; + return ! $exitcode; + } +} + |