summaryrefslogtreecommitdiff
path: root/phrasinator/train-phrasinator.pl
diff options
context:
space:
mode:
Diffstat (limited to 'phrasinator/train-phrasinator.pl')
-rwxr-xr-xphrasinator/train-phrasinator.pl89
1 files changed, 89 insertions, 0 deletions
diff --git a/phrasinator/train-phrasinator.pl b/phrasinator/train-phrasinator.pl
new file mode 100755
index 00000000..de258caf
--- /dev/null
+++ b/phrasinator/train-phrasinator.pl
@@ -0,0 +1,89 @@
+#!/usr/bin/perl -w
+use strict;
+my $script_dir; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; }
+use Getopt::Long;
+use File::Spec qw (rel2abs);
+
+my $DECODER = "$script_dir/../decoder/cdec";
+my $TRAINER = "$script_dir/gibbs_train_plm";
+
+die "Can't find $TRAINER" unless -f $TRAINER;
+die "Can't execute $TRAINER" unless -x $TRAINER;
+
+if (!GetOptions(
+ "decoder=s" => \$DECODER,
+)) { usage(); }
+
+die "Can't find $DECODER" unless -f $DECODER;
+die "Can't execute $DECODER" unless -x $DECODER;
+if (scalar @ARGV != 2) { usage(); }
+my $INFILE = shift @ARGV;
+my $OUTDIR = shift @ARGV;
+$OUTDIR = File::Spec->rel2abs($OUTDIR);
+print STDERR " Input file: $INFILE\n";
+print STDERR "Output directory: $OUTDIR\n";
+open F, "<$INFILE" or die "Failed to open $INFILE for reading: $!";
+close F;
+die "Please remove existing directory $OUTDIR\n" if (-f $OUTDIR || -d $OUTDIR);
+
+my $CMD = "mkdir $OUTDIR";
+safesystem($CMD) or die "Failed to create directory $OUTDIR\n$!";
+
+my $grammar="$OUTDIR/grammar.gz";
+my $weights="$OUTDIR/weights";
+$CMD = "$TRAINER -w $weights -g $grammar -i $INFILE";
+safesystem($CMD) or die "Failed to train model!\n";
+my $cdecini = "$OUTDIR/cdec.ini";
+open C, ">$cdecini" or die "Failed to open $cdecini for writing: $!";
+
+print C <<EOINI;
+quiet=true
+formalism=scfg
+grammar=$grammar
+add_pass_through_rules=true
+weights=$OUTDIR/weights
+EOINI
+
+close C;
+
+print <<EOT;
+
+Model trained successfully. Text can be decoded into phrasal units with
+the following command:
+
+ $DECODER -c $OUTDIR/cdec.ini < FILE.TXT
+
+EOT
+exit(0);
+
+sub usage {
+ print <<EOT;
+Usage: $0 [options] INPUT.TXT OUTPUT-DIRECTORY
+
+ Infers a phrasal segmentation model from the tokenized text in INPUT.TXT
+ and writes it to OUTPUT-DIRECTORY/ so that it can be applied to other
+ text or have its granularity altered.
+
+EOT
+ exit(1);
+}
+
+sub safesystem {
+ print STDERR "Executing: @_\n";
+ system(@_);
+ if ($? == -1) {
+ print STDERR "ERROR: Failed to execute: @_\n $!\n";
+ exit(1);
+ }
+ elsif ($? & 127) {
+ printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n",
+ ($? & 127), ($? & 128) ? 'with' : 'without';
+ exit(1);
+ }
+ else {
+ my $exitcode = $? >> 8;
+ print STDERR "Exit code: $exitcode\n" if $exitcode;
+ return ! $exitcode;
+ }
+}
+