summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xgi/pipeline/filter-for-test-set.pl73
-rwxr-xr-xgi/pipeline/local-gi-pipeline.pl2
-rwxr-xr-xgi/pipeline/scripts/sort-by-key.sh (renamed from gi/pipeline/sort-by-key.sh)0
3 files changed, 74 insertions, 1 deletions
diff --git a/gi/pipeline/filter-for-test-set.pl b/gi/pipeline/filter-for-test-set.pl
new file mode 100755
index 00000000..61edaf67
--- /dev/null
+++ b/gi/pipeline/filter-for-test-set.pl
@@ -0,0 +1,73 @@
+#!/usr/bin/perl -w
+use strict;
+my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; }
+
+use Getopt::Long "GetOptions";
+use IPC::Run3;
+use File::Temp qw ( tempdir );
+my $TEMP_DIR = tempdir( CLEANUP => 1 );
+
+my $GZIP = 'gzip';
+my $ZCAT = 'gunzip -c';
+
+my $EXTOOLS = "$SCRIPT_DIR/../../extools";
+die "Can't find extools: $EXTOOLS" unless -e $EXTOOLS && -d $EXTOOLS;
+
+my $FILTER = "$EXTOOLS/filter_grammar";
+my $SCORE = "$EXTOOLS/score_grammar";
+
+assert_exec($FILTER, $SCORE);
+
+usage() unless scalar @ARGV == 3;
+my $corpus = $ARGV[0];
+my $grammar = $ARGV[1];
+my $testset = $ARGV[2];
+die "Can't find corpus: $corpus" unless -f $corpus;
+die "Can't find corpus: $grammar" unless -f $grammar;
+die "Can't find corpus: $testset" unless -f $testset;
+print STDERR " CORPUS: $corpus\n";
+print STDERR " GRAMMAR: $corpus\n";
+print STDERR "TEST SET: $corpus\n";
+print STDERR "Extracting...\n";
+
+safesystem("$ZCAT $grammar | $FILTER $testset | $SCORE $corpus") or die "Failed";
+
+sub usage {
+ print <<EOT;
+
+Usage: $0 corpus.src_trg_al grammar.gz test-set.txt > filtered-grammar.scfg.txt
+
+Filter and score a grammar for a test set.
+
+EOT
+ exit 1;
+};
+
+sub assert_exec {
+ my @files = @_;
+ for my $file (@files) {
+ die "Can't find $file - did you run make?\n" unless -e $file;
+ die "Can't execute $file" unless -e $file;
+ }
+};
+
+sub safesystem {
+ print STDERR "Executing: @_\n";
+ system(@_);
+ if ($? == -1) {
+ print STDERR "ERROR: Failed to execute: @_\n $!\n";
+ exit(1);
+ }
+ elsif ($? & 127) {
+ printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n",
+ ($? & 127), ($? & 128) ? 'with' : 'without';
+ exit(1);
+ }
+ else {
+ my $exitcode = $? >> 8;
+ print STDERR "Exit code: $exitcode\n" if $exitcode;
+ return ! $exitcode;
+ }
+}
+
+
diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl
index e52ad4ec..4707d5a3 100755
--- a/gi/pipeline/local-gi-pipeline.pl
+++ b/gi/pipeline/local-gi-pipeline.pl
@@ -28,7 +28,7 @@ my $S2L = "$PYPSCRIPTS/spans2labels.py";
my $PYP_TOPICS_TRAIN="$PYPTOOLS/pyp-topics-train";
-my $SORT_KEYS = "$SCRIPT_DIR/sort-by-key.sh";
+my $SORT_KEYS = "$SCRIPT_DIR/scripts/sort-by-key.sh";
my $EXTRACTOR = "$EXTOOLS/extractor";
my $FILTER = "$EXTOOLS/filter_grammar";
my $SCORER = "$EXTOOLS/score_grammar";
diff --git a/gi/pipeline/sort-by-key.sh b/gi/pipeline/scripts/sort-by-key.sh
index 948dd4df..948dd4df 100755
--- a/gi/pipeline/sort-by-key.sh
+++ b/gi/pipeline/scripts/sort-by-key.sh