diff options
| -rwxr-xr-x | gi/pipeline/filter-for-test-set.pl | 73 | ||||
| -rwxr-xr-x | gi/pipeline/local-gi-pipeline.pl | 2 | ||||
| -rwxr-xr-x | gi/pipeline/scripts/sort-by-key.sh (renamed from gi/pipeline/sort-by-key.sh) | 0 | 
3 files changed, 74 insertions, 1 deletions
| diff --git a/gi/pipeline/filter-for-test-set.pl b/gi/pipeline/filter-for-test-set.pl new file mode 100755 index 00000000..61edaf67 --- /dev/null +++ b/gi/pipeline/filter-for-test-set.pl @@ -0,0 +1,73 @@ +#!/usr/bin/perl -w +use strict; +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } + +use Getopt::Long "GetOptions"; +use IPC::Run3; +use File::Temp qw ( tempdir ); +my $TEMP_DIR = tempdir( CLEANUP => 1 ); + +my $GZIP = 'gzip'; +my $ZCAT = 'gunzip -c'; + +my $EXTOOLS = "$SCRIPT_DIR/../../extools"; +die "Can't find extools: $EXTOOLS" unless -e $EXTOOLS && -d $EXTOOLS; + +my $FILTER = "$EXTOOLS/filter_grammar"; +my $SCORE = "$EXTOOLS/score_grammar"; + +assert_exec($FILTER, $SCORE); + +usage() unless scalar @ARGV == 3; +my $corpus = $ARGV[0]; +my $grammar = $ARGV[1]; +my $testset = $ARGV[2]; +die "Can't find corpus: $corpus" unless -f $corpus; +die "Can't find corpus: $grammar" unless -f $grammar; +die "Can't find corpus: $testset" unless -f $testset; +print STDERR "  CORPUS: $corpus\n"; +print STDERR " GRAMMAR: $corpus\n"; +print STDERR "TEST SET: $corpus\n"; +print STDERR "Extracting...\n"; + +safesystem("$ZCAT $grammar | $FILTER $testset | $SCORE $corpus") or die "Failed"; + +sub usage { +  print <<EOT; + +Usage: $0 corpus.src_trg_al grammar.gz test-set.txt > filtered-grammar.scfg.txt + +Filter and score a grammar for a test set. + +EOT +  exit 1; +}; + +sub assert_exec { +  my @files = @_; +  for my $file (@files) { +    die "Can't find $file - did you run make?\n" unless -e $file; +    die "Can't execute $file" unless -e $file; +  } +}; + +sub safesystem { +  print STDERR "Executing: @_\n"; +  system(@_); +  if ($? == -1) { +      print STDERR "ERROR: Failed to execute: @_\n  $!\n"; +      exit(1); +  } +  elsif ($? & 127) { +      printf STDERR "ERROR: Execution of: @_\n  died with signal %d, %s coredump\n", +          ($? & 127),  ($? & 128) ? 'with' : 'without'; +      exit(1); +  } +  else { +    my $exitcode = $? >> 8; +    print STDERR "Exit code: $exitcode\n" if $exitcode; +    return ! $exitcode; +  } +} + + diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl index e52ad4ec..4707d5a3 100755 --- a/gi/pipeline/local-gi-pipeline.pl +++ b/gi/pipeline/local-gi-pipeline.pl @@ -28,7 +28,7 @@ my $S2L = "$PYPSCRIPTS/spans2labels.py";  my $PYP_TOPICS_TRAIN="$PYPTOOLS/pyp-topics-train"; -my $SORT_KEYS = "$SCRIPT_DIR/sort-by-key.sh"; +my $SORT_KEYS = "$SCRIPT_DIR/scripts/sort-by-key.sh";  my $EXTRACTOR = "$EXTOOLS/extractor";  my $FILTER = "$EXTOOLS/filter_grammar";  my $SCORER = "$EXTOOLS/score_grammar"; diff --git a/gi/pipeline/sort-by-key.sh b/gi/pipeline/scripts/sort-by-key.sh index 948dd4df..948dd4df 100755 --- a/gi/pipeline/sort-by-key.sh +++ b/gi/pipeline/scripts/sort-by-key.sh | 
