From 840d1b84a977b46180350bdb965a118150f238d9 Mon Sep 17 00:00:00 2001 From: redpony Date: Thu, 24 Jun 2010 04:05:17 +0000 Subject: pipeline scripts git-svn-id: https://ws10smt.googlecode.com/svn/trunk@17 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/pipeline/filter-for-test-set.pl | 73 ++++++++++++++++++++++++++++++++++++++ gi/pipeline/local-gi-pipeline.pl | 2 +- gi/pipeline/scripts/sort-by-key.sh | 5 +++ gi/pipeline/sort-by-key.sh | 5 --- 4 files changed, 79 insertions(+), 6 deletions(-) create mode 100755 gi/pipeline/filter-for-test-set.pl create mode 100755 gi/pipeline/scripts/sort-by-key.sh delete mode 100755 gi/pipeline/sort-by-key.sh diff --git a/gi/pipeline/filter-for-test-set.pl b/gi/pipeline/filter-for-test-set.pl new file mode 100755 index 00000000..61edaf67 --- /dev/null +++ b/gi/pipeline/filter-for-test-set.pl @@ -0,0 +1,73 @@ +#!/usr/bin/perl -w +use strict; +my $SCRIPT_DIR; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $SCRIPT_DIR = dirname(abs_path($0)); push @INC, $SCRIPT_DIR; } + +use Getopt::Long "GetOptions"; +use IPC::Run3; +use File::Temp qw ( tempdir ); +my $TEMP_DIR = tempdir( CLEANUP => 1 ); + +my $GZIP = 'gzip'; +my $ZCAT = 'gunzip -c'; + +my $EXTOOLS = "$SCRIPT_DIR/../../extools"; +die "Can't find extools: $EXTOOLS" unless -e $EXTOOLS && -d $EXTOOLS; + +my $FILTER = "$EXTOOLS/filter_grammar"; +my $SCORE = "$EXTOOLS/score_grammar"; + +assert_exec($FILTER, $SCORE); + +usage() unless scalar @ARGV == 3; +my $corpus = $ARGV[0]; +my $grammar = $ARGV[1]; +my $testset = $ARGV[2]; +die "Can't find corpus: $corpus" unless -f $corpus; +die "Can't find corpus: $grammar" unless -f $grammar; +die "Can't find corpus: $testset" unless -f $testset; +print STDERR " CORPUS: $corpus\n"; +print STDERR " GRAMMAR: $corpus\n"; +print STDERR "TEST SET: $corpus\n"; +print STDERR "Extracting...\n"; + +safesystem("$ZCAT $grammar | $FILTER $testset | $SCORE $corpus") or die "Failed"; + +sub usage { + print < filtered-grammar.scfg.txt + +Filter and score a grammar for a test set. + +EOT + exit 1; +}; + +sub assert_exec { + my @files = @_; + for my $file (@files) { + die "Can't find $file - did you run make?\n" unless -e $file; + die "Can't execute $file" unless -e $file; + } +}; + +sub safesystem { + print STDERR "Executing: @_\n"; + system(@_); + if ($? == -1) { + print STDERR "ERROR: Failed to execute: @_\n $!\n"; + exit(1); + } + elsif ($? & 127) { + printf STDERR "ERROR: Execution of: @_\n died with signal %d, %s coredump\n", + ($? & 127), ($? & 128) ? 'with' : 'without'; + exit(1); + } + else { + my $exitcode = $? >> 8; + print STDERR "Exit code: $exitcode\n" if $exitcode; + return ! $exitcode; + } +} + + diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl index e52ad4ec..4707d5a3 100755 --- a/gi/pipeline/local-gi-pipeline.pl +++ b/gi/pipeline/local-gi-pipeline.pl @@ -28,7 +28,7 @@ my $S2L = "$PYPSCRIPTS/spans2labels.py"; my $PYP_TOPICS_TRAIN="$PYPTOOLS/pyp-topics-train"; -my $SORT_KEYS = "$SCRIPT_DIR/sort-by-key.sh"; +my $SORT_KEYS = "$SCRIPT_DIR/scripts/sort-by-key.sh"; my $EXTRACTOR = "$EXTOOLS/extractor"; my $FILTER = "$EXTOOLS/filter_grammar"; my $SCORER = "$EXTOOLS/score_grammar"; diff --git a/gi/pipeline/scripts/sort-by-key.sh b/gi/pipeline/scripts/sort-by-key.sh new file mode 100755 index 00000000..948dd4df --- /dev/null +++ b/gi/pipeline/scripts/sort-by-key.sh @@ -0,0 +1,5 @@ +#!/bin/bash + +export LANG=C +sort -t $'\t' -k 1 + diff --git a/gi/pipeline/sort-by-key.sh b/gi/pipeline/sort-by-key.sh deleted file mode 100755 index 948dd4df..00000000 --- a/gi/pipeline/sort-by-key.sh +++ /dev/null @@ -1,5 +0,0 @@ -#!/bin/bash - -export LANG=C -sort -t $'\t' -k 1 - -- cgit v1.2.3