diff options
| author | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-22 20:52:22 +0000 | 
|---|---|---|
| committer | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-22 20:52:22 +0000 | 
| commit | 22b3bffd201fe43cea4c4639bcefc3c0e4746518 (patch) | |
| tree | 423ef9fbbb6c516bf06311fe188ff78c8e31a355 /gi/pipeline/scripts | |
| parent | 0732fff632e792ae2268e9ef1c9c230624098eb7 (diff) | |
add additional filtering step
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@368 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pipeline/scripts')
| -rwxr-xr-x | gi/pipeline/scripts/refilter.pl | 37 | ||||
| -rwxr-xr-x | gi/pipeline/scripts/rekey.pl | 8 | ||||
| -rwxr-xr-x | gi/pipeline/scripts/sort-by-key.sh | 2 | 
3 files changed, 46 insertions, 1 deletions
| diff --git a/gi/pipeline/scripts/refilter.pl b/gi/pipeline/scripts/refilter.pl new file mode 100755 index 00000000..11a36ebe --- /dev/null +++ b/gi/pipeline/scripts/refilter.pl @@ -0,0 +1,37 @@ +#!/usr/bin/perl -w +use strict; + +my $NUM_TRANSLATIONS = 30; +my $pk = ''; +my %dict; +while(<>) { +  s/^(.+)\t//; +  my $key = $1; +  if ($key ne $pk) { +    if ($pk) { +      emit_dict(); +    } +    %dict = (); +    $pk = $key; +  } +  my ($lhs, $f, $e, $s) = split / \|\|\| /; +  my $score = 0; +  if ($s =~ /XEF=([^ ]+)/) { +    $score += $1; +  } else { die; } +  if ($s =~ /GenerativeProb=([^ ]+)/) { +    $score += ($1 / 10); +  } else { die; } +  $dict{"$lhs ||| $f ||| $e ||| $s"} = $score; +} +emit_dict(); + +sub emit_dict { +  my $cc = 0; +  for my $k (sort { $dict{$a} <=> $dict{$b} } keys %dict) { +    print "$k"; +    $cc++; +    if ($cc >= $NUM_TRANSLATIONS) { last; } +  } +} + diff --git a/gi/pipeline/scripts/rekey.pl b/gi/pipeline/scripts/rekey.pl new file mode 100755 index 00000000..31eb86b8 --- /dev/null +++ b/gi/pipeline/scripts/rekey.pl @@ -0,0 +1,8 @@ +#!/usr/bin/perl + +while(<>) { +  my ($lhs, $f, $e, $s) = split / \|\|\| /; +  $f =~ s/\[X[0-9]+\]/\[X\]/g; +  print "$f\t$_"; +} + diff --git a/gi/pipeline/scripts/sort-by-key.sh b/gi/pipeline/scripts/sort-by-key.sh index 948dd4df..7ae33e03 100755 --- a/gi/pipeline/scripts/sort-by-key.sh +++ b/gi/pipeline/scripts/sort-by-key.sh @@ -1,5 +1,5 @@  #!/bin/bash  export LANG=C -sort -t $'\t' -k 1 +sort -t $'\t' -k 1 -T /tmp -S 6000000000 | 
