summaryrefslogtreecommitdiff
path: root/gi/pipeline/scripts
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-22 20:52:22 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-22 20:52:22 +0000
commit22b3bffd201fe43cea4c4639bcefc3c0e4746518 (patch)
tree423ef9fbbb6c516bf06311fe188ff78c8e31a355 /gi/pipeline/scripts
parent0732fff632e792ae2268e9ef1c9c230624098eb7 (diff)
add additional filtering step
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@368 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi/pipeline/scripts')
-rwxr-xr-xgi/pipeline/scripts/refilter.pl37
-rwxr-xr-xgi/pipeline/scripts/rekey.pl8
-rwxr-xr-xgi/pipeline/scripts/sort-by-key.sh2
3 files changed, 46 insertions, 1 deletions
diff --git a/gi/pipeline/scripts/refilter.pl b/gi/pipeline/scripts/refilter.pl
new file mode 100755
index 00000000..11a36ebe
--- /dev/null
+++ b/gi/pipeline/scripts/refilter.pl
@@ -0,0 +1,37 @@
+#!/usr/bin/perl -w
+use strict;
+
+my $NUM_TRANSLATIONS = 30;
+my $pk = '';
+my %dict;
+while(<>) {
+ s/^(.+)\t//;
+ my $key = $1;
+ if ($key ne $pk) {
+ if ($pk) {
+ emit_dict();
+ }
+ %dict = ();
+ $pk = $key;
+ }
+ my ($lhs, $f, $e, $s) = split / \|\|\| /;
+ my $score = 0;
+ if ($s =~ /XEF=([^ ]+)/) {
+ $score += $1;
+ } else { die; }
+ if ($s =~ /GenerativeProb=([^ ]+)/) {
+ $score += ($1 / 10);
+ } else { die; }
+ $dict{"$lhs ||| $f ||| $e ||| $s"} = $score;
+}
+emit_dict();
+
+sub emit_dict {
+ my $cc = 0;
+ for my $k (sort { $dict{$a} <=> $dict{$b} } keys %dict) {
+ print "$k";
+ $cc++;
+ if ($cc >= $NUM_TRANSLATIONS) { last; }
+ }
+}
+
diff --git a/gi/pipeline/scripts/rekey.pl b/gi/pipeline/scripts/rekey.pl
new file mode 100755
index 00000000..31eb86b8
--- /dev/null
+++ b/gi/pipeline/scripts/rekey.pl
@@ -0,0 +1,8 @@
+#!/usr/bin/perl
+
+while(<>) {
+ my ($lhs, $f, $e, $s) = split / \|\|\| /;
+ $f =~ s/\[X[0-9]+\]/\[X\]/g;
+ print "$f\t$_";
+}
+
diff --git a/gi/pipeline/scripts/sort-by-key.sh b/gi/pipeline/scripts/sort-by-key.sh
index 948dd4df..7ae33e03 100755
--- a/gi/pipeline/scripts/sort-by-key.sh
+++ b/gi/pipeline/scripts/sort-by-key.sh
@@ -1,5 +1,5 @@
#!/bin/bash
export LANG=C
-sort -t $'\t' -k 1
+sort -t $'\t' -k 1 -T /tmp -S 6000000000