summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xadd_ln8
-rw-r--r--dense_features.txt12
-rwxr-xr-xfilter_features16
-rwxr-xr-xsplit_kbest24
-rwxr-xr-xsplit_lines14
5 files changed, 74 insertions, 0 deletions
diff --git a/add_ln b/add_ln
new file mode 100755
index 0000000..35bc44d
--- /dev/null
+++ b/add_ln
@@ -0,0 +1,8 @@
+#!/usr/bin/env ruby
+
+i = 0
+while line = STDIN.gets
+ puts "#{i}\t#{line}"
+ i += 1
+end
+
diff --git a/dense_features.txt b/dense_features.txt
new file mode 100644
index 0000000..daae8d1
--- /dev/null
+++ b/dense_features.txt
@@ -0,0 +1,12 @@
+CountEF
+EgivenFCoherent
+Glue
+IsSingletonF
+IsSingletonFE
+LanguageModel
+LanguageModel_OOV
+MaxLexFgivenE
+MaxLexEgivenF
+PassThrough
+SampleCountF
+WordPenalty
diff --git a/filter_features b/filter_features
new file mode 100755
index 0000000..fc21f6c
--- /dev/null
+++ b/filter_features
@@ -0,0 +1,16 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+
+dense_features = ReadFile.readlines_strip "#{File.dirname(__FILE__)}/dense_features.txt"
+
+sep = " "
+
+while line = STDIN.gets
+ a = line.strip.split
+ a.reject! { |i|
+ !dense_features.include?(i.split('=').first)
+ }
+ puts a.join sep
+end
+
diff --git a/split_kbest b/split_kbest
new file mode 100755
index 0000000..ab425b0
--- /dev/null
+++ b/split_kbest
@@ -0,0 +1,24 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+
+def write_kbest l, fn
+ f = WriteFile.new fn
+ f.write l.join("")
+ f.close
+end
+
+dir = ARGV[0]
+i = 0
+l = []
+while line = STDIN.gets
+ j = line.split.first.to_i
+ if j == 0 && l.size > 0
+ write_kbest l, "#{dir}/#{i}.gz"
+ l = []
+ i += 1
+ end
+ l << line
+end
+write_kbest l, "#{dir}/#{i}.gz" # last one
+
diff --git a/split_lines b/split_lines
new file mode 100755
index 0000000..14b3a0f
--- /dev/null
+++ b/split_lines
@@ -0,0 +1,14 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+
+dir = ARGV[0]
+i = 0
+while line = STDIN.gets
+ src, tgt = line.split " ||| "
+ f = WriteFile.new "#{dir}/#{i}.src"
+ f.write line
+ f.close
+ i += 1
+end
+