diff options
-rwxr-xr-x | add_ln | 8 | ||||
-rw-r--r-- | dense_features.txt | 12 | ||||
-rwxr-xr-x | filter_features | 16 | ||||
-rwxr-xr-x | split_kbest | 24 | ||||
-rwxr-xr-x | split_lines | 14 |
5 files changed, 74 insertions, 0 deletions
@@ -0,0 +1,8 @@ +#!/usr/bin/env ruby + +i = 0 +while line = STDIN.gets + puts "#{i}\t#{line}" + i += 1 +end + diff --git a/dense_features.txt b/dense_features.txt new file mode 100644 index 0000000..daae8d1 --- /dev/null +++ b/dense_features.txt @@ -0,0 +1,12 @@ +CountEF +EgivenFCoherent +Glue +IsSingletonF +IsSingletonFE +LanguageModel +LanguageModel_OOV +MaxLexFgivenE +MaxLexEgivenF +PassThrough +SampleCountF +WordPenalty diff --git a/filter_features b/filter_features new file mode 100755 index 0000000..fc21f6c --- /dev/null +++ b/filter_features @@ -0,0 +1,16 @@ +#!/usr/bin/env ruby + +require 'zipf' + +dense_features = ReadFile.readlines_strip "#{File.dirname(__FILE__)}/dense_features.txt" + +sep = " " + +while line = STDIN.gets + a = line.strip.split + a.reject! { |i| + !dense_features.include?(i.split('=').first) + } + puts a.join sep +end + diff --git a/split_kbest b/split_kbest new file mode 100755 index 0000000..ab425b0 --- /dev/null +++ b/split_kbest @@ -0,0 +1,24 @@ +#!/usr/bin/env ruby + +require 'zipf' + +def write_kbest l, fn + f = WriteFile.new fn + f.write l.join("") + f.close +end + +dir = ARGV[0] +i = 0 +l = [] +while line = STDIN.gets + j = line.split.first.to_i + if j == 0 && l.size > 0 + write_kbest l, "#{dir}/#{i}.gz" + l = [] + i += 1 + end + l << line +end +write_kbest l, "#{dir}/#{i}.gz" # last one + diff --git a/split_lines b/split_lines new file mode 100755 index 0000000..14b3a0f --- /dev/null +++ b/split_lines @@ -0,0 +1,14 @@ +#!/usr/bin/env ruby + +require 'zipf' + +dir = ARGV[0] +i = 0 +while line = STDIN.gets + src, tgt = line.split " ||| " + f = WriteFile.new "#{dir}/#{i}.src" + f.write line + f.close + i += 1 +end + |