From 181c0bd79b0e7db2891047305dc87d20ca04097b Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Fri, 29 May 2015 09:17:43 +0200 Subject: add_ln: add line numbers, filter_features: filter text reps of sparse vectors, split_*: split kbest lists and by line --- add_ln | 8 ++++++++ dense_features.txt | 12 ++++++++++++ filter_features | 16 ++++++++++++++++ split_kbest | 24 ++++++++++++++++++++++++ split_lines | 14 ++++++++++++++ 5 files changed, 74 insertions(+) create mode 100755 add_ln create mode 100644 dense_features.txt create mode 100755 filter_features create mode 100755 split_kbest create mode 100755 split_lines diff --git a/add_ln b/add_ln new file mode 100755 index 0000000..35bc44d --- /dev/null +++ b/add_ln @@ -0,0 +1,8 @@ +#!/usr/bin/env ruby + +i = 0 +while line = STDIN.gets + puts "#{i}\t#{line}" + i += 1 +end + diff --git a/dense_features.txt b/dense_features.txt new file mode 100644 index 0000000..daae8d1 --- /dev/null +++ b/dense_features.txt @@ -0,0 +1,12 @@ +CountEF +EgivenFCoherent +Glue +IsSingletonF +IsSingletonFE +LanguageModel +LanguageModel_OOV +MaxLexFgivenE +MaxLexEgivenF +PassThrough +SampleCountF +WordPenalty diff --git a/filter_features b/filter_features new file mode 100755 index 0000000..fc21f6c --- /dev/null +++ b/filter_features @@ -0,0 +1,16 @@ +#!/usr/bin/env ruby + +require 'zipf' + +dense_features = ReadFile.readlines_strip "#{File.dirname(__FILE__)}/dense_features.txt" + +sep = " " + +while line = STDIN.gets + a = line.strip.split + a.reject! { |i| + !dense_features.include?(i.split('=').first) + } + puts a.join sep +end + diff --git a/split_kbest b/split_kbest new file mode 100755 index 0000000..ab425b0 --- /dev/null +++ b/split_kbest @@ -0,0 +1,24 @@ +#!/usr/bin/env ruby + +require 'zipf' + +def write_kbest l, fn + f = WriteFile.new fn + f.write l.join("") + f.close +end + +dir = ARGV[0] +i = 0 +l = [] +while line = STDIN.gets + j = line.split.first.to_i + if j == 0 && l.size > 0 + write_kbest l, "#{dir}/#{i}.gz" + l = [] + i += 1 + end + l << line +end +write_kbest l, "#{dir}/#{i}.gz" # last one + diff --git a/split_lines b/split_lines new file mode 100755 index 0000000..14b3a0f --- /dev/null +++ b/split_lines @@ -0,0 +1,14 @@ +#!/usr/bin/env ruby + +require 'zipf' + +dir = ARGV[0] +i = 0 +while line = STDIN.gets + src, tgt = line.split " ||| " + f = WriteFile.new "#{dir}/#{i}.src" + f.write line + f.close + i += 1 +end + -- cgit v1.2.3