summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2014-02-16 00:12:32 +0100
committerPatrick Simianer <p@simianer.de>2014-02-16 00:12:32 +0100
commit81a637ae52d2a1d0bc751b44c193765cdc1091f1 (patch)
tree19708fb523ef32cbeccc4d87133f115650e13280
parent99ae15932eae7e727b74f723107cf42aad80ba3f (diff)
nlp_ruby 0.3
-rwxr-xr-xfirstisupper9
-rwxr-xr-xfirstlower12
-rwxr-xr-xkbest_bleu_oracles2
-rwxr-xr-xkmeans7
-rwxr-xr-xlin_reg (renamed from linreg.rb)11
-rwxr-xr-xper_sentence_bleu30
-rw-r--r--test/lin_reg/x.dat (renamed from test/linreg/x.dat)0
-rw-r--r--test/lin_reg/y.dat (renamed from test/linreg/y.dat)0
-rwxr-xr-xtf-idf12
-rwxr-xr-xtraintestsplit12
10 files changed, 36 insertions, 59 deletions
diff --git a/firstisupper b/firstisupper
deleted file mode 100755
index 516dd8a..0000000
--- a/firstisupper
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/usr/bin/env ruby
-
-require 'nlp_ruby'
-
-
-while line = STDIN.gets
- puts line.strip if downcase? line[0]
-end
-
diff --git a/firstlower b/firstlower
new file mode 100755
index 0000000..fb63fcd
--- /dev/null
+++ b/firstlower
@@ -0,0 +1,12 @@
+#!/usr/bin/env ruby
+
+require 'nlp_ruby'
+
+
+while line = STDIN.gets
+ line.strip!
+ if line && line!='' && line[0].downcase?
+ puts line
+ end
+end
+
diff --git a/kbest_bleu_oracles b/kbest_bleu_oracles
index 1a41019..adc695e 100755
--- a/kbest_bleu_oracles
+++ b/kbest_bleu_oracles
@@ -23,7 +23,7 @@ def main
debug = cfg[:debug]
n = cfg[:n]
kbest_lists = read_kbest_lists cfg[:kbest_lists]
- references = ReadFile.new(cfg[:references]).readlines_strip
+ references = ReadFile.readlines_strip cfg[:references]
context = get_context kbest_lists, references, n
kbest_lists.each_with_index { |kbest,j|
scores = []
diff --git a/kmeans b/kmeans
index 5c49d9a..02c9c42 100755
--- a/kmeans
+++ b/kmeans
@@ -8,7 +8,8 @@ def read_data fn
data = {}
ReadFile.new(fn).readlines_strip.map{ |i|
a = i.split ' ', 2
- data[a.first] = read_feature_string a.last
+ v = SparseVector.from_kv a.last
+ data[a.first] = v
}
return data
end
@@ -22,7 +23,7 @@ def rand_means_init data, k
prng = Random.new
a = []
0.upto(k-1) do
- a << mean_sparse_vector(data.values.sample k, random:prng)
+ a << SparseVector.mean(data.values.sample k, random:prng)
end
return a
end
@@ -51,7 +52,7 @@ end
def update assignment, data
new_centroids = []
assignment.each_pair { |centroid_index,a|
- new_centroids << mean_sparse_vector(assignment[centroid_index].map{ |i| data[i] })
+ new_centroids << SparseVector.mean(assignment[centroid_index].map{ |i| data[i] })
}
return new_centroids
end
diff --git a/linreg.rb b/lin_reg
index 5c3f584..3546c3e 100755
--- a/linreg.rb
+++ b/lin_reg
@@ -9,11 +9,10 @@ def read_data fn, scale
data = []
while line = f.gets
line.strip!
- v = SparseVector.new
a = []
a << 1.0
tokenize(line).each { |i| a << i.to_f }
- v.from_a(a)
+ v = SparseVector.from_a a
data << v
end
if scale
@@ -22,7 +21,7 @@ def read_data fn, scale
data.each { |i| i[k] /= max }
}
end
- return data
+ return data
end
def main
@@ -36,7 +35,7 @@ def main
end
data = read_data cfg[:input], cfg[:scale_features]
zeros = [0.0]*data[0].size
- t = ReadFile.new(cfg[:output]).readlines.map{ |i| i.to_f }
+ t = ReadFile.readlines(cfg[:output]).map{ |i| i.to_f }
model = SparseVector.new zeros
stop = 0
prev_model = nil
@@ -48,12 +47,12 @@ def main
data.each_with_index { |d,j|
loss = model.dot(d) - t[j]
overall_loss += loss**2
- u += d * loss *(1.0/t.size)
+ u += d * (loss * (1.0/t.size))
}
STDERR.write "#{i} #{overall_loss/data.size}\n" if cfg[:show_loss]
u *= cfg[:learning_rate]
model -= u
- if model.approx_eql? prev_model
+ if model.approx_eql? prev_model
stop += 1
else
stop = 0
diff --git a/per_sentence_bleu b/per_sentence_bleu
index 724b1e1..b7243df 100755
--- a/per_sentence_bleu
+++ b/per_sentence_bleu
@@ -4,32 +4,6 @@ require 'nlp_ruby'
require 'trollop'
-# reference-length hack as in (Nakov et al., 2012)
-def brevity_penalty hypothesis, reference, hack=0
- a = tokenize hypothesis; b = tokenize reference
- return 1.0 if a.size>=b.size
- return Math.exp(1.0 - ((b.size.to_f+hack)/a.size));
-end
-
-def per_sentence_bleu hypothesis, reference, n=4, hack=0
- h_ng = {}; r_ng = {}
- (1).upto(n) {|i| h_ng[i] = []; r_ng[i] = []}
- ngrams(hypothesis, n) {|i| h_ng[i.size] << i}
- ngrams(reference, n) {|i| r_ng[i.size] << i}
- m = [n, reference.split.size].min
- weight = 1.0/m
- add = 0.0
- sum = 0
- (1).upto(m) { |i|
- counts_clipped = 0
- counts_sum = h_ng[i].size
- h_ng[i].uniq.each {|j| counts_clipped += r_ng[i].count(j)}
- add = 1.0 if i >= 2
- sum += weight * Math.log((counts_clipped + add)/(counts_sum + add));
- }
- return brevity_penalty(hypothesis, reference, hack) * Math.exp(sum)
-end
-
def main
cfg = Trollop::options do
opt :input, "input", :type => :string, :default => '-'
@@ -38,7 +12,7 @@ def main
opt :n, "N", :default => 4
end
- refs = ReadFile.new(cfg[:references]).readlines_strip
+ refs = ReadFile.readlines_strip cfg[:references]
i = -1
input = ReadFile.new cfg[:input]
while line = input.gets
@@ -47,7 +21,7 @@ def main
puts 0.0
next
end
- puts per_sentence_bleu line.strip, refs[i], cfg[:n], cfg[:len_hack]
+ puts BLEU::per_sentence_bleu line.strip, refs[i], cfg[:n], cfg[:len_hack]
end
input.close
end
diff --git a/test/linreg/x.dat b/test/lin_reg/x.dat
index 3d93394..3d93394 100644
--- a/test/linreg/x.dat
+++ b/test/lin_reg/x.dat
diff --git a/test/linreg/y.dat b/test/lin_reg/y.dat
index 1f4f963..1f4f963 100644
--- a/test/linreg/y.dat
+++ b/test/lin_reg/y.dat
diff --git a/tf-idf b/tf-idf
index e1502b3..ce3400a 100755
--- a/tf-idf
+++ b/tf-idf
@@ -15,7 +15,7 @@ def main
stopwords = []
if cfg[:filter_stopwords]
- stopwords = ReadFile.new(cfg[:filter_stopwords]).readlines.map{ |i|
+ stopwords = ReadFile.readlines(cfg[:filter_stopwords]).map{ |i|
i.split('|').first.strip
}.reject{ |i| i=='' }
end
@@ -23,17 +23,17 @@ def main
docs = {}
cfg[:documents].each { |i|
if cfg[:one_item_per_line]
- docs[i] = ReadFile.new(i).readlines_strip
+ docs[i] = ReadFile.readlines_strip i
else
- docs[i] = ReadFile.new(i).read.split(/\s/).map{ |i| i.strip }
+ docs[i] = ReadFile.read(i).split(/\s/).map{ |i| i.strip }
end
}
- idf_values = idf docs
+ idf_values = TFIDF::idf docs
docs.each_pair { |name, words|
- just_tf = tf words, stopwords
- just_tf = ntf(just_tf) if cfg[:ntf]
+ just_tf = TFIDF::tf words, stopwords
+ just_tf = TFIDF::ntf(just_tf) if cfg[:ntf]
tf_idf = {}; tf_idf.default = 0.0
if cfg[:idf]
just_tf.each_pair { |word,f|
diff --git a/traintestsplit b/traintestsplit
index 7cc5bcf..eb71354 100755
--- a/traintestsplit
+++ b/traintestsplit
@@ -13,10 +13,10 @@ cfg = Trollop::options do
end
fn = cfg[:foreign]
fn_ext = fn.split('.').last
-f = ReadFile.new(fn).readlines
+f = ReadFile.readlines fn
en = cfg[:english]
en_ext = en.split('.').last
-e = ReadFile(en).readlines
+e = ReadFile.readlines en
size = cfg[:size]
nlines_f = `wc -l #{fn}`.split()[0].to_i
nlines_e = `wc -l #{en}`.split()[0].to_i
@@ -32,15 +32,15 @@ cfg[:repeat].times {
b = a.sample(size)
ax = a.reject{|j| b.include? j}
`mkdir split_#{i}`
- new_f = WriteFile.new("split_#{i}/#{prefix}.train.#{i}.#{fn_ext}")
- new_e = WriteFile.new("split_#{i}/#{prefix}.train.#{i}.#{en_ext}")
+ new_f = WriteFile.new "split_#{i}/#{prefix}.train.#{i}.#{fn_ext}"
+ new_e = WriteFile.new "split_#{i}/#{prefix}.train.#{i}.#{en_ext}"
ax.each { |j|
new_f.write f[j]
new_e.write e[j]
}
new_f.close; new_e.close
- new_f = WriteFile.new("split_#{i}/#{prefix}.test.#{i}.#{fn_ext}")
- new_e = WriteFile.new("split_#{i}/#{prefix}.test.#{i}.#{en_ext}")
+ new_f = WriteFile.new "split_#{i}/#{prefix}.test.#{i}.#{fn_ext}"
+ new_e = WriteFile.new "split_#{i}/#{prefix}.test.#{i}.#{en_ext}"
b.each { |j|
new_f.write f[j]
new_e.write e[j]