From 81a637ae52d2a1d0bc751b44c193765cdc1091f1 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Sun, 16 Feb 2014 00:12:32 +0100 Subject: nlp_ruby 0.3 --- firstisupper | 9 ------- firstlower | 12 +++++++++ kbest_bleu_oracles | 2 +- kmeans | 7 +++--- lin_reg | 72 +++++++++++++++++++++++++++++++++++++++++++++++++++++ linreg.rb | 73 ------------------------------------------------------ per_sentence_bleu | 30 ++-------------------- test/lin_reg/x.dat | 50 +++++++++++++++++++++++++++++++++++++ test/lin_reg/y.dat | 50 +++++++++++++++++++++++++++++++++++++ test/linreg/x.dat | 50 ------------------------------------- test/linreg/y.dat | 50 ------------------------------------- tf-idf | 12 ++++----- traintestsplit | 12 ++++----- 13 files changed, 203 insertions(+), 226 deletions(-) delete mode 100755 firstisupper create mode 100755 firstlower create mode 100755 lin_reg delete mode 100755 linreg.rb create mode 100644 test/lin_reg/x.dat create mode 100644 test/lin_reg/y.dat delete mode 100644 test/linreg/x.dat delete mode 100644 test/linreg/y.dat diff --git a/firstisupper b/firstisupper deleted file mode 100755 index 516dd8a..0000000 --- a/firstisupper +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env ruby - -require 'nlp_ruby' - - -while line = STDIN.gets - puts line.strip if downcase? line[0] -end - diff --git a/firstlower b/firstlower new file mode 100755 index 0000000..fb63fcd --- /dev/null +++ b/firstlower @@ -0,0 +1,12 @@ +#!/usr/bin/env ruby + +require 'nlp_ruby' + + +while line = STDIN.gets + line.strip! + if line && line!='' && line[0].downcase? + puts line + end +end + diff --git a/kbest_bleu_oracles b/kbest_bleu_oracles index 1a41019..adc695e 100755 --- a/kbest_bleu_oracles +++ b/kbest_bleu_oracles @@ -23,7 +23,7 @@ def main debug = cfg[:debug] n = cfg[:n] kbest_lists = read_kbest_lists cfg[:kbest_lists] - references = ReadFile.new(cfg[:references]).readlines_strip + references = ReadFile.readlines_strip cfg[:references] context = get_context kbest_lists, references, n kbest_lists.each_with_index { |kbest,j| scores = [] diff --git a/kmeans b/kmeans index 5c49d9a..02c9c42 100755 --- a/kmeans +++ b/kmeans @@ -8,7 +8,8 @@ def read_data fn data = {} ReadFile.new(fn).readlines_strip.map{ |i| a = i.split ' ', 2 - data[a.first] = read_feature_string a.last + v = SparseVector.from_kv a.last + data[a.first] = v } return data end @@ -22,7 +23,7 @@ def rand_means_init data, k prng = Random.new a = [] 0.upto(k-1) do - a << mean_sparse_vector(data.values.sample k, random:prng) + a << SparseVector.mean(data.values.sample k, random:prng) end return a end @@ -51,7 +52,7 @@ end def update assignment, data new_centroids = [] assignment.each_pair { |centroid_index,a| - new_centroids << mean_sparse_vector(assignment[centroid_index].map{ |i| data[i] }) + new_centroids << SparseVector.mean(assignment[centroid_index].map{ |i| data[i] }) } return new_centroids end diff --git a/lin_reg b/lin_reg new file mode 100755 index 0000000..3546c3e --- /dev/null +++ b/lin_reg @@ -0,0 +1,72 @@ +#!/usr/bin/env ruby + +require 'nlp_ruby' +require 'trollop' + + +def read_data fn, scale + f = ReadFile.new fn + data = [] + while line = f.gets + line.strip! + a = [] + a << 1.0 + tokenize(line).each { |i| a << i.to_f } + v = SparseVector.from_a a + data << v + end + if scale + data.map { |i| i.keys }.flatten.uniq.each { |k| + max = data.map { |i| i[k] }.max + data.each { |i| i[k] /= max } + } + end + return data +end + +def main + cfg = Trollop::options do + opt :input, "input data", :type => :string, :required => true + opt :output, "output data", :type => :string, :required => true + opt :learning_rate, "learning rate", :type => :float, :default => 0.07 + opt :stop, "stopping criterion", :type => :int, :default => 100 + opt :scale_features,"scale features", :type => :bool, :default => false, :short => '-t' + opt :show_loss, "show loss per iter", :type => :bool, :default => false + end + data = read_data cfg[:input], cfg[:scale_features] + zeros = [0.0]*data[0].size + t = ReadFile.readlines(cfg[:output]).map{ |i| i.to_f } + model = SparseVector.new zeros + stop = 0 + prev_model = nil + i = 0 + while true + i += 1 + u = SparseVector.new zeros + overall_loss = 0.0 + data.each_with_index { |d,j| + loss = model.dot(d) - t[j] + overall_loss += loss**2 + u += d * (loss * (1.0/t.size)) + } + STDERR.write "#{i} #{overall_loss/data.size}\n" if cfg[:show_loss] + u *= cfg[:learning_rate] + model -= u + if model.approx_eql? prev_model + stop += 1 + else + stop = 0 + end + break if stop==cfg[:stop] + prev_model = model + end + tss = t.map{ |y| (y-t.mean)**2 }.sum + j = -1 + rss = t.map{ |y| j+=1; (y-model.dot(data[j]))**2 }.sum + STDERR.write "ran for #{i} iterations\n R^2=#{1-(rss/tss)}\n" + puts model.to_s +end + + +main + diff --git a/linreg.rb b/linreg.rb deleted file mode 100755 index 5c3f584..0000000 --- a/linreg.rb +++ /dev/null @@ -1,73 +0,0 @@ -#!/usr/bin/env ruby - -require 'nlp_ruby' -require 'trollop' - - -def read_data fn, scale - f = ReadFile.new fn - data = [] - while line = f.gets - line.strip! - v = SparseVector.new - a = [] - a << 1.0 - tokenize(line).each { |i| a << i.to_f } - v.from_a(a) - data << v - end - if scale - data.map { |i| i.keys }.flatten.uniq.each { |k| - max = data.map { |i| i[k] }.max - data.each { |i| i[k] /= max } - } - end - return data -end - -def main - cfg = Trollop::options do - opt :input, "input data", :type => :string, :required => true - opt :output, "output data", :type => :string, :required => true - opt :learning_rate, "learning rate", :type => :float, :default => 0.07 - opt :stop, "stopping criterion", :type => :int, :default => 100 - opt :scale_features,"scale features", :type => :bool, :default => false, :short => '-t' - opt :show_loss, "show loss per iter", :type => :bool, :default => false - end - data = read_data cfg[:input], cfg[:scale_features] - zeros = [0.0]*data[0].size - t = ReadFile.new(cfg[:output]).readlines.map{ |i| i.to_f } - model = SparseVector.new zeros - stop = 0 - prev_model = nil - i = 0 - while true - i += 1 - u = SparseVector.new zeros - overall_loss = 0.0 - data.each_with_index { |d,j| - loss = model.dot(d) - t[j] - overall_loss += loss**2 - u += d * loss *(1.0/t.size) - } - STDERR.write "#{i} #{overall_loss/data.size}\n" if cfg[:show_loss] - u *= cfg[:learning_rate] - model -= u - if model.approx_eql? prev_model - stop += 1 - else - stop = 0 - end - break if stop==cfg[:stop] - prev_model = model - end - tss = t.map{ |y| (y-t.mean)**2 }.sum - j = -1 - rss = t.map{ |y| j+=1; (y-model.dot(data[j]))**2 }.sum - STDERR.write "ran for #{i} iterations\n R^2=#{1-(rss/tss)}\n" - puts model.to_s -end - - -main - diff --git a/per_sentence_bleu b/per_sentence_bleu index 724b1e1..b7243df 100755 --- a/per_sentence_bleu +++ b/per_sentence_bleu @@ -4,32 +4,6 @@ require 'nlp_ruby' require 'trollop' -# reference-length hack as in (Nakov et al., 2012) -def brevity_penalty hypothesis, reference, hack=0 - a = tokenize hypothesis; b = tokenize reference - return 1.0 if a.size>=b.size - return Math.exp(1.0 - ((b.size.to_f+hack)/a.size)); -end - -def per_sentence_bleu hypothesis, reference, n=4, hack=0 - h_ng = {}; r_ng = {} - (1).upto(n) {|i| h_ng[i] = []; r_ng[i] = []} - ngrams(hypothesis, n) {|i| h_ng[i.size] << i} - ngrams(reference, n) {|i| r_ng[i.size] << i} - m = [n, reference.split.size].min - weight = 1.0/m - add = 0.0 - sum = 0 - (1).upto(m) { |i| - counts_clipped = 0 - counts_sum = h_ng[i].size - h_ng[i].uniq.each {|j| counts_clipped += r_ng[i].count(j)} - add = 1.0 if i >= 2 - sum += weight * Math.log((counts_clipped + add)/(counts_sum + add)); - } - return brevity_penalty(hypothesis, reference, hack) * Math.exp(sum) -end - def main cfg = Trollop::options do opt :input, "input", :type => :string, :default => '-' @@ -38,7 +12,7 @@ def main opt :n, "N", :default => 4 end - refs = ReadFile.new(cfg[:references]).readlines_strip + refs = ReadFile.readlines_strip cfg[:references] i = -1 input = ReadFile.new cfg[:input] while line = input.gets @@ -47,7 +21,7 @@ def main puts 0.0 next end - puts per_sentence_bleu line.strip, refs[i], cfg[:n], cfg[:len_hack] + puts BLEU::per_sentence_bleu line.strip, refs[i], cfg[:n], cfg[:len_hack] end input.close end diff --git a/test/lin_reg/x.dat b/test/lin_reg/x.dat new file mode 100644 index 0000000..3d93394 --- /dev/null +++ b/test/lin_reg/x.dat @@ -0,0 +1,50 @@ + 2.0658746e+00 + 2.3684087e+00 + 2.5399929e+00 + 2.5420804e+00 + 2.5490790e+00 + 2.7866882e+00 + 2.9116825e+00 + 3.0356270e+00 + 3.1146696e+00 + 3.1582389e+00 + 3.3275944e+00 + 3.3793165e+00 + 3.4122006e+00 + 3.4215823e+00 + 3.5315732e+00 + 3.6393002e+00 + 3.6732537e+00 + 3.9256462e+00 + 4.0498646e+00 + 4.2483348e+00 + 4.3440052e+00 + 4.3826531e+00 + 4.4230602e+00 + 4.6102443e+00 + 4.6881183e+00 + 4.9777333e+00 + 5.0359967e+00 + 5.0684536e+00 + 5.4161491e+00 + 5.4395623e+00 + 5.4563207e+00 + 5.5698458e+00 + 5.6015729e+00 + 5.6877617e+00 + 5.7215602e+00 + 5.8538914e+00 + 6.1978026e+00 + 6.3510941e+00 + 6.4797033e+00 + 6.7383791e+00 + 6.8637686e+00 + 7.0223387e+00 + 7.0782373e+00 + 7.1514232e+00 + 7.4664023e+00 + 7.5973874e+00 + 7.7440717e+00 + 7.7729662e+00 + 7.8264514e+00 + 7.9306356e+00 diff --git a/test/lin_reg/y.dat b/test/lin_reg/y.dat new file mode 100644 index 0000000..1f4f963 --- /dev/null +++ b/test/lin_reg/y.dat @@ -0,0 +1,50 @@ + 7.7918926e-01 + 9.1596757e-01 + 9.0538354e-01 + 9.0566138e-01 + 9.3898890e-01 + 9.6684740e-01 + 9.6436824e-01 + 9.1445939e-01 + 9.3933944e-01 + 9.6074971e-01 + 8.9837094e-01 + 9.1209739e-01 + 9.4238499e-01 + 9.6624578e-01 + 1.0526500e+00 + 1.0143791e+00 + 9.5969426e-01 + 9.6853716e-01 + 1.0766065e+00 + 1.1454978e+00 + 1.0340625e+00 + 1.0070009e+00 + 9.6683648e-01 + 1.0895919e+00 + 1.0634462e+00 + 1.1237239e+00 + 1.0323374e+00 + 1.0874452e+00 + 1.0702988e+00 + 1.1606493e+00 + 1.0778037e+00 + 1.1069758e+00 + 1.0971875e+00 + 1.1648603e+00 + 1.1411796e+00 + 1.0844156e+00 + 1.1252493e+00 + 1.1168341e+00 + 1.1970789e+00 + 1.2069462e+00 + 1.1251046e+00 + 1.1235672e+00 + 1.2132829e+00 + 1.2522652e+00 + 1.2497065e+00 + 1.1799706e+00 + 1.1897299e+00 + 1.3029934e+00 + 1.2601134e+00 + 1.2562267e+00 diff --git a/test/linreg/x.dat b/test/linreg/x.dat deleted file mode 100644 index 3d93394..0000000 --- a/test/linreg/x.dat +++ /dev/null @@ -1,50 +0,0 @@ - 2.0658746e+00 - 2.3684087e+00 - 2.5399929e+00 - 2.5420804e+00 - 2.5490790e+00 - 2.7866882e+00 - 2.9116825e+00 - 3.0356270e+00 - 3.1146696e+00 - 3.1582389e+00 - 3.3275944e+00 - 3.3793165e+00 - 3.4122006e+00 - 3.4215823e+00 - 3.5315732e+00 - 3.6393002e+00 - 3.6732537e+00 - 3.9256462e+00 - 4.0498646e+00 - 4.2483348e+00 - 4.3440052e+00 - 4.3826531e+00 - 4.4230602e+00 - 4.6102443e+00 - 4.6881183e+00 - 4.9777333e+00 - 5.0359967e+00 - 5.0684536e+00 - 5.4161491e+00 - 5.4395623e+00 - 5.4563207e+00 - 5.5698458e+00 - 5.6015729e+00 - 5.6877617e+00 - 5.7215602e+00 - 5.8538914e+00 - 6.1978026e+00 - 6.3510941e+00 - 6.4797033e+00 - 6.7383791e+00 - 6.8637686e+00 - 7.0223387e+00 - 7.0782373e+00 - 7.1514232e+00 - 7.4664023e+00 - 7.5973874e+00 - 7.7440717e+00 - 7.7729662e+00 - 7.8264514e+00 - 7.9306356e+00 diff --git a/test/linreg/y.dat b/test/linreg/y.dat deleted file mode 100644 index 1f4f963..0000000 --- a/test/linreg/y.dat +++ /dev/null @@ -1,50 +0,0 @@ - 7.7918926e-01 - 9.1596757e-01 - 9.0538354e-01 - 9.0566138e-01 - 9.3898890e-01 - 9.6684740e-01 - 9.6436824e-01 - 9.1445939e-01 - 9.3933944e-01 - 9.6074971e-01 - 8.9837094e-01 - 9.1209739e-01 - 9.4238499e-01 - 9.6624578e-01 - 1.0526500e+00 - 1.0143791e+00 - 9.5969426e-01 - 9.6853716e-01 - 1.0766065e+00 - 1.1454978e+00 - 1.0340625e+00 - 1.0070009e+00 - 9.6683648e-01 - 1.0895919e+00 - 1.0634462e+00 - 1.1237239e+00 - 1.0323374e+00 - 1.0874452e+00 - 1.0702988e+00 - 1.1606493e+00 - 1.0778037e+00 - 1.1069758e+00 - 1.0971875e+00 - 1.1648603e+00 - 1.1411796e+00 - 1.0844156e+00 - 1.1252493e+00 - 1.1168341e+00 - 1.1970789e+00 - 1.2069462e+00 - 1.1251046e+00 - 1.1235672e+00 - 1.2132829e+00 - 1.2522652e+00 - 1.2497065e+00 - 1.1799706e+00 - 1.1897299e+00 - 1.3029934e+00 - 1.2601134e+00 - 1.2562267e+00 diff --git a/tf-idf b/tf-idf index e1502b3..ce3400a 100755 --- a/tf-idf +++ b/tf-idf @@ -15,7 +15,7 @@ def main stopwords = [] if cfg[:filter_stopwords] - stopwords = ReadFile.new(cfg[:filter_stopwords]).readlines.map{ |i| + stopwords = ReadFile.readlines(cfg[:filter_stopwords]).map{ |i| i.split('|').first.strip }.reject{ |i| i=='' } end @@ -23,17 +23,17 @@ def main docs = {} cfg[:documents].each { |i| if cfg[:one_item_per_line] - docs[i] = ReadFile.new(i).readlines_strip + docs[i] = ReadFile.readlines_strip i else - docs[i] = ReadFile.new(i).read.split(/\s/).map{ |i| i.strip } + docs[i] = ReadFile.read(i).split(/\s/).map{ |i| i.strip } end } - idf_values = idf docs + idf_values = TFIDF::idf docs docs.each_pair { |name, words| - just_tf = tf words, stopwords - just_tf = ntf(just_tf) if cfg[:ntf] + just_tf = TFIDF::tf words, stopwords + just_tf = TFIDF::ntf(just_tf) if cfg[:ntf] tf_idf = {}; tf_idf.default = 0.0 if cfg[:idf] just_tf.each_pair { |word,f| diff --git a/traintestsplit b/traintestsplit index 7cc5bcf..eb71354 100755 --- a/traintestsplit +++ b/traintestsplit @@ -13,10 +13,10 @@ cfg = Trollop::options do end fn = cfg[:foreign] fn_ext = fn.split('.').last -f = ReadFile.new(fn).readlines +f = ReadFile.readlines fn en = cfg[:english] en_ext = en.split('.').last -e = ReadFile(en).readlines +e = ReadFile.readlines en size = cfg[:size] nlines_f = `wc -l #{fn}`.split()[0].to_i nlines_e = `wc -l #{en}`.split()[0].to_i @@ -32,15 +32,15 @@ cfg[:repeat].times { b = a.sample(size) ax = a.reject{|j| b.include? j} `mkdir split_#{i}` - new_f = WriteFile.new("split_#{i}/#{prefix}.train.#{i}.#{fn_ext}") - new_e = WriteFile.new("split_#{i}/#{prefix}.train.#{i}.#{en_ext}") + new_f = WriteFile.new "split_#{i}/#{prefix}.train.#{i}.#{fn_ext}" + new_e = WriteFile.new "split_#{i}/#{prefix}.train.#{i}.#{en_ext}" ax.each { |j| new_f.write f[j] new_e.write e[j] } new_f.close; new_e.close - new_f = WriteFile.new("split_#{i}/#{prefix}.test.#{i}.#{fn_ext}") - new_e = WriteFile.new("split_#{i}/#{prefix}.test.#{i}.#{en_ext}") + new_f = WriteFile.new "split_#{i}/#{prefix}.test.#{i}.#{fn_ext}" + new_e = WriteFile.new "split_#{i}/#{prefix}.test.#{i}.#{en_ext}" b.each { |j| new_f.write f[j] new_e.write e[j] -- cgit v1.2.3