diff options
author | Patrick Simianer <p@simianer.de> | 2014-02-16 00:12:32 +0100 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2014-02-16 00:12:32 +0100 |
commit | 81a637ae52d2a1d0bc751b44c193765cdc1091f1 (patch) | |
tree | 19708fb523ef32cbeccc4d87133f115650e13280 | |
parent | 99ae15932eae7e727b74f723107cf42aad80ba3f (diff) |
nlp_ruby 0.3
-rwxr-xr-x | firstisupper | 9 | ||||
-rwxr-xr-x | firstlower | 12 | ||||
-rwxr-xr-x | kbest_bleu_oracles | 2 | ||||
-rwxr-xr-x | kmeans | 7 | ||||
-rwxr-xr-x | lin_reg (renamed from linreg.rb) | 11 | ||||
-rwxr-xr-x | per_sentence_bleu | 30 | ||||
-rw-r--r-- | test/lin_reg/x.dat (renamed from test/linreg/x.dat) | 0 | ||||
-rw-r--r-- | test/lin_reg/y.dat (renamed from test/linreg/y.dat) | 0 | ||||
-rwxr-xr-x | tf-idf | 12 | ||||
-rwxr-xr-x | traintestsplit | 12 |
10 files changed, 36 insertions, 59 deletions
diff --git a/firstisupper b/firstisupper deleted file mode 100755 index 516dd8a..0000000 --- a/firstisupper +++ /dev/null @@ -1,9 +0,0 @@ -#!/usr/bin/env ruby - -require 'nlp_ruby' - - -while line = STDIN.gets - puts line.strip if downcase? line[0] -end - diff --git a/firstlower b/firstlower new file mode 100755 index 0000000..fb63fcd --- /dev/null +++ b/firstlower @@ -0,0 +1,12 @@ +#!/usr/bin/env ruby + +require 'nlp_ruby' + + +while line = STDIN.gets + line.strip! + if line && line!='' && line[0].downcase? + puts line + end +end + diff --git a/kbest_bleu_oracles b/kbest_bleu_oracles index 1a41019..adc695e 100755 --- a/kbest_bleu_oracles +++ b/kbest_bleu_oracles @@ -23,7 +23,7 @@ def main debug = cfg[:debug] n = cfg[:n] kbest_lists = read_kbest_lists cfg[:kbest_lists] - references = ReadFile.new(cfg[:references]).readlines_strip + references = ReadFile.readlines_strip cfg[:references] context = get_context kbest_lists, references, n kbest_lists.each_with_index { |kbest,j| scores = [] @@ -8,7 +8,8 @@ def read_data fn data = {} ReadFile.new(fn).readlines_strip.map{ |i| a = i.split ' ', 2 - data[a.first] = read_feature_string a.last + v = SparseVector.from_kv a.last + data[a.first] = v } return data end @@ -22,7 +23,7 @@ def rand_means_init data, k prng = Random.new a = [] 0.upto(k-1) do - a << mean_sparse_vector(data.values.sample k, random:prng) + a << SparseVector.mean(data.values.sample k, random:prng) end return a end @@ -51,7 +52,7 @@ end def update assignment, data new_centroids = [] assignment.each_pair { |centroid_index,a| - new_centroids << mean_sparse_vector(assignment[centroid_index].map{ |i| data[i] }) + new_centroids << SparseVector.mean(assignment[centroid_index].map{ |i| data[i] }) } return new_centroids end @@ -9,11 +9,10 @@ def read_data fn, scale data = [] while line = f.gets line.strip! - v = SparseVector.new a = [] a << 1.0 tokenize(line).each { |i| a << i.to_f } - v.from_a(a) + v = SparseVector.from_a a data << v end if scale @@ -22,7 +21,7 @@ def read_data fn, scale data.each { |i| i[k] /= max } } end - return data + return data end def main @@ -36,7 +35,7 @@ def main end data = read_data cfg[:input], cfg[:scale_features] zeros = [0.0]*data[0].size - t = ReadFile.new(cfg[:output]).readlines.map{ |i| i.to_f } + t = ReadFile.readlines(cfg[:output]).map{ |i| i.to_f } model = SparseVector.new zeros stop = 0 prev_model = nil @@ -48,12 +47,12 @@ def main data.each_with_index { |d,j| loss = model.dot(d) - t[j] overall_loss += loss**2 - u += d * loss *(1.0/t.size) + u += d * (loss * (1.0/t.size)) } STDERR.write "#{i} #{overall_loss/data.size}\n" if cfg[:show_loss] u *= cfg[:learning_rate] model -= u - if model.approx_eql? prev_model + if model.approx_eql? prev_model stop += 1 else stop = 0 diff --git a/per_sentence_bleu b/per_sentence_bleu index 724b1e1..b7243df 100755 --- a/per_sentence_bleu +++ b/per_sentence_bleu @@ -4,32 +4,6 @@ require 'nlp_ruby' require 'trollop' -# reference-length hack as in (Nakov et al., 2012) -def brevity_penalty hypothesis, reference, hack=0 - a = tokenize hypothesis; b = tokenize reference - return 1.0 if a.size>=b.size - return Math.exp(1.0 - ((b.size.to_f+hack)/a.size)); -end - -def per_sentence_bleu hypothesis, reference, n=4, hack=0 - h_ng = {}; r_ng = {} - (1).upto(n) {|i| h_ng[i] = []; r_ng[i] = []} - ngrams(hypothesis, n) {|i| h_ng[i.size] << i} - ngrams(reference, n) {|i| r_ng[i.size] << i} - m = [n, reference.split.size].min - weight = 1.0/m - add = 0.0 - sum = 0 - (1).upto(m) { |i| - counts_clipped = 0 - counts_sum = h_ng[i].size - h_ng[i].uniq.each {|j| counts_clipped += r_ng[i].count(j)} - add = 1.0 if i >= 2 - sum += weight * Math.log((counts_clipped + add)/(counts_sum + add)); - } - return brevity_penalty(hypothesis, reference, hack) * Math.exp(sum) -end - def main cfg = Trollop::options do opt :input, "input", :type => :string, :default => '-' @@ -38,7 +12,7 @@ def main opt :n, "N", :default => 4 end - refs = ReadFile.new(cfg[:references]).readlines_strip + refs = ReadFile.readlines_strip cfg[:references] i = -1 input = ReadFile.new cfg[:input] while line = input.gets @@ -47,7 +21,7 @@ def main puts 0.0 next end - puts per_sentence_bleu line.strip, refs[i], cfg[:n], cfg[:len_hack] + puts BLEU::per_sentence_bleu line.strip, refs[i], cfg[:n], cfg[:len_hack] end input.close end diff --git a/test/linreg/x.dat b/test/lin_reg/x.dat index 3d93394..3d93394 100644 --- a/test/linreg/x.dat +++ b/test/lin_reg/x.dat diff --git a/test/linreg/y.dat b/test/lin_reg/y.dat index 1f4f963..1f4f963 100644 --- a/test/linreg/y.dat +++ b/test/lin_reg/y.dat @@ -15,7 +15,7 @@ def main stopwords = [] if cfg[:filter_stopwords] - stopwords = ReadFile.new(cfg[:filter_stopwords]).readlines.map{ |i| + stopwords = ReadFile.readlines(cfg[:filter_stopwords]).map{ |i| i.split('|').first.strip }.reject{ |i| i=='' } end @@ -23,17 +23,17 @@ def main docs = {} cfg[:documents].each { |i| if cfg[:one_item_per_line] - docs[i] = ReadFile.new(i).readlines_strip + docs[i] = ReadFile.readlines_strip i else - docs[i] = ReadFile.new(i).read.split(/\s/).map{ |i| i.strip } + docs[i] = ReadFile.read(i).split(/\s/).map{ |i| i.strip } end } - idf_values = idf docs + idf_values = TFIDF::idf docs docs.each_pair { |name, words| - just_tf = tf words, stopwords - just_tf = ntf(just_tf) if cfg[:ntf] + just_tf = TFIDF::tf words, stopwords + just_tf = TFIDF::ntf(just_tf) if cfg[:ntf] tf_idf = {}; tf_idf.default = 0.0 if cfg[:idf] just_tf.each_pair { |word,f| diff --git a/traintestsplit b/traintestsplit index 7cc5bcf..eb71354 100755 --- a/traintestsplit +++ b/traintestsplit @@ -13,10 +13,10 @@ cfg = Trollop::options do end fn = cfg[:foreign] fn_ext = fn.split('.').last -f = ReadFile.new(fn).readlines +f = ReadFile.readlines fn en = cfg[:english] en_ext = en.split('.').last -e = ReadFile(en).readlines +e = ReadFile.readlines en size = cfg[:size] nlines_f = `wc -l #{fn}`.split()[0].to_i nlines_e = `wc -l #{en}`.split()[0].to_i @@ -32,15 +32,15 @@ cfg[:repeat].times { b = a.sample(size) ax = a.reject{|j| b.include? j} `mkdir split_#{i}` - new_f = WriteFile.new("split_#{i}/#{prefix}.train.#{i}.#{fn_ext}") - new_e = WriteFile.new("split_#{i}/#{prefix}.train.#{i}.#{en_ext}") + new_f = WriteFile.new "split_#{i}/#{prefix}.train.#{i}.#{fn_ext}" + new_e = WriteFile.new "split_#{i}/#{prefix}.train.#{i}.#{en_ext}" ax.each { |j| new_f.write f[j] new_e.write e[j] } new_f.close; new_e.close - new_f = WriteFile.new("split_#{i}/#{prefix}.test.#{i}.#{fn_ext}") - new_e = WriteFile.new("split_#{i}/#{prefix}.test.#{i}.#{en_ext}") + new_f = WriteFile.new "split_#{i}/#{prefix}.test.#{i}.#{fn_ext}" + new_e = WriteFile.new "split_#{i}/#{prefix}.test.#{i}.#{en_ext}" b.each { |j| new_f.write f[j] new_e.write e[j] |