From 4059a5d048cb0f72872c98073ef1ce120a30d78c Mon Sep 17 00:00:00 2001
From: Patrick Simianer <p@simianer.de>
Date: Mon, 16 Jun 2014 17:44:07 +0200
Subject: renaming to zipf

---
 lib/nlp_ruby.rb              |  18 ----
 lib/nlp_ruby/SparseVector.rb | 172 ------------------------------------
 lib/nlp_ruby/Translation.rb  |  72 ---------------
 lib/nlp_ruby/bleu.rb         | 130 ---------------------------
 lib/nlp_ruby/dag.rb          | 205 -------------------------------------------
 lib/nlp_ruby/fileutil.rb     |  88 -------------------
 lib/nlp_ruby/grammar.rb      | 122 -------------------------
 lib/nlp_ruby/hg.rb           | 173 ------------------------------------
 lib/nlp_ruby/misc.rb         | 114 ------------------------
 lib/nlp_ruby/semirings.rb    |  79 -----------------
 lib/nlp_ruby/stringutil.rb   |  22 -----
 lib/nlp_ruby/tfidf.rb        |  38 --------
 lib/zipf.rb                  |  18 ++++
 lib/zipf/SparseVector.rb     | 172 ++++++++++++++++++++++++++++++++++++
 lib/zipf/Translation.rb      |  72 +++++++++++++++
 lib/zipf/bleu.rb             | 130 +++++++++++++++++++++++++++
 lib/zipf/dag.rb              | 205 +++++++++++++++++++++++++++++++++++++++++++
 lib/zipf/fileutil.rb         |  88 +++++++++++++++++++
 lib/zipf/grammar.rb          | 123 ++++++++++++++++++++++++++
 lib/zipf/hg.rb               | 173 ++++++++++++++++++++++++++++++++++++
 lib/zipf/misc.rb             | 114 ++++++++++++++++++++++++
 lib/zipf/semirings.rb        |  81 +++++++++++++++++
 lib/zipf/stringutil.rb       |  22 +++++
 lib/zipf/tfidf.rb            |  38 ++++++++
 24 files changed, 1236 insertions(+), 1233 deletions(-)
 delete mode 100755 lib/nlp_ruby.rb
 delete mode 100644 lib/nlp_ruby/SparseVector.rb
 delete mode 100644 lib/nlp_ruby/Translation.rb
 delete mode 100644 lib/nlp_ruby/bleu.rb
 delete mode 100644 lib/nlp_ruby/dag.rb
 delete mode 100644 lib/nlp_ruby/fileutil.rb
 delete mode 100644 lib/nlp_ruby/grammar.rb
 delete mode 100644 lib/nlp_ruby/hg.rb
 delete mode 100644 lib/nlp_ruby/misc.rb
 delete mode 100644 lib/nlp_ruby/semirings.rb
 delete mode 100644 lib/nlp_ruby/stringutil.rb
 delete mode 100644 lib/nlp_ruby/tfidf.rb
 create mode 100755 lib/zipf.rb
 create mode 100644 lib/zipf/SparseVector.rb
 create mode 100644 lib/zipf/Translation.rb
 create mode 100644 lib/zipf/bleu.rb
 create mode 100644 lib/zipf/dag.rb
 create mode 100644 lib/zipf/fileutil.rb
 create mode 100644 lib/zipf/grammar.rb
 create mode 100644 lib/zipf/hg.rb
 create mode 100644 lib/zipf/misc.rb
 create mode 100644 lib/zipf/semirings.rb
 create mode 100644 lib/zipf/stringutil.rb
 create mode 100644 lib/zipf/tfidf.rb

(limited to 'lib')

diff --git a/lib/nlp_ruby.rb b/lib/nlp_ruby.rb
deleted file mode 100755
index 0e26e97..0000000
--- a/lib/nlp_ruby.rb
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/usr/bin/env ruby
-
-require 'nlp_ruby/stringutil'
-require 'nlp_ruby/fileutil'
-require 'nlp_ruby/SparseVector'
-require 'nlp_ruby/tfidf'
-require 'nlp_ruby/Translation'
-require 'nlp_ruby/dag'
-require 'nlp_ruby/semirings'
-require 'nlp_ruby/bleu'
-require 'nlp_ruby/misc'
-require 'nlp_ruby/hg'
-require 'nlp_ruby/grammar'
-
-STDIN.set_encoding  'utf-8'
-STDOUT.set_encoding 'utf-8'
-STDERR.set_encoding 'utf-8'
-
diff --git a/lib/nlp_ruby/SparseVector.rb b/lib/nlp_ruby/SparseVector.rb
deleted file mode 100644
index 3096412..0000000
--- a/lib/nlp_ruby/SparseVector.rb
+++ /dev/null
@@ -1,172 +0,0 @@
-class SparseVector < Hash
-
-  def initialize arg=nil
-    super
-    self.default = 0
-    if arg.is_a? Array
-      from_a arg
-    end
-  end
-
-  def from_a a
-    a.each_with_index { |i,j| self[j] = i }
-  end
-
-  def self.from_a a
-    v = SparseVector.new
-    v.from_a a
-    return v
-  end
-
-  def from_h h
-    h.each_pair { |k,v| self[k] = v }
-  end
-
-  def self.from_h h
-    v = SparseVector.new
-    v.from_h h
-    return v
-  end
-
-  def from_s s
-    from_h eval(s)
-  end
-
-  def self.from_s s
-    v = SparseVector.new
-    v.from_s s
-    return v
-  end
-
-  def to_kv sep='=', join=' '
-    a = []
-    self.each_pair { |k,v|
-      a << "#{k}#{sep}#{v}"
-    }
-    return a.join join
-  end
-
-  def from_kv s
-    s.split.each { |i|
-      k,v = i.split('=')
-      self[k] = v.to_f
-    }
-  end
-
-  def self.from_kv s
-    v = SparseVector.new
-    v.from_kv s
-    return v
-  end
-
-  def from_file fn, sep='='
-    f = ReadFile.new(fn)
-    while line = f.gets
-      key, value = line.strip.split sep
-      value = value.to_f
-      self[key] = value
-    end
-  end
-
-  def self.from_file fn, sep='='
-    v = SparseVector.new
-    v.from_file fn, sep
-    return v
-  end
-
-  def join_keys other
-    self.keys + other.keys
-  end
-
-  def sum
-    self.values.inject(:+)
-  end
-
-  def approx_eql? other, p=10**-10
-    return false if !other
-    return false if other.size!=self.size
-    return false if other.keys.sort!=self.keys.sort
-    self.keys.each { |k|
-      return false if (self[k]-other[k]).abs>p
-    }
-    return true
-  end
-
-  def average
-    self.sum/self.size.to_f
-  end
-
-  def variance
-    avg = self.average
-    var = 0.0
-    self.values.each { |i| var += (avg - i)**2 }
-    return var
-  end
-
-  def stddev
-    Math.sqrt self.variance
-  end
-
-  def dot other
-    sum = 0.0
-    self.each_pair { |k,v| sum += v * other[k] }
-    return sum
-  end
-
-  def zeros n
-    (0).upto(n-1) { |i| self[i] = 0.0 }
-  end
-
-  def magnitude
-    Math.sqrt self.values.inject { |sum,i| sum+i**2 }
-  end
-
-  def cosinus_sim other
-    self.dot(other)/(self.magnitude*other.magnitude)
-  end
-
-  def euclidian_dist other
-    dims = [self.keys, other.keys].flatten.uniq
-    sum = 0.0
-    dims.each { |d| sum += (self[d] - other[d])**2 }
-    return Math.sqrt(sum)
-  end
-
-  def + other
-    new = SparseVector.new
-    join_keys(other).each { |k|
-      new[k] = self[k]+other[k]
-    }
-    return new
-  end
-
-  def - other
-    new = SparseVector.new
-    join_keys(other).each { |k|
-      new[k] = self[k]-other[k]
-    }
-    return new
-  end
-
-  def * scalar
-    raise ArgumentError, "Arg is not numeric #{scalar}" unless scalar.is_a? Numeric
-    new = SparseVector.new
-    self.keys.each { |k|
-      new[k] = self[k] * scalar
-    }
-    return new
-  end
-
-  def self.mean a
-    mean = SparseVector.new
-    a.each { |i|
-      i.each_pair { |k,v|
-        mean[k] += v
-      }
-    }
-    n = a.size.to_f
-    mean.each_pair { |k,v| mean[k] = v/n }
-    return mean
-  end
-end
-
diff --git a/lib/nlp_ruby/Translation.rb b/lib/nlp_ruby/Translation.rb
deleted file mode 100644
index 3759a1d..0000000
--- a/lib/nlp_ruby/Translation.rb
+++ /dev/null
@@ -1,72 +0,0 @@
-class Translation
-  attr_accessor :id, :s, :raw, :f, :scores, :rank
-
-  def initialize id=nil, raw=nil, s=nil, f=nil, scores={}, rank=nil
-    @id = id
-    @raw = raw
-    @s = s
-    @f = f
-    @scores = scores
-    @rank = rank
-  end
-
-  def from_s t, strip_alignment=true, rank=nil
-    id, raw, features, score = splitpipe(t, 3)
-    raw.strip!
-    @raw = raw
-    if strip_alignment # the way moses does it
-      @s = @raw.gsub(/\s*\|\d+-\d+\||\|-?\d+\|\s*/, ' ').gsub(/\s+/, ' ')
-      @s.strip!
-    else
-      @s = raw
-    end
-    @id = id.to_i
-    @f = SparseVector.from_kv features
-    @scores[:decoder] = score.to_f
-    @rank = rank
-  end
-
-  def self.from_s s
-    t = self.new
-    t.from_s s
-    return t
-  end
-
-  def to_s include_features=true
-    [@id, @s, @f.to_kv('=', ' '), @scores[:decoder]].join(' ||| ') if include_features
-    [@id, @s, @scores[:decoder]].join(' ||| ') if !include_features
-  end
-
-  def to_s2
-    [@rank, @s, @score, @scores.to_s].join ' ||| '
-  end
-end
-
-def read_kbest_lists fn, translation_type=Translation
-  kbest_lists = []
-  cur = []
-  f = ReadFile.new fn
-  prev = -1
-  c = 0
-  id = 0
-  while line = f.gets
-    t = translation_type.new
-    t.from_s line
-    c = splitpipe(line)[0].to_i
-    if c != prev
-      if cur.size > 0
-        kbest_lists << cur
-        cur = []
-      end
-      prev = c
-      id = 0
-    end
-    t.id = id
-    cur << t
-    id += 1
-  end
-  kbest_lists << cur # last one
-  f.close
-  return kbest_lists
-end
-
diff --git a/lib/nlp_ruby/bleu.rb b/lib/nlp_ruby/bleu.rb
deleted file mode 100644
index 56f341b..0000000
--- a/lib/nlp_ruby/bleu.rb
+++ /dev/null
@@ -1,130 +0,0 @@
-module BLEU
-
-
-class BLEU::NgramCounts
-  attr_accessor :sum, :clipped, :ref_len, :hyp_len, :n
-
-  def initialize(n)
-    @n = 0
-    @sum = []
-    @clipped = []
-    @ref_len = 0.0
-    @hyp_len = 0.0
-    grow(n)
-  end
-
-  def grow(n)
-    (n-@n).times {
-      @sum << 0.0
-      @clipped << 0.0
-    }
-    @n = n
-  end
-
-  def plus_eq(other)
-    if other.n > @n then grow(other.n) end
-    0.upto(other.n-1) { |m|
-      @sum[m] += other.sum[m]
-      @clipped[m] += other.clipped[m]
-    }
-    @ref_len += other.ref_len
-    @hyp_len += other.hyp_len
-  end
-
-  def to_s
-    return "n=#{n} sum=#{sum} clipped=#{clipped} ref_len=#{ref_len} hyp_len=#{hyp_len}"
-  end
-end
-
-class BLEU::Ngrams
-  def initialize
-    @h_ = {}
-    @h_.default = 0
-  end
-
-  def add(k)
-    if k.class == Array then k = k.join ' ' end
-    @h_[k] += 1
-  end
-
-  def get_count(k)
-    if k.class == Array then k = k.join ' ' end
-    return @h_[k]
-  end
-
-  def each
-    @h_.each_pair { |k,v|
-      yield k.split, v
-    }
-  end
-
-  def to_s
-    @h_.to_s
-  end
-end
-
-def BLEU::get_counts hypothesis, reference, n, times=1
-  p = NgramCounts.new n
-  r = Ngrams.new
-  ngrams(reference, n) { |ng| r.add ng }
-  h = Ngrams.new
-  ngrams(hypothesis, n) { |ng| h.add ng }
-  h.each { |ng,count|
-    sz = ng.size-1
-    p.sum[sz] += count * times
-    p.clipped[sz] += [r.get_count(ng), count].min * times
-  }
-  p.ref_len = tokenize(reference.strip).size * times
-  p.hyp_len = tokenize(hypothesis.strip).size * times
-  return p
-end
-
-def BLEU::brevity_penalty c, r, smooth=0.0
-  return [0.0, 1.0-((r+smooth)/c)].min
-end
-
-def BLEU::bleu counts, n, debug=false
-  corpus_stats = NgramCounts.new n
-  counts.each { |i| corpus_stats.plus_eq i }
-  logbleu = 0.0
-  0.upto(n-1) { |m|
-    STDERR.write "#{m+1} #{corpus_stats.clipped[m]} / #{corpus_stats.sum[m]}\n" if debug
-    return 0.0 if corpus_stats.clipped[m] == 0 or corpus_stats.sum == 0
-    logbleu += Math.log(corpus_stats.clipped[m]) - Math.log(corpus_stats.sum[m])
-  }
-  logbleu /= n
-  if debug
-    STDERR.write "BP #{brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len)}\n"
-    STDERR.write "sum #{Math.exp(sum)}\n"
-  end
-  logbleu += brevity_penalty corpus_stats.hyp_len, corpus_stats.ref_len
-  return Math.exp logbleu
-end
-
-def BLEU::hbleu counts, n, debug=false
-  (100*bleu(counts, n, debug)).round(3)
-end
-
-def BLEU::per_sentence_bleu hypothesis, reference, n=4, smooth=0.0
-  h_ng = {}; r_ng = {}
-  (1).upto(n) { |i| h_ng[i] = []; r_ng[i] = [] }
-  ngrams(hypothesis, n) { |i| h_ng[i.size] << i }
-  ngrams(reference, n) { |i| r_ng[i.size] << i }
-  m = [n, reference.split.size].min
-  add = 0.0
-  logbleu = 0.0
-  (1).upto(m) { |i|
-    counts_clipped = 0
-    counts_sum = h_ng[i].size
-    h_ng[i].uniq.each { |j| counts_clipped += r_ng[i].count(j) }
-    add = 1.0 if i >= 2
-    logbleu += Math.log(counts_clipped+add) - Math.log(counts_sum+add);
-  }
-  logbleu /= m
-  logbleu += brevity_penalty hypothesis.strip.split.size, reference.strip.split.size, smooth
-  return Math.exp logbleu
-end
-
-
-end # module
-
diff --git a/lib/nlp_ruby/dag.rb b/lib/nlp_ruby/dag.rb
deleted file mode 100644
index 94afa23..0000000
--- a/lib/nlp_ruby/dag.rb
+++ /dev/null
@@ -1,205 +0,0 @@
-module DAG
-
-require 'json'
-
-
-class DAG::Node
-  attr_accessor :label, :outgoing, :incoming, :score, :mark
-
-  def initialize label=nil, outgoing=[], incoming=[], score=nil
-    @label    = label
-    @outgoing = outgoing
-    @incoming = incoming
-    @score    = nil
-  end
-
-  def add_edge head, weight=0
-    exit if self==head # no self-cycles!
-    @outgoing << DAG::Edge.new(self, head, weight)
-    return @outgoing.last
-  end
-
-  def to_s
-    "DAG::Node<label:#{label}, outgoing:#{outgoing.size}, incoming:#{incoming.size}>"
-  end
-
-  def repr
-    "#{to_s} #{@score} out:#{@outgoing} in:[#{@incoming.map{|e| e.to_s}.join ', '}]"
-  end
-end
-
-class DAG::Edge
-  attr_accessor :tail, :head, :weight, :mark
-
-  def initialize tail=nil, head=nil, weight=0
-    @tail   = tail
-    @head   = head
-    @weight = weight
-    @mark   = false # did we already follow this edge? -- for topological sorting
-  end
-
-  def to_s
-    s = "DAG::Edge<#{@tail} ->[#{weight}] #{@head}"
-    s += " x" if @mark
-    s += ">"
-    s
-  end
-end
-
-# depth-first search
-#  w/o markings as we do not have cycles
-def DAG::dfs n, target_label
-  return n if n.label==target_label # assumes uniq labels!
-  stack = n.outgoing.map { |i| i.head }
-  while !stack.empty?
-    m = stack.pop
-    return DAG::dfs m, target_label
-  end
-  return nil
-end
-
-# breadth-first search
-#  w/o markings as we do not have cycles
-def DAG::bfs n, target_label
-  queue = [n]
-  while !queue.empty?
-    m = queue.shift
-    return m if m.label==target_label
-    m.outgoing.each { |e| queue << e.head }
-  end
-  return nil
-end
-
-# topological sort
-def DAG::topological_sort graph
-  sorted = []
-  s = graph.reject { |n| !n.incoming.empty? }
-  while !s.empty?
-    sorted << s.shift
-    sorted.last.outgoing.each { |e|
-      e.mark = true
-      s << e.head if e.head.incoming.reject{|f| f.mark}.empty?
-    }
-  end
-  return sorted
-end
-
-# initialize graph scores with semiring One
-def DAG::init graph, semiring, source_node
-  graph.each {|n| n.score=semiring.null}
-  source_node.score = semiring.one
-end
-
-# viterbi
-def DAG::viterbi graph, semiring=ViterbiSemiring, source_node
-  toposorted = DAG::topological_sort(graph)
-  DAG::init(graph, semiring, source_node)
-  toposorted.each { |n|
-    n.incoming.each { |e|
-      # update
-      n.score = \
-        semiring.add.call(n.score, \
-                          semiring.multiply.call(e.tail.score, e.weight)
-        )
-    }
-  }
-end
-
-# forward viterbi
-def DAG::viterbi_forward graph, semiring=ViterbiSemiring, source_node
-  toposorted = DAG::topological_sort(graph)
-  DAG::init(graph, semiring, source_node)
-  toposorted.each { |n|
-    n.outgoing.each { |e|
-      e.head.score = \
-        semiring.add.call(e.head.score, \
-                          semiring.multiply.call(n.score, e.weight)
-        )
-    }
-  }
-end
-
-# Dijkstra algorithm
-#  for A*-search we would need an optimistic estimate of
-#  future cost at each node
-def DAG::dijkstra graph, semiring=RealSemiring.new, source_node
-  DAG::init(graph, semiring, source_node)
-  q = PriorityQueue.new graph
-  while !q.empty?
-    n = q.pop
-    n.outgoing.each { |e|
-      e.head.score = \
-        semiring.add.call(e.head.score, \
-                          semiring.multiply.call(n.score, e.weight))
-      q.sort!
-    }
-  end
-end
-
-# Bellman-Ford algorithm
-def DAG::bellman_ford(graph, semiring=RealSemiring.new, source_node)
-  DAG::init(graph, semiring, source_node)
-  edges = []
-  graph.each { |n| edges |= n.outgoing }
-  # relax edges
-  (graph.size-1).times{ |i|
-    edges.each { |e|
-      e.head.score = \
-        semiring.add.call(e.head.score, \
-                          semiring.multiply.call(e.tail.score, e.weight))
-    }
-  }
-  # we do not allow cycles (negative or positive)
-end
-
-# Floyd algorithm
-def DAG::floyd(graph, semiring=nil)
-  dist_matrix = []
-  graph.each_index { |i|
-    dist_matrix << []
-    graph.each_index { |j|
-      val = 1.0/0.0
-      val = 0.0 if i==j
-      dist_matrix.last << val
-    }
-  }
-  edges = []
-  graph.each { |n| edges |= n.outgoing }
-  edges.each { |e|
-    dist_matrix[graph.index(e.tail)][graph.index(e.head)] = e.weight
-  }
-  0.upto(graph.size-1) { |k|
-    0.upto(graph.size-1) { |i|
-      0.upto(graph.size-1) { |j|
-        if dist_matrix[i][k] + dist_matrix[k][j] < dist_matrix[i][j]
-          dist_matrix  [i][j] = dist_matrix[i][k] + dist_matrix[k][j]
-        end
-      }
-    }
-  }
-  return dist_matrix
-end
-
-
-# returns a list of nodes (graph) and a hash for finding
-# nodes by their label (these need to be unique!)
-def DAG::read_graph_from_json fn, semiring=RealSemiring.new
-  graph = []
-  nodes_by_label = {}
-  h = JSON.parse File.new(fn).read
-  h['nodes'].each { |i|
-    n = DAG::Node.new i['label']
-    graph << n
-    nodes_by_label[n.label] = n
-  }
-  h['edges'].each { |i|
-    n = nodes_by_label[i['tail']]
-    a = n.add_edge(nodes_by_label[i['head']], semiring.convert.call(i['weight'].to_f))
-    nodes_by_label[i['head']].incoming << a
-  }
-  return graph, nodes_by_label
-end
-
-
-end # module
-
diff --git a/lib/nlp_ruby/fileutil.rb b/lib/nlp_ruby/fileutil.rb
deleted file mode 100644
index eb69136..0000000
--- a/lib/nlp_ruby/fileutil.rb
+++ /dev/null
@@ -1,88 +0,0 @@
-require 'zlib'
-
-
-class ReadFile
-
-  def initialize fn, encoding='utf-8'
-    if fn.split('.').last == 'gz'
-      @f = Zlib::GzipReader.new(File.new(fn, 'rb'), :external_encoding=>encoding)
-    elsif fn == '-'
-      @f = STDIN
-      STDIN.set_encoding encoding
-    else
-      @f = File.new fn, 'r'
-      @f.set_encoding encoding
-    end
-  end
-
-  def gets
-    @f.gets { |line| yield line }
-  end
-
-  def readlines
-    @f.readlines
-  end
-
-  def self.readlines fn, encoding='utf-8'
-    f = ReadFile.new fn, encoding
-    r = f.readlines
-    f.close
-    return r
-  end
-
-  def readlines_strip
-    self.readlines.map{ |i| i.strip }
-  end
-
-  def self.readlines_strip fn, encoding='utf-8'
-    f = ReadFile.new fn, encoding
-    r = f.readlines_strip
-    f.close
-    return r
-  end
-
-  def read
-    @f.read
-  end
-
-  def self.read fn, encoding='utf-8'
-    f = ReadFile.new fn, encoding
-    r = f.read
-    f.close
-    return r
-  end
-
-  def close
-    @f.close if @f!=STDIN
-  end
-end
-
-class WriteFile
-
-  def initialize fn, encoding='utf-8'
-    if fn.split('.').last == 'gz'
-      @f = Zlib::GzipWriter.new(File.new(fn, 'wb+'), :external_encoding=>encoding)
-    elsif fn == '-'
-      @f = STDOUT
-      STDOUT.set_encoding encoding
-    else
-      @f = File.new fn, 'w+'
-      @f.set_encoding encoding
-    end
-  end
-
-  def write s
-    @f.write s
-  end
-
-  def self.write s, fn, encoding='utf-8'
-    f = WriteFile.new fn, encoding
-    f.write s
-    f.close
-  end
-
-  def close
-    @f.close if @f!=STDIN
-  end
-end
-
diff --git a/lib/nlp_ruby/grammar.rb b/lib/nlp_ruby/grammar.rb
deleted file mode 100644
index 7bd8fe6..0000000
--- a/lib/nlp_ruby/grammar.rb
+++ /dev/null
@@ -1,122 +0,0 @@
-module Grammar
-
-class T
-  attr_accessor :word
-
-  def initialize word
-    @word = word
-  end
-
-  def to_s
-    "T<#{@word}>"
-  end
-end
-
-class NT
-  attr_accessor :symbol, :index, :span
-
-  def initialize symbol, index=0
-    @symbol = symbol
-    @index = index
-    @span = Span.new
-  end
-
-  def to_s
-    "NT(#{@span.left},#{@span.right})<#{@symbol},#{@index}>"
-  end
-end
-
-class Rule
-  attr_accessor :lhs, :rhs, :e
-
-  def initialize lhs=nil, rhs=[], e=''
-    @lhs = lhs
-    @rhs = rhs
-    @e = e
-  end
-
-  def to_s
-    "#{lhs} -> #{rhs.map{ |i| i.to_s }.join ' '} [arity=#{arity}] ||| #{@e}"
-  end
-
-  def arity
-    rhs.select { |i| i.class == NT }.size
-  end
-
-  def from_s s
-    _ = splitpipe s, 3
-    @lhs = NT.new _[0].strip.gsub!(/(\[|\])/, "")
-    _[1].split.each { |x|
-      x.strip!
-      if x[0]=='[' && x[x.size-1] == ']'
-        @rhs << NT.new(x.gsub!(/(\[|\])/, "").split(',')[0])
-      else
-        @rhs << T.new(x)
-      end
-    }
-    @e = _[2]
-  end
-
-  def self.from_s s
-    r = self.new
-    r.from_s s
-    return r
-  end
-end
-
-class Span
-  attr_accessor :left, :right
-
-  def initialize left=nil, right=nil
-    @left = left
-    @right = right
-  end
-end
-
-class Grammar
-  attr_accessor :rules, :startn, :startt, :flat
-
-  def initialize fn
-    @rules = []; @startn = []; @startt = [] ;@flat = []
-    ReadFile.readlines_strip(fn).each_with_index { |s,i|
-      STDERR.write '.'; STDERR.write " #{i+1}\n" if (i+1)%80==0
-      @rules << Rule.from_s(s)
-      if @rules.last.rhs.first.class == NT
-        @startn << @rules.last
-      else
-        if rules.last.arity == 0
-          @flat << @rules.last
-        else
-          @startt << @rules.last
-        end
-      end
-    }
-    STDERR.write "\n"
-  end
-
-  def to_s
-    s = ''
-    @rules.each { |r| s += r.to_s+"\n" }
-    return s
-  end
-
-  def add_glue_rules
-    @rules.map { |r| r.lhs.symbol }.select { |s| s != 'S' }.uniq.each { |symbol|
-      @rules << Rule.new(NT.new('S'), [NT.new(symbol)])
-      @startn << @rules.last
-      @rules << Rule.new(NT.new('S'), [NT.new('S'), NT.new('X')])
-      @startn << @rules.last
-    }
-  end
-
-  def add_pass_through_rules s
-    s.each { |word|
-      @rules << Rule.new(NT.new('X'), [T.new(word)])
-      @flat << @rules.last
-    }
-  end
-end
-
-
-end # module
-
diff --git a/lib/nlp_ruby/hg.rb b/lib/nlp_ruby/hg.rb
deleted file mode 100644
index b8b147e..0000000
--- a/lib/nlp_ruby/hg.rb
+++ /dev/null
@@ -1,173 +0,0 @@
-#!/usr/bin/env ruby
-
-require 'nlp_ruby'
-require 'json'
-
-
-module HG
-
-
-class HG::Node
-  attr_accessor :label, :cat, :outgoing, :incoming, :score
-
-  def initialize label=nil, cat=nil, outgoing=[], incoming=[], score=nil
-    @label    = label
-    @cat      = cat
-    @outgoing = outgoing
-    @incoming = incoming
-    @score    = nil
-  end
-
-  def to_s
-    "Node<label:\"#{@label}\", cat:\"#{@cat}\", outgoing:#{@outgoing.size}, incoming:#{@incoming.size}>"
-  end
-end
-
-class HG::Hypergraph
-  attr_accessor :nodes, :edges
-
-  def initialize nodes=[], edges=[]
-    @nodes = nodes
-    @edges = edges
-  end
-
-  def arity
-    @edges.map { |e| e.arity }.max
-  end
-
-  def to_s
-    "Hypergraph<nodes:[#{@nodes.to_s}], edges:[#{@edges.to_s}], arity:#{arity}>"
-  end
-end
-
-class HG::Hyperedge
-  attr_accessor :head, :tails, :weight, :f, :mark, :rule, :left, :right
-
-  def initialize head=nil, tails=[], weight=0.0, f={}
-    @head   = head
-    @tails  = tails
-    @weight = weight
-    @f      = f
-    @mark   = 0
-  end
-
-  def arity
-    return @tails.size
-  end
-
-  def marked?
-    arity == @mark
-  end
-
-  def to_s
-    "Hyperedge<head:\"#{@head.label}\", \"tails:#{@tails.map{|n|n.label}}, arity:#{arity}, weight:#{@weight}, f:#{f.to_s}, mark:#{@mark}>"
-  end
-end
-
-def HG::topological_sort nodes
-  sorted = []
-  s = nodes.reject { |n| !n.incoming.empty? }
-  while !s.empty?
-    sorted << s.shift
-    sorted.last.outgoing.each { |e|
-      next if e.marked?
-      e.mark += 1
-      s << e.head if e.head.incoming.reject{ |f| f.mark==f.arity }.empty?
-    }
-  end
-  return sorted
-end
-
-def HG::init nodes, semiring, root
-  nodes.each { |n| n.score=semiring.null }
-  root.score = semiring.one
-end
-
-def HG::viterbi hypergraph, root, semiring=ViterbiSemiring.new
-  toposorted = topological_sort hypergraph.nodes
-  init toposorted, semiring, root
-  toposorted.each { |n|
-    n.incoming.each { |e|
-      s = semiring.one
-      e.tails.each { |m|
-        s = semiring.multiply.call(s, m.score)
-      }
-      n.score = semiring.add.call(n.score, semiring.multiply.call(s, e.weight))
-    }
-  }
-end
-
-def HG::viterbi_path hypergraph, root, semiring=ViterbiSemiring.new
-  toposorted = topological_sort hypergraph.nodes
-  init toposorted, semiring, root
-  best_path = []
-  toposorted.each { |n|
-    best_edge = nil
-    n.incoming.each { |e|
-      s = semiring.one
-      e.tails.each { |m|
-        s = semiring.multiply.call(s, m.score)
-      }
-      if n.score < semiring.multiply.call(s, e.weight) # ViterbiSemiring add
-        best_edge = e
-      end
-      n.score = semiring.add.call(n.score, semiring.multiply.call(s, e.weight))
-    }
-    best_path << best_edge
-  }
-  return best_path, toposorted.last.score
-end
-
-def HG::read_hypergraph_from_json fn, semiring=RealSemiring.new, log_weights=false
-  nodes = []
-  edges = []
-  nodes_by_label = {}
-  nodes_by_index = []
-  h = JSON.parse File.new(fn).read
-  w = SparseVector.from_h h['weights']
-  h['nodes'].each { |i|
-    n = Node.new i['label'], i['cat']
-    nodes << n
-    nodes_by_label[n.label] = n
-    nodes_by_index << n
-  }
-  h['edges'].each { |i|
-    e = Hyperedge.new(nodes_by_label[i['head']], \
-                      i['tails'].map{|j| nodes_by_label[j]}.to_a, \
-                      semiring.convert.call(i['weight'].to_f), \
-                      {})
-    e.f = SparseVector.from_h i['f']
-    if log_weights
-      e.weight = Math.exp(w.dot(e.f))
-    else
-      e.weight = w.dot(e.f)
-    end
-    e.tails.each { |m|
-      m.outgoing << e
-    }
-    e.head.incoming << e
-    edges << e
-  }
-  return Hypergraph.new(nodes, edges), nodes_by_label, nodes_by_index
-end
-
-def HG::all_paths hypergraph, root, semiring=ViterbiSemiring.new
-  toposorted = topological_sort hypergraph.nodes
-  paths = [[]]
-  toposorted.each { |n|
-    next if n.incoming.empty?
-    new_paths = []
-    while !paths.empty?
-      p = paths.pop
-      n.incoming.each { |e|
-        new_paths << p+[e]
-      }
-    end
-    paths = new_paths
-  }
-  return paths
-end
-
-
-end #module
-
diff --git a/lib/nlp_ruby/misc.rb b/lib/nlp_ruby/misc.rb
deleted file mode 100644
index 0319a5f..0000000
--- a/lib/nlp_ruby/misc.rb
+++ /dev/null
@@ -1,114 +0,0 @@
-require 'timeout'
-
-
-class Array
-  def max_index
-    self.index(self.max)
-  end
-
-  def is_subset_of? other
-    self.each { |i|
-      if other.include? i
-       return false
-      end
-    }
-    return true
-  end
-
-  def sum
-    self.inject(:+)
-  end
-
-  def mean
-    self.sum.to_f/self.size
-  end
-end
-
-class String
-
-  def downcase?
-    self[/[[:lower:]]/]
-  end
-end
-
-class PriorityQueue
-# This assumes that elements in the queue
-# have a numerical member named 'score'.
-
-  def initialize a=Array.new
-    @queue = Array.new a
-    sort!
-  end
-
-  def sort!
-    @queue.sort_by! { |i| -i.score }
-  end
-
-  def pop
-    @queue.pop
-  end
-
-  def push i
-    @queue << i
-    sort!
-  end
-
-  def empty?
-    @queue.empty?
-  end
-end
-
-def spawn_with_timeout cmd, t=4, ignore_fail=false, debug=false
-  STDERR.write cmd+"\n" if debug
-  pipe_in, pipe_out = IO.pipe
-  pid = Process.spawn(cmd, :out => pipe_out)
-  begin
-    Timeout.timeout(t) { Process.wait pid }
-  rescue Timeout::Error
-    Process.kill('TERM', pid) if !ignore_fail
-  end
-  pipe_out.close
-  return pipe_in.read
-end
-
-def read_phrase_table fn
-  table = {}
-  f = ReadFile.new fn
-  while raw_rule = f.gets
-    french, english, features = splitpipe(raw_rule)
-    feature_map = SparseVector.from_kv  features
-    if table.has_key? french
-      table[french] << [english, feature_map ]
-    else
-      table[french] = [[english, feature_map]]
-    end
-  end
-  f.close
-  return table
-end
-
-def cdec_kbest cdec_bin, input, ini, weights, k, unique=true
-  require 'open3'
-  cmd = "echo \"#{input}\" | #{cdec_bin} -c #{ini} -w #{weights} -k #{k}"
-  cmd += " -r" if unique
-  o,_ = Open3.capture2 "#{cmd}  2>/dev/null"
-  a = []; j = -1
-  o.split("\n").map{ |i| j+=1; t=Translation.new; t.from_s(i, false, j); a << t }
-  return a
-end
-
-def read_config fn
-  f = ReadFile.new fn
-  cfg = {}
-  while line = f.gets
-    line.strip!
-    next if /^\s*$/.match line
-    next if line[0]=='#'
-    content = line.split('#', 2).first
-    k, v = content.split(/\s*=\s*/, 2)
-    k.strip!; v.strip!
-    cfg[k] = v
-  end
-  return cfg
-end
-
diff --git a/lib/nlp_ruby/semirings.rb b/lib/nlp_ruby/semirings.rb
deleted file mode 100644
index fda4683..0000000
--- a/lib/nlp_ruby/semirings.rb
+++ /dev/null
@@ -1,79 +0,0 @@
-# Semirings for directed acyclic graphs (dags) (also directed hypergraphs),
-# as described in:
-# 'Dynamic Programming Algorithms in
-#  Semiring and Hypergraph Frameworks' (Liang Huang)
-class Semiring
-  attr_accessor :add, :multiply, :one, :null, :convert
-end
-
-class BooleanSemiring < Semiring
-  def initialize
-    @add = Proc.new { |a,b| a||b }
-    @multiply =  Proc.new { |a,b| a&&b }
-    @one = true
-    @null = false
-    @convert = Proc.new { |v| true && v!=0 }
-  end
-end
-
-class ViterbiSemiring < Semiring
-  def initialize
-    @add = Proc.new { |a,b| [a,b].max }
-    @multiply =  Proc.new { |a,b| a*b }
-    @one = 1.0
-    @null = 0.0
-    @convert = Proc.new { |v| v }
-  end
-end
-
-class ViterbiLogSemiring < Semiring
-  def initialize
-    @add = Proc.new { |a,b| [a,b].max }
-    @multiply =  Proc.new { |a,b| a+b }
-    @one = 0.0
-    @null = -1.0/0.0
-    @convert = Proc.new { |v| v }
-  end
-end
-
-class InsideSemiring < Semiring
-  def initialize
-    @add = Proc.new { |a,b| a+b }
-    @multiply =  Proc.new { |a,b| a*b }
-    @one = 1.0
-    @null = 0.0
-    @convert = Proc.new { |v| v }
-  end
-end
-
-class RealSemiring < Semiring
-  def initialize
-    @add = Proc.new { |a,b| [a,b].min }
-    @multiply =  Proc.new { |a,b| a+b }
-    @one = 0.0
-    @null = 1.0/0.0
-    @convert = Proc.new { |v| v }
-  end
-end
-
-# for longest/worst paths
-class RealxSemiring < Semiring
-  def initialize
-    @add = Proc.new { |a,b| [a,b].max }
-    @multiply =  Proc.new { |a,b| a+b }
-    @one = -1.0/0.0
-    @null = 0.0
-    @convert = Proc.new { |v| v }
-  end
-end
-
-class CountingSemiring < Semiring
-  def initialize
-    @add = Proc.new { |a,b| a+b }
-    @multiply =  Proc.new { |a,b| a*b }
-    @one = 1.0
-    @null = 0.0
-    @convert = Proc.new { |v| if v!=0 then 1 else 0 end }
-  end
-end
-
diff --git a/lib/nlp_ruby/stringutil.rb b/lib/nlp_ruby/stringutil.rb
deleted file mode 100644
index aa9be00..0000000
--- a/lib/nlp_ruby/stringutil.rb
+++ /dev/null
@@ -1,22 +0,0 @@
-def tokenize s
-  s.strip.split
-end
-
-def ngrams(s, n, fix=false)
-  a = tokenize s
-  a.each_with_index { |tok, i|
-    tok.strip!
-    0.upto([n-1, a.size-i-1].min) { |m|
-      yield a[i..i+m] if !fix||(fix&&a[i..i+m].size==n)
-    }
-  }
-end
-
-def bag_of_words s, stopwords=[]
-  s.strip.split.uniq.sort.reject{ |w| stopwords.include? w }
-end
-
-def splitpipe s, n=3
-  s.strip.split("|"*n)
-end
-
diff --git a/lib/nlp_ruby/tfidf.rb b/lib/nlp_ruby/tfidf.rb
deleted file mode 100644
index 13a40a3..0000000
--- a/lib/nlp_ruby/tfidf.rb
+++ /dev/null
@@ -1,38 +0,0 @@
-module TFIDF
-
-
-# returns key='raw frequency' for an
-# array-like object
-def TFIDF::tf array, stopwords=[]
-  v = {}; v.default = 0
-  array.uniq.each { |i|
-   next if stopwords.include? i
-   v[i] = array.count(i).to_f
-  }
-  return v
-end
-
-# smoothes raw frequencies of tf() in-place
-# a is a smoothing term
-def TFIDF::ntf hash, a=0.4
-  max = hash.values.max.to_f
-  hash.each_pair { |k,v|
-    hash[k] = a + (1-a)*(v/max)
-  }
-end
-
-# returns idf value for each word in a vocabulary
-def TFIDF::idf list_of_hashes
-  vocab = list_of_hashes.values.flatten.uniq
-  n = list_of_hashes.size.to_f
-  idf = {}
-  vocab.each { |i|
-    df = list_of_hashes.values.flatten.count i
-    idf[i] = Math.log(n/df)
-  }
-  return idf
-end
-
-
-end #module
-
diff --git a/lib/zipf.rb b/lib/zipf.rb
new file mode 100755
index 0000000..0e26e97
--- /dev/null
+++ b/lib/zipf.rb
@@ -0,0 +1,18 @@
+#!/usr/bin/env ruby
+
+require 'nlp_ruby/stringutil'
+require 'nlp_ruby/fileutil'
+require 'nlp_ruby/SparseVector'
+require 'nlp_ruby/tfidf'
+require 'nlp_ruby/Translation'
+require 'nlp_ruby/dag'
+require 'nlp_ruby/semirings'
+require 'nlp_ruby/bleu'
+require 'nlp_ruby/misc'
+require 'nlp_ruby/hg'
+require 'nlp_ruby/grammar'
+
+STDIN.set_encoding  'utf-8'
+STDOUT.set_encoding 'utf-8'
+STDERR.set_encoding 'utf-8'
+
diff --git a/lib/zipf/SparseVector.rb b/lib/zipf/SparseVector.rb
new file mode 100644
index 0000000..3096412
--- /dev/null
+++ b/lib/zipf/SparseVector.rb
@@ -0,0 +1,172 @@
+class SparseVector < Hash
+
+  def initialize arg=nil
+    super
+    self.default = 0
+    if arg.is_a? Array
+      from_a arg
+    end
+  end
+
+  def from_a a
+    a.each_with_index { |i,j| self[j] = i }
+  end
+
+  def self.from_a a
+    v = SparseVector.new
+    v.from_a a
+    return v
+  end
+
+  def from_h h
+    h.each_pair { |k,v| self[k] = v }
+  end
+
+  def self.from_h h
+    v = SparseVector.new
+    v.from_h h
+    return v
+  end
+
+  def from_s s
+    from_h eval(s)
+  end
+
+  def self.from_s s
+    v = SparseVector.new
+    v.from_s s
+    return v
+  end
+
+  def to_kv sep='=', join=' '
+    a = []
+    self.each_pair { |k,v|
+      a << "#{k}#{sep}#{v}"
+    }
+    return a.join join
+  end
+
+  def from_kv s
+    s.split.each { |i|
+      k,v = i.split('=')
+      self[k] = v.to_f
+    }
+  end
+
+  def self.from_kv s
+    v = SparseVector.new
+    v.from_kv s
+    return v
+  end
+
+  def from_file fn, sep='='
+    f = ReadFile.new(fn)
+    while line = f.gets
+      key, value = line.strip.split sep
+      value = value.to_f
+      self[key] = value
+    end
+  end
+
+  def self.from_file fn, sep='='
+    v = SparseVector.new
+    v.from_file fn, sep
+    return v
+  end
+
+  def join_keys other
+    self.keys + other.keys
+  end
+
+  def sum
+    self.values.inject(:+)
+  end
+
+  def approx_eql? other, p=10**-10
+    return false if !other
+    return false if other.size!=self.size
+    return false if other.keys.sort!=self.keys.sort
+    self.keys.each { |k|
+      return false if (self[k]-other[k]).abs>p
+    }
+    return true
+  end
+
+  def average
+    self.sum/self.size.to_f
+  end
+
+  def variance
+    avg = self.average
+    var = 0.0
+    self.values.each { |i| var += (avg - i)**2 }
+    return var
+  end
+
+  def stddev
+    Math.sqrt self.variance
+  end
+
+  def dot other
+    sum = 0.0
+    self.each_pair { |k,v| sum += v * other[k] }
+    return sum
+  end
+
+  def zeros n
+    (0).upto(n-1) { |i| self[i] = 0.0 }
+  end
+
+  def magnitude
+    Math.sqrt self.values.inject { |sum,i| sum+i**2 }
+  end
+
+  def cosinus_sim other
+    self.dot(other)/(self.magnitude*other.magnitude)
+  end
+
+  def euclidian_dist other
+    dims = [self.keys, other.keys].flatten.uniq
+    sum = 0.0
+    dims.each { |d| sum += (self[d] - other[d])**2 }
+    return Math.sqrt(sum)
+  end
+
+  def + other
+    new = SparseVector.new
+    join_keys(other).each { |k|
+      new[k] = self[k]+other[k]
+    }
+    return new
+  end
+
+  def - other
+    new = SparseVector.new
+    join_keys(other).each { |k|
+      new[k] = self[k]-other[k]
+    }
+    return new
+  end
+
+  def * scalar
+    raise ArgumentError, "Arg is not numeric #{scalar}" unless scalar.is_a? Numeric
+    new = SparseVector.new
+    self.keys.each { |k|
+      new[k] = self[k] * scalar
+    }
+    return new
+  end
+
+  def self.mean a
+    mean = SparseVector.new
+    a.each { |i|
+      i.each_pair { |k,v|
+        mean[k] += v
+      }
+    }
+    n = a.size.to_f
+    mean.each_pair { |k,v| mean[k] = v/n }
+    return mean
+  end
+end
+
diff --git a/lib/zipf/Translation.rb b/lib/zipf/Translation.rb
new file mode 100644
index 0000000..3759a1d
--- /dev/null
+++ b/lib/zipf/Translation.rb
@@ -0,0 +1,72 @@
+class Translation
+  attr_accessor :id, :s, :raw, :f, :scores, :rank
+
+  def initialize id=nil, raw=nil, s=nil, f=nil, scores={}, rank=nil
+    @id = id
+    @raw = raw
+    @s = s
+    @f = f
+    @scores = scores
+    @rank = rank
+  end
+
+  def from_s t, strip_alignment=true, rank=nil
+    id, raw, features, score = splitpipe(t, 3)
+    raw.strip!
+    @raw = raw
+    if strip_alignment # the way moses does it
+      @s = @raw.gsub(/\s*\|\d+-\d+\||\|-?\d+\|\s*/, ' ').gsub(/\s+/, ' ')
+      @s.strip!
+    else
+      @s = raw
+    end
+    @id = id.to_i
+    @f = SparseVector.from_kv features
+    @scores[:decoder] = score.to_f
+    @rank = rank
+  end
+
+  def self.from_s s
+    t = self.new
+    t.from_s s
+    return t
+  end
+
+  def to_s include_features=true
+    [@id, @s, @f.to_kv('=', ' '), @scores[:decoder]].join(' ||| ') if include_features
+    [@id, @s, @scores[:decoder]].join(' ||| ') if !include_features
+  end
+
+  def to_s2
+    [@rank, @s, @score, @scores.to_s].join ' ||| '
+  end
+end
+
+def read_kbest_lists fn, translation_type=Translation
+  kbest_lists = []
+  cur = []
+  f = ReadFile.new fn
+  prev = -1
+  c = 0
+  id = 0
+  while line = f.gets
+    t = translation_type.new
+    t.from_s line
+    c = splitpipe(line)[0].to_i
+    if c != prev
+      if cur.size > 0
+        kbest_lists << cur
+        cur = []
+      end
+      prev = c
+      id = 0
+    end
+    t.id = id
+    cur << t
+    id += 1
+  end
+  kbest_lists << cur # last one
+  f.close
+  return kbest_lists
+end
+
diff --git a/lib/zipf/bleu.rb b/lib/zipf/bleu.rb
new file mode 100644
index 0000000..69de00b
--- /dev/null
+++ b/lib/zipf/bleu.rb
@@ -0,0 +1,130 @@
+module BLEU
+
+
+class BLEU::NgramCounts
+  attr_accessor :sum, :clipped, :ref_len, :hyp_len, :n
+
+  def initialize(n)
+    @n = 0
+    @sum = []
+    @clipped = []
+    @ref_len = 0.0
+    @hyp_len = 0.0
+    grow(n)
+  end
+
+  def grow(n)
+    (n-@n).times {
+      @sum << 0.0
+      @clipped << 0.0
+    }
+    @n = n
+  end
+
+  def plus_eq(other)
+    if other.n > @n then grow(other.n) end
+    0.upto(other.n-1) { |m|
+      @sum[m] += other.sum[m]
+      @clipped[m] += other.clipped[m]
+    }
+    @ref_len += other.ref_len
+    @hyp_len += other.hyp_len
+  end
+
+  def to_s
+    return "n=#{n} sum=#{sum} clipped=#{clipped} ref_len=#{ref_len} hyp_len=#{hyp_len}"
+  end
+end
+
+class BLEU::Ngrams
+  def initialize
+    @h_ = {}
+    @h_.default = 0
+  end
+
+  def add(k)
+    if k.class == Array then k = k.join ' ' end
+    @h_[k] += 1
+  end
+
+  def get_count(k)
+    if k.class == Array then k = k.join ' ' end
+    return @h_[k]
+  end
+
+  def each
+    @h_.each_pair { |k,v|
+      yield k.split, v
+    }
+  end
+
+  def to_s
+    @h_.to_s
+  end
+end
+
+def BLEU::get_counts hypothesis, reference, n, times=1
+  p = NgramCounts.new n
+  r = Ngrams.new
+  ngrams(reference, n) { |ng| r.add ng }
+  h = Ngrams.new
+  ngrams(hypothesis, n) { |ng| h.add ng }
+  h.each { |ng,count|
+    sz = ng.size-1
+    p.sum[sz] += count * times
+    p.clipped[sz] += [r.get_count(ng), count].min * times
+  }
+  p.ref_len = tokenize(reference.strip).size * times
+  p.hyp_len = tokenize(hypothesis.strip).size * times
+  return p
+end
+
+def BLEU::brevity_penalty c, r, smooth=0.0
+  return [0.0, 1.0-((r+smooth)/c)].min
+end
+
+def BLEU::bleu counts, n, debug=false
+  corpus_stats = NgramCounts.new n
+  counts.each { |i| corpus_stats.plus_eq i }
+  logbleu = 0.0
+  0.upto(n-1) { |m|
+    STDERR.write "#{m+1} #{corpus_stats.clipped[m]} / #{corpus_stats.sum[m]}\n" if debug
+    return 0.0 if corpus_stats.clipped[m] == 0 or corpus_stats.sum == 0
+    logbleu += Math.log(corpus_stats.clipped[m]) - Math.log(corpus_stats.sum[m])
+  }
+  logbleu /= n
+  if debug
+    STDERR.write "BP #{brevity_penalty(corpus_stats.hyp_len, corpus_stats.ref_len)}\n"
+    STDERR.write "sum #{Math.exp(sum)}\n"
+  end
+  logbleu += brevity_penalty corpus_stats.hyp_len, corpus_stats.ref_len
+  return Math.exp logbleu
+end
+
+def BLEU::hbleu counts, n, debug=false
+  (100*bleu(counts, n, debug)).round(3)
+end
+
+def BLEU::per_sentence_bleu hypothesis, reference, n=4, smooth=0.0
+  h_ng = {}; r_ng = {}
+  (1).upto(n) { |i| h_ng[i] = []; r_ng[i] = [] }
+  ngrams(hypothesis, n) { |i| h_ng[i.size] << i }
+  ngrams(reference, n) { |i| r_ng[i.size] << i }
+  m = [n, reference.split.size].min
+  add = 0.0
+  logbleu = 0.0
+  (1).upto(m) { |i|
+    counts_clipped = 0
+    counts_sum = h_ng[i].size
+    h_ng[i].uniq.each { |j| counts_clipped += r_ng[i].count(j) }
+    add = 1.0 if i >= 2
+    logbleu += Math.log(counts_clipped+add) - Math.log(counts_sum+add);
+  }
+  logbleu /= m
+  logbleu += brevity_penalty hypothesis.strip.split.size, reference.strip.split.size, smooth
+  return Math.exp logbleu
+end
+
+
+end #module
+
diff --git a/lib/zipf/dag.rb b/lib/zipf/dag.rb
new file mode 100644
index 0000000..45ede20
--- /dev/null
+++ b/lib/zipf/dag.rb
@@ -0,0 +1,205 @@
+module DAG
+
+require 'json'
+
+
+class DAG::Node
+  attr_accessor :label, :outgoing, :incoming, :score, :mark
+
+  def initialize label=nil, outgoing=[], incoming=[], score=nil
+    @label    = label
+    @outgoing = outgoing
+    @incoming = incoming
+    @score    = nil
+  end
+
+  def add_edge head, weight=0
+    exit if self==head # no self-cycles!
+    @outgoing << DAG::Edge.new(self, head, weight)
+    return @outgoing.last
+  end
+
+  def to_s
+    "DAG::Node<label:#{label}, outgoing:#{outgoing.size}, incoming:#{incoming.size}>"
+  end
+
+  def repr
+    "#{to_s} #{@score} out:#{@outgoing} in:[#{@incoming.map{|e| e.to_s}.join ', '}]"
+  end
+end
+
+class DAG::Edge
+  attr_accessor :tail, :head, :weight, :mark
+
+  def initialize tail=nil, head=nil, weight=0
+    @tail   = tail
+    @head   = head
+    @weight = weight
+    @mark   = false # did we already follow this edge? -- for topological sorting
+  end
+
+  def to_s
+    s = "DAG::Edge<#{@tail} ->[#{weight}] #{@head}"
+    s += " x" if @mark
+    s += ">"
+    s
+  end
+end
+
+# depth-first search
+#  w/o markings as we do not have cycles
+def DAG::dfs n, target_label
+  return n if n.label==target_label # assumes uniq labels!
+  stack = n.outgoing.map { |i| i.head }
+  while !stack.empty?
+    m = stack.pop
+    return DAG::dfs m, target_label
+  end
+  return nil
+end
+
+# breadth-first search
+#  w/o markings as we do not have cycles
+def DAG::bfs n, target_label
+  queue = [n]
+  while !queue.empty?
+    m = queue.shift
+    return m if m.label==target_label
+    m.outgoing.each { |e| queue << e.head }
+  end
+  return nil
+end
+
+# topological sort
+def DAG::topological_sort graph
+  sorted = []
+  s = graph.reject { |n| !n.incoming.empty? }
+  while !s.empty?
+    sorted << s.shift
+    sorted.last.outgoing.each { |e|
+      e.mark = true
+      s << e.head if e.head.incoming.reject{|f| f.mark}.empty?
+    }
+  end
+  return sorted
+end
+
+# initialize graph scores with semiring One
+def DAG::init graph, semiring, source_node
+  graph.each {|n| n.score=semiring.null}
+  source_node.score = semiring.one
+end
+
+# viterbi
+def DAG::viterbi graph, semiring=ViterbiSemiring, source_node
+  toposorted = DAG::topological_sort(graph)
+  DAG::init(graph, semiring, source_node)
+  toposorted.each { |n|
+    n.incoming.each { |e|
+      # update
+      n.score = \
+        semiring.add.call(n.score, \
+                          semiring.multiply.call(e.tail.score, e.weight)
+        )
+    }
+  }
+end
+
+# forward viterbi
+def DAG::viterbi_forward graph, semiring=ViterbiSemiring, source_node
+  toposorted = DAG::topological_sort(graph)
+  DAG::init(graph, semiring, source_node)
+  toposorted.each { |n|
+    n.outgoing.each { |e|
+      e.head.score = \
+        semiring.add.call(e.head.score, \
+                          semiring.multiply.call(n.score, e.weight)
+        )
+    }
+  }
+end
+
+# Dijkstra algorithm
+#  for A*-search we would need an optimistic estimate of
+#  future cost at each node
+def DAG::dijkstra graph, semiring=RealSemiring.new, source_node
+  DAG::init(graph, semiring, source_node)
+  q = PriorityQueue.new graph
+  while !q.empty?
+    n = q.pop
+    n.outgoing.each { |e|
+      e.head.score = \
+        semiring.add.call(e.head.score, \
+                          semiring.multiply.call(n.score, e.weight))
+      q.sort!
+    }
+  end
+end
+
+# Bellman-Ford algorithm
+def DAG::bellman_ford(graph, semiring=RealSemiring.new, source_node)
+  DAG::init(graph, semiring, source_node)
+  edges = []
+  graph.each { |n| edges |= n.outgoing }
+  # relax edges
+  (graph.size-1).times{ |i|
+    edges.each { |e|
+      e.head.score = \
+        semiring.add.call(e.head.score, \
+                          semiring.multiply.call(e.tail.score, e.weight))
+    }
+  }
+  # we do not allow cycles (negative or positive)
+end
+
+# Floyd algorithm
+def DAG::floyd(graph, semiring=nil)
+  dist_matrix = []
+  graph.each_index { |i|
+    dist_matrix << []
+    graph.each_index { |j|
+      val = 1.0/0.0
+      val = 0.0 if i==j
+      dist_matrix.last << val
+    }
+  }
+  edges = []
+  graph.each { |n| edges |= n.outgoing }
+  edges.each { |e|
+    dist_matrix[graph.index(e.tail)][graph.index(e.head)] = e.weight
+  }
+  0.upto(graph.size-1) { |k|
+    0.upto(graph.size-1) { |i|
+      0.upto(graph.size-1) { |j|
+        if dist_matrix[i][k] + dist_matrix[k][j] < dist_matrix[i][j]
+          dist_matrix  [i][j] = dist_matrix[i][k] + dist_matrix[k][j]
+        end
+      }
+    }
+  }
+  return dist_matrix
+end
+
+
+# returns a list of nodes (graph) and a hash for finding
+# nodes by their label (these need to be unique!)
+def DAG::read_graph_from_json fn, semiring=RealSemiring.new
+  graph = []
+  nodes_by_label = {}
+  h = JSON.parse File.new(fn).read
+  h['nodes'].each { |i|
+    n = DAG::Node.new i['label']
+    graph << n
+    nodes_by_label[n.label] = n
+  }
+  h['edges'].each { |i|
+    n = nodes_by_label[i['tail']]
+    a = n.add_edge(nodes_by_label[i['head']], semiring.convert.call(i['weight'].to_f))
+    nodes_by_label[i['head']].incoming << a
+  }
+  return graph, nodes_by_label
+end
+
+
+end #module
+
diff --git a/lib/zipf/fileutil.rb b/lib/zipf/fileutil.rb
new file mode 100644
index 0000000..eb69136
--- /dev/null
+++ b/lib/zipf/fileutil.rb
@@ -0,0 +1,88 @@
+require 'zlib'
+
+
+class ReadFile
+
+  def initialize fn, encoding='utf-8'
+    if fn.split('.').last == 'gz'
+      @f = Zlib::GzipReader.new(File.new(fn, 'rb'), :external_encoding=>encoding)
+    elsif fn == '-'
+      @f = STDIN
+      STDIN.set_encoding encoding
+    else
+      @f = File.new fn, 'r'
+      @f.set_encoding encoding
+    end
+  end
+
+  def gets
+    @f.gets { |line| yield line }
+  end
+
+  def readlines
+    @f.readlines
+  end
+
+  def self.readlines fn, encoding='utf-8'
+    f = ReadFile.new fn, encoding
+    r = f.readlines
+    f.close
+    return r
+  end
+
+  def readlines_strip
+    self.readlines.map{ |i| i.strip }
+  end
+
+  def self.readlines_strip fn, encoding='utf-8'
+    f = ReadFile.new fn, encoding
+    r = f.readlines_strip
+    f.close
+    return r
+  end
+
+  def read
+    @f.read
+  end
+
+  def self.read fn, encoding='utf-8'
+    f = ReadFile.new fn, encoding
+    r = f.read
+    f.close
+    return r
+  end
+
+  def close
+    @f.close if @f!=STDIN
+  end
+end
+
+class WriteFile
+
+  def initialize fn, encoding='utf-8'
+    if fn.split('.').last == 'gz'
+      @f = Zlib::GzipWriter.new(File.new(fn, 'wb+'), :external_encoding=>encoding)
+    elsif fn == '-'
+      @f = STDOUT
+      STDOUT.set_encoding encoding
+    else
+      @f = File.new fn, 'w+'
+      @f.set_encoding encoding
+    end
+  end
+
+  def write s
+    @f.write s
+  end
+
+  def self.write s, fn, encoding='utf-8'
+    f = WriteFile.new fn, encoding
+    f.write s
+    f.close
+  end
+
+  def close
+    @f.close if @f!=STDIN
+  end
+end
+
diff --git a/lib/zipf/grammar.rb b/lib/zipf/grammar.rb
new file mode 100644
index 0000000..568b9fc
--- /dev/null
+++ b/lib/zipf/grammar.rb
@@ -0,0 +1,123 @@
+module Grammar
+
+
+class T
+  attr_accessor :word
+
+  def initialize word
+    @word = word
+  end
+
+  def to_s
+    "T<#{@word}>"
+  end
+end
+
+class NT
+  attr_accessor :symbol, :index, :span
+
+  def initialize symbol, index=0
+    @symbol = symbol
+    @index = index
+    @span = Span.new
+  end
+
+  def to_s
+    "NT(#{@span.left},#{@span.right})<#{@symbol},#{@index}>"
+  end
+end
+
+class Rule
+  attr_accessor :lhs, :rhs, :e
+
+  def initialize lhs=nil, rhs=[], e=''
+    @lhs = lhs
+    @rhs = rhs
+    @e = e
+  end
+
+  def to_s
+    "#{lhs} -> #{rhs.map{ |i| i.to_s }.join ' '} [arity=#{arity}] ||| #{@e}"
+  end
+
+  def arity
+    rhs.select { |i| i.class == NT }.size
+  end
+
+  def from_s s
+    _ = splitpipe s, 3
+    @lhs = NT.new _[0].strip.gsub!(/(\[|\])/, "")
+    _[1].split.each { |x|
+      x.strip!
+      if x[0]=='[' && x[x.size-1] == ']'
+        @rhs << NT.new(x.gsub!(/(\[|\])/, "").split(',')[0])
+      else
+        @rhs << T.new(x)
+      end
+    }
+    @e = _[2]
+  end
+
+  def self.from_s s
+    r = self.new
+    r.from_s s
+    return r
+  end
+end
+
+class Span
+  attr_accessor :left, :right
+
+  def initialize left=nil, right=nil
+    @left = left
+    @right = right
+  end
+end
+
+class Grammar
+  attr_accessor :rules, :startn, :startt, :flat
+
+  def initialize fn
+    @rules = []; @startn = []; @startt = [] ;@flat = []
+    ReadFile.readlines_strip(fn).each_with_index { |s,i|
+      STDERR.write '.'; STDERR.write " #{i+1}\n" if (i+1)%80==0
+      @rules << Rule.from_s(s)
+      if @rules.last.rhs.first.class == NT
+        @startn << @rules.last
+      else
+        if rules.last.arity == 0
+          @flat << @rules.last
+        else
+          @startt << @rules.last
+        end
+      end
+    }
+    STDERR.write "\n"
+  end
+
+  def to_s
+    s = ''
+    @rules.each { |r| s += r.to_s+"\n" }
+    return s
+  end
+
+  def add_glue_rules
+    @rules.map { |r| r.lhs.symbol }.select { |s| s != 'S' }.uniq.each { |symbol|
+      @rules << Rule.new(NT.new('S'), [NT.new(symbol)])
+      @startn << @rules.last
+      @rules << Rule.new(NT.new('S'), [NT.new('S'), NT.new('X')])
+      @startn << @rules.last
+    }
+  end
+
+  def add_pass_through_rules s
+    s.each { |word|
+      @rules << Rule.new(NT.new('X'), [T.new(word)])
+      @flat << @rules.last
+    }
+  end
+end
+
+
+end #module
+
diff --git a/lib/zipf/hg.rb b/lib/zipf/hg.rb
new file mode 100644
index 0000000..f86bf60
--- /dev/null
+++ b/lib/zipf/hg.rb
@@ -0,0 +1,173 @@
+#!/usr/bin/env ruby
+
+require_relative 'semirings'
+require 'json'
+
+
+module HG
+
+
+class HG::Node
+  attr_accessor :label, :cat, :outgoing, :incoming, :score
+
+  def initialize label=nil, cat=nil, outgoing=[], incoming=[], score=nil
+    @label    = label
+    @cat      = cat
+    @outgoing = outgoing
+    @incoming = incoming
+    @score    = nil
+  end
+
+  def to_s
+    "Node<label:\"#{@label}\", cat:\"#{@cat}\", outgoing:#{@outgoing.size}, incoming:#{@incoming.size}>"
+  end
+end
+
+class HG::Hypergraph
+  attr_accessor :nodes, :edges
+
+  def initialize nodes=[], edges=[]
+    @nodes = nodes
+    @edges = edges
+  end
+
+  def arity
+    @edges.map { |e| e.arity }.max
+  end
+
+  def to_s
+    "Hypergraph<nodes:[#{@nodes.to_s}], edges:[#{@edges.to_s}], arity:#{arity}>"
+  end
+end
+
+class HG::Hyperedge
+  attr_accessor :head, :tails, :weight, :f, :mark, :rule, :left, :right
+
+  def initialize head=nil, tails=[], weight=0.0, f={}
+    @head   = head
+    @tails  = tails
+    @weight = weight
+    @f      = f
+    @mark   = 0
+  end
+
+  def arity
+    return @tails.size
+  end
+
+  def marked?
+    arity == @mark
+  end
+
+  def to_s
+    "Hyperedge<head:\"#{@head.label}\", \"tails:#{@tails.map{|n|n.label}}, arity:#{arity}, weight:#{@weight}, f:#{f.to_s}, mark:#{@mark}>"
+  end
+end
+
+def HG::topological_sort nodes
+  sorted = []
+  s = nodes.reject { |n| !n.incoming.empty? }
+  while !s.empty?
+    sorted << s.shift
+    sorted.last.outgoing.each { |e|
+      next if e.marked?
+      e.mark += 1
+      s << e.head if e.head.incoming.reject{ |f| f.mark==f.arity }.empty?
+    }
+  end
+  return sorted
+end
+
+def HG::init nodes, semiring, root
+  nodes.each { |n| n.score=semiring.null }
+  root.score = semiring.one
+end
+
+def HG::viterbi hypergraph, root, semiring=ViterbiSemiring.new
+  toposorted = topological_sort hypergraph.nodes
+  init toposorted, semiring, root
+  toposorted.each { |n|
+    n.incoming.each { |e|
+      s = semiring.one
+      e.tails.each { |m|
+        s = semiring.multiply.call(s, m.score)
+      }
+      n.score = semiring.add.call(n.score, semiring.multiply.call(s, e.weight))
+    }
+  }
+end
+
+def HG::viterbi_path hypergraph, root, semiring=ViterbiSemiring.new
+  toposorted = topological_sort hypergraph.nodes
+  init toposorted, semiring, root
+  best_path = []
+  toposorted.each { |n|
+    best_edge = nil
+    n.incoming.each { |e|
+      s = semiring.one
+      e.tails.each { |m|
+        s = semiring.multiply.call(s, m.score)
+      }
+      if n.score < semiring.multiply.call(s, e.weight) # ViterbiSemiring add
+        best_edge = e
+      end
+      n.score = semiring.add.call(n.score, semiring.multiply.call(s, e.weight))
+    }
+    best_path << best_edge
+  }
+  return best_path, toposorted.last.score
+end
+
+def HG::read_hypergraph_from_json fn, semiring=RealSemiring.new, log_weights=false
+  nodes = []
+  edges = []
+  nodes_by_label = {}
+  nodes_by_index = []
+  h = JSON.parse File.new(fn).read
+  w = SparseVector.from_h h['weights']
+  h['nodes'].each { |i|
+    n = Node.new i['label'], i['cat']
+    nodes << n
+    nodes_by_label[n.label] = n
+    nodes_by_index << n
+  }
+  h['edges'].each { |i|
+    e = Hyperedge.new(nodes_by_label[i['head']], \
+                      i['tails'].map{|j| nodes_by_label[j]}.to_a, \
+                      semiring.convert.call(i['weight'].to_f), \
+                      {})
+    e.f = SparseVector.from_h i['f']
+    if log_weights
+      e.weight = Math.exp(w.dot(e.f))
+    else
+      e.weight = w.dot(e.f)
+    end
+    e.tails.each { |m|
+      m.outgoing << e
+    }
+    e.head.incoming << e
+    edges << e
+  }
+  return Hypergraph.new(nodes, edges), nodes_by_label, nodes_by_index
+end
+
+def HG::all_paths hypergraph, root, semiring=ViterbiSemiring.new
+  toposorted = topological_sort hypergraph.nodes
+  paths = [[]]
+  toposorted.each { |n|
+    next if n.incoming.empty?
+    new_paths = []
+    while !paths.empty?
+      p = paths.pop
+      n.incoming.each { |e|
+        new_paths << p+[e]
+      }
+    end
+    paths = new_paths
+  }
+  return paths
+end
+
+
+end #module
+
diff --git a/lib/zipf/misc.rb b/lib/zipf/misc.rb
new file mode 100644
index 0000000..0319a5f
--- /dev/null
+++ b/lib/zipf/misc.rb
@@ -0,0 +1,114 @@
+require 'timeout'
+
+
+class Array
+  def max_index
+    self.index(self.max)
+  end
+
+  def is_subset_of? other
+    self.each { |i|
+      if other.include? i
+       return false
+      end
+    }
+    return true
+  end
+
+  def sum
+    self.inject(:+)
+  end
+
+  def mean
+    self.sum.to_f/self.size
+  end
+end
+
+class String
+
+  def downcase?
+    self[/[[:lower:]]/]
+  end
+end
+
+class PriorityQueue
+# This assumes that elements in the queue
+# have a numerical member named 'score'.
+
+  def initialize a=Array.new
+    @queue = Array.new a
+    sort!
+  end
+
+  def sort!
+    @queue.sort_by! { |i| -i.score }
+  end
+
+  def pop
+    @queue.pop
+  end
+
+  def push i
+    @queue << i
+    sort!
+  end
+
+  def empty?
+    @queue.empty?
+  end
+end
+
+def spawn_with_timeout cmd, t=4, ignore_fail=false, debug=false
+  STDERR.write cmd+"\n" if debug
+  pipe_in, pipe_out = IO.pipe
+  pid = Process.spawn(cmd, :out => pipe_out)
+  begin
+    Timeout.timeout(t) { Process.wait pid }
+  rescue Timeout::Error
+    Process.kill('TERM', pid) if !ignore_fail
+  end
+  pipe_out.close
+  return pipe_in.read
+end
+
+def read_phrase_table fn
+  table = {}
+  f = ReadFile.new fn
+  while raw_rule = f.gets
+    french, english, features = splitpipe(raw_rule)
+    feature_map = SparseVector.from_kv  features
+    if table.has_key? french
+      table[french] << [english, feature_map ]
+    else
+      table[french] = [[english, feature_map]]
+    end
+  end
+  f.close
+  return table
+end
+
+def cdec_kbest cdec_bin, input, ini, weights, k, unique=true
+  require 'open3'
+  cmd = "echo \"#{input}\" | #{cdec_bin} -c #{ini} -w #{weights} -k #{k}"
+  cmd += " -r" if unique
+  o,_ = Open3.capture2 "#{cmd}  2>/dev/null"
+  a = []; j = -1
+  o.split("\n").map{ |i| j+=1; t=Translation.new; t.from_s(i, false, j); a << t }
+  return a
+end
+
+def read_config fn
+  f = ReadFile.new fn
+  cfg = {}
+  while line = f.gets
+    line.strip!
+    next if /^\s*$/.match line
+    next if line[0]=='#'
+    content = line.split('#', 2).first
+    k, v = content.split(/\s*=\s*/, 2)
+    k.strip!; v.strip!
+    cfg[k] = v
+  end
+  return cfg
+end
+
diff --git a/lib/zipf/semirings.rb b/lib/zipf/semirings.rb
new file mode 100644
index 0000000..920923f
--- /dev/null
+++ b/lib/zipf/semirings.rb
@@ -0,0 +1,81 @@
+# Semirings for directed acyclic graphs (dags) (also directed hypergraphs),
+# as described in:
+# 'Dynamic Programming Algorithms in
+#  Semiring and Hypergraph Frameworks' (Liang Huang)
+#
+
+class Semiring
+  attr_accessor :add, :multiply, :one, :null, :convert
+end
+
+class BooleanSemiring < Semiring
+  def initialize
+    @add = Proc.new { |a,b| a||b }
+    @multiply =  Proc.new { |a,b| a&&b }
+    @one = true
+    @null = false
+    @convert = Proc.new { |v| true && v!=0 }
+  end
+end
+
+class ViterbiSemiring < Semiring
+  def initialize
+    @add = Proc.new { |a,b| [a,b].max }
+    @multiply =  Proc.new { |a,b| a*b }
+    @one = 1.0
+    @null = 0.0
+    @convert = Proc.new { |v| v }
+  end
+end
+
+class ViterbiLogSemiring < Semiring
+  def initialize
+    @add = Proc.new { |a,b| [a,b].max }
+    @multiply =  Proc.new { |a,b| a+b }
+    @one = 0.0
+    @null = -1.0/0.0
+    @convert = Proc.new { |v| v }
+  end
+end
+
+class InsideSemiring < Semiring
+  def initialize
+    @add = Proc.new { |a,b| a+b }
+    @multiply =  Proc.new { |a,b| a*b }
+    @one = 1.0
+    @null = 0.0
+    @convert = Proc.new { |v| v }
+  end
+end
+
+class RealSemiring < Semiring
+  def initialize
+    @add = Proc.new { |a,b| [a,b].min }
+    @multiply =  Proc.new { |a,b| a+b }
+    @one = 0.0
+    @null = 1.0/0.0
+    @convert = Proc.new { |v| v }
+  end
+end
+
+# for longest/worst paths
+class RealxSemiring < Semiring
+  def initialize
+    @add = Proc.new { |a,b| [a,b].max }
+    @multiply =  Proc.new { |a,b| a+b }
+    @one = -1.0/0.0
+    @null = 0.0
+    @convert = Proc.new { |v| v }
+  end
+end
+
+class CountingSemiring < Semiring
+  def initialize
+    @add = Proc.new { |a,b| a+b }
+    @multiply =  Proc.new { |a,b| a*b }
+    @one = 1.0
+    @null = 0.0
+    @convert = Proc.new { |v| if v!=0 then 1 else 0 end }
+  end
+end
+
diff --git a/lib/zipf/stringutil.rb b/lib/zipf/stringutil.rb
new file mode 100644
index 0000000..aa9be00
--- /dev/null
+++ b/lib/zipf/stringutil.rb
@@ -0,0 +1,22 @@
+def tokenize s
+  s.strip.split
+end
+
+def ngrams(s, n, fix=false)
+  a = tokenize s
+  a.each_with_index { |tok, i|
+    tok.strip!
+    0.upto([n-1, a.size-i-1].min) { |m|
+      yield a[i..i+m] if !fix||(fix&&a[i..i+m].size==n)
+    }
+  }
+end
+
+def bag_of_words s, stopwords=[]
+  s.strip.split.uniq.sort.reject{ |w| stopwords.include? w }
+end
+
+def splitpipe s, n=3
+  s.strip.split("|"*n)
+end
+
diff --git a/lib/zipf/tfidf.rb b/lib/zipf/tfidf.rb
new file mode 100644
index 0000000..13a40a3
--- /dev/null
+++ b/lib/zipf/tfidf.rb
@@ -0,0 +1,38 @@
+module TFIDF
+
+
+# returns key='raw frequency' for an
+# array-like object
+def TFIDF::tf array, stopwords=[]
+  v = {}; v.default = 0
+  array.uniq.each { |i|
+   next if stopwords.include? i
+   v[i] = array.count(i).to_f
+  }
+  return v
+end
+
+# smoothes raw frequencies of tf() in-place
+# a is a smoothing term
+def TFIDF::ntf hash, a=0.4
+  max = hash.values.max.to_f
+  hash.each_pair { |k,v|
+    hash[k] = a + (1-a)*(v/max)
+  }
+end
+
+# returns idf value for each word in a vocabulary
+def TFIDF::idf list_of_hashes
+  vocab = list_of_hashes.values.flatten.uniq
+  n = list_of_hashes.size.to_f
+  idf = {}
+  vocab.each { |i|
+    df = list_of_hashes.values.flatten.count i
+    idf[i] = Math.log(n/df)
+  }
+  return idf
+end
+
+
+end #module
+
-- 
cgit v1.2.3