#!/usr/bin/env ruby

require 'zipf'
require 'trollop'


def main
  cfg = Trollop::options do
    opt :documents, "input files (documents)", :type => :string, :required => true
    opt :filter_stopwords, "filter stopwords (give file)", :type => :string, :default => nil
    opt :one_item_per_line, "one item per line (allow multi-word items)", :type => :bool, :default => false
    opt :ntf, "length-normalize tf values", :type => :bool
    opt :idf, "weight tf by idf", :type => :bool
  end

  stopwords = []
  if cfg[:filter_stopwords]
    stopwords = ReadFile.readlines(cfg[:filter_stopwords]).map{ |i|
      i.split('|').first.strip
    }.reject{ |i| i=='' }
  end

  docs = {}
  cfg[:documents].each { |i|
    if cfg[:one_item_per_line]
      docs[i] = ReadFile.readlines_strip i
    else
     docs[i] = ReadFile.read(i).split(/\s/).map{ |i| i.strip }
    end
  }

  idf_values = TFIDF::idf docs

  docs.each_pair { |name, words|
    just_tf = TFIDF::tf words, stopwords
    just_tf = TFIDF::ntf(just_tf) if cfg[:ntf]
    tf_idf = {}; tf_idf.default = 0.0
    if cfg[:idf]
      just_tf.each_pair { |word,f|
        tf_idf[word] = idf_values[word] * f
      }
    else
      tf_idf = just_tf
    end
    docs[name] = tf_idf
  }

  docs.each { |i| puts i.to_s }
end


main