From db6a6ecfa350cae29739c59df1210d8f76a479c9 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Thu, 5 Dec 2013 07:56:38 +0100 Subject: init --- ng | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100755 ng (limited to 'ng') diff --git a/ng b/ng new file mode 100755 index 0000000..d8b01ae --- /dev/null +++ b/ng @@ -0,0 +1,39 @@ +#!/usr/bin/env ruby + +def ngrams_it(s, n, fix=false) + a = s.strip.split + a.each_with_index { |tok, i| + tok.strip! + 0.upto([n-1, a.size-i-1].min) { |m| + yield a[i..i+m] if !(fix^(a[i..i+m].size==n)) + } + } +end + +def main(n, fix, sep) + STDIN.set_encoding 'utf-8' + STDOUT.set_encoding 'utf-8' + while line = STDIN.gets + a = [] + ngrams_it(line, n, fix) {|ng| a << ng.join(' ')} + a.reject! {|i| i.strip.size==0 } + puts a.join sep if a.size > 0 + end +end + +def usage + STDERR.write "./ng [-n ] [--fix] [--separator ] < \n" + exit 1 +end + +if __FILE__ == $0 + require 'trollop' + opts = Trollop::options do + opt :n, "Ngrams", :type => :int, :default => 4 + opt :fix, "Don't output lower order Ngrams.", :type => :bool, :default => true + opt :separator, "separte ngrams of a line by this string", :type => :string, :default => "\n" + end + usage if not [0,2,4,6].include? ARGV.size + main(opts[:n], opts[:fix], opts[:separator]) +end + -- cgit v1.2.3