From 99bb47b07e5cbbb5f91e0f5c6c3138a66ca17fe8 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Thu, 6 Feb 2014 10:35:59 +0100 Subject: 'phrase-table => bloom filter' script --- pt_bloom | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100755 pt_bloom (limited to 'pt_bloom') diff --git a/pt_bloom b/pt_bloom new file mode 100755 index 0000000..2c3928f --- /dev/null +++ b/pt_bloom @@ -0,0 +1,24 @@ +#!/usr/bin/env ruby + +require 'bloom-filter' +require 'trollop' + +STDIN.set_encoding 'utf-8' +STDOUT.set_encoding 'utf-8' + +cfg = Trollop::options do + opt :size, "number of entries in the filter", :type => :int, :required => true + opt :error_rate, "error rate", :type => :float, :default => 0.01 +end + +f = BloomFilter.new cfg[:size], cfg[:error_rate] +while line = STDIN.gets + src, tgt = splitpipe(line)[0..1] + src.strip! + tgt.strip! + f.insert(src+" ||| "+tgt) +end + +f.dump('pt.bloom') +f.close + -- cgit v1.2.3