summaryrefslogtreecommitdiff
path: root/pt_bloom
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2014-02-06 10:35:59 +0100
committerPatrick Simianer <p@simianer.de>2014-02-06 10:35:59 +0100
commit99bb47b07e5cbbb5f91e0f5c6c3138a66ca17fe8 (patch)
treee8b3192f8eedd010a576afbe7b7a5ac5478e7bd3 /pt_bloom
parentaaca4353663e46ce46a64851dd8e21f5590f7c6c (diff)
'phrase-table => bloom filter' script
Diffstat (limited to 'pt_bloom')
-rwxr-xr-xpt_bloom24
1 files changed, 24 insertions, 0 deletions
diff --git a/pt_bloom b/pt_bloom
new file mode 100755
index 0000000..2c3928f
--- /dev/null
+++ b/pt_bloom
@@ -0,0 +1,24 @@
+#!/usr/bin/env ruby
+
+require 'bloom-filter'
+require 'trollop'
+
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
+cfg = Trollop::options do
+ opt :size, "number of entries in the filter", :type => :int, :required => true
+ opt :error_rate, "error rate", :type => :float, :default => 0.01
+end
+
+f = BloomFilter.new cfg[:size], cfg[:error_rate]
+while line = STDIN.gets
+ src, tgt = splitpipe(line)[0..1]
+ src.strip!
+ tgt.strip!
+ f.insert(src+" ||| "+tgt)
+end
+
+f.dump('pt.bloom')
+f.close
+