summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2015-12-23 13:53:53 +0100
committerPatrick Simianer <p@simianer.de>2015-12-23 13:53:53 +0100
commitbad89c4f793591c550f2fce3d6669d60b156dd34 (patch)
tree6f03e53686a36937aa2123fc2ce74ce861195d23
parentc69080adb7cf6dbf25c0ed1129fe988163bc26fd (diff)
make_rule_features: produce cdec's rule features (ids and bigrams) from a grammar
-rwxr-xr-xmake_rule_features44
1 files changed, 44 insertions, 0 deletions
diff --git a/make_rule_features b/make_rule_features
new file mode 100755
index 0000000..7adb6e9
--- /dev/null
+++ b/make_rule_features
@@ -0,0 +1,44 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+
+def mkrf src, tgt
+ s = src.gsub /\[X,[1-9]\]/, "NX"
+ t = tgt.gsub /\[X,([1-9])\]/,'N\1'
+ return "R:X:#{s.gsub(" ","_")}:#{t.gsub(" ","_")}"
+end
+
+def mkrbf s, t
+ s = String.new s
+ if t == "S"
+ s.gsub! /\[X,[1-9]\]/, "X"
+ else
+ s.gsub! /\[X,([1-9])\]/, 'X\1'
+ end
+ s.reverse!
+ s += " >r<"
+ s.reverse!
+ s += " </r>"
+ a = []
+ ngrams(s, 2, true) { |ng|
+ a << "RB#{t}:#{ng.join "_"}"
+ }
+ return a
+end
+
+h = {}
+while line = STDIN.gets
+ _,src,tgt,_,_ = splitpipe line.strip
+ src.strip!
+ tgt.strip!
+ mkrbf(src, "S").each { |f|
+ h[f] = true
+ }
+ mkrbf(tgt, "T").each { |f|
+ h[f] = true
+ }
+ h [mkrf(src, tgt)] = true
+end
+
+h.keys.each { |f| puts f }
+