From bad89c4f793591c550f2fce3d6669d60b156dd34 Mon Sep 17 00:00:00 2001
From: Patrick Simianer <p@simianer.de>
Date: Wed, 23 Dec 2015 13:53:53 +0100
Subject: make_rule_features: produce cdec's rule features (ids and bigrams)
 from a grammar

---
 make_rule_features | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)
 create mode 100755 make_rule_features

(limited to 'make_rule_features')

diff --git a/make_rule_features b/make_rule_features
new file mode 100755
index 0000000..7adb6e9
--- /dev/null
+++ b/make_rule_features
@@ -0,0 +1,44 @@
+#!/usr/bin/env ruby
+
+require 'zipf'
+
+def mkrf src, tgt
+  s = src.gsub /\[X,[1-9]\]/, "NX"
+  t = tgt.gsub /\[X,([1-9])\]/,'N\1'
+  return "R:X:#{s.gsub(" ","_")}:#{t.gsub(" ","_")}"
+end
+
+def mkrbf s, t
+  s = String.new s
+  if t == "S"
+    s.gsub! /\[X,[1-9]\]/, "X"
+  else
+    s.gsub! /\[X,([1-9])\]/, 'X\1' 
+  end
+  s.reverse!
+  s += " >r<"
+  s.reverse!
+  s += " </r>"
+  a = []
+  ngrams(s, 2, true) { |ng|
+    a << "RB#{t}:#{ng.join "_"}"
+  }
+  return a
+end
+
+h = {}
+while line = STDIN.gets
+  _,src,tgt,_,_ = splitpipe line.strip
+  src.strip!
+  tgt.strip!
+  mkrbf(src, "S").each { |f|
+    h[f] = true
+  }
+  mkrbf(tgt, "T").each { |f|
+    h[f] = true
+  }
+  h [mkrf(src, tgt)] = true
+end
+
+h.keys.each { |f| puts f }
+
-- 
cgit v1.2.3