From bad89c4f793591c550f2fce3d6669d60b156dd34 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Wed, 23 Dec 2015 13:53:53 +0100 Subject: make_rule_features: produce cdec's rule features (ids and bigrams) from a grammar --- make_rule_features | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100755 make_rule_features diff --git a/make_rule_features b/make_rule_features new file mode 100755 index 0000000..7adb6e9 --- /dev/null +++ b/make_rule_features @@ -0,0 +1,44 @@ +#!/usr/bin/env ruby + +require 'zipf' + +def mkrf src, tgt + s = src.gsub /\[X,[1-9]\]/, "NX" + t = tgt.gsub /\[X,([1-9])\]/,'N\1' + return "R:X:#{s.gsub(" ","_")}:#{t.gsub(" ","_")}" +end + +def mkrbf s, t + s = String.new s + if t == "S" + s.gsub! /\[X,[1-9]\]/, "X" + else + s.gsub! /\[X,([1-9])\]/, 'X\1' + end + s.reverse! + s += " >r<" + s.reverse! + s += " " + a = [] + ngrams(s, 2, true) { |ng| + a << "RB#{t}:#{ng.join "_"}" + } + return a +end + +h = {} +while line = STDIN.gets + _,src,tgt,_,_ = splitpipe line.strip + src.strip! + tgt.strip! + mkrbf(src, "S").each { |f| + h[f] = true + } + mkrbf(tgt, "T").each { |f| + h[f] = true + } + h [mkrf(src, tgt)] = true +end + +h.keys.each { |f| puts f } + -- cgit v1.2.3