diff options
author | Patrick Simianer <p@simianer.de> | 2015-12-23 13:53:53 +0100 |
---|---|---|
committer | Patrick Simianer <p@simianer.de> | 2015-12-23 13:53:53 +0100 |
commit | bad89c4f793591c550f2fce3d6669d60b156dd34 (patch) | |
tree | 6f03e53686a36937aa2123fc2ce74ce861195d23 | |
parent | c69080adb7cf6dbf25c0ed1129fe988163bc26fd (diff) |
make_rule_features: produce cdec's rule features (ids and bigrams) from a grammar
-rwxr-xr-x | make_rule_features | 44 |
1 files changed, 44 insertions, 0 deletions
diff --git a/make_rule_features b/make_rule_features new file mode 100755 index 0000000..7adb6e9 --- /dev/null +++ b/make_rule_features @@ -0,0 +1,44 @@ +#!/usr/bin/env ruby + +require 'zipf' + +def mkrf src, tgt + s = src.gsub /\[X,[1-9]\]/, "NX" + t = tgt.gsub /\[X,([1-9])\]/,'N\1' + return "R:X:#{s.gsub(" ","_")}:#{t.gsub(" ","_")}" +end + +def mkrbf s, t + s = String.new s + if t == "S" + s.gsub! /\[X,[1-9]\]/, "X" + else + s.gsub! /\[X,([1-9])\]/, 'X\1' + end + s.reverse! + s += " >r<" + s.reverse! + s += " </r>" + a = [] + ngrams(s, 2, true) { |ng| + a << "RB#{t}:#{ng.join "_"}" + } + return a +end + +h = {} +while line = STDIN.gets + _,src,tgt,_,_ = splitpipe line.strip + src.strip! + tgt.strip! + mkrbf(src, "S").each { |f| + h[f] = true + } + mkrbf(tgt, "T").each { |f| + h[f] = true + } + h [mkrf(src, tgt)] = true +end + +h.keys.each { |f| puts f } + |