init

author: Patrick Simianer <p@simianer.de> 2014-06-14 16:46:27 +0200
committer: Patrick Simianer <p@simianer.de> 2014-06-14 16:46:27 +0200
commit: 26c490f404731d053a6205719b6246502c07b449 (patch)
tree: 3aa721098f1251dfbf2249ecd2736434c13b1d48 /simple_decoder/simple_decoder.rb
1 files changed, 171 insertions, 0 deletions
diff --git a/simple_decoder/simple_decoder.rb b/simple_decoder/simple_decoder.rb
new file mode 100755
index 0000000..fca4eb4
--- /dev/null
+++ b/simple_decoder/simple_decoder.rb
@@ -0,0 +1,171 @@
+#!/usr/bin/env ruby
+
+require 'nlp_ruby'
+require 'trollop'
+
+
+class OrderedHypothesisStack < Array
+
+  def sort!
+    self.sort_by! { |i| i.score }
+  end
+end
+
+class Rule
+  attr_accessor :left, :right, :features
+
+  def initialize left, right, features
+    @left = left
+    @right = right
+    @features = features
+  end
+end
+
+class Coverage
+
+  def initialize a
+    @_a = a
+    @_size = nil
+  end
+
+  def set tuple
+    tuple[0].upto(tuple[1]) { |i|
+      @_a[i] = true
+    }
+  end
+
+  def clone
+    new_a = []
+    @_a.each_with_index { |e,i|
+      if e
+        new_a << true
+      else
+        new_a << nil
+      end
+    }
+    return Coverage.new new_a
+  end
+
+  def to_s
+    [@_a.to_s, size]
+  end
+
+  def free? tuple
+    tuple[0].upto(tuple[1]) { |i|
+      return false if @_a[i]
+    }
+    return true
+  end
+
+  def size
+    return @_size if @_size
+    sum = 0
+    @_a.each { |i| sum += 1 if i }
+    @_size = sum
+    return @_size
+  end
+end
+
+class Hypothesis
+  attr_accessor :rule, :score, :coverage, :previous, :span, :final
+
+  def initialize rule, score, coverage, previous, span
+    @rule = rule
+    @score = score
+    @coverage = coverage
+    @previous = previous
+    @span = span
+    @final = false
+  end
+
+  def to_s
+    [@score, @rule, @coverage.to_s, "TODO", span.to_s].to_s
+  end
+
+  def applicable? span
+    return @coverage.free? span
+  end
+end
+
+def get_translation_options tokenized_input, phrase_table
+  translation_options = {}
+  tokenized_input.each_with_index { |token, i|
+    i.upto(tokenized_input.size-1) { |j|
+       span = [i, j]
+       str = tokenized_input[i..j].join ' '
+       translation_options[span] = [[str, {'f2e'=>-1.0}]] if i==j
+       translation_options[span] = [] if j>i
+       if phrase_table.has_key? str
+         phrase_table[str].each { |right_side|
+           translation_options[span] << right_side
+         }
+       end
+    }
+  }
+  return translation_options
+end
+
+def main
+  cfg = Trollop::options do
+    opt :debug, "debug mode", :type => :bool, :default => false
+    opt :stack_size, "max. stack size", :type => :int, :default => 100
+  end
+  if !cfg[:debug]
+    input = STDIN.gets.strip
+    phrase_table = read_phrase_table './data/example.phrases'
+  else
+    input = 'a b'
+    phrase_table = read_phrase_table './data/debug.phrases'
+  end
+  tokenized_input = tokenize input
+  translation_options = get_translation_options tokenized_input, phrase_table
+  a = []
+  (0).upto(tokenized_input.size-1) { a << nil }
+  initial_coverage = Coverage.new a
+  empty_hypothesis = Hypothesis.new nil, 0.0, initial_coverage, nil, nil
+  stack = OrderedHypothesisStack.new
+  stack.push empty_hypothesis
+  STDERR.write "input size: #{tokenized_input.size}\n"
+  while not stack.size == 0
+    hyp = stack.pop
+    translation_options.each_pair { |span, options|
+      if hyp.applicable? span
+        new_coverage = hyp.coverage.clone
+        new_coverage.set span
+        options.each { |opt|
+          stack.push Hypothesis.new opt[0], hyp.score+opt[1]['f2e'], new_coverage, hyp, span
+        }
+      else
+        if hyp.coverage.size == tokenized_input.size and !hyp.final
+          str = []
+          scores = []
+          0.upto(tokenized_input.size-1) { str << '' }
+          score = hyp.score
+          cur_hyp = hyp
+          while true
+            break if !cur_hyp.rule
+            a = cur_hyp.rule.split
+            i = cur_hyp.span[0]
+            scores << cur_hyp.score
+            a.each { |tok|
+              str[i] = tok
+              i += 1
+            }
+            str[i-1] += " |"
+            cur_hyp = cur_hyp.previous
+          end
+          puts "#{score} #{str.join(' ')}|| #{scores.to_s}\n"
+          hyp.final = true
+        end
+      end
+    }
+    stack.sort!
+    if stack.size > cfg[:stack_size]
+      stack = stack[stack.size-cfg[:stack_size]..stack.size-1]
+    end
+  end
+end
+
+
+main
+
author	Patrick Simianer <p@simianer.de>	2014-06-14 16:46:27 +0200
committer	Patrick Simianer <p@simianer.de>	2014-06-14 16:46:27 +0200
commit	26c490f404731d053a6205719b6246502c07b449 (patch)
tree	3aa721098f1251dfbf2249ecd2736434c13b1d48 /simple_decoder/simple_decoder.rb