From 26c490f404731d053a6205719b6246502c07b449 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Sat, 14 Jun 2014 16:46:27 +0200 Subject: init --- simple_decoder/simple_decoder.rb | 171 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 171 insertions(+) create mode 100755 simple_decoder/simple_decoder.rb (limited to 'simple_decoder/simple_decoder.rb') diff --git a/simple_decoder/simple_decoder.rb b/simple_decoder/simple_decoder.rb new file mode 100755 index 0000000..fca4eb4 --- /dev/null +++ b/simple_decoder/simple_decoder.rb @@ -0,0 +1,171 @@ +#!/usr/bin/env ruby + +require 'nlp_ruby' +require 'trollop' + + +class OrderedHypothesisStack < Array + + def sort! + self.sort_by! { |i| i.score } + end +end + +class Rule + attr_accessor :left, :right, :features + + def initialize left, right, features + @left = left + @right = right + @features = features + end +end + +class Coverage + + def initialize a + @_a = a + @_size = nil + end + + def set tuple + tuple[0].upto(tuple[1]) { |i| + @_a[i] = true + } + end + + def clone + new_a = [] + @_a.each_with_index { |e,i| + if e + new_a << true + else + new_a << nil + end + } + return Coverage.new new_a + end + + def to_s + [@_a.to_s, size] + end + + def free? tuple + tuple[0].upto(tuple[1]) { |i| + return false if @_a[i] + } + return true + end + + def size + return @_size if @_size + sum = 0 + @_a.each { |i| sum += 1 if i } + @_size = sum + return @_size + end +end + +class Hypothesis + attr_accessor :rule, :score, :coverage, :previous, :span, :final + + def initialize rule, score, coverage, previous, span + @rule = rule + @score = score + @coverage = coverage + @previous = previous + @span = span + @final = false + end + + def to_s + [@score, @rule, @coverage.to_s, "TODO", span.to_s].to_s + end + + def applicable? span + return @coverage.free? span + end +end + +def get_translation_options tokenized_input, phrase_table + translation_options = {} + tokenized_input.each_with_index { |token, i| + i.upto(tokenized_input.size-1) { |j| + span = [i, j] + str = tokenized_input[i..j].join ' ' + translation_options[span] = [[str, {'f2e'=>-1.0}]] if i==j + translation_options[span] = [] if j>i + if phrase_table.has_key? str + phrase_table[str].each { |right_side| + translation_options[span] << right_side + } + end + } + } + return translation_options +end + +def main + cfg = Trollop::options do + opt :debug, "debug mode", :type => :bool, :default => false + opt :stack_size, "max. stack size", :type => :int, :default => 100 + end + if !cfg[:debug] + input = STDIN.gets.strip + phrase_table = read_phrase_table './data/example.phrases' + else + input = 'a b' + phrase_table = read_phrase_table './data/debug.phrases' + end + tokenized_input = tokenize input + translation_options = get_translation_options tokenized_input, phrase_table + a = [] + (0).upto(tokenized_input.size-1) { a << nil } + initial_coverage = Coverage.new a + empty_hypothesis = Hypothesis.new nil, 0.0, initial_coverage, nil, nil + stack = OrderedHypothesisStack.new + stack.push empty_hypothesis + STDERR.write "input size: #{tokenized_input.size}\n" + while not stack.size == 0 + hyp = stack.pop + translation_options.each_pair { |span, options| + if hyp.applicable? span + new_coverage = hyp.coverage.clone + new_coverage.set span + options.each { |opt| + stack.push Hypothesis.new opt[0], hyp.score+opt[1]['f2e'], new_coverage, hyp, span + } + else + if hyp.coverage.size == tokenized_input.size and !hyp.final + str = [] + scores = [] + 0.upto(tokenized_input.size-1) { str << '' } + score = hyp.score + cur_hyp = hyp + while true + break if !cur_hyp.rule + a = cur_hyp.rule.split + i = cur_hyp.span[0] + scores << cur_hyp.score + a.each { |tok| + str[i] = tok + i += 1 + } + str[i-1] += " |" + cur_hyp = cur_hyp.previous + end + puts "#{score} #{str.join(' ')}|| #{scores.to_s}\n" + hyp.final = true + end + end + } + stack.sort! + if stack.size > cfg[:stack_size] + stack = stack[stack.size-cfg[:stack_size]..stack.size-1] + end + end +end + + +main + -- cgit v1.2.3