simple_decoder/simple_decoder.rb


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171

#!/usr/bin/env ruby

require 'nlp_ruby'
require 'trollop'


class OrderedHypothesisStack < Array

  def sort!
    self.sort_by! { |i| i.score }
  end
end

class Rule
  attr_accessor :left, :right, :features

  def initialize left, right, features
    @left = left
    @right = right
    @features = features
  end
end

class Coverage

  def initialize a
    @_a = a
    @_size = nil
  end

  def set tuple
    tuple[0].upto(tuple[1]) { |i|
      @_a[i] = true
    }
  end

  def clone
    new_a = []
    @_a.each_with_index { |e,i|
      if e
        new_a << true
      else
        new_a << nil
      end
    }
    return Coverage.new new_a
  end

  def to_s
    [@_a.to_s, size]
  end

  def free? tuple
    tuple[0].upto(tuple[1]) { |i|
      return false if @_a[i]
    }
    return true
  end

  def size
    return @_size if @_size
    sum = 0
    @_a.each { |i| sum += 1 if i }
    @_size = sum
    return @_size
  end
end

class Hypothesis
  attr_accessor :rule, :score, :coverage, :previous, :span, :final

  def initialize rule, score, coverage, previous, span
    @rule = rule
    @score = score
    @coverage = coverage
    @previous = previous
    @span = span
    @final = false
  end

  def to_s
    [@score, @rule, @coverage.to_s, "TODO", span.to_s].to_s
  end

  def applicable? span
    return @coverage.free? span
  end
end

def get_translation_options tokenized_input, phrase_table
  translation_options = {}
  tokenized_input.each_with_index { |token, i|
    i.upto(tokenized_input.size-1) { |j|
       span = [i, j]
       str = tokenized_input[i..j].join ' '
       translation_options[span] = [[str, {'f2e'=>-1.0}]] if i==j
       translation_options[span] = [] if j>i
       if phrase_table.has_key? str
         phrase_table[str].each { |right_side|
           translation_options[span] << right_side
         }
       end
    }
  }
  return translation_options
end

def main
  cfg = Trollop::options do
    opt :debug, "debug mode", :type => :bool, :default => false
    opt :stack_size, "max. stack size", :type => :int, :default => 100
  end
  if !cfg[:debug]
    input = STDIN.gets.strip
    phrase_table = read_phrase_table './data/example.phrases'
  else
    input = 'a b'
    phrase_table = read_phrase_table './data/debug.phrases'
  end
  tokenized_input = tokenize input
  translation_options = get_translation_options tokenized_input, phrase_table
  a = []
  (0).upto(tokenized_input.size-1) { a << nil }
  initial_coverage = Coverage.new a
  empty_hypothesis = Hypothesis.new nil, 0.0, initial_coverage, nil, nil
  stack = OrderedHypothesisStack.new
  stack.push empty_hypothesis
  STDERR.write "input size: #{tokenized_input.size}\n"
  while not stack.size == 0
    hyp = stack.pop
    translation_options.each_pair { |span, options|
      if hyp.applicable? span
        new_coverage = hyp.coverage.clone
        new_coverage.set span
        options.each { |opt|
          stack.push Hypothesis.new opt[0], hyp.score+opt[1]['f2e'], new_coverage, hyp, span
        }
      else
        if hyp.coverage.size == tokenized_input.size and !hyp.final
          str = []
          scores = []
          0.upto(tokenized_input.size-1) { str << '' }
          score = hyp.score
          cur_hyp = hyp
          while true
            break if !cur_hyp.rule
            a = cur_hyp.rule.split
            i = cur_hyp.span[0]
            scores << cur_hyp.score
            a.each { |tok|
              str[i] = tok
              i += 1
            }
            str[i-1] += " |"
            cur_hyp = cur_hyp.previous
          end
          puts "#{score} #{str.join(' ')}|| #{scores.to_s}\n"
          hyp.final = true
        end
      end
    }
    stack.sort!
    if stack.size > cfg[:stack_size]
      stack = stack[stack.size-cfg[:stack_size]..stack.size-1]
    end
  end
end


main