diff options
-rw-r--r-- | example/json/test.json | 57 | ||||
-rw-r--r-- | example/toy/cdec.ini | 3 | ||||
-rw-r--r-- | grammar.rb | 80 | ||||
-rw-r--r-- | hg.rb | 41 | ||||
-rw-r--r-- | parse.rb | 2 | ||||
-rwxr-xr-x | test.rb | 13 |
6 files changed, 98 insertions, 98 deletions
diff --git a/example/json/test.json b/example/json/test.json index c423ddd..ad43ae4 100644 --- a/example/json/test.json +++ b/example/json/test.json @@ -1,48 +1,33 @@ { "weights":{ -"PhraseModel_0":0.0, "PhraseModel_1":0.0, "PhraseModel_2":0.0, "PhraseModel_3":0.0, "PhraseModel_4":0.0, "PhraseModel_5":0.0, "PhraseModel_6":0.0, "PhraseModel_7":0.0, "PhraseModel_8":0.0, "PhraseModel_9":0.0, "PhraseModel_10":0.0, "PhraseModel_11":0.0, "PhraseModel_12":0.0, "PhraseModel_13":0.0, "PhraseModel_14":0.0, "PhraseModel_15":0.0, "PhraseModel_16":0.0, "PhraseModel_17":0.0, "PhraseModel_18":0.0, "PhraseModel_19":0.0, "PhraseModel_20":0.0, "PhraseModel_21":0.0, "PhraseModel_22":0.0, "PhraseModel_23":0.0, "PhraseModel_24":0.0, "PhraseModel_25":0.0, "PhraseModel_26":0.0, "PhraseModel_27":0.0, "PhraseModel_28":0.0, "PhraseModel_29":0.0, "PhraseModel_30":0.0, "PhraseModel_31":0.0, "PhraseModel_32":0.0, "PhraseModel_33":0.0, "PhraseModel_34":0.0, "PhraseModel_35":0.0, "PhraseModel_36":0.0, "PhraseModel_37":0.0, "PhraseModel_38":0.0, "PhraseModel_39":0.0, "PhraseModel_40":0.0, "PhraseModel_41":0.0, "PhraseModel_42":0.0, "PhraseModel_43":0.0, "PhraseModel_44":0.0, "PhraseModel_45":0.0, "PhraseModel_46":0.0, "PhraseModel_47":0.0, "PhraseModel_48":0.0, "PhraseModel_49":0.0, "PhraseModel_50":0.0, "PhraseModel_51":0.0, "PhraseModel_52":0.0, "PhraseModel_53":0.0, "PhraseModel_54":0.0, "PhraseModel_55":0.0, "PhraseModel_56":0.0, "PhraseModel_57":0.0, "PhraseModel_58":0.0, "PhraseModel_59":0.0, "PhraseModel_60":0.0, "PhraseModel_61":0.0, "PhraseModel_62":0.0, "PhraseModel_63":0.0, "PhraseModel_64":0.0, "PhraseModel_65":0.0, "PhraseModel_66":0.0, "PhraseModel_67":0.0, "PhraseModel_68":0.0, "PhraseModel_69":0.0, "PhraseModel_70":0.0, "PhraseModel_71":0.0, "PhraseModel_72":0.0, "PhraseModel_73":0.0, "PhraseModel_74":0.0, "PhraseModel_75":0.0, "PhraseModel_76":0.0, "PhraseModel_77":0.0, "PhraseModel_78":0.0, "PhraseModel_79":0.0, "PhraseModel_80":0.0, "PhraseModel_81":0.0, "PhraseModel_82":0.0, "PhraseModel_83":0.0, "PhraseModel_84":0.0, "PhraseModel_85":0.0, "PhraseModel_86":0.0, "PhraseModel_87":0.0, "PhraseModel_88":0.0, "PhraseModel_89":0.0, "PhraseModel_90":0.0, "PhraseModel_91":0.0, "PhraseModel_92":0.0, "PhraseModel_93":0.0, "PhraseModel_94":0.0, "PhraseModel_95":0.0, "PhraseModel_96":0.0, "PhraseModel_97":0.0, "PhraseModel_98":0.0, "PhraseModel_99":0.0, "logp":2.0, "use_i":0.0, "use_a":0.0, "use_house":0.0, "use_shell":1.0 +"PhraseModel_0":0.0, "PhraseModel_1":0.0, "PhraseModel_2":0.0, "PhraseModel_3":0.0, "PhraseModel_4":0.0, "PhraseModel_5":0.0, "PhraseModel_6":0.0, "PhraseModel_7":0.0, "PhraseModel_8":0.0, "PhraseModel_9":0.0, "PhraseModel_10":0.0, "PhraseModel_11":0.0, "PhraseModel_12":0.0, "PhraseModel_13":0.0, "PhraseModel_14":0.0, "PhraseModel_15":0.0, "PhraseModel_16":0.0, "PhraseModel_17":0.0, "PhraseModel_18":0.0, "PhraseModel_19":0.0, "PhraseModel_20":0.0, "PhraseModel_21":0.0, "PhraseModel_22":0.0, "PhraseModel_23":0.0, "PhraseModel_24":0.0, "PhraseModel_25":0.0, "PhraseModel_26":0.0, "PhraseModel_27":0.0, "PhraseModel_28":0.0, "PhraseModel_29":0.0, "PhraseModel_30":0.0, "PhraseModel_31":0.0, "PhraseModel_32":0.0, "PhraseModel_33":0.0, "PhraseModel_34":0.0, "PhraseModel_35":0.0, "PhraseModel_36":0.0, "PhraseModel_37":0.0, "PhraseModel_38":0.0, "PhraseModel_39":0.0, "PhraseModel_40":0.0, "PhraseModel_41":0.0, "PhraseModel_42":0.0, "PhraseModel_43":0.0, "PhraseModel_44":0.0, "PhraseModel_45":0.0, "PhraseModel_46":0.0, "PhraseModel_47":0.0, "PhraseModel_48":0.0, "PhraseModel_49":0.0, "PhraseModel_50":0.0, "PhraseModel_51":0.0, "PhraseModel_52":0.0, "PhraseModel_53":0.0, "PhraseModel_54":0.0, "PhraseModel_55":0.0, "PhraseModel_56":0.0, "PhraseModel_57":0.0, "PhraseModel_58":0.0, "PhraseModel_59":0.0, "PhraseModel_60":0.0, "PhraseModel_61":0.0, "PhraseModel_62":0.0, "PhraseModel_63":0.0, "PhraseModel_64":0.0, "PhraseModel_65":0.0, "PhraseModel_66":0.0, "PhraseModel_67":0.0, "PhraseModel_68":0.0, "PhraseModel_69":0.0, "PhraseModel_70":0.0, "PhraseModel_71":0.0, "PhraseModel_72":0.0, "PhraseModel_73":0.0, "PhraseModel_74":0.0, "PhraseModel_75":0.0, "PhraseModel_76":0.0, "PhraseModel_77":0.0, "PhraseModel_78":0.0, "PhraseModel_79":0.0, "PhraseModel_80":0.0, "PhraseModel_81":0.0, "PhraseModel_82":0.0, "PhraseModel_83":0.0, "PhraseModel_84":0.0, "PhraseModel_85":0.0, "PhraseModel_86":0.0, "PhraseModel_87":0.0, "PhraseModel_88":0.0, "PhraseModel_89":0.0, "PhraseModel_90":0.0, "PhraseModel_91":0.0, "PhraseModel_92":0.0, "PhraseModel_93":0.0, "PhraseModel_94":0.0, "PhraseModel_95":0.0, "PhraseModel_96":0.0, "PhraseModel_97":0.0, "PhraseModel_98":0.0, "PhraseModel_99":0.0, "logp":2.0, "use_i":0.0, "use_a":0.0, "use_house":15.0, "use_shell":1.0 }, "nodes": [ { "label":"root", "cat":"root" }, -{ "label":"0", "cat":"I", "left":4, "right":5 }, -{ "label":"1", "cat":"Z", "left":4, "right":5 }, -{ "label":"2", "cat":"H", "left":3, "right":5 }, -{ "label":"3", "cat":"JJ", "left":3, "right":5 }, -{ "label":"4", "cat":"V", "left":1, "right":2 }, -{ "label":"5", "cat":"JJ", "left":3, "right":4 }, -{ "label":"6", "cat":"NP", "left":0, "right":1 }, -{ "label":"7", "cat":"NN", "left":3, "right":5 }, -{ "label":"8", "cat":"NP", "left":2, "right":5 }, -{ "label":"9", "cat":"VP", "left":1, "right":5 }, -{ "label":"10", "cat":"Q", "left":0, "right":5 }, -{ "label":"11", "cat":"C", "left":0, "right":5 }, -{ "label":"12", "cat":"B", "left":0, "right":5 }, -{ "label":"13", "cat":"S", "left":0, "right":5 }, -{ "label":"14", "cat":"Goal", "left":0, "right":5 } +{ "label":"0", "cat":"NP", "left":0, "right":1 }, +{ "label":"1", "cat":"V", "left":1, "right":2 }, +{ "label":"2", "cat":"JJ", "left":3, "right":4 }, +{ "label":"3", "cat":"NN", "left":3, "right":5 }, +{ "label":"4", "cat":"NP", "left":2, "right":5 }, +{ "label":"5", "cat":"VP", "left":1, "right":5 }, +{ "label":"6", "cat":"S", "left":0, "right":5 }, +{ "label":"7", "cat":"Goal", "left":0, "right":5 } ], "edges": [ -{"head":"6", "rule":"[NP] ||| ich ||| i ||| logp=-0.5 use_i=1.0", "left":0, "right":1, "spans":"", "tails":[ "root" ], "f":{"logp":-0.5, "use_i":1.0}, "weight":0.367879441171 }, -{"head":"4", "rule":"[V] ||| sah ||| saw ||| logp=-0.25 use_saw=1.0", "left":1, "right":2, "spans":"", "tails":[ "root" ], "f":{"logp":-0.25, "use_saw":1.0}, "weight":0.606530659713 }, -{"head":"5", "rule":"[JJ] ||| kleines ||| small ||| logp=0.0 use_small=1.0", "left":3, "right":4, "spans":"", "tails":[ "root" ], "f":{"logp":0.0, "use_small":1.0}, "weight":1.0 }, -{"head":"5", "rule":"[JJ] ||| kleines ||| little ||| logp=0.0 use_little=1.0", "left":3, "right":4, "spans":"", "tails":[ "root" ], "f":{"logp":0.0, "use_little":1.0}, "weight":1.0 }, -{"head":"0", "rule":"[I] ||| haus ||| house ||| logp=0.0", "left":4, "right":5, "spans":"", "tails":[ "root" ], "f":{"logp":0.0}, "weight":1.0 }, -{"head":"1", "rule":"[Z] ||| [I,1] ||| [1] ||| logp=0.0", "left":4, "right":5, "spans":"I|||0|||(4,5);", "tails":[ "0" ], "f":{"logp":0.0}, "weight":1.0 }, -{"head":"2", "rule":"[H] ||| kleines [Z,1] ||| small [1] ||| logp=0.0", "left":3, "right":5, "spans":"Z|||0|||(4,5);", "tails":[ "1" ], "f":{"logp":0.0}, "weight":1.0 }, -{"head":"7", "rule":"[NN] ||| [JJ,1] haus ||| [1] house ||| logp=0.0 use_house=1.0", "left":3, "right":5, "spans":"JJ|||0|||(3,4);", "tails":[ "5" ], "f":{"logp":0.0, "use_house":1.0}, "weight":1.0 }, -{"head":"7", "rule":"[NN] ||| [JJ,1] haus ||| [1] shell ||| logp=0.0 use_shell=1.0", "left":3, "right":5, "spans":"JJ|||0|||(3,4);", "tails":[ "5" ], "f":{"logp":0.0, "use_shell":1.0}, "weight":2.71828182846 }, -{"head":"3", "rule":"[JJ] ||| [H,1] ||| [1] ||| logp=0.0", "left":3, "right":5, "spans":"H|||0|||(3,5);", "tails":[ "2" ], "f":{"logp":0.0}, "weight":1.0 }, -{"head":"8", "rule":"[NP] ||| ein [NN,1] ||| a [1] ||| logp=0.0 use_a=1.0", "left":2, "right":5, "spans":"NN|||0|||(3,5);", "tails":[ "7" ], "f":{"logp":0.0, "use_a":1.0}, "weight":1.0 }, -{"head":"9", "rule":"[VP] ||| [V,1] [NP,2] ||| [1] [2] ||| logp=0.0", "left":1, "right":5, "spans":"V|||0|||(1,2);NP|||1|||(2,5);", "tails":[ "4","8" ], "f":{"logp":0.0}, "weight":1.0 }, -{"head":"10", "rule":"[Q] ||| [NP,1] [VP,2] ||| [1] [2] ||| logp=0.0", "left":0, "right":5, "spans":"NP|||0|||(0,1);VP|||1|||(1,5);", "tails":[ "6","9" ], "f":{"logp":0.0}, "weight":1.0 }, -{"head":"13", "rule":"[S] ||| ich sah ein [JJ,1] ||| i saw a [1] ||| logp=0.0", "left":0, "right":5, "spans":"JJ|||0|||(3,5);", "tails":[ "3" ], "f":{"logp":0.0}, "weight":1.0 }, -{"head":"13", "rule":"[S] ||| ich [V,1] ein [JJ,2] haus ||| i [1] a [2] ||| logp=0.0", "left":0, "right":5, "spans":"V|||0|||(1,2);JJ|||1|||(3,4);", "tails":[ "4","5" ], "f":{"logp":0.0}, "weight":1.0 }, -{"head":"13", "rule":"[S] ||| ich sah ein kleines haus ||| i saw a small house ||| logp=0.0", "left":0, "right":5, "spans":"", "tails":[ "root" ], "f":{"logp":0.0}, "weight":1.0 }, -{"head":"11", "rule":"[C] ||| [Q,1] ||| [1] ||| logp=0.0", "left":0, "right":5, "spans":"Q|||0|||(0,5);", "tails":[ "10" ], "f":{"logp":0.0}, "weight":1.0 }, -{"head":"12", "rule":"[B] ||| [C,1] ||| [1] ||| logp=0.0", "left":0, "right":5, "spans":"C|||0|||(0,5);", "tails":[ "11" ], "f":{"logp":0.0}, "weight":1.0 }, -{"head":"13", "rule":"[S] ||| [B,1] ||| [1] ||| logp=0.0", "left":0, "right":5, "spans":"B|||0|||(0,5);", "tails":[ "12" ], "f":{"logp":0.0}, "weight":1.0 }, -{"head":"14", "rule":"[Goal] ||| [S,1] ||| [1] ||| ", "left":0, "right":5, "spans":"S|||0|||(0,5);", "tails":[ "13" ], "f":{}, "weight":1.0 } +{"head":"0", "rule":"[NP@0:1] ||| ich ||| i ||| logp=-0.5 use_i=1.0", "tails":[ "root" ], "f":{"logp":-0.5, "use_i":1.0}, "weight":0.367879441171 }, +{"head":"1", "rule":"[V@1:2] ||| sah ||| saw ||| logp=-0.25 use_saw=1.0", "tails":[ "root" ], "f":{"logp":-0.25, "use_saw":1.0}, "weight":0.606530659713 }, +{"head":"2", "rule":"[JJ@3:4] ||| kleines ||| small ||| logp=0.0 use_small=1.0", "tails":[ "root" ], "f":{"logp":0.0, "use_small":1.0}, "weight":1.0 }, +{"head":"2", "rule":"[JJ@3:4] ||| kleines ||| little ||| logp=0.0 use_little=1.0", "tails":[ "root" ], "f":{"logp":0.0, "use_little":1.0}, "weight":1.0 }, +{"head":"3", "rule":"[NN@3:5] ||| kleines haus ||| small house ||| logp=0.0 use_house=1.0", "tails":[ "root" ], "f":{"logp":0.0, "use_house":1.0}, "weight":1.0 }, +{"head":"3", "rule":"[NN@3:5] ||| kleines haus ||| little house ||| logp=0.0 use_house=1.0", "tails":[ "root" ], "f":{"logp":0.0, "use_house":1.0}, "weight":1.0 }, +{"head":"3", "rule":"[NN@3:5] ||| [JJ@3:4,1] haus ||| [JJ@3:4,1] house ||| logp=0.0 use_house=1.0", "tails":[ "2" ], "f":{"logp":0.0, "use_house":1.0}, "weight":1.0 }, +{"head":"3", "rule":"[NN@3:5] ||| [JJ@3:4,1] haus ||| [JJ@3:4,1] shell ||| logp=0.0 use_shell=1.0", "tails":[ "2" ], "f":{"logp":0.0, "use_shell":1.0}, "weight":2.71828182846 }, +{"head":"4", "rule":"[NP@2:5] ||| ein [NN@3:5,1] ||| a [NN@3:5,1] ||| logp=0.0 use_a=1.0", "tails":[ "3" ], "f":{"logp":0.0, "use_a":1.0}, "weight":1.0 }, +{"head":"5", "rule":"[VP@1:5] ||| [V@1:2,1] [NP@2:5,2] ||| [V@1:2,1] [NP@2:5,2] ||| logp=0.0", "tails":[ "1","4" ], "f":{"logp":0.0}, "weight":1.0 }, +{"head":"6", "rule":"[S@0:5] ||| [NP@0:1,1] [VP@1:5,2] ||| [NP@0:1,1] [VP@1:5,2] ||| logp=0.0", "tails":[ "0","5" ], "f":{"logp":0.0}, "weight":1.0 }, +{"head":"7", "rule":"[Goal@0:5] ||| [S@0:5,1] ||| [S@0:5,1] ||| ", "tails":[ "6" ], "f":{}, "weight":1.0 } ] } diff --git a/example/toy/cdec.ini b/example/toy/cdec.ini index f9c75ff..8276d9b 100644 --- a/example/toy/cdec.ini +++ b/example/toy/cdec.ini @@ -1,4 +1,5 @@ formalism=scfg -grammar=grammar-test +grammar=grammar +#grammar=grammar-test #add_pass_through_rules=true #weights=weights @@ -13,73 +13,79 @@ class T end class NT - attr_accessor :symbol, :index, :span + attr_accessor :symbol, :index, :left, :right - def initialize symbol, index=0 + def initialize symbol=nil, index=nil, left=nil, right=nil @symbol = symbol @index = index - @span = Span.new + @left = left + @right = right + end + + def from_s s + s.delete! '[]' + @symbol, meta = s.split '@' + span, index = meta.split ',' + @left, @right = span.split(':').map { |x| x.to_i } + @index = index.to_i if index + end + + def self.from_s s + n = NT.new + n.from_s s + return n end def to_s - "NT(#{@span.left},#{@span.right})<#{@symbol},#{@index}>" + "NT(#{@left},#{@right})<#{@symbol},#{@index}>" end end class Rule - attr_accessor :lhs, :rhs, :e + attr_accessor :lhs, :rhs, :target, :map - def initialize lhs=nil, rhs=[], e='', span=nil + def initialize lhs=nil, rhs=[], left=nil, right=nil, target=[] @lhs = lhs @rhs = rhs - @e = e - @lhs.span = span if span + @lhs.left = left if lhs + @lhs.right = right if lhs + @target = target + @arity_ = nil end def to_s - "#{lhs} -> #{rhs.map{ |i| i.to_s }.join ' '} [arity=#{arity}] ||| #{@e}" + "#{@lhs} -> #{@rhs.map{ |i| i.to_s }.join ' '} ||| #{@target.map{ |i| i.to_s }.join ' '} [arity=#{arity}]" #FIXME end def arity - rhs.select { |i| i.class == NT }.size + return @arity_ if @arity_ + return rhs.select { |i| i.class == NT }.size end - def from_s s, tail_spans=nil - nt_dict = {} - tail_spans.split(';').each { |i| - symbol, idx, span = i.split('|||') - nt_dict[idx.to_i] = span.gsub('(','').gsub(')','').split(',').map{|i|i.to_i} - } - _ = splitpipe s, 3 - @lhs = NT.new _[0].strip.gsub!(/(\[|\])/, "") - q = 0 - _[1].split.each { |x| + def read_right_ s + a = [] + s.split.each { |x| x.strip! if x[0]=='[' && x[x.size-1] == ']' - @rhs << NT.new(x.gsub!(/(\[|\])/, "").split(',')[0]) - @rhs.last.span.left = nt_dict[q][0] - @rhs.last.span.right = nt_dict[q][1] - q += 1 + a << NT.from_s(x) else - @rhs << T.new(x) + a << T.new(x) end } - @e = _[2] + return a end - def self.from_s s, tail_spans=nil - r = self.new - r.from_s s, tail_spans - return r + def from_s s + lhs, rhs, target = splitpipe s, 3 + @lhs = NT.from_s lhs + @rhs = read_right_ rhs + @target = read_right_ target end -end -class Span - attr_accessor :left, :right - - def initialize left=nil, right=nil - @left = left - @right = right + def self.from_s s + r = self.new + r.from_s s + return r end end @@ -42,17 +42,15 @@ class HG::Hypergraph end class HG::Hyperedge - attr_accessor :head, :tails, :weight, :f, :mark, :rule, :left, :right + attr_accessor :head, :tails, :weight, :f, :mark, :rule - def initialize head=nil, tails=[], weight=0.0, f={}, rule=nil, left=nil, right=nil, tail_spans=nil + def initialize head=nil, tails=[], weight=0.0, f=SparseVector.new, rule=nil @head = head @tails = tails @weight = weight @f = f @mark = 0 - @rule = Grammar::Rule.from_s rule, tail_spans if rule - @rule.lhs.span.left = left if left - @rule.lhs.span.right = right if right + @rule = Grammar::Rule.from_s rule if rule end def arity @@ -150,19 +148,18 @@ def HG::read_hypergraph_from_json fn, semiring=RealSemiring.new, log_weights=fal nodes_by_index = [] h = JSON.parse File.new(fn).read w = SparseVector.from_h h['weights'] - h['nodes'].each { |i| - n = Node.new i['label'], i['cat'] + h['nodes'].each { |x| + n = Node.new x['label'], x['cat'] nodes << n nodes_by_label[n.label] = n nodes_by_index << n } - h['edges'].each { |i| - e = Hyperedge.new(nodes_by_label[i['head']], \ - i['tails'].map{|j| nodes_by_label[j]}.to_a, \ - semiring.convert.call(i['weight'].to_f), \ - {}, \ - i['rule'], i['left'], i['right'], i['spans']) - e.f = SparseVector.from_h i['f'] + h['edges'].each { |x| + e = Hyperedge.new(nodes_by_label[x['head']], \ + x['tails'].map { |j| nodes_by_label[j] }.to_a, \ + semiring.convert.call(x['weight'].to_f), \ + SparseVector.from_h(x['f']), \ + x['rule']) if log_weights e.weight = Math.exp(w.dot(e.f)) else @@ -177,7 +174,7 @@ def HG::read_hypergraph_from_json fn, semiring=RealSemiring.new, log_weights=fal return Hypergraph.new(nodes, edges), nodes_by_label, nodes_by_index end -def HG::all_paths hypergraph, root, semiring=ViterbiSemiring.new +def HG::all_paths hypergraph, root, semiring=ViterbiSemiring.new #FIXME? toposorted = topological_sort hypergraph.nodes paths = [[]] toposorted.each { |n| @@ -194,6 +191,20 @@ def HG::all_paths hypergraph, root, semiring=ViterbiSemiring.new return paths end +def HG::derive path, cur, carry + edge = path.select { |e| e.rule.lhs.symbol==cur.symbol \ + && e.rule.lhs.left==cur.left \ + && e.rule.lhs.right==cur.right }.first + edge.rule.target.each { |i| + if i.class == Grammar::NT + derive path, i, carry + else + carry << i + end + } + return carry +end + end #module @@ -162,7 +162,7 @@ def main n = input.size STDERR.write "> reading grammar\n" - grammar = Grammar::Grammar.new 'example/grammar.3.gz' + grammar = Grammar::Grammar.new 'example/grammars/grammar.3.gz' STDERR.write ">> adding glue grammar\n" #grammar.add_glue_rules STDERR.write ">> adding pass-through grammar\n" @@ -2,14 +2,11 @@ require_relative 'hg' + + semiring = ViterbiSemiring.new hypergraph, nodes_by_label, _ = HG::read_hypergraph_from_json('example/json/test.json', semiring, true) -path, score = HG::viterbi_path hypergraph, nodes_by_label['root'], semiring -path.each { |e| - #puts e.to_s - puts " "+e.rule.to_s -} -s, score = HG::viterbi_string hypergraph, nodes_by_label['root'], semiring -puts s - +path, _ = HG::viterbi_path hypergraph, nodes_by_label['root'], semiring +s = HG::derive path, path.last.rule.lhs, [] +puts s.map { |i| i.word }.join ' ' |