From 6a637210e65194041206be4fafc02188bfe4e01c Mon Sep 17 00:00:00 2001 From: Patrick Simianer
Date: Wed, 5 Feb 2014 21:56:55 +0100
Subject: init
---
bold_reranking.rb | 288 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 288 insertions(+)
create mode 100755 bold_reranking.rb
(limited to 'bold_reranking.rb')
diff --git a/bold_reranking.rb b/bold_reranking.rb
new file mode 100755
index 0000000..3041ced
--- /dev/null
+++ b/bold_reranking.rb
@@ -0,0 +1,288 @@
+#!/usr/bin/env ruby
+
+require 'nlp_ruby'
+require 'bloom-filter'
+
+
+class FeatureFactory
+
+ def initialize cfg
+ @use_target_ngrams = false
+ if cfg['ff_target_ngrams']
+ @use_target_ngrams = true
+ args = cfg['ff_target_ngrams'].split
+ @target_ngrams_n = args[0].to_i
+ @target_ngrams_fix = true if args.size==2&&args[1]=='fix'
+ end
+ @use_phrase_pairs = false
+ if cfg['ff_phrase_pairs']
+ @use_phrase_pairs = true
+ @phrase_table = nil
+ args = cfg['ff_phrase_pairs'].split
+ if args.size==2
+ @phrase_table = BloomFilter.load args.last
+ end
+ end
+ @additional_phrase_pairs = {}
+ @binary = false
+ @binary = true if cfg['binary_feature_values']
+ @filter_features = false
+ if cfg['filter_features']
+ @filter_features = true
+ @stopwords_target = ReadFile.new(cfg['filter_features']).readlines.map{ |i| i.strip.downcase }
+ end
+ end
+
+ def produce translation, source
+ f = SparseVector.new
+ phrase_pairs(f, translation, source) if @use_phrase_pairs
+ target_ngrams(f, translation.s) if @use_target_ngrams
+ return f
+ end
+
+ def filter a
+ single_char = only_stop = only_num = 1
+ a.each { |i|
+ single_char = 0 if i.size > 1
+ only_stop = 0 if not @stopwords_target.include? i.downcase
+ only_num = 0 if i.gsub(/[0-9]+/, '').size > 0
+ }
+ return [single_char,only_stop,only_num].max==1
+ end
+
+ def phrase_pairs f, translation, source
+ target_phrases = translation.get_phrases
+ return if !target_phrases
+ spans = translation.get_spans
+ src_tok = source.split.map{ |i| i.strip }
+ src_sz = 0.0
+ name = nil
+ spans.each_with_index { |i,j|
+ next if @filter_features && filter(target_phrases[j])
+ i.pop if i.size==2 && i[0]==i[1]
+ if i.size == 2
+ next if !src_tok[i[0]..i[1]]
+ if @phrase_table
+ pp = "#{src_tok[i[0]..i[1]].join ' '} ||| #{target_phrases[j]}"
+ next if !(@phrase_table.include?(pp) || @additional_phrase_pairs.has_key?(pp))
+ end
+ name = "PP:#{src_tok[i[0]..i[1]].join ','}~#{target_phrases[j].split.join ','}"
+ src_sz = src_tok[i[0]..i[1]].size.to_f
+ else
+ if @phrase_table
+ pp ="#{src_tok[i[0]]} ||| #{target_phrases[j]}"
+ next if !(@phrase_table.include?(pp) || @additional_phrase_pairs.has_key?(pp))
+ end
+ if i[0] >= 0
+ name = "PP:#{src_tok[i[0]]}~#{target_phrases[j]}"
+ src_sz = 1.0
+ end
+ end
+ if @binary
+ f[name] = 1.0
+ else
+ f[name] = src_sz
+ end
+ }
+ end
+
+ def add_phrase_pairs pairs
+ pairs.each { |i| @additional_phrase_pairs[i] = true }
+ end
+
+ def target_ngrams f, s
+ ngrams(s, @target_ngrams_n, @target_ngrams_fix) { |ng|
+ next if @filter_features && filter(ng)
+ name = "NG:"+ng.join("_")
+ if @binary
+ f[name] = 1.0
+ else
+ f[name] += ng.size
+ end
+ }
+ end
+end
+
+class MosesKbestEntryWithPhraseAlignment < Translation
+
+ def initialize
+ super
+ @other_score = -1.0/0
+ end
+
+ def get_phrases
+ @raw.split(/\|-?\d+\||\|\d+-\d+\|/).map{ |i| i.strip }.reject{ |i| i=='' }
+ end
+
+ def _span span
+ if span == '-1'
+ return [-1]
+ else
+ return span.split('-').map { |i| i.to_i }
+ end
+ end
+
+ def get_spans
+ @raw.scan(/\|-?\d+\||\|\d+-\d+\|/).map{ |i| i[1..-2] }.map{ |i| _span i }
+ end
+
+ def other_score model=nil
+ if model
+ @other_score = model.dot(@f)
+ end
+ return @other_score
+ end
+end
+
+class ConstrainedSearchOracle < MosesKbestEntryWithPhraseAlignment
+
+ def from_s s
+ @id = -1
+ @raw = s.strip.split(' : ', 2)[1].gsub(/(\[|\])/, '|')
+ @s = @raw.gsub(/\s*\|\d+-\d+\||\|-?\d+\|\s*/, ' ').gsub(/\s+/, ' ')
+ @score = 1.0/0
+ @other_score = -1.0/0
+ end
+end
+
+def structured_update model, hypothesis, oracle
+ if hypothesis.s != oracle.s
+ model += oracle.f - hypothesis.f
+ return [model, 1]
+ end
+ return [model, 0]
+end
+
+def ranking_update w, hypothesis, oracle
+ if oracle.other_score <= hypothesis.other_score \
+ && oracle.s != hypothesis.s
+ model += oracle.f - hypothesis.f
+ return [model, 1]
+ end
+ return [model, 0]
+end
+
+def write_model fn, w
+ f = WriteFile.new fn
+ f.write w.to_s+"\n"
+ f.close
+end
+
+def read_additional_phrase_pairs fn
+ f = ReadFile.new fn
+ add = {}
+ while line = f.gets
+ id, phrase_pair = line.split ' ', 2
+ id = id.to_i-1
+ s, t = splitpipe phrase_pair, 3
+ phrase_pair = "#{s.strip} ||| #{t.strip}"
+ if add.has_key? id
+ add[id] << phrase_pair
+ else
+ add[id] = [phrase_pair]
+ end
+ end
+ return add
+end
+
+def usage
+ STDERR.write "#{__FILE__}