summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.md11
-rwxr-xr-xbold_reranking.rb288
-rw-r--r--example/example.ini19
-rw-r--r--example/example.lists.gzbin0 -> 77183 bytes
-rw-r--r--example/example.oracles10
-rw-r--r--example/example.output10
-rw-r--r--example/example.source10
7 files changed, 348 insertions, 0 deletions
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..33eca72
--- /dev/null
+++ b/README.md
@@ -0,0 +1,11 @@
+bold_reranker
+=============
+
+Reranker that does bold (online) updates on oracles.
+Makes use of my little nlp library for ruby, see here
+https://github.com/pks/nlp_ruby
+
+<code>
+./bold_reranking.rb example/example.ini
+</code>
+
diff --git a/bold_reranking.rb b/bold_reranking.rb
new file mode 100755
index 0000000..3041ced
--- /dev/null
+++ b/bold_reranking.rb
@@ -0,0 +1,288 @@
+#!/usr/bin/env ruby
+
+require 'nlp_ruby'
+require 'bloom-filter'
+
+
+class FeatureFactory
+
+ def initialize cfg
+ @use_target_ngrams = false
+ if cfg['ff_target_ngrams']
+ @use_target_ngrams = true
+ args = cfg['ff_target_ngrams'].split
+ @target_ngrams_n = args[0].to_i
+ @target_ngrams_fix = true if args.size==2&&args[1]=='fix'
+ end
+ @use_phrase_pairs = false
+ if cfg['ff_phrase_pairs']
+ @use_phrase_pairs = true
+ @phrase_table = nil
+ args = cfg['ff_phrase_pairs'].split
+ if args.size==2
+ @phrase_table = BloomFilter.load args.last
+ end
+ end
+ @additional_phrase_pairs = {}
+ @binary = false
+ @binary = true if cfg['binary_feature_values']
+ @filter_features = false
+ if cfg['filter_features']
+ @filter_features = true
+ @stopwords_target = ReadFile.new(cfg['filter_features']).readlines.map{ |i| i.strip.downcase }
+ end
+ end
+
+ def produce translation, source
+ f = SparseVector.new
+ phrase_pairs(f, translation, source) if @use_phrase_pairs
+ target_ngrams(f, translation.s) if @use_target_ngrams
+ return f
+ end
+
+ def filter a
+ single_char = only_stop = only_num = 1
+ a.each { |i|
+ single_char = 0 if i.size > 1
+ only_stop = 0 if not @stopwords_target.include? i.downcase
+ only_num = 0 if i.gsub(/[0-9]+/, '').size > 0
+ }
+ return [single_char,only_stop,only_num].max==1
+ end
+
+ def phrase_pairs f, translation, source
+ target_phrases = translation.get_phrases
+ return if !target_phrases
+ spans = translation.get_spans
+ src_tok = source.split.map{ |i| i.strip }
+ src_sz = 0.0
+ name = nil
+ spans.each_with_index { |i,j|
+ next if @filter_features && filter(target_phrases[j])
+ i.pop if i.size==2 && i[0]==i[1]
+ if i.size == 2
+ next if !src_tok[i[0]..i[1]]
+ if @phrase_table
+ pp = "#{src_tok[i[0]..i[1]].join ' '} ||| #{target_phrases[j]}"
+ next if !(@phrase_table.include?(pp) || @additional_phrase_pairs.has_key?(pp))
+ end
+ name = "PP:#{src_tok[i[0]..i[1]].join ','}~#{target_phrases[j].split.join ','}"
+ src_sz = src_tok[i[0]..i[1]].size.to_f
+ else
+ if @phrase_table
+ pp ="#{src_tok[i[0]]} ||| #{target_phrases[j]}"
+ next if !(@phrase_table.include?(pp) || @additional_phrase_pairs.has_key?(pp))
+ end
+ if i[0] >= 0
+ name = "PP:#{src_tok[i[0]]}~#{target_phrases[j]}"
+ src_sz = 1.0
+ end
+ end
+ if @binary
+ f[name] = 1.0
+ else
+ f[name] = src_sz
+ end
+ }
+ end
+
+ def add_phrase_pairs pairs
+ pairs.each { |i| @additional_phrase_pairs[i] = true }
+ end
+
+ def target_ngrams f, s
+ ngrams(s, @target_ngrams_n, @target_ngrams_fix) { |ng|
+ next if @filter_features && filter(ng)
+ name = "NG:"+ng.join("_")
+ if @binary
+ f[name] = 1.0
+ else
+ f[name] += ng.size
+ end
+ }
+ end
+end
+
+class MosesKbestEntryWithPhraseAlignment < Translation
+
+ def initialize
+ super
+ @other_score = -1.0/0
+ end
+
+ def get_phrases
+ @raw.split(/\|-?\d+\||\|\d+-\d+\|/).map{ |i| i.strip }.reject{ |i| i=='' }
+ end
+
+ def _span span
+ if span == '-1'
+ return [-1]
+ else
+ return span.split('-').map { |i| i.to_i }
+ end
+ end
+
+ def get_spans
+ @raw.scan(/\|-?\d+\||\|\d+-\d+\|/).map{ |i| i[1..-2] }.map{ |i| _span i }
+ end
+
+ def other_score model=nil
+ if model
+ @other_score = model.dot(@f)
+ end
+ return @other_score
+ end
+end
+
+class ConstrainedSearchOracle < MosesKbestEntryWithPhraseAlignment
+
+ def from_s s
+ @id = -1
+ @raw = s.strip.split(' : ', 2)[1].gsub(/(\[|\])/, '|')
+ @s = @raw.gsub(/\s*\|\d+-\d+\||\|-?\d+\|\s*/, ' ').gsub(/\s+/, ' ')
+ @score = 1.0/0
+ @other_score = -1.0/0
+ end
+end
+
+def structured_update model, hypothesis, oracle
+ if hypothesis.s != oracle.s
+ model += oracle.f - hypothesis.f
+ return [model, 1]
+ end
+ return [model, 0]
+end
+
+def ranking_update w, hypothesis, oracle
+ if oracle.other_score <= hypothesis.other_score \
+ && oracle.s != hypothesis.s
+ model += oracle.f - hypothesis.f
+ return [model, 1]
+ end
+ return [model, 0]
+end
+
+def write_model fn, w
+ f = WriteFile.new fn
+ f.write w.to_s+"\n"
+ f.close
+end
+
+def read_additional_phrase_pairs fn
+ f = ReadFile.new fn
+ add = {}
+ while line = f.gets
+ id, phrase_pair = line.split ' ', 2
+ id = id.to_i-1
+ s, t = splitpipe phrase_pair, 3
+ phrase_pair = "#{s.strip} ||| #{t.strip}"
+ if add.has_key? id
+ add[id] << phrase_pair
+ else
+ add[id] = [phrase_pair]
+ end
+ end
+ return add
+end
+
+def usage
+ STDERR.write "#{__FILE__} <config file>\n"
+ exit 1
+end
+
+def main
+ usage if ARGV.size != 1
+ cfg = read_cfg ARGV[0]
+
+ sources = ReadFile.new(cfg['sources']).readlines
+ oracles = ReadFile.new(cfg['oracles']).readlines
+ kbest_lists = read_kbest_lists cfg['kbest_lists'], MosesKbestEntryWithPhraseAlignment
+ iterations = cfg['iterate'].to_i
+ output = WriteFile.new cfg['output']
+ output_model = cfg['output_model']
+ silent = true if cfg['silent']
+ verbose = true if cfg['verbose']
+ cheat = true if cfg['cheat']
+
+ additional_phrase_pairs = nil
+ if cfg['additional_phrase_pairs']
+ additional_phrase_pairs = read_additional_phrase_pairs cfg['additional_phrase_pairs']
+ end
+
+ ff = FeatureFactory.new cfg
+ if !silent
+ STDERR.write "Running online-reranker with config '#{File.expand_path ARGV[0]}'\n"
+ cfg.each_pair { |k,v| STDERR.write " #{k} = #{v}\n" }
+ STDERR.write "\n"
+ end
+
+ model = SparseVector.new
+ if cfg['init_model']
+ model.from_s ReadFile.new(cfg['init_model']).read
+ end
+
+ sz = sources.size
+ start = Time.now
+ iterations.times {
+ |t|
+ overall_errors = 0
+ STDERR.write "Iteration #{t+1} of #{iterations}\n"
+ sources.each_with_index { |i,j|
+ STDERR.write " #{j+1}\n" if (j+1)%10==0 && !silent&&!verbose
+
+ ff.add_phrase_pairs(additional_phrase_pairs[j]) if additional_phrase_pairs
+
+ kbest = kbest_lists[j]
+ kbest.each { |k|
+ k.f = ff.produce k, sources[j]
+ k.other_score model
+ }
+
+ hypothesis = kbest[ kbest.map{ |k| k.other_score }.max_index ]
+
+ if !cheat
+ output.write "#{hypothesis.s}\n"
+ end
+
+ oracle = ConstrainedSearchOracle.new
+ oracle.from_s oracles[j]
+ oracle.f = ff.produce oracle, sources[j]
+ oracle.other_score model
+
+ err = 0
+ case cfg['update']
+ when 'structured'
+ model, err = structured_update model, hypothesis, oracle
+ when 'ranking'
+ model, err = ranking_update model, hypothesis, oracle
+ else
+ STDERR.write "Don't know update method '#{cfg['update']}', exiting.\n"
+ exit 1
+ end
+ overall_errors += err
+
+ if cheat
+ kbest.each { |k| k.other_score model }
+ hypothesis = kbest[ kbest.map{ |k| k.other_score }.max_index ]
+ output.write "#{hypothesis.s}\n"
+ end
+
+ if verbose
+ counts = { 'PP'=>0, 'NG'=>0 }
+ model.each_pair { |k,v|
+ counts[k.split(':').first] += 1
+ }
+ STDERR.write "errors=#{overall_errors}; model size=#{model.size} (PP #{counts['PP']}, ng #{counts['NG']})\n" if verbose
+ end
+ }
+ }
+
+ elapsed = Time.now - start
+ STDERR.write"#{elapsed.round 2} s, #{(elapsed/Float(sz)).round 2} s per kbest; model size: #{model.size}\n\n" if !silent
+ write_model(output_model, model) if output_model
+ output.close
+end
+
+
+main
+
diff --git a/example/example.ini b/example/example.ini
new file mode 100644
index 0000000..46686bd
--- /dev/null
+++ b/example/example.ini
@@ -0,0 +1,19 @@
+sources = example/example.source
+oracles = example/example.oracles
+kbest_lists = example/example.lists.gz
+
+update = structured # ranking
+ff_target_ngrams = 4 # 4 fix
+ff_phrase_pairs = true # true /path/to/phrase_table
+#filter_features = /path/to/target/stopwords_file
+binary_feature_values = true
+iterate = 3
+
+output = -
+output_model = /dev/null
+#additional_phrase_pairs = /path/to/list/of/additional_phrase_pairs
+#init_model = /path/to/model
+
+#silent = true
+#cheat = true
+verbose = true
diff --git a/example/example.lists.gz b/example/example.lists.gz
new file mode 100644
index 0000000..45ab7ba
--- /dev/null
+++ b/example/example.lists.gz
Binary files differ
diff --git a/example/example.oracles b/example/example.oracles
new file mode 100644
index 0000000..ddf62f5
--- /dev/null
+++ b/example/example.oracles
@@ -0,0 +1,10 @@
+11101 4 -86 : Locking device [0] with a large [1-2] number [-1] of [-1] locking [-1] combinations [-1] . [4]
+1111101111111111111111111111011111111011111111111111111111111111111111111111111 78 -42 : The invention relates to a [0-4] locking device [9-10] which [11-12] consists [-1] of [26-27] a [31-33] lock [35-36] and [29-30] key [13-14] and [6-7] which [8] has [19-20] tumblers [18] transferable [-1] into [55-58] the [62-65] release position [59-60] by means [52] of [49-50] the [46-47] key [51] and [40] , [38-39] particularly [22] for a large [23-25] number [-1] of [34] lock [-1] combinations [-1] , [61] proposes [-1] that [-1] at least one [66-67] of the [69-70] tumblers [-1] ( [73] 20 [-1] ) [75] , [77] in [-1] addition [-1] to [-1] or [-1] alternatively to [41-42] its [43] direct control [44-45] by [-1] the [-1] basic code [48] of [-1] the [-1] corresponding [-1] key [-1] , [21] can be [17] brought [-1] into the release position [15-16] by [-1] means [-1] of a magnetic coil [53-54] ( [-1] 43 [-1] ) [-1] which [-1] can [-1] be energised [76] by [-1] a [-1] reading device [72] ( [-1] 44 ) [74] detecting [71] at [-1] least [-1] one [-1] supplementary code [68] of [-1] the [-1] key [-1] . [78]
+111111111111111111111 20 72 : The present invention relates to a [0-3] locking [9] device which consists of a lock [4-5] and a key [6-7] and [8] which [10-11] has [19] tumblers [18] that can be [17] brought [-1] into a release position [15-16] by [12] the key [13-14] . [20]
+11111 4 96 : Such [0] locking devices [1] are known . [2-4]
+1111111111111011111 12 -74 : They [0] can be formed [16-18] , [10-11] for [-1] instance [-1] , [8-9] of [2-3] locking [7] cylinders [-1] which [6] can be [1] locked [-1] by [4] flat keys [5] or [-1] else [-1] also [-1] of [-1] locking devices [15] which [14] are [-1] operated [-1] with [12] magnetic [-1] keys [-1] . [-1]
+111100111110101111111111111111 29 -25 : All [0] of [6-10] these [1] locking devices [2] share [-1] the [19-20] common [-1] feature [-1] that [-1] while [-1] the [23-24] number [-1] of [16-17] locking [-1] combinations [-1] is [3] frequently [12] considerable [14-15] , [18] this [-1] number [-1] however [-1] , could be [27-28] further increased [25-26] in order to increase [21-22] security [-1] . [29]
+1111111101111111111111111111111 30 71 : Particularly [0] with respect to [1-3] locking systems [4] having [5] master [-1] and [6-7] specific [-1] keys [9] there is [10] a need [11-13] to make available [21-24] in [14] a locking system [15-16] a [17] large number of [18-19] keys [20] which have different [25-27] access authorizations [28] . [29-30]
+10111111111111111111111111111 28 66 : Such [2-3] locking systems [4] frequently [0] have the disadvantage that [5-8] the [9-10] main [-1] master [-1] key [11] which [12-13] can be used [19-20] for all locks [14-16] of the locking system [17-18] has [27] only [21-22] a relatively [23-24] simple coding [25-26] . [28]
+111111111111111111011111111110111110011111111101111 50 8 : In the case of [0-3] locking devices [4] which [5-6] are used [11-12] by [7] continuously changing [8-9] users [10] ( for instance [13-16] , [22-23] in [17] hotels [-1] ) [19-21] , [37-38] it [40] is [27] necessary [-1] to ensure that [24-26] the [28] previous [-1] user [-1] cannot [30-31] easily [32] obtain [-1] a [33-34] copy [-1] of the [39] key [-1] , [-1] which [-1] would [-1] raise [-1] the [44] possibility [-1] of [-1] his [-1] unauthorized [41] access into the [42-43] room [48] inhabited [47] by the [45] subsequent [-1] user [-1] . [49-50]
+1111001111101111110111111111 27 44 : In many known [0-2] locking devices [3] an increase in the number of [6-10] locking [-1] combinations [-1] also [12] means [-1] an [13] increase in the [15-16] structural [14] size [-1] of the [24] key [17] , which has a [19-21] detrimental effect on [22-23] convenience [25] . [26-27]
diff --git a/example/example.output b/example/example.output
new file mode 100644
index 0000000..546e694
--- /dev/null
+++ b/example/example.output
@@ -0,0 +1,10 @@
+Locking device with large Schliesskombinationsanzahl .
+The invention relates to a locking device consisting of a lock and key , which by means of the key in the tractor tumblers , and proposes , in particular for a high number of Schließkombinationen before , that at least one of the tumblers ( 20 ) , in addition to or as an alternative to the direct controlling of the basic code of the corresponding key by means of a magnet coil ( 43 ) in the release position , the at least one supplementary code of the key is a reader ( 44 ) is excited .
+The invention relates to a device consisting of a lock and key , by means of the key in the tractor tumblers .
+This type of locking devices are known .
+They can be , for example , of flat keys which lock cylinder or , however , also be formed with Magnetschlüsseln operating mechanisms .
+All of these closing devices have in common is that the number of Schließkomtinationen often a considerable size which , however , in order to increase the security can be increased even further .
+In particular in view of locking systems with lye and minor keys , there is a need , within a locking system , a large number of keys available which have different access authorizations .
+Frequently , such locking systems have the disadvantage that the key , for all of the locks of the locking system is used , only a relatively simple coding .
+If it is decoupled from the constantly changing users are used ( e.g. in Hotelbetrieb ) , so it is also ensured that the Vorbenutzer not in a simple manner a Schlüsselkopie throughout , it can access into the space occupied by the Nachbenutzer relinquished .
+In many known locking devices , with an increase in the number of the Schließkombinationen also a constructional enlargement of the key , which has a disadvantageous effect on the comfort .
diff --git a/example/example.source b/example/example.source
new file mode 100644
index 0000000..56b617c
--- /dev/null
+++ b/example/example.source
@@ -0,0 +1,10 @@
+Schliesseinrichtung mit grosser Schliesskombinationsanzahl .
+Die Erfindung betrifft eine aus Schloß und Schlüssel bestehende Schließeinrichtung , die mittels des Schlüssels in Freigabestellung überführbare Zuhaltungen aufweist und schlägt insbesondere für eine hohe Anzahl von Schließkombinationen vor , daß mindestens eine der Zuhaltungen ( 20 ) zusätzlich oder alternativ zu ihrer direkten Steuerung von dem Grundcode des entsprechenden Schlüssels mittels einer Magnetspule ( 43 ) in Freigabestellung bringbar ist , die von einer mindestens einen Ergänzungscode des Schlüssels erfassenden Leseeinrichtung ( 44 ) erregbar ist .
+Die Erfindung betrifft eine aus Schloß und Schlüssel bestehende Schließeinrichtung , die mittels des Schlüssels in Freigabestellung überführbare Zuhaltungen aufweist .
+Derartige Schließeinrichtungen sind bekannt .
+Sie können beispielsweise von mit Flachschlüsseln schließbaren Schließzylinder oder aber auch von mit Magnetschlüsseln arbeitenden Schließeinrichtungen gebildet sein .
+Allen diesen Schließeinrichtungen ist es gemeinsam , daß die Anzahl der Schließkomtinationen oftmals zwar eine beträchtliche Größe aufweist , die jedoch zur Erhöhung der Sicherheit noch vergrößert werden könnte .
+Insbesondere im Hinblick auf Schließanlagen mit über- und untergeordneten Schlüsseln besteht der Bedarf , innerhalb einer Schließanlage eine Vielzahl von Schlüsseln zur Verfügung zu stellen , die unterschiedliche Zugangsberechtigungen besitzen .
+Häufig besteht bei derartigen Schließanlagen der Nachteil , daß gerade der Generalschlüssel , der für sämtliche Schlösser der Schließanlage benutzbar ist , nur eine relativ einfache Codierung besitzt .
+Handelt es sich um Schließeinrichtungen , die von ständig wechselnden Benutzern benutzt werden ( z . B. im Hotelbetrieb ) , so ist überdies sicherzustellen , daß sich der Vorbenutzer nicht auf einfachem Wege eine Schlüsselkopie beschaffen kann , die ihm unberechtigten Zugang in den vom Nachbenutzer bewohnten Raum verschafft .
+Bei vielen bekannten Schließeinrichtungen geht mit einer Erhöhung der Anzahl der Schließkombinationen auch eine bauliche Vergrößerung des Schlüssels einher , was sich nachteilig auf den Komfort auswirkt .