From 14f7bbab028d781cd2057a348862f911324338fd Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Tue, 23 Jun 2015 16:03:23 +0200 Subject: overhaul --- de-tok.rb | 36 -------- index.php | 2 +- kill | 4 - run_server | 6 -- server.rb | 251 +++++++++++++++++++++++++++++++------------------------ static/debug.css | 7 ++ util/de-tok.rb | 36 ++++++++ util/kill | 4 + util/run_server | 8 ++ util/truecase.rb | 30 +++++++ views/debug.haml | 71 ++++++++++++++++ 11 files changed, 297 insertions(+), 158 deletions(-) delete mode 100755 de-tok.rb delete mode 100755 kill delete mode 100755 run_server create mode 100644 static/debug.css create mode 100755 util/de-tok.rb create mode 100755 util/kill create mode 100755 util/run_server create mode 100755 util/truecase.rb create mode 100644 views/debug.haml diff --git a/de-tok.rb b/de-tok.rb deleted file mode 100755 index 92c563f..0000000 --- a/de-tok.rb +++ /dev/null @@ -1,36 +0,0 @@ -#!/usr/bin/env ruby - -require 'nanomsg' -require 'open3' -require 'trollop' - -conf = Trollop::options do - opt :action, "tokenize (T) or detokenize (D)", :type => :string, :requred => true - opt :addr, "socket address", :short => "-S", :type => :string, :required => true - opt :scripts, "path to scripts directory", :short => "-p", :type => :string, :required => true - opt :lang, "language", :short => "-l", :type => :string, :required => true -end - -sock = NanoMsg::PairSocket.new -sock.bind conf[:addr] -sock.send "hello" - -if conf[:action] == "D" - cmd = "#{conf[:scripts]}/detokenizer.perl -q -b -u -l #{conf[:lang]}" -elsif conf[:action] == "T" - cmd = "#{conf[:scripts]}/tokenizer-no-escape.perl -q -b -a -l #{conf[:lang]}" -else - # ERROR -end -while true - inp = sock.recv - break if !inp||inp=="shutdown" - Open3.popen3(cmd) do |pin, pout, perr| - pin.write inp - pin.close - sock.send pout.gets.strip - end -end - -sock.send "off" - diff --git a/index.php b/index.php index 3947b42..f694b72 100644 --- a/index.php +++ b/index.php @@ -44,7 +44,7 @@ Document overview raw_source_segments as $s) { diff --git a/kill b/kill deleted file mode 100755 index 34d7ed4..0000000 --- a/kill +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -for i in {1..6}; do ps ax | grep -P "(server.rb|atools|net_fa|sa.extract|dtrain)" | grep -v vim | grep -v -P "^\s\+$" | cut -d " " -f $i | xargs kill -9; done - diff --git a/run_server b/run_server deleted file mode 100755 index 91d9872..0000000 --- a/run_server +++ /dev/null @@ -1,6 +0,0 @@ -#!/bin/bash -x - -export LD_LIBRARY_PATH=/fast_scratch/simianer/lfpe/nanomsg-0.5-beta/lib -export PYTHONPATH=~/.local/lib/python2.7/site-packages -./kill;./kill;rm ../example_session/work/lockfile; ./server.rb ../example_session/conf.rb 2>server.err > server.out - diff --git a/server.rb b/server.rb index fdc99fd..2122979 100755 --- a/server.rb +++ b/server.rb @@ -7,28 +7,37 @@ require 'nanomsg' require 'zipf' require 'digest' require 'json' +require 'haml' -# load configuration file and setup global variables -require_relative "#{ARGV[0]}" -$lock = false # lock if currently learning/translating -$last_reply = nil # cache last reply -$confirmed = true # client received translation? -if !FileTest.exist? LOCK_FILE - $db = {} # FIXME: that is supposed to be a database connection - $env = {} +# ############################################################################# +# Load configuration file and setup global variables +# ############################################################################# +require_relative "#{ARGV[0]}" # load configuration for this session +$lock = false # lock if currently learning/translating +$last_reply = nil # cache last reply +$confirmed = true # client received translation? +if !FileTest.exist? LOCK_FILE # locked? + $db = {} # FIXME: that is supposed to be a database connection + $env = {} # environment variables (socket connections to daemons) end +# ############################################################################# +# Daemons +# ############################################################################# $daemons = { - :detokenizer => "/fast_scratch/simianer/lfpe/lfpe/de-tok.rb -a D -S '__ADDR__' -p #{SCRIPTS_DIR} -l #{TARGET_LANG}", - :tokenizer => "/fast_scratch/simianer/lfpe/lfpe/de-tok.rb -a T -S '__ADDR__' -p #{SCRIPTS_DIR} -l #{TARGET_LANG}", + :detokenizer => "/fast_scratch/simianer/lfpe/lfpe/util/de-tok.rb -a D -S '__ADDR__' -p #{SCRIPTS} -l #{TARGET_LANG}", + :tokenizer => "/fast_scratch/simianer/lfpe/lfpe/util/de-tok.rb -a T -S '__ADDR__' -p #{SCRIPTS} -l #{TARGET_LANG}", + :truecaser => "/fast_scratch/simianer/lfpe/lfpe/util/truecase.rb -S '__ADDR__' -m #{MOSES} -n #{DATA_DIR}/truecaser", # FIXME: run as real daemon + :dtrain => "#{CDEC}/training/dtrain/dtrain_net_interface -c #{DATA_DIR}/dtrain.ini -d #{WORK_DIR}/dtrain.debug.json -o #{WORK_DIR}/weights.final -a '__ADDR__'", :extractor => "python -m cdec.sa.extract -c #{DATA_DIR}/sa.ini --online -u -S '__ADDR__'", - :aligner_fwd => "#{CDEC_NET}/word-aligner/net_fa -f #{DATA_DIR}/a/forward.params -m #{FWD_MEAN_SRCLEN_MULT} -T #{FWD_TENSION} --sock_url '__ADDR__'", - :aligner_back => "#{CDEC_NET}/word-aligner/net_fa -f #{DATA_DIR}/a/backward.params -m #{BACK_MEAN_SRCLEN_MULT} -T #{BACK_TENSION} --sock_url '__ADDR__'", - :atools => "#{CDEC_NET}/utils/atools_net -c grow-diag-final-and -S '__ADDR__'", - :dtrain => "#{CDEC_NET}/training/dtrain/dtrain_net_interface -c #{DATA_DIR}/dtrain.ini -o #{WORK_DIR}/weights.final -a '__ADDR__'" + :aligner_fwd => "#{CDEC}/word-aligner/net_fa -f #{DATA_DIR}/forward.params -m #{FWD_MEAN_SRCLEN_MULT} -T #{FWD_TENSION} --sock_url '__ADDR__'", + :aligner_back => "#{CDEC}/word-aligner/net_fa -f #{DATA_DIR}/backward.params -m #{BACK_MEAN_SRCLEN_MULT} -T #{BACK_TENSION} --sock_url '__ADDR__'", + :atools => "#{CDEC}/utils/atools_net -c grow-diag-final-and -S '__ADDR__'" } -# setup Sinatra +# ############################################################################# +# Set-up Sinatra +# ############################################################################# set :bind, SERVER_IP set :port, WEB_PORT set :allow_origin, :any @@ -36,31 +45,40 @@ set :allow_methods, [:get, :post, :options] set :allow_credentials, true set :max_age, "1728000" set :expose_headers, ['Content-Type'] +set :public_folder, File.dirname(__FILE__) + '/static' + + +# ############################################################################# +# Helper functions +# ############################################################################# +def logmsg name, msg + STDERR.write "[#{name}] #{msg}\n" +end def start_daemon cmd, name, addr - STDERR.write "> starting #{name} daemon\n" + logmsg :server, "starting #{name} daemon" cmd.gsub! '__ADDR__', addr pid = fork do exec cmd end sock = NanoMsg::PairSocket.new sock.connect addr - STDERR.write "< got #{sock.recv} from #{name}\n" + logmsg :server, "< got #{sock.recv} from #{name}" return sock, pid end def stop_all_daemons - STDERR.write "shutting down all daemons\n" + logmsg :server, "shutting down all daemons" $env.each { |name,p| - p[:socket].send "shutdown" - STDERR.write "< #{name} is #{p[:socket].recv}\n" + p[:socket].send "shutdown" # every daemon shuts down after receiving this keyword + logmsg :server, "< #{name} is #{p[:socket].recv}" } end -def update_database # FIXME: real database +def update_database $db['progress'] += 1 - j = JSON.generate $db + j = JSON.generate $db # FIXME: real database f = WriteFile.new DB_FILE f.write j.to_s f.close @@ -68,7 +86,7 @@ end def init # database connection - $db = JSON.parse ReadFile.read DB_FILE + $db = JSON.parse ReadFile.read DB_FILE # FIXME: real database # working directory `mkdir -p #{WORK_DIR}/g` # setup environment, start daemons @@ -78,132 +96,144 @@ def init $env[name] = { :socket => sock, :pid => pid } port += 1 } + # lock `touch #{LOCK_FILE}` end +def send_recv daemon, msg # simple pair communcation + socket = $env[daemon][:socket] + logmsg daemon, "> sending message: '#{msg}'" + socket.send msg + logmsg daemon, "waiting ..." + ans = socket.recv.force_encoding("UTF-8").strip + logmsg daemon, "< received answer: '#{ans}'" + + return ans +end + +# ############################################################################# +# Run init() [just once] +# ############################################################################# init if !FileTest.exist?(LOCK_FILE) +# ############################################################################# +# Routes +# ############################################################################# get '/' do cross_origin - "Nothing to see here." + + return "" end -# receive post-edit, send translation -get '/next' do +get '/next' do # (receive post-edit, update models), send next translation cross_origin - return "locked" if $lock + # already processing request? + return "locked" if $lock # return $lock = true - key = params[:key] # FIXME: do something with it + key = params[:key] # FIXME: do something with it + + # received post-edit -> update models + # 0. save raw post-edit + # 1. tokenize + # 2. truecase + # 3. save processed post-edit + # 4. update weights + # 5. update grammar extractor + # 5a. forward alignment + # 5b. backward alignment + # 5c. symmetrize alignment + # 5d. actual update + # 6. update database if params[:example] + # 0. save raw post-edit source, reference = params[:example].strip.split(" ||| ") - # tokenize, lowercase $db['post_edits_raw'] << reference.strip - $env[:tokenizer][:socket].send reference - STDERR.write "[tokenizer] waiting ...\n" - reference = $env[:tokenizer][:socket].recv.force_encoding("UTF-8").strip - STDERR.write "[tokenizer] < received tokenized reference: '#{reference}'\n" - reference.downcase! - # save post-edits - $db['post_edits'] << reference.strip - # update weights - grammar = "#{WORK_DIR}/g/#{Digest::SHA256.hexdigest(source)}.grammar" - annotated_source = " #{source} " - msg = "#{annotated_source} ||| #{reference}" - STDERR.write "[dtrain] > sending '#{msg}' for update\n" - $env[:dtrain][:socket].send msg - STDERR.write "[dtrain] waiting for confirmation ...\n" - STDERR.write "[dtrain] < says it's #{$env[:dtrain][:socket].recv}\n" - # update grammar extractor - # get forward alignment - msg = "#{source} ||| #{reference}" - STDERR.write "[aligner_fwd] > sending '#{msg}' for forced alignment\n" - $env[:aligner_fwd][:socket].send msg - STDERR.write "[aligner_fwd] waiting for alignment ...\n" - a_fwd = $env[:aligner_fwd][:socket].recv.strip - STDERR.write "[aligner_fwd] < got alignment: '#{a_fwd}'\n" - # get backward alignment - msg = "#{source} ||| #{reference}" - STDERR.write "[aligner_back] > sending '#{msg}' for forced alignment\n" - $env[:aligner_back][:socket].send msg - STDERR.write "[aligner_back] waiting for alignment ...\n" - a_back = $env[:aligner_back][:socket].recv.strip - STDERR.write "[aligner_back] < got alignment: '#{a_back}'\n" - # symmetrize alignment - msg = "#{a_fwd} ||| #{a_back}" - STDERR.write "[atools] > sending '#{msg}' to combine alignments\n" - $env[:atools][:socket].send msg - STDERR.write "[atools] waiting for alignment ...\n" - a = $env[:atools][:socket].recv.strip - STDERR.write "[atools] < got alignment '#{a}'\n" - # actual extractor - msg = "TEST ||| #{source} ||| #{reference} ||| #{a}" - STDERR.write "[extractor] > sending '#{msg}' for learning\n" - $env[:extractor][:socket].send "TEST ||| #{source} ||| #{reference} ||| #{a}" - STDERR.write "[extractor] waiting for confirmation ...\n" - STDERR.write "[extractor] < got '#{$env[:extractor][:socket].recv}'\n" - update_database + # 1. tokenize + reference = send_recv :tokenizer, reference + # 2. truecase + reference = send_recv :truecaser, reference + # 3. save processed post-edits + logmsg "db", "saving processed post-edit" + $db['post_edits'] << reference.strip + # 4. update weights + grammar = "#{WORK_DIR}/g/#{Digest::SHA256.hexdigest(source)}.grammar" + annotated_source = " #{source} " + send_recv :dtrain, "#{annotated_source} ||| #{reference}" + # 5. update grammar extractor + # 5a. get forward alignment + a_fwd = send_recv :aligner_fwd, "#{source} ||| #{reference}" + # 5b. get backward alignment + a_back = send_recv :aligner_back, "#{reference} ||| #{source}" + # 5c. symmetrize alignment + a = send_recv :atools, "#{a_fwd} ||| #{a_back}" + # 5d actual extractor + send_recv :extractor, "- ||| #{source} ||| #{reference} ||| #{a}" + # 6. update database + logmsg "db", "updating database" + update_database end source = $db['source_segments'][$db['progress']] raw_source = $db['raw_source_segments'][$db['progress']] if !source # input is done -> displays 'Thank you!' - STDERR.write ">>> end of input, sending 'fi'\n" + logmsg "server", "end of input, sending 'fi'" $lock = false - return "fi" + return "fi" # return elsif !$confirmed + logmsg :server, "locked, re-sending last reply" $lock = false - return $last_reply - else # translate next sentence + return $last_reply # return + else + # translate next sentence + # 1. generate grammar + # 2. translate + # 3. detokenize + # 4. reply source.strip! - # generate grammar for current sentence - grammar = "#{WORK_DIR}/g/#{Digest::SHA256.hexdigest(source)}.grammar" # FIXME: keep grammars? - msg = "- ||| #{source} ||| #{grammar}" # FIXME: content identifier useful? - STDERR.write "[extractor] > asking to generate grammar: '#{msg}'\n" - $env[:extractor][:socket].send msg - STDERR.write "[extractor] waiting for confirmation ...\n" - STDERR.write "[extractor] < says it generated #{$env[:extractor][:socket].recv.force_encoding("UTF-8").strip}\n" - # translation + # 1. generate grammar for current sentence + grammar = "#{WORK_DIR}/g/#{Digest::SHA256.hexdigest(source)}.grammar" + msg = "- ||| #{source} ||| #{grammar}" + send_recv :extractor, msg # FIXME: content identifier useful? + # 2. translation msg = "act:translate ||| #{source} " - STDERR.write "[dtrain] > asking to translate: '#{msg}'\n" - $env[:dtrain][:socket].send msg - STDERR.write "[dtrain] waiting for translation ...\n" - transl = $env[:dtrain][:socket].recv.force_encoding "UTF-8" - STDERR.write "[dtrain] < received translation: '#{transl}'\n" - # detokenizer - $env[:detokenizer][:socket].send transl - STDERR.write "[detokenizer] waiting ...\n" - transl = $env[:detokenizer][:socket].recv.force_encoding("UTF-8").strip - STDERR.write "[detokenizer] < received final translation: '#{transl}'\n" - # reply + transl = send_recv :dtrain, msg + # 3. detokenizer + transl = send_recv :detokenizer, transl + # 4. reply $last_reply = "#{$db['progress']}\t#{source}\t#{transl.strip}\t#{raw_source}" $lock = false $confirmed = false - STDERR.write ">>> response: '#{$last_reply}'" - return $last_reply + logmsg :server, "response: '#{$last_reply}'" + return $last_reply # return end - return "oh oh" # FIXME: do something sensible + return "oh oh" # return FIXME: do something sensible +end + +get '/debug' do # debug view + fn = "#{WORK_DIR}/dtrain.debug.json" + data = JSON.parse ReadFile.read(fn).force_encoding("UTF-8") + + haml :debug, :locals => { :data => data } end -# client confirms received translation -get '/confirm' do +get '/confirm' do # client confirms received translation cross_origin - STDERR.write "confirmed = #{$confirmed}\n" $confirmed = true + logmsg :server, "confirmed = #{$confirmed}" return "#{$confirmed}" end -# stop daemons and shut down server -get '/shutdown' do +get '/shutdown' do # stop daemons and shut down server + logmsg :server, "shutting down daemons" stop_all_daemons - "ready to shutdown" + return "ready to shutdown" end -# reset current session -get '/reset' do +get '/reset' do # reset current session return "locked" if $lock - $db = JSON.parse ReadFile.read DB_FILE # FIXME: database .. + $db = JSON.parse ReadFile.read DB_FILE # FIXME: database .. $db['post_edits'].clear $db['post_edits_raw'].clear update_database @@ -213,10 +243,9 @@ get '/reset' do return "#{$db.to_s}" end -# load other db file than configured -get '/load/:name' do +get '/load/:name' do # load other db file than configured return "locked" if $lock - $db = JSON.parse ReadFile.read "/fast_scratch/simianer/lfpe/example_pattr/#{params[:name]}.json.original" + $db = JSON.parse ReadFile.read "#{DATA_DIR}/#{params[:name]}.json.original" $db['post_edits'].clear $db['post_edits_raw'].clear update_database diff --git a/static/debug.css b/static/debug.css new file mode 100644 index 0000000..4cc77f7 --- /dev/null +++ b/static/debug.css @@ -0,0 +1,7 @@ +.red { color: red } +.green { color: green } +th { border: 1px solid #000 } +td { border: 1px solid #000; text-align:right } +td.noborder { border: 0 } +td.left { text-align: left } + diff --git a/util/de-tok.rb b/util/de-tok.rb new file mode 100755 index 0000000..92c563f --- /dev/null +++ b/util/de-tok.rb @@ -0,0 +1,36 @@ +#!/usr/bin/env ruby + +require 'nanomsg' +require 'open3' +require 'trollop' + +conf = Trollop::options do + opt :action, "tokenize (T) or detokenize (D)", :type => :string, :requred => true + opt :addr, "socket address", :short => "-S", :type => :string, :required => true + opt :scripts, "path to scripts directory", :short => "-p", :type => :string, :required => true + opt :lang, "language", :short => "-l", :type => :string, :required => true +end + +sock = NanoMsg::PairSocket.new +sock.bind conf[:addr] +sock.send "hello" + +if conf[:action] == "D" + cmd = "#{conf[:scripts]}/detokenizer.perl -q -b -u -l #{conf[:lang]}" +elsif conf[:action] == "T" + cmd = "#{conf[:scripts]}/tokenizer-no-escape.perl -q -b -a -l #{conf[:lang]}" +else + # ERROR +end +while true + inp = sock.recv + break if !inp||inp=="shutdown" + Open3.popen3(cmd) do |pin, pout, perr| + pin.write inp + pin.close + sock.send pout.gets.strip + end +end + +sock.send "off" + diff --git a/util/kill b/util/kill new file mode 100755 index 0000000..f1924b2 --- /dev/null +++ b/util/kill @@ -0,0 +1,4 @@ +#!/bin/bash + +for i in {1..6}; do ps ax | grep -P "(server.rb|atools|net_fa|sa.extract|dtrain)" | grep -v vim | grep -v -P "^\s\+$" | cut -d " " -f $i | xargs kill -9 &>/dev/null; done + diff --git a/util/run_server b/util/run_server new file mode 100755 index 0000000..88dadb5 --- /dev/null +++ b/util/run_server @@ -0,0 +1,8 @@ +#!/bin/bash -x + +export LD_LIBRARY_PATH=/fast_scratch/simianer/lfpe/nanomsg-0.5-beta/lib +export PYTHONPATH=/fast_scratch/simianer/lfpe/python +UTIL=/fast_scratch/simianer/lfpe/lfpe/util +DATA=/fast_scratch/simianer/lfpe/data/tiny_test +clear;$UTIL/kill;$UTIL/kill;$UTIL/kill;rm $DATA/work/lockfile; cp $DATA/data.json.original $DATA/data.json; $UTIL/../server.rb $DATA/conf.rb + diff --git a/util/truecase.rb b/util/truecase.rb new file mode 100755 index 0000000..3e97bd5 --- /dev/null +++ b/util/truecase.rb @@ -0,0 +1,30 @@ +#!/usr/bin/env ruby + +require 'nanomsg' +require 'open3' +require 'trollop' + +conf = Trollop::options do + opt :addr, "socket address", :short => "-S", :type => :string, :required => true + opt :moses, "path to moses directory", :short => "-m", :type => :string, :required => true + opt :model, "model file", :short => "-n", :type => :string, :required => true +end + +sock = NanoMsg::PairSocket.new +sock.bind conf[:addr] +sock.send "hello" + +cmd = "#{conf[:moses]}/scripts/recaser/truecase.perl -b --model #{conf[:model]}" +while true + inp = sock.recv + " " # FIXME? + break if !inp||inp=="shutdown" + Open3.popen3(cmd) do |pin, pout, perr| + pin.write inp + pin.close + s = pout.gets.strip + sock.send s #pout.gets.strip + end +end + +sock.send "off" + diff --git a/views/debug.haml b/views/debug.haml new file mode 100644 index 0000000..2f90044 --- /dev/null +++ b/views/debug.haml @@ -0,0 +1,71 @@ +- require 'zipf' +!!! +%html + %head + %title debug view (for TODO) + %link(rel="stylesheet" type="text/css" href="debug.css") + %body + %h1 debug view (for TODO) + %table + %tr + %td.noborder + %strong source: + %td.left #{data["source"]} + %tr + %td.noborder + %strong post-edit: + %td.left #{data["target"]} + %tr + %td.noborder + %strong original mt: + %td.left #{data["1best"]} + %tr + %td.noborder + %strong best match (bleu=#{data["best_match_score"]}): + %td.left #{data["best_match"]} + %h2 meta + %p k: #{data["samples_size"]} + %p number of updates: #{data["num_up"]} + %p updated features: #{data["updated_features"]} + %p learning rate: #{data["learning_rate"]} + %h2 k-best + %p bleu | model score | original rank | translation \n features + %p.red update needed + %ol + - kbest = [] + - data["kbest"].each { |i| x=splitpipe(i); kbest << [ x[0].to_f, x[1].to_f, x[2].to_i, x[3], x[4] ] } + - kbest.sort! { |i,j| j[0] <=> i[0] } + - kbest.each_with_index do |k,j| + - b = kbest[0,j].map { |l| l[0]>k[0] && l[1] + %pre #{k[3]} + - else + %li + %strong #{"%.2f"%(k[0].to_f*100)} | #{k[1]} | #{k[2]} | #{k[4]}
+ %pre #{k[3]} + - if [9,89].include? j + %hr + %h2 weight updates + %table + %tr + %th feature + %th before + %th after + %th diff + %th raw diff + - data["weights_after"].keys.each.sort { |a,b| a[0] <=> b[0] }.each do |k| + %tr + %td.noborder #{k} + %td #{"%+.3f"%data["weights_before"][k].round(4)} + %td #{"%+.3f"%data["weights_after"][k].round(4)} + - diff = data["weights_before"][k].abs-data["weights_after"][k].abs + - if diff < 0 + %td.red #{"%+.3f"%(diff).round(4)} + - elsif diff > 0 + %td.green #{"%+.3f"%(diff).round(4)} + - else + %td #{"%+.3f"%(diff).round(4)} + %td #{"%+.1f"%((data["weights_before"][k].abs-data["weights_after"][k].abs)/data["learning_rate"]).round(2)} + -- cgit v1.2.3