summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2015-06-23 16:03:23 +0200
committerPatrick Simianer <p@simianer.de>2015-06-23 16:03:23 +0200
commit14f7bbab028d781cd2057a348862f911324338fd (patch)
tree170df2d80cb4aea161700e26bd951d06276a81f5
parent553d54484725614fa73e805b59136a39e6dee295 (diff)
overhaul
-rw-r--r--index.php2
-rwxr-xr-xrun_server6
-rwxr-xr-xserver.rb251
-rw-r--r--static/debug.css7
-rwxr-xr-xutil/de-tok.rb (renamed from de-tok.rb)0
-rwxr-xr-xutil/kill (renamed from kill)2
-rwxr-xr-xutil/run_server8
-rwxr-xr-xutil/truecase.rb30
-rw-r--r--views/debug.haml71
9 files changed, 258 insertions, 119 deletions
diff --git a/index.php b/index.php
index 3947b42..f694b72 100644
--- a/index.php
+++ b/index.php
@@ -44,7 +44,7 @@
<strong>Document overview</strong>
<table id="overview">
<?php
-$j = file_get_contents("/fast_scratch/simianer/lfpe/example_session/".$_GET["key"].".json"); # FIXME: from database
+$j = file_get_contents($_GET["dir"]."/".$_GET["key"].".json"); # FIXME: get overview/state from database
$a = json_decode($j);
$i = 0;
foreach($a->raw_source_segments as $s) {
diff --git a/run_server b/run_server
deleted file mode 100755
index 91d9872..0000000
--- a/run_server
+++ /dev/null
@@ -1,6 +0,0 @@
-#!/bin/bash -x
-
-export LD_LIBRARY_PATH=/fast_scratch/simianer/lfpe/nanomsg-0.5-beta/lib
-export PYTHONPATH=~/.local/lib/python2.7/site-packages
-./kill;./kill;rm ../example_session/work/lockfile; ./server.rb ../example_session/conf.rb 2>server.err > server.out
-
diff --git a/server.rb b/server.rb
index fdc99fd..2122979 100755
--- a/server.rb
+++ b/server.rb
@@ -7,28 +7,37 @@ require 'nanomsg'
require 'zipf'
require 'digest'
require 'json'
+require 'haml'
-# load configuration file and setup global variables
-require_relative "#{ARGV[0]}"
-$lock = false # lock if currently learning/translating
-$last_reply = nil # cache last reply
-$confirmed = true # client received translation?
-if !FileTest.exist? LOCK_FILE
- $db = {} # FIXME: that is supposed to be a database connection
- $env = {}
+# #############################################################################
+# Load configuration file and setup global variables
+# #############################################################################
+require_relative "#{ARGV[0]}" # load configuration for this session
+$lock = false # lock if currently learning/translating
+$last_reply = nil # cache last reply
+$confirmed = true # client received translation?
+if !FileTest.exist? LOCK_FILE # locked?
+ $db = {} # FIXME: that is supposed to be a database connection
+ $env = {} # environment variables (socket connections to daemons)
end
+# #############################################################################
+# Daemons
+# #############################################################################
$daemons = {
- :detokenizer => "/fast_scratch/simianer/lfpe/lfpe/de-tok.rb -a D -S '__ADDR__' -p #{SCRIPTS_DIR} -l #{TARGET_LANG}",
- :tokenizer => "/fast_scratch/simianer/lfpe/lfpe/de-tok.rb -a T -S '__ADDR__' -p #{SCRIPTS_DIR} -l #{TARGET_LANG}",
+ :detokenizer => "/fast_scratch/simianer/lfpe/lfpe/util/de-tok.rb -a D -S '__ADDR__' -p #{SCRIPTS} -l #{TARGET_LANG}",
+ :tokenizer => "/fast_scratch/simianer/lfpe/lfpe/util/de-tok.rb -a T -S '__ADDR__' -p #{SCRIPTS} -l #{TARGET_LANG}",
+ :truecaser => "/fast_scratch/simianer/lfpe/lfpe/util/truecase.rb -S '__ADDR__' -m #{MOSES} -n #{DATA_DIR}/truecaser", # FIXME: run as real daemon
+ :dtrain => "#{CDEC}/training/dtrain/dtrain_net_interface -c #{DATA_DIR}/dtrain.ini -d #{WORK_DIR}/dtrain.debug.json -o #{WORK_DIR}/weights.final -a '__ADDR__'",
:extractor => "python -m cdec.sa.extract -c #{DATA_DIR}/sa.ini --online -u -S '__ADDR__'",
- :aligner_fwd => "#{CDEC_NET}/word-aligner/net_fa -f #{DATA_DIR}/a/forward.params -m #{FWD_MEAN_SRCLEN_MULT} -T #{FWD_TENSION} --sock_url '__ADDR__'",
- :aligner_back => "#{CDEC_NET}/word-aligner/net_fa -f #{DATA_DIR}/a/backward.params -m #{BACK_MEAN_SRCLEN_MULT} -T #{BACK_TENSION} --sock_url '__ADDR__'",
- :atools => "#{CDEC_NET}/utils/atools_net -c grow-diag-final-and -S '__ADDR__'",
- :dtrain => "#{CDEC_NET}/training/dtrain/dtrain_net_interface -c #{DATA_DIR}/dtrain.ini -o #{WORK_DIR}/weights.final -a '__ADDR__'"
+ :aligner_fwd => "#{CDEC}/word-aligner/net_fa -f #{DATA_DIR}/forward.params -m #{FWD_MEAN_SRCLEN_MULT} -T #{FWD_TENSION} --sock_url '__ADDR__'",
+ :aligner_back => "#{CDEC}/word-aligner/net_fa -f #{DATA_DIR}/backward.params -m #{BACK_MEAN_SRCLEN_MULT} -T #{BACK_TENSION} --sock_url '__ADDR__'",
+ :atools => "#{CDEC}/utils/atools_net -c grow-diag-final-and -S '__ADDR__'"
}
-# setup Sinatra
+# #############################################################################
+# Set-up Sinatra
+# #############################################################################
set :bind, SERVER_IP
set :port, WEB_PORT
set :allow_origin, :any
@@ -36,31 +45,40 @@ set :allow_methods, [:get, :post, :options]
set :allow_credentials, true
set :max_age, "1728000"
set :expose_headers, ['Content-Type']
+set :public_folder, File.dirname(__FILE__) + '/static'
+
+
+# #############################################################################
+# Helper functions
+# #############################################################################
+def logmsg name, msg
+ STDERR.write "[#{name}] #{msg}\n"
+end
def start_daemon cmd, name, addr
- STDERR.write "> starting #{name} daemon\n"
+ logmsg :server, "starting #{name} daemon"
cmd.gsub! '__ADDR__', addr
pid = fork do
exec cmd
end
sock = NanoMsg::PairSocket.new
sock.connect addr
- STDERR.write "< got #{sock.recv} from #{name}\n"
+ logmsg :server, "< got #{sock.recv} from #{name}"
return sock, pid
end
def stop_all_daemons
- STDERR.write "shutting down all daemons\n"
+ logmsg :server, "shutting down all daemons"
$env.each { |name,p|
- p[:socket].send "shutdown"
- STDERR.write "< #{name} is #{p[:socket].recv}\n"
+ p[:socket].send "shutdown" # every daemon shuts down after receiving this keyword
+ logmsg :server, "< #{name} is #{p[:socket].recv}"
}
end
-def update_database # FIXME: real database
+def update_database
$db['progress'] += 1
- j = JSON.generate $db
+ j = JSON.generate $db # FIXME: real database
f = WriteFile.new DB_FILE
f.write j.to_s
f.close
@@ -68,7 +86,7 @@ end
def init
# database connection
- $db = JSON.parse ReadFile.read DB_FILE
+ $db = JSON.parse ReadFile.read DB_FILE # FIXME: real database
# working directory
`mkdir -p #{WORK_DIR}/g`
# setup environment, start daemons
@@ -78,132 +96,144 @@ def init
$env[name] = { :socket => sock, :pid => pid }
port += 1
}
+ # lock
`touch #{LOCK_FILE}`
end
+def send_recv daemon, msg # simple pair communcation
+ socket = $env[daemon][:socket]
+ logmsg daemon, "> sending message: '#{msg}'"
+ socket.send msg
+ logmsg daemon, "waiting ..."
+ ans = socket.recv.force_encoding("UTF-8").strip
+ logmsg daemon, "< received answer: '#{ans}'"
+
+ return ans
+end
+
+# #############################################################################
+# Run init() [just once]
+# #############################################################################
init if !FileTest.exist?(LOCK_FILE)
+# #############################################################################
+# Routes
+# #############################################################################
get '/' do
cross_origin
- "Nothing to see here."
+
+ return ""
end
-# receive post-edit, send translation
-get '/next' do
+get '/next' do # (receive post-edit, update models), send next translation
cross_origin
- return "locked" if $lock
+ # already processing request?
+ return "locked" if $lock # return
$lock = true
- key = params[:key] # FIXME: do something with it
+ key = params[:key] # FIXME: do something with it
+
+ # received post-edit -> update models
+ # 0. save raw post-edit
+ # 1. tokenize
+ # 2. truecase
+ # 3. save processed post-edit
+ # 4. update weights
+ # 5. update grammar extractor
+ # 5a. forward alignment
+ # 5b. backward alignment
+ # 5c. symmetrize alignment
+ # 5d. actual update
+ # 6. update database
if params[:example]
+ # 0. save raw post-edit
source, reference = params[:example].strip.split(" ||| ")
- # tokenize, lowercase
$db['post_edits_raw'] << reference.strip
- $env[:tokenizer][:socket].send reference
- STDERR.write "[tokenizer] waiting ...\n"
- reference = $env[:tokenizer][:socket].recv.force_encoding("UTF-8").strip
- STDERR.write "[tokenizer] < received tokenized reference: '#{reference}'\n"
- reference.downcase!
- # save post-edits
- $db['post_edits'] << reference.strip
- # update weights
- grammar = "#{WORK_DIR}/g/#{Digest::SHA256.hexdigest(source)}.grammar"
- annotated_source = "<seg grammar=\"#{grammar}\"> #{source} </seg>"
- msg = "#{annotated_source} ||| #{reference}"
- STDERR.write "[dtrain] > sending '#{msg}' for update\n"
- $env[:dtrain][:socket].send msg
- STDERR.write "[dtrain] waiting for confirmation ...\n"
- STDERR.write "[dtrain] < says it's #{$env[:dtrain][:socket].recv}\n"
- # update grammar extractor
- # get forward alignment
- msg = "#{source} ||| #{reference}"
- STDERR.write "[aligner_fwd] > sending '#{msg}' for forced alignment\n"
- $env[:aligner_fwd][:socket].send msg
- STDERR.write "[aligner_fwd] waiting for alignment ...\n"
- a_fwd = $env[:aligner_fwd][:socket].recv.strip
- STDERR.write "[aligner_fwd] < got alignment: '#{a_fwd}'\n"
- # get backward alignment
- msg = "#{source} ||| #{reference}"
- STDERR.write "[aligner_back] > sending '#{msg}' for forced alignment\n"
- $env[:aligner_back][:socket].send msg
- STDERR.write "[aligner_back] waiting for alignment ...\n"
- a_back = $env[:aligner_back][:socket].recv.strip
- STDERR.write "[aligner_back] < got alignment: '#{a_back}'\n"
- # symmetrize alignment
- msg = "#{a_fwd} ||| #{a_back}"
- STDERR.write "[atools] > sending '#{msg}' to combine alignments\n"
- $env[:atools][:socket].send msg
- STDERR.write "[atools] waiting for alignment ...\n"
- a = $env[:atools][:socket].recv.strip
- STDERR.write "[atools] < got alignment '#{a}'\n"
- # actual extractor
- msg = "TEST ||| #{source} ||| #{reference} ||| #{a}"
- STDERR.write "[extractor] > sending '#{msg}' for learning\n"
- $env[:extractor][:socket].send "TEST ||| #{source} ||| #{reference} ||| #{a}"
- STDERR.write "[extractor] waiting for confirmation ...\n"
- STDERR.write "[extractor] < got '#{$env[:extractor][:socket].recv}'\n"
- update_database
+ # 1. tokenize
+ reference = send_recv :tokenizer, reference
+ # 2. truecase
+ reference = send_recv :truecaser, reference
+ # 3. save processed post-edits
+ logmsg "db", "saving processed post-edit"
+ $db['post_edits'] << reference.strip
+ # 4. update weights
+ grammar = "#{WORK_DIR}/g/#{Digest::SHA256.hexdigest(source)}.grammar"
+ annotated_source = "<seg grammar=\"#{grammar}\"> #{source} </seg>"
+ send_recv :dtrain, "#{annotated_source} ||| #{reference}"
+ # 5. update grammar extractor
+ # 5a. get forward alignment
+ a_fwd = send_recv :aligner_fwd, "#{source} ||| #{reference}"
+ # 5b. get backward alignment
+ a_back = send_recv :aligner_back, "#{reference} ||| #{source}"
+ # 5c. symmetrize alignment
+ a = send_recv :atools, "#{a_fwd} ||| #{a_back}"
+ # 5d actual extractor
+ send_recv :extractor, "- ||| #{source} ||| #{reference} ||| #{a}"
+ # 6. update database
+ logmsg "db", "updating database"
+ update_database
end
source = $db['source_segments'][$db['progress']]
raw_source = $db['raw_source_segments'][$db['progress']]
if !source # input is done -> displays 'Thank you!'
- STDERR.write ">>> end of input, sending 'fi'\n"
+ logmsg "server", "end of input, sending 'fi'"
$lock = false
- return "fi"
+ return "fi" # return
elsif !$confirmed
+ logmsg :server, "locked, re-sending last reply"
$lock = false
- return $last_reply
- else # translate next sentence
+ return $last_reply # return
+ else
+ # translate next sentence
+ # 1. generate grammar
+ # 2. translate
+ # 3. detokenize
+ # 4. reply
source.strip!
- # generate grammar for current sentence
- grammar = "#{WORK_DIR}/g/#{Digest::SHA256.hexdigest(source)}.grammar" # FIXME: keep grammars?
- msg = "- ||| #{source} ||| #{grammar}" # FIXME: content identifier useful?
- STDERR.write "[extractor] > asking to generate grammar: '#{msg}'\n"
- $env[:extractor][:socket].send msg
- STDERR.write "[extractor] waiting for confirmation ...\n"
- STDERR.write "[extractor] < says it generated #{$env[:extractor][:socket].recv.force_encoding("UTF-8").strip}\n"
- # translation
+ # 1. generate grammar for current sentence
+ grammar = "#{WORK_DIR}/g/#{Digest::SHA256.hexdigest(source)}.grammar"
+ msg = "- ||| #{source} ||| #{grammar}"
+ send_recv :extractor, msg # FIXME: content identifier useful?
+ # 2. translation
msg = "act:translate ||| <seg grammar=\"#{grammar}\"> #{source} </seg>"
- STDERR.write "[dtrain] > asking to translate: '#{msg}'\n"
- $env[:dtrain][:socket].send msg
- STDERR.write "[dtrain] waiting for translation ...\n"
- transl = $env[:dtrain][:socket].recv.force_encoding "UTF-8"
- STDERR.write "[dtrain] < received translation: '#{transl}'\n"
- # detokenizer
- $env[:detokenizer][:socket].send transl
- STDERR.write "[detokenizer] waiting ...\n"
- transl = $env[:detokenizer][:socket].recv.force_encoding("UTF-8").strip
- STDERR.write "[detokenizer] < received final translation: '#{transl}'\n"
- # reply
+ transl = send_recv :dtrain, msg
+ # 3. detokenizer
+ transl = send_recv :detokenizer, transl
+ # 4. reply
$last_reply = "#{$db['progress']}\t#{source}\t#{transl.strip}\t#{raw_source}"
$lock = false
$confirmed = false
- STDERR.write ">>> response: '#{$last_reply}'"
- return $last_reply
+ logmsg :server, "response: '#{$last_reply}'"
+ return $last_reply # return
end
- return "oh oh" # FIXME: do something sensible
+ return "oh oh" # return FIXME: do something sensible
+end
+
+get '/debug' do # debug view
+ fn = "#{WORK_DIR}/dtrain.debug.json"
+ data = JSON.parse ReadFile.read(fn).force_encoding("UTF-8")
+
+ haml :debug, :locals => { :data => data }
end
-# client confirms received translation
-get '/confirm' do
+get '/confirm' do # client confirms received translation
cross_origin
- STDERR.write "confirmed = #{$confirmed}\n"
$confirmed = true
+ logmsg :server, "confirmed = #{$confirmed}"
return "#{$confirmed}"
end
-# stop daemons and shut down server
-get '/shutdown' do
+get '/shutdown' do # stop daemons and shut down server
+ logmsg :server, "shutting down daemons"
stop_all_daemons
- "ready to shutdown"
+ return "ready to shutdown"
end
-# reset current session
-get '/reset' do
+get '/reset' do # reset current session
return "locked" if $lock
- $db = JSON.parse ReadFile.read DB_FILE # FIXME: database ..
+ $db = JSON.parse ReadFile.read DB_FILE # FIXME: database ..
$db['post_edits'].clear
$db['post_edits_raw'].clear
update_database
@@ -213,10 +243,9 @@ get '/reset' do
return "#{$db.to_s}"
end
-# load other db file than configured
-get '/load/:name' do
+get '/load/:name' do # load other db file than configured
return "locked" if $lock
- $db = JSON.parse ReadFile.read "/fast_scratch/simianer/lfpe/example_pattr/#{params[:name]}.json.original"
+ $db = JSON.parse ReadFile.read "#{DATA_DIR}/#{params[:name]}.json.original"
$db['post_edits'].clear
$db['post_edits_raw'].clear
update_database
diff --git a/static/debug.css b/static/debug.css
new file mode 100644
index 0000000..4cc77f7
--- /dev/null
+++ b/static/debug.css
@@ -0,0 +1,7 @@
+.red { color: red }
+.green { color: green }
+th { border: 1px solid #000 }
+td { border: 1px solid #000; text-align:right }
+td.noborder { border: 0 }
+td.left { text-align: left }
+
diff --git a/de-tok.rb b/util/de-tok.rb
index 92c563f..92c563f 100755
--- a/de-tok.rb
+++ b/util/de-tok.rb
diff --git a/kill b/util/kill
index 34d7ed4..f1924b2 100755
--- a/kill
+++ b/util/kill
@@ -1,4 +1,4 @@
#!/bin/bash
-for i in {1..6}; do ps ax | grep -P "(server.rb|atools|net_fa|sa.extract|dtrain)" | grep -v vim | grep -v -P "^\s\+$" | cut -d " " -f $i | xargs kill -9; done
+for i in {1..6}; do ps ax | grep -P "(server.rb|atools|net_fa|sa.extract|dtrain)" | grep -v vim | grep -v -P "^\s\+$" | cut -d " " -f $i | xargs kill -9 &>/dev/null; done
diff --git a/util/run_server b/util/run_server
new file mode 100755
index 0000000..88dadb5
--- /dev/null
+++ b/util/run_server
@@ -0,0 +1,8 @@
+#!/bin/bash -x
+
+export LD_LIBRARY_PATH=/fast_scratch/simianer/lfpe/nanomsg-0.5-beta/lib
+export PYTHONPATH=/fast_scratch/simianer/lfpe/python
+UTIL=/fast_scratch/simianer/lfpe/lfpe/util
+DATA=/fast_scratch/simianer/lfpe/data/tiny_test
+clear;$UTIL/kill;$UTIL/kill;$UTIL/kill;rm $DATA/work/lockfile; cp $DATA/data.json.original $DATA/data.json; $UTIL/../server.rb $DATA/conf.rb
+
diff --git a/util/truecase.rb b/util/truecase.rb
new file mode 100755
index 0000000..3e97bd5
--- /dev/null
+++ b/util/truecase.rb
@@ -0,0 +1,30 @@
+#!/usr/bin/env ruby
+
+require 'nanomsg'
+require 'open3'
+require 'trollop'
+
+conf = Trollop::options do
+ opt :addr, "socket address", :short => "-S", :type => :string, :required => true
+ opt :moses, "path to moses directory", :short => "-m", :type => :string, :required => true
+ opt :model, "model file", :short => "-n", :type => :string, :required => true
+end
+
+sock = NanoMsg::PairSocket.new
+sock.bind conf[:addr]
+sock.send "hello"
+
+cmd = "#{conf[:moses]}/scripts/recaser/truecase.perl -b --model #{conf[:model]}"
+while true
+ inp = sock.recv + " " # FIXME?
+ break if !inp||inp=="shutdown"
+ Open3.popen3(cmd) do |pin, pout, perr|
+ pin.write inp
+ pin.close
+ s = pout.gets.strip
+ sock.send s #pout.gets.strip
+ end
+end
+
+sock.send "off"
+
diff --git a/views/debug.haml b/views/debug.haml
new file mode 100644
index 0000000..2f90044
--- /dev/null
+++ b/views/debug.haml
@@ -0,0 +1,71 @@
+- require 'zipf'
+!!!
+%html
+ %head
+ %title debug view (for TODO)
+ %link(rel="stylesheet" type="text/css" href="debug.css")
+ %body
+ %h1 debug view (for TODO)
+ %table
+ %tr
+ %td.noborder
+ %strong source:
+ %td.left #{data["source"]}
+ %tr
+ %td.noborder
+ %strong post-edit:
+ %td.left #{data["target"]}
+ %tr
+ %td.noborder
+ %strong original mt:
+ %td.left #{data["1best"]}
+ %tr
+ %td.noborder
+ %strong best match (bleu=#{data["best_match_score"]}):
+ %td.left #{data["best_match"]}
+ %h2 meta
+ %p <strong>k:</strong> #{data["samples_size"]}
+ %p <strong>number of updates:</strong> #{data["num_up"]}
+ %p <strong>updated features:</strong> #{data["updated_features"]}
+ %p <strong>learning rate:</strong> #{data["learning_rate"]}
+ %h2 k-best
+ %p bleu | model score | original rank | translation \n features
+ %p.red update needed
+ %ol
+ - kbest = []
+ - data["kbest"].each { |i| x=splitpipe(i); kbest << [ x[0].to_f, x[1].to_f, x[2].to_i, x[3], x[4] ] }
+ - kbest.sort! { |i,j| j[0] <=> i[0] }
+ - kbest.each_with_index do |k,j|
+ - b = kbest[0,j].map { |l| l[0]>k[0] && l[1]<k[1] }.include? true
+ -if b
+ %li.red
+ %strong #{"%.2f"%(k[0].to_f*100)} | #{k[1]} | #{k[2]} | #{k[4]} <br/>
+ %pre #{k[3]}
+ - else
+ %li
+ %strong #{"%.2f"%(k[0].to_f*100)} | #{k[1]} | #{k[2]} | #{k[4]} <br/>
+ %pre #{k[3]}
+ - if [9,89].include? j
+ %hr
+ %h2 weight updates
+ %table
+ %tr
+ %th feature
+ %th before
+ %th after
+ %th diff
+ %th raw diff
+ - data["weights_after"].keys.each.sort { |a,b| a[0] <=> b[0] }.each do |k|
+ %tr
+ %td.noborder <strong> #{k} </strong>
+ %td #{"%+.3f"%data["weights_before"][k].round(4)}
+ %td #{"%+.3f"%data["weights_after"][k].round(4)}
+ - diff = data["weights_before"][k].abs-data["weights_after"][k].abs
+ - if diff < 0
+ %td.red #{"%+.3f"%(diff).round(4)}
+ - elsif diff > 0
+ %td.green #{"%+.3f"%(diff).round(4)}
+ - else
+ %td #{"%+.3f"%(diff).round(4)}
+ %td #{"%+.1f"%((data["weights_before"][k].abs-data["weights_after"][k].abs)/data["learning_rate"]).round(2)}
+