summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2016-07-20 10:55:28 +0200
committerPatrick Simianer <p@simianer.de>2016-07-20 10:55:28 +0200
commit47b04c4d2e72a473d1b7595c1a6655bcd351f65e (patch)
treed4c8709f6f7b2ec0a1f917f85af06d4750995610
parent893f1b87ff267a6afd886951c91a64573f4d9c9f (diff)
parentfac00976168c6b3c94d01d76babede147e4a0710 (diff)
Merge branch 'master' of github.com:pks/lfpe
-rw-r--r--inc/db.inc.php2
-rw-r--r--js/interface.js429
-rwxr-xr-xphrase2_extraction/phrase2_extraction.rb10
-rwxr-xr-xserver.rb154
-rwxr-xr-xutil/kill_all (renamed from util/kill)0
-rwxr-xr-xutil/kill_session5
-rwxr-xr-xutil/run_all2
-rw-r--r--views/debug.haml5
8 files changed, 593 insertions, 14 deletions
diff --git a/inc/db.inc.php b/inc/db.inc.php
index e023c3a..bc18e16 100644
--- a/inc/db.inc.php
+++ b/inc/db.inc.php
@@ -2,7 +2,7 @@
$SESSION_DIR="/srv/postedit/sessions";
$key = $_GET["key"];
-if (preg_match('/^[a-z0-9]{1,4}$/', $key)) {
+if (preg_match('/^[a-f0-9]{1,4}$/', $key)) {
$json = file_get_contents($SESSION_DIR."/".$key."/data.json");
}
$db = json_decode($json);
diff --git a/js/interface.js b/js/interface.js
index ec36af7..6c32063 100644
--- a/js/interface.js
+++ b/js/interface.js
@@ -8,6 +8,8 @@ var data, // global data object
var TEXT_count_click=0,
TEXT_count_kbd=0;
+var rules_orig = {};
+
/*
* cross-site request
*
@@ -282,6 +284,11 @@ var next = function ()
send_data["post_edit"] = safe_str(post_edit);
send_data['type'] = 'g';
send_data["original_svg"] = document.getElementById("original_svg").value;
+ var dx = rule_diff(rules_orig, get_simplest_rules1());
+ for (k in dx) {
+ dx[k] = safe_str(dx[k]);
+ }
+ send_data["rule_diff"] = dx;
} else {
post_edit = $.trim(target_textarea.value);
send_data["post_edit"] = safe_str(post_edit);
@@ -514,6 +521,7 @@ var request_and_process_next = function ()
var x = $.trim(JSON.parse(DE_extract_data())["target"].join(" "));
last_post_edit.value = x;
document.getElementById("original_svg").value = DE_get_raw_svg_data();
+ rules_orig = get_simplest_rules1();
}
// start timer
@@ -593,3 +601,424 @@ $().ready(function()
});
+var explore = function (o,src,tgt,s2t,t2s,done)
+{
+ if (done[o["id"]]) return;
+ var d,other_t;
+ if (o["type"] == "source") {
+ d = s2t;
+ src.push(o["id"]);
+ other_t = "target";
+ } else {
+ d = t2s;
+ tgt.push(o["id"])
+ other_t = "source";
+ }
+
+ if (!d[o["id"]]) return;
+ if (d[o["id"]].length==0) return;
+
+ done[o["id"]] = true;
+
+ for (var i=0; i < d[o["id"]].length; i++) {
+ explore({"id":d[o["id"]][i], "type":other_t}, src, tgt, s2t, t2s, done);
+ }
+
+ return;
+}
+
+var rule_diff = function (prev,now)
+{
+ var diff = {};
+ for (key in now) {
+ if (prev[key] && now[key] != prev[key]) {
+ diff[key] = now[key];
+ }
+ if (!prev[key]) {
+ diff[key] = now[key];
+ }
+ }
+
+ return diff;
+}
+
+var get_simplest_rules = function ()
+{
+ var s2t = [];
+ var t2s = [];
+ for (key in DE_connections) {
+ var a = key.split("-");
+ if (s2t.hasOwnProperty(a[0])) {
+ s2t[parseInt(a[0])].push(parseInt(a[1]));
+ } else {
+ s2t[parseInt(a[0])] = [parseInt(a[1])];
+ }
+ if (t2s.hasOwnProperty(a[1])) {
+ t2s[parseInt(a[1])].push(parseInt(a[0]));
+ } else {
+ t2s[parseInt(a[1])] = [parseInt(a[0])];
+ }
+ }
+
+ var rules = [];
+ var done = {};
+ for (var i=0; i < DE_shapes.length; i++) {
+ if (DE_shapes[i]["type_"] == "source") {
+ var id = parseInt(DE_shapes[i]["id_"]);
+ var src = [];
+ var tgt = [];
+ explore({"id":id,"type":"source"}, src, tgt, s2t, t2s, done);
+ if (src.length >0 && tgt.length>0) {
+ rules.push( {"src":src, "tgt":tgt } );
+ }
+ }
+ }
+
+ rs = {}
+ for (r in rules) {
+ var src = "", tgt = "";
+ var prev=null
+ for (var i=0; i< rules[r]["src"].length; i++) {
+ if (prev!=null && prev < rules[r]["src"][i]-1) {
+ src += "[X] ";
+ }
+ src += DE_shapes_by_id[rules[r]["src"][i]].pair[0].textContent+" ";
+ if (rules[r]["src"][i]!=null)
+ prev = rules[r]["src"][i];
+ }
+ src += "||| ";
+ prev = null;
+ for (var i=0; i< rules[r]["tgt"].length; i++) {
+ if (!DE_shapes_by_id[rules[r]["tgt"][i]]) // unaligned source
+ continue;
+ if (prev && prev < rules[r]["tgt"][i]-1) {
+ tgt += "[X] ";
+ }
+ tgt += DE_shapes_by_id[rules[r]["tgt"][i]].pair[0].textContent+" ";
+ if (rules[r]["tgt"][i])
+ prev = rules[r]["tgt"][i];
+ }
+ if (tgt.replace(/\|\|\|/g, "").trim() != "") {
+ var id = rules[r]["tgt"][0];
+ var b = false;
+ if (DE_target_shapes[0]["id_"] == id) {
+ b = true;
+ }
+ rs[rules[r]["src"]] = b+" ||| "+$.trim(src+tgt);
+ }
+ }
+
+ return rs;
+}
+
+var id2idx = function (id) { // or grid_pos
+ var i = 0;
+ for (k in DE_target_shapes) {
+ if (DE_target_shapes[k]["id_"] == id) {
+ return i;
+ }
+ i++;
+ }
+
+ return -1;
+}
+
+var idx2id = function (idx) {
+ return DE_target_shapes[idx]["id_"];
+}
+
+var amax = function (a) {
+ var max = -9999999999999;
+ for (k in a) {
+ if (a[k] > max)
+ max = a[k];
+ }
+ return max;
+}
+
+var $rules =[];
+var get_simplest_rules1 = function ()
+{
+ var s2t = [];
+ var t2s = [];
+ for (key in DE_connections) {
+ var a = key.split("-");
+ if (s2t.hasOwnProperty(a[0])) {
+ s2t[parseInt(a[0])].push(parseInt(a[1]));
+ } else {
+ s2t[parseInt(a[0])] = [parseInt(a[1])];
+ }
+ if (t2s.hasOwnProperty(a[1])) {
+ t2s[parseInt(a[1])].push(parseInt(a[0]));
+ } else {
+ t2s[parseInt(a[1])] = [parseInt(a[0])];
+ }
+ }
+
+ var rules = [];
+ var done = {};
+ for (var i=0; i < DE_shapes.length; i++) {
+ if (DE_shapes[i]["type_"] == "source") {
+ var id = parseInt(DE_shapes[i]["id_"]);
+ var src = [];
+ var tgt = [];
+ explore({"id":id,"type":"source"}, src, tgt, s2t, t2s, done);
+ if (src.length >0 && tgt.length>0) {
+ tgt.sort(function(a,b) { return id2idx(a) > id2idx(b) });
+ rules.push( {"src":src, "tgt":tgt } );
+ }
+ }
+ }
+
+ for (var z=0; z<rules.length; z++) {
+ var src_gaps = [];
+ var tgt_gaps = [];
+ var r = rules[z];
+ var prev = null;
+
+ for (var j=0; j<r.src.length; j++) {
+ if (prev!=null && prev<(r.src[j]-1)) { // id == index == pos
+ var a = [];
+ for (var k=prev+1; k<r.src[j]; k++) {
+ a.push(k);
+ }
+ src_gaps.push(a);
+ }
+ prev = r.src[j];
+ }
+
+ prev = null;
+ for (var j=0; j<r.tgt.length; j++) {
+ if (prev!=null && prev<((id2idx(r.tgt[j]))-1)) {
+ var a = [];
+ for (var k=prev+1; k<id2idx(r.tgt[j]); k++) {
+ a.push(k);
+ }
+ tgt_gaps.push(a);
+ }
+ prev = id2idx(r.tgt[j]);
+ }
+
+ r["src_gaps"] = src_gaps;
+ r["tgt_gaps"] = tgt_gaps;
+ r["src_gaps_pos"] = []; // 0 before, 1 after
+ src_gaps_covered = [];
+ var tgt_indexes = [];
+ var invalid = false;
+ for (k in r.tgt_gaps) { // for each target gap
+ var g = r.tgt_gaps[k]; // current gap
+ var ga = g.map(function(i) { // these are the aligned sources to this gap
+ try {
+ return t2s[idx2id(i)][0];
+ } catch(e) {
+ return null;
+ }
+ });
+ var index = -1; // behind or before
+ var b = false;
+ for (var l=ga.length-1; l>=0; l--) { // take the last one / or should I?
+ for (m in r.src_gaps) {
+ if (r.src_gaps[m].find(function(i) { return i==ga[l] })) {
+ index = m;
+ src_gaps_covered.push(m);
+ b = true;
+ break;
+ }
+ if (b) break;
+ }
+ }
+
+ if (index == -1) { // not found within
+ // try to find outside
+ var x = null;
+ for (var j=ga.length-1; j>=0; j--) { // first (from the back) aligned
+ if (ga[j]) {
+ x = ga[j];
+ break;
+ }
+ }
+ if (x == null) {
+ if (r.src_gaps.length == 1 && r.tgt_gaps.length == 1) {
+ index = 0;
+ src_gaps_covered.push(0);
+ } else {
+ invalid = true;
+ }
+ } else {
+ if (x < r.src[0]) { // before
+ r.src_gaps.unshift([x]);
+ tgt_indexes = tgt_indexes.map(function(i) { return i+1 });
+ index = 0;
+ r["src_gaps_pos"].push(0);
+ src_gaps_covered.push(-1); // doesn't matter
+ } else if (x > r.src[r.src.length-1]) { // after
+ r.src_gaps.push([x]);
+ index = Math.max(0,amax(tgt_indexes)+1);
+ r["src_gaps_pos"].push(1);
+ src_gaps_covered.push(-1); // doesn't matter
+ } else {
+ invalid = true;
+ }
+ }
+ }
+ tgt_indexes.push(parseInt(index));
+ }
+
+ r["tgt_gaps_pos"] = [];
+ if (r.src_gaps.length > src_gaps_covered.length) {
+ for (k in r.src_gaps) {
+ if (!src_gaps_covered.find(function(i){return i==k;})) { // not covered
+ try {
+ for (var l=r.src_gaps[k].length-1; l>=0; l--) {
+ if (s2t[r.src_gaps[k][l]]!=null) {
+ if (s2t[r.src_gaps[k][l]] > id2idx(r.tgt[0])) { // before
+ r["tgt_gaps_pos"].push(0);
+ } else if(s2t[r.src_gaps[k][l]] < id2idx(r.tgt[r.tgt.length-1])) { //after
+ //alert("!!");
+ r["tgt_gaps_pos"].push(1);
+ } else {
+ }
+ break;
+ }
+ }
+ } catch(e) {
+ }
+ }
+ }
+ }
+ r["tgt_indexes"] = tgt_indexes;
+ r["invalid"] = invalid;
+ }
+
+ for (var z=0; z<rules.length; z++) { // FIXME why here?
+ r = rules[z];
+ if (r.tgt_indexes.length!=r.tgt_gaps.length || r.tgt_indexes.length!=r.src_gaps.length) {
+ r.invalid = true;
+ }
+ }
+
+ rs = {}
+ for (r in rules) {
+ if (r.invalid) {
+ //alert(r);
+ continue;
+ }
+ var src = "", tgt = "";
+ var prev=null
+ var src_idx = 1;
+ var src_gaps_count = 0;
+ for (var i=0; i< rules[r]["src"].length; i++) {
+ if (prev!=null && prev < rules[r]["src"][i]-1) { // work, because id==idx
+ src += "[X,"+src_idx+"] ";
+ src_idx++;
+ }
+ src += DE_shapes_by_id[rules[r]["src"][i]].pair[0].textContent+" ";
+ if (rules[r]["src"][i]!=null)
+ prev = rules[r]["src"][i];
+ }
+ if ((src_idx-1) < rules[r]["src_gaps_pos"].length) {
+ for (q in rules[r]["src_gaps_pos"]) {
+ if (rules[r]["src_gaps_pos"][q] == 0) { // before
+ var re=/\[X,(\d)\]/g;
+ do { m = re.exec(src); if (m) { src[m.index+3] = parseInt(m[1])+1 }; } while (m);
+ src = "[X,1] "+$.trim(src);
+ src_gaps_covered.push(-1);
+ } else { // after
+ var re=/\[X,(\d)\]/g;
+ var last = 0;
+ do { m = re.exec(src); if (m) { last = parseInt(m[1]) }; } while (m);
+ src = $.trim(src);
+ src += " [X,"+(last+1)+"]";
+ src_gaps_covered.push(-1);
+ }
+ }
+ }
+ src += "||| ";
+ prev = null;
+ var tgt_idx_idx = 0;
+ for (var i=0; i< rules[r]["tgt"].length; i++) {
+ if (!DE_shapes_by_id[rules[r]["tgt"][i]]) { // unaligned source
+ continue;
+ }
+ if (prev!=null && prev < id2idx(rules[r]["tgt"][i])-1) {
+ tgt += "[X,"+(rules[r]["tgt_indexes"][tgt_idx_idx]+1)+"] " ;
+ tgt_idx_idx++;
+ }
+ tgt += DE_shapes_by_id[rules[r]["tgt"][i]].pair[0].textContent+" ";
+ if (rules[r]["tgt"][i]) {
+ prev = id2idx(rules[r]["tgt"][i]);
+ }
+ }
+ for (k in rules[r]["tgt_gaps_pos"]) {
+ if (rules[r]["tgt_gaps_pos"][k] == 0) { // before
+ var re=/\[X,(\d)\]/g;
+ do { m = re.exec(tgt); if (m) { tgt[m.index+3] = parseInt(m[1])+1 }; } while (m);
+ tgt = "[X,1] "+$.trim(tgt);
+ } else { // after
+ var re=/\[X,(\d)\]/g;
+ var last = 0;
+ do { m = re.exec(tgt); if (m) { last = parseInt(m[1]) }; } while (m);
+ tgt = $.trim(tgt);
+ tgt += " [X,"+(last+1)+"]";
+ }
+ }
+ if (tgt.replace(/\|\|\|/g, "").trim() != "") {
+ var id = rules[r]["tgt"][0];
+ var b = false;
+ if (DE_target_shapes[0]["id_"] == id) {
+ b = true;
+ }
+ var accept = true;
+ var x = src.match(/\[X,\d\]/g);
+ var y = tgt.match(/\[X,\d\]/g);
+ if (x && y) {
+ accept = x.length==y.length;
+ var srci = src.match(/\[X,(\d)\]/g).map(function(i){return parseInt(i.split(",")[1].replace("]",""))}).sort()
+ var tgti = tgt.match(/\[X,(\d)\]/g).map(function(i){return parseInt(i.split(",")[1].replace("]",""))}).sort()
+ var prev = null;
+ var uniq = true;
+ for (k in srci) {
+ if (prev!=null && prev==srci[k]) {
+ uniq = false;
+ break;
+ }
+ prev = srci[k];
+ }
+ prev = null
+ for (k in tgti) {
+ if (prev!=null && prev==tgti[k]) {
+ uniq = false;
+ break;
+ }
+ prev = tgti[k];
+ }
+ accept = accept && uniq;
+ var same = true;
+ if (srci.length == tgti.length) {
+ for (k in srci) {
+ if (srci[k] != tgti[k]) {
+ same = false;
+ break;
+ }
+ }
+ }
+
+ accept = accept && same;
+
+ } else if (x && !y || !x && y) {
+ accept = false
+ }
+
+ if (accept) {
+ rs[rules[r]["src"]] = b+" ||| "+$.trim(src+tgt);
+ } else {
+ //alert(src+tgt+" "+rules[r]["tgt_gaps"].length+" "+src_gaps_covered.length+" --- "+String(x.length==y.length) + " " + String(uniq) + " " + String(same));
+ }
+ }
+ }
+
+ $rules = rules;
+
+ return rs;
+}
+
diff --git a/phrase2_extraction/phrase2_extraction.rb b/phrase2_extraction/phrase2_extraction.rb
index 1f268cd..b376953 100755
--- a/phrase2_extraction/phrase2_extraction.rb
+++ b/phrase2_extraction/phrase2_extraction.rb
@@ -178,6 +178,16 @@ class Rule
}
astr.strip!
+ #a = []
+ #source_string.strip.lstrip.split.each_with_index { |s,i|
+ # target_string.strip.lstrip.split.each_with_index { |t,j|
+ # if !s.match /\[X,\d+\]/ and !t.match /\[X,\d+\]/
+ # a << "#{i}-#{j}"
+ # end
+ # }
+ #}
+ #astr = a.join ' '
+
return "[X] ||| #{source_string} ||| #{target_string} ||| NewRule=1 ||| #{astr}"
end
diff --git a/server.rb b/server.rb
index d6cbec7..752d0d5 100755
--- a/server.rb
+++ b/server.rb
@@ -15,7 +15,10 @@ require_relative './phrase2_extraction/phrase2_extraction'
# #############################################################################
# Load configuration file and setup global variables
# #############################################################################
-require_relative "#{ARGV[0]}" # load configuration for this session
+NOLOO = nil # added later, warning
+OLM = nil # added later, warning
+$olm_pipe = nil
+require_relative "#{ARGV[0]}" # load configuration for this session
$lock = false # lock if currently learning/translating
$last_reply = nil # cache last reply
$last_processed_postedit = "" # to show to the user
@@ -73,9 +76,9 @@ end
def start_daemon cmd, name, addr
logmsg :server, "starting #{name} daemon"
cmd.gsub! '__ADDR__', addr
- pid = fork do
- exec cmd
- end
+ pid = spawn(cmd)
+ Process.detach pid
+ logmsg :server, "#{name} detached"
sock = NanoMsg::PairSocket.new
sock.connect addr
logmsg :server, "< got #{sock.recv} from #{name}"
@@ -116,6 +119,11 @@ def init
# working directory
`mkdir -p #{WORK_DIR}/`
`mkdir #{WORK_DIR}/g`
+
+ if OLM
+ `mkfifo #{WORK_DIR}/refp`
+ end
+
# setup environment, start daemons
port = BEGIN_PORT_RANGE
$daemons.each { |name,cmd|
@@ -124,7 +132,19 @@ def init
port += 1
}
+ if OLM
+ logmsg :server, "writing to OLM pipe"
+ $olm_pipe = File.new "#{WORK_DIR}/refp", "w"
+ $olm_pipe.write " \n"
+ $olm_pipe.flush
+ logmsg :server, "writing to OLM pipe, done!"
+ end
+
send_recv :truecaser, "lOaD iT"
+
+ #if OLM
+ # $olm_pipe = File.new "#{WORK_DIR}/refp", "w"
+ #end
# lock file
`touch #{LOCK_FILE}`
$status = "Initialized" # status
@@ -220,6 +240,7 @@ def process_next reply
# 5b. backward alignment
# 5c. symmetrize alignment
# 5d. actual update
+# 5e. update LM
# 6. update database
if data["EDIT"]
$status = "Processing post-edit" # status
@@ -245,6 +266,10 @@ def process_next reply
f = []
data["source_raw"].each { |i| f << URI.decode(i) }
+ # no loo rules
+ no_loo_known_rules = []
+ no_loo_new_rules = []
+
if !NOGRAMMAR
# 2.5 new rule extraction
$status = "Extracting rules from post edit" # status
@@ -255,6 +280,46 @@ def process_next reply
s = splitpipe(r.to_s)[1..2].map{|i|i.strip.lstrip}.join(" ||| ")
current_grammar_ids[s] = true
}
+ # no loo rules
+ no_loo_known_rules = []
+ no_loo_new_rules = []
+ if NOLOO
+ tmp_rules = []
+ logmsg :server, "rule diff: #{data['rule_diff'].to_s}"
+ data["rule_diff"].each_key { |k|
+ x = k.split(",").map{|i|i.to_i}.sort
+ tgt_a = data["rule_diff"][k]["tgt_a"]
+ tgt_first,src,tgt = splitpipe data["rule_diff"][k]
+ tgt_first = tgt_first.lstrip.strip
+ src = src.lstrip.strip
+ tgt = tgt.lstrip.strip
+ prev = tgt[0]
+ logmsg :server, "tgt_first #{tgt_first}"
+ tgt = send_recv :truecaser, tgt
+ tgt[0] = prev if tgt_first=="false"
+ if x.first == 0
+ src[0] = data["source_value"][0]
+ end
+ tmp_rules << [src, tgt]
+ }
+ tmp_rules_new = tmp_rules.reject { |r|
+ current_grammar_ids.has_key? r
+ }
+ tmp_rules_known = tmp_rules - tmp_rules_new
+ tmp_rules_known.each { |i| no_loo_known_rules << "[X] ||| #{i[0]} ||| #{i[1]} ||| KnownRule=1 ||| 0-0" }
+ tmp_rules_new.each { |i|
+ a = []
+ i[0].strip.lstrip.split.each_with_index { |s,ii|
+ i[1].strip.lstrip.split.each_with_index { |t,j|
+ if !s.match /\[X,\d+\]/ and !t.match /\[X,\d+\]/
+ a << "#{ii}-#{j}"
+ end
+ }
+ }
+ no_loo_new_rules << "[X] ||| #{i[0]} ||| #{i[1]} ||| NewRule=1 ||| #{a.join ' '}"
+ }
+ end
+ # regular
new_rules = PhrasePhraseExtraction.extract_rules f, e, data["align"], true
new_rules_ids = {}
$new_rules.each { |r|
@@ -269,6 +334,7 @@ def process_next reply
current_grammar_ids.has_key?(s) || new_rules_ids.has_key?(s)
}
$new_rules += new_rules
+ $new_rules += no_loo_new_rules
$new_rules.uniq! { |rs|
splitpipe(rs)[1..2].map{|i|i.strip.lstrip}.join(" ||| ")
}
@@ -277,6 +343,7 @@ def process_next reply
f.close
logmsg :server, "# rules after filtering #{new_rules.size}"
add_known_rules = _-new_rules
+ add_known_rules += no_loo_known_rules
add_known_rules.reject! { |rs|
s = splitpipe(rs)[1..2].map{|i|i.strip.lstrip}.join(" ||| ")
new_rules_ids.has_key?(s)
@@ -333,7 +400,45 @@ def process_next reply
grammar = "#{SESSION_DIR}/g/grammar.#{$db['progress']}"
annotated_source = "<seg grammar=\"#{grammar}\"> #{source} </seg>"
$status = "Learning from post-edit" # status
- send_recv :dtrain, "#{annotated_source} ||| #{post_edit}"
+ if NOLOO
+ `cp #{grammar} #{grammar}.pass0`
+ match = {}
+ no_loo_known_rules.each { |r|
+ _,src,tgt,_,_ = splitpipe r
+ match["#{src.strip.lstrip} ||| #{tgt.strip.lstrip}".hash] = true
+ }
+ all_rules = ReadFile.readlines_strip grammar
+ all_rules.each_with_index { |r,j|
+ nt,src,tgt,f,a = splitpipe(r).map { |i| i.strip.lstrip }
+ if match["#{src} ||| #{tgt}".hash]
+ ar = "#{nt} ||| #{src} ||| #{tgt} ||| #{f} KnownRule=1 ||| #{a}"
+ logmsg :server, "replacing rule '#{r}' with '#{ar}'"
+ all_rules[j] = ar
+ end
+ }
+ if no_loo_new_rules.size > 0
+ all_rules += no_loo_new_rules
+ end
+ f = WriteFile.new(grammar)
+ f.write(all_rules.join("\n")+"\n")
+ f.close
+ logmsg :server, "adding rules and re-translate"
+ if OLM # again ..
+ $status = "Updating language model"
+ logmsg :server, "fake updating lm"
+ $olm_pipe.write " \n"
+ $olm_pipe.flush
+ end
+ `cp #{WORK_DIR}/dtrain.debug.json \
+ #{WORK_DIR}/#{$db['progress']}.dtrain.debug.json.pass0`
+ send_recv :dtrain, "act:translate_learn ||| #{annotated_source} ||| #{post_edit}"
+ `cp #{WORK_DIR}/dtrain.debug.json \
+ #{WORK_DIR}/#{$db['progress']}.dtrain.debug.json.pass1`
+ else
+ send_recv :dtrain, "act:learn ||| #{annotated_source} ||| #{post_edit}"
+ `cp #{WORK_DIR}/dtrain.debug.json \
+ #{WORK_DIR}/#{$db['progress']}.dtrain.debug.json.pass0`
+ end
# 5. update grammar extractor
if !$pregenerated_grammars
# 5a. get forward alignment
@@ -350,12 +455,26 @@ def process_next reply
msg = "default_context ||| #{source} ||| #{post_edit} ||| #{a}"
send_recv :extractor, msg
end
+ # 5e update LM
+ if OLM
+ $status = "Updating language model"
+ logmsg :server, "updating lm"
+ #`echo "#{post_edit}" >> #{WORK_DIR}/refp`
+ $olm_pipe.write "#{post_edit}\n"
+ $olm_pipe.flush
+ end
# 6. update database
$db['updated'] << true
- `cp #{WORK_DIR}/dtrain.debug.json \
- #{WORK_DIR}/#{$db['progress']}.dtrain.debug.json`
else
+ `cp #{WORK_DIR}/dtrain.debug.json \
+ #{WORK_DIR}/#{$db['progress']}.dtrain.debug.json.nolearn`
$db['updated'] << false
+ if OLM
+ $status = "Updating language model"
+ logmsg :server, "fake updating lm"
+ $olm_pipe.write " \n"
+ $olm_pipe.flush
+ end
end
logmsg :db, "updating database"
update_database
@@ -532,19 +651,33 @@ end
get '/debug' do # debug view
data = {}
- data = JSON.parse ReadFile.read(DB_FILE).force_encoding("UTF-8")
+ s = File.binread(DB_FILE).encode('UTF-8', 'UTF-8', :invalid => :replace, :replace => "__INVALID__")
+ data = JSON.parse s
if data["durations"].size == 0
data["durations"] << -1
end
- fn = "#{WORK_DIR}/dtrain.debug.json"
+ fn = "#{WORK_DIR}/#{$db["progress"]-1}.dtrain.debug.json.pass"
+ pass = 0
+ if File.exist? fn+"1"
+ fn += "1"
+ pass = 1
+ else
+ fn += "0"
+ pass = 0
+ end
pairwise_ranking_data = {}
pairwise_ranking_data["kbest"] = []
pairwise_ranking_data["weights_before"] = {}
pairwise_ranking_data["weights_after"] = {}
pairwise_ranking_data["best_match_score"] = 0
if File.exist? fn
- pairwise_ranking_data = JSON.parse ReadFile.read(fn).force_encoding("UTF-8")
+ s = File.binread(fn).encode('UTF-8', 'UTF-8', :invalid => :replace, :replace => "__INVALID__").force_encoding("utf-8")
+ begin
+ pairwise_ranking_data = JSON.parse s
+ rescue
+ logmsg :server, s.encoding
+ end
end
admin = false
@@ -555,6 +688,7 @@ get '/debug' do # debug view
haml :debug, :locals => { :data => data,
:pairwise_ranking_data => pairwise_ranking_data, \
:progress => $db["progress"]-1,
+ :pass => pass,
:new_rules => $new_rules, \
:known_rules => $known_rules, \
:session_key => SESSION_KEY, \
diff --git a/util/kill b/util/kill_all
index e82f822..e82f822 100755
--- a/util/kill
+++ b/util/kill_all
diff --git a/util/kill_session b/util/kill_session
new file mode 100755
index 0000000..1f57c81
--- /dev/null
+++ b/util/kill_session
@@ -0,0 +1,5 @@
+#!/usr/bin/zsh -x
+
+lsof /srv/postedit/sessions/$1/work/session.out | cut -d " " -f 2- | /srv/postedit/scripts/strips | cut -d " " -f 1 | grep -v PID | xargs kill
+lsof /srv/postedit/sessions/$1/work/session.out
+
diff --git a/util/run_all b/util/run_all
index a873e63..e82ced9 100755
--- a/util/run_all
+++ b/util/run_all
@@ -4,6 +4,6 @@
for i in `cat ../sessions/sessions | cut -f 1`; do
echo $i
./util/run_session $i &
- sleep 60
+ sleep 10
done
diff --git a/views/debug.haml b/views/debug.haml
index 17f7f86..1ab22f7 100644
--- a/views/debug.haml
+++ b/views/debug.haml
@@ -148,6 +148,7 @@
%p <strong>Duration:</strong> #{data["durations"][progress]}ms
%p <strong>Keypresses:</strong> #{data["count_kbd"][progress]}
%p <strong>Clicks:</strong> #{data["count_click"][progress]}
+ %p <strong>Pass:</strong> #{pass}
%h3 Derivation
%p
@@ -169,7 +170,7 @@
/=#########################################################################
%h2#grammar Grammar
- %p <strong>Notes:</strong> In addition to dynamically adding each source/post-edit instance to the suffix array extractor, the system additionally uses the provided phrase alignments to extract new rules. The extraction follows the original Hiero grammar extraction, but using phrases instead of words and using only a single binary feature: 'NewRule=1'. Extracted rules that already exist in a grammar are annotated with an additional feature: 'KnownRules=1'. OOVs are avoided by asking the user for translations of unknown words prior to translation. These are added to the grammars as new rules ('OOVFix=1').
+ %p <strong>Notes:</strong> <strike>In addition to dynamically adding each source/post-edit instance to the suffix array extractor,</strike> the system additionally uses the provided phrase alignments to extract new rules. The extraction follows the original Hiero grammar extraction, but using phrases instead of words and using only a single binary feature: 'NewRule=1'. Extracted rules that already exist in a grammar are annotated with an additional feature: 'KnownRules=1'. OOVs are avoided by asking the user for translations of unknown words prior to translation. These are added to the grammars as new rules ('OOVFix=1').
%h3
New Rules
@@ -225,7 +226,7 @@
%th Rate
%tbody
- if pairwise_ranking_data["update_raw"]
- - raw_update = SparseVector.from_kv(pairwise_ranking_data["update_raw"])
+ - raw_update = SparseVector.new(pairwise_ranking_data["update_raw"])
- pairwise_ranking_data["weights_before"].default = 0
- pairwise_ranking_data["weights_after"].keys.each.sort { |a,b| a <=> b }.each do |k|
- diff = pairwise_ranking_data["weights_after"][k] - pairwise_ranking_data["weights_before"][k]