From 259d67b738d9e7375d12ffa8b7d613ef98f0ad9f Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Fri, 15 Jul 2016 12:16:50 +0200 Subject: util/kill_session --- util/kill_session | 5 +++++ 1 file changed, 5 insertions(+) create mode 100755 util/kill_session diff --git a/util/kill_session b/util/kill_session new file mode 100755 index 0000000..1f57c81 --- /dev/null +++ b/util/kill_session @@ -0,0 +1,5 @@ +#!/usr/bin/zsh -x + +lsof /srv/postedit/sessions/$1/work/session.out | cut -d " " -f 2- | /srv/postedit/scripts/strips | cut -d " " -f 1 | grep -v PID | xargs kill +lsof /srv/postedit/sessions/$1/work/session.out + -- cgit v1.2.3 From 29ca42a8605ff29524ed287f384d5113f1ec30d7 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Fri, 15 Jul 2016 12:17:01 +0200 Subject: mv --- util/kill | 4 ---- util/kill_all | 4 ++++ 2 files changed, 4 insertions(+), 4 deletions(-) delete mode 100755 util/kill create mode 100755 util/kill_all diff --git a/util/kill b/util/kill deleted file mode 100755 index e82f822..0000000 --- a/util/kill +++ /dev/null @@ -1,4 +0,0 @@ -#!/bin/bash - -for i in {1..6}; do ps ax | grep -P "(server.rb|wrapper.rb|atools|net_fa|sa.extract|dtrain|truecase.perl)" | grep -v vim | grep -v -P "^\s\+$" | cut -d " " -f $i | xargs kill -9 &>/dev/null; done - diff --git a/util/kill_all b/util/kill_all new file mode 100755 index 0000000..e82f822 --- /dev/null +++ b/util/kill_all @@ -0,0 +1,4 @@ +#!/bin/bash + +for i in {1..6}; do ps ax | grep -P "(server.rb|wrapper.rb|atools|net_fa|sa.extract|dtrain|truecase.perl)" | grep -v vim | grep -v -P "^\s\+$" | cut -d " " -f $i | xargs kill -9 &>/dev/null; done + -- cgit v1.2.3 From d4263d1dc29fe46871caec9fde613bf40f3ed90c Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Fri, 15 Jul 2016 12:17:47 +0200 Subject: support for 'noloo' extraction and updatable lm; fixes --- inc/db.inc.php | 2 +- js/interface.js | 429 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ server.rb | 144 +++++++++++++++++-- views/debug.haml | 5 +- 4 files changed, 567 insertions(+), 13 deletions(-) diff --git a/inc/db.inc.php b/inc/db.inc.php index e023c3a..bc18e16 100644 --- a/inc/db.inc.php +++ b/inc/db.inc.php @@ -2,7 +2,7 @@ $SESSION_DIR="/srv/postedit/sessions"; $key = $_GET["key"]; -if (preg_match('/^[a-z0-9]{1,4}$/', $key)) { +if (preg_match('/^[a-f0-9]{1,4}$/', $key)) { $json = file_get_contents($SESSION_DIR."/".$key."/data.json"); } $db = json_decode($json); diff --git a/js/interface.js b/js/interface.js index ec36af7..1fd3468 100644 --- a/js/interface.js +++ b/js/interface.js @@ -8,6 +8,8 @@ var data, // global data object var TEXT_count_click=0, TEXT_count_kbd=0; +var rules_orig = {}; + /* * cross-site request * @@ -282,6 +284,11 @@ var next = function () send_data["post_edit"] = safe_str(post_edit); send_data['type'] = 'g'; send_data["original_svg"] = document.getElementById("original_svg").value; + var dx = rule_diff(rules_orig, get_simplest_rules1()); + for (k in dx) { + dx[k] = safe_str(dx[k]); + } + send_data["rule_diff"] = dx; } else { post_edit = $.trim(target_textarea.value); send_data["post_edit"] = safe_str(post_edit); @@ -514,6 +521,7 @@ var request_and_process_next = function () var x = $.trim(JSON.parse(DE_extract_data())["target"].join(" ")); last_post_edit.value = x; document.getElementById("original_svg").value = DE_get_raw_svg_data(); + rules_orig = get_simplest_rules1(); } // start timer @@ -593,3 +601,424 @@ $().ready(function() }); +var explore = function (o,src,tgt,s2t,t2s,done) +{ + if (done[o["id"]]) return; + var d,other_t; + if (o["type"] == "source") { + d = s2t; + src.push(o["id"]); + other_t = "target"; + } else { + d = t2s; + tgt.push(o["id"]) + other_t = "source"; + } + + if (!d[o["id"]]) return; + if (d[o["id"]].length==0) return; + + done[o["id"]] = true; + + for (var i=0; i < d[o["id"]].length; i++) { + explore({"id":d[o["id"]][i], "type":other_t}, src, tgt, s2t, t2s, done); + } + + return; +} + +var rule_diff = function (prev,now) +{ + var diff = {}; + for (key in now) { + if (prev[key] && now[key] != prev[key]) { + diff[key] = now[key]; + } + if (!prev[key]) { + diff[key] = now[key]; + } + } + + return diff; +} + +var get_simplest_rules = function () +{ + var s2t = []; + var t2s = []; + for (key in DE_connections) { + var a = key.split("-"); + if (s2t.hasOwnProperty(a[0])) { + s2t[parseInt(a[0])].push(parseInt(a[1])); + } else { + s2t[parseInt(a[0])] = [parseInt(a[1])]; + } + if (t2s.hasOwnProperty(a[1])) { + t2s[parseInt(a[1])].push(parseInt(a[0])); + } else { + t2s[parseInt(a[1])] = [parseInt(a[0])]; + } + } + + var rules = []; + var done = {}; + for (var i=0; i < DE_shapes.length; i++) { + if (DE_shapes[i]["type_"] == "source") { + var id = parseInt(DE_shapes[i]["id_"]); + var src = []; + var tgt = []; + explore({"id":id,"type":"source"}, src, tgt, s2t, t2s, done); + if (src.length >0 && tgt.length>0) { + rules.push( {"src":src, "tgt":tgt } ); + } + } + } + + rs = {} + for (r in rules) { + var src = "", tgt = ""; + var prev=null + for (var i=0; i< rules[r]["src"].length; i++) { + if (prev!=null && prev < rules[r]["src"][i]-1) { + src += "[X] "; + } + src += DE_shapes_by_id[rules[r]["src"][i]].pair[0].textContent+" "; + if (rules[r]["src"][i]!=null) + prev = rules[r]["src"][i]; + } + src += "||| "; + prev = null; + for (var i=0; i< rules[r]["tgt"].length; i++) { + if (!DE_shapes_by_id[rules[r]["tgt"][i]]) // unaligned source + continue; + if (prev && prev < rules[r]["tgt"][i]-1) { + tgt += "[X] "; + } + tgt += DE_shapes_by_id[rules[r]["tgt"][i]].pair[0].textContent+" "; + if (rules[r]["tgt"][i]) + prev = rules[r]["tgt"][i]; + } + if (tgt.replace(/\|\|\|/g, "").trim() != "") { + var id = rules[r]["tgt"][0]; + var b = false; + if (DE_target_shapes[0]["id_"] == id) { + b = true; + } + rs[rules[r]["src"]] = b+" ||| "+$.trim(src+tgt); + } + } + + return rs; +} + +var id2idx = function (id) { // or grid_pos + var i = 0; + for (k in DE_target_shapes) { + if (DE_target_shapes[k]["id_"] == id) { + return i; + } + i++; + } + + return -1; +} + +var idx2id = function (idx) { + return DE_target_shapes[idx]["id_"]; +} + +var amax = function (a) { + var max = -9999999999999; + for (k in a) { + if (a[k] > max) + max = a[k]; + } + return max; +} + +var $rules =[]; +var get_simplest_rules1 = function () +{ + var s2t = []; + var t2s = []; + for (key in DE_connections) { + var a = key.split("-"); + if (s2t.hasOwnProperty(a[0])) { + s2t[parseInt(a[0])].push(parseInt(a[1])); + } else { + s2t[parseInt(a[0])] = [parseInt(a[1])]; + } + if (t2s.hasOwnProperty(a[1])) { + t2s[parseInt(a[1])].push(parseInt(a[0])); + } else { + t2s[parseInt(a[1])] = [parseInt(a[0])]; + } + } + + var rules = []; + var done = {}; + for (var i=0; i < DE_shapes.length; i++) { + if (DE_shapes[i]["type_"] == "source") { + var id = parseInt(DE_shapes[i]["id_"]); + var src = []; + var tgt = []; + explore({"id":id,"type":"source"}, src, tgt, s2t, t2s, done); + if (src.length >0 && tgt.length>0) { + tgt.sort(function(a,b) { return id2idx(a) > id2idx(b) }); + rules.push( {"src":src, "tgt":tgt } ); + } + } + } + + for (var z=0; z=0; l--) { // take the last one / or should I? + for (m in r.src_gaps) { + if (r.src_gaps[m].find(function(i) { return i==ga[l] })) { + index = m; + src_gaps_covered.push(m); + b = true; + break; + } + if (b) break; + } + } + + if (index == -1) { // not found within + // try to find outside + var x = null; + for (var j=ga.length-1; j>=0; j--) { // first (from the back) aligned + if (ga[j]) { + x = ga[j]; + break; + } + } + if (x == null) { + if (r.src_gaps.length == 1 && r.tgt_gaps.length == 1) { + index = 0; + src_gaps_covered.push(0); + } else { + invalid = true; + } + } else { + if (x < r.src[0]) { // before + r.src_gaps.unshift([x]); + tgt_indexes = tgt_indexes.map(function(i) { return i+1 }); + index = 0; + r["src_gaps_pos"].push(0); + src_gaps_covered.push(-1); // doesn't matter + } else if (x > r.src[r.src.length-1]) { // after + r.src_gaps.push([x]); + index = Math.max(0,amax(tgt_indexes)+1); + r["src_gaps_pos"].push(1); + src_gaps_covered.push(-1); // doesn't matter + } else { + invalid = true; + } + } + } + tgt_indexes.push(parseInt(index)); + } + + r["tgt_gaps_pos"] = []; + if (r.src_gaps.length > src_gaps_covered.length) { + for (k in r.src_gaps) { + if (!src_gaps_covered.find(function(i){return i==k;})) { // not covered + try { + for (var l=r.src_gaps[k].length-1; l>=0; l--) { + if (s2t[r.src_gaps[k][l]]!=null) { + if (s2t[r.src_gaps[k][l]] > id2idx(r.tgt[0])) { // before + r["tgt_gaps_pos"].push(0); + } else if(s2t[r.src_gaps[k][l]] < id2idx(r.tgt[r.tgt.length-1])) { //after + alert("!!"); + r["tgt_gaps_pos"].push(1); + } else { + } + break; + } + } + } catch(e) { + } + } + } + } + r["tgt_indexes"] = tgt_indexes; + r["invalid"] = invalid; + } + + for (var z=0; z #{source} " $status = "Learning from post-edit" # status - send_recv :dtrain, "#{annotated_source} ||| #{post_edit}" + if NOLOO + `cp #{grammar} #{grammar}.pass0` + match = {} + no_loo_known_rules.each { |r| + _,src,tgt,_,_ = splitpipe r + match["#{src.strip.lstrip} ||| #{tgt.strip.lstrip}".hash] = true + } + all_rules = ReadFile.readlines_strip grammar + all_rules.each_with_index { |r,j| + nt,src,tgt,f,a = splitpipe(r).map { |i| i.strip.lstrip } + if match["#{src} ||| #{tgt}".hash] + ar = "#{nt} ||| #{src} ||| #{tgt} ||| #{f} KnownRule=1 ||| #{a}" + logmsg :server, "replacing rule '#{r}' with '#{ar}'" + all_rules[j] = ar + end + } + if no_loo_new_rules.size > 0 + all_rules += no_loo_new_rules + end + f = WriteFile.new(grammar) + f.write(all_rules.join("\n")+"\n") + f.close + logmsg :server, "adding rules and re-translate" + if OLM # again .. + $status = "Updating language model" + logmsg :server, "fake updating lm" + $olm_pipe.write " \n" + $olm_pipe.flush + end + `cp #{WORK_DIR}/dtrain.debug.json \ + #{WORK_DIR}/#{$db['progress']}.dtrain.debug.json.pass0` + send_recv :dtrain, "act:translate_learn ||| #{annotated_source} ||| #{post_edit}" + `cp #{WORK_DIR}/dtrain.debug.json \ + #{WORK_DIR}/#{$db['progress']}.dtrain.debug.json.pass1` + else + send_recv :dtrain, "act:learn ||| #{annotated_source} ||| #{post_edit}" + `cp #{WORK_DIR}/dtrain.debug.json \ + #{WORK_DIR}/#{$db['progress']}.dtrain.debug.json.pass0` + end # 5. update grammar extractor if !$pregenerated_grammars # 5a. get forward alignment @@ -349,13 +444,27 @@ def process_next reply $status = "Updating grammar extractor" # status msg = "default_context ||| #{source} ||| #{post_edit} ||| #{a}" send_recv :extractor, msg + end + # 5e update LM + if OLM + $status = "Updating language model" + logmsg :server, "updating lm" + #`echo "#{post_edit}" >> #{WORK_DIR}/refp` + $olm_pipe.write "#{post_edit}\n" + $olm_pipe.flush end # 6. update database $db['updated'] << true - `cp #{WORK_DIR}/dtrain.debug.json \ - #{WORK_DIR}/#{$db['progress']}.dtrain.debug.json` else + `cp #{WORK_DIR}/dtrain.debug.json \ + #{WORK_DIR}/#{$db['progress']}.dtrain.debug.json.nolearn` $db['updated'] << false + if OLM + $status = "Updating language model" + logmsg :server, "fake updating lm" + $olm_pipe.write " \n" + $olm_pipe.flush + end end logmsg :db, "updating database" update_database @@ -532,19 +641,33 @@ end get '/debug' do # debug view data = {} - data = JSON.parse ReadFile.read(DB_FILE).force_encoding("UTF-8") + s = File.binread(DB_FILE).encode('UTF-8', 'UTF-8', :invalid => :replace, :replace => "__INVALID__") + data = JSON.parse s if data["durations"].size == 0 data["durations"] << -1 end - fn = "#{WORK_DIR}/dtrain.debug.json" + fn = "#{WORK_DIR}/#{$db["progress"]-1}.dtrain.debug.json.pass" + pass = 0 + if File.exist? fn+"1" + fn += "1" + pass = 1 + else + fn += "0" + pass = 0 + end pairwise_ranking_data = {} pairwise_ranking_data["kbest"] = [] pairwise_ranking_data["weights_before"] = {} pairwise_ranking_data["weights_after"] = {} pairwise_ranking_data["best_match_score"] = 0 if File.exist? fn - pairwise_ranking_data = JSON.parse ReadFile.read(fn).force_encoding("UTF-8") + s = File.binread(fn).encode('UTF-8', 'UTF-8', :invalid => :replace, :replace => "__INVALID__").force_encoding("utf-8") + begin + pairwise_ranking_data = JSON.parse s + rescue + logmsg :server, s.encoding + end end admin = false @@ -555,6 +678,7 @@ get '/debug' do # debug view haml :debug, :locals => { :data => data, :pairwise_ranking_data => pairwise_ranking_data, \ :progress => $db["progress"]-1, + :pass => pass, :new_rules => $new_rules, \ :known_rules => $known_rules, \ :session_key => SESSION_KEY, \ diff --git a/views/debug.haml b/views/debug.haml index 17f7f86..1ab22f7 100644 --- a/views/debug.haml +++ b/views/debug.haml @@ -148,6 +148,7 @@ %p Duration: #{data["durations"][progress]}ms %p Keypresses: #{data["count_kbd"][progress]} %p Clicks: #{data["count_click"][progress]} + %p Pass: #{pass} %h3 Derivation %p @@ -169,7 +170,7 @@ /=######################################################################### %h2#grammar Grammar - %p Notes: In addition to dynamically adding each source/post-edit instance to the suffix array extractor, the system additionally uses the provided phrase alignments to extract new rules. The extraction follows the original Hiero grammar extraction, but using phrases instead of words and using only a single binary feature: 'NewRule=1'. Extracted rules that already exist in a grammar are annotated with an additional feature: 'KnownRules=1'. OOVs are avoided by asking the user for translations of unknown words prior to translation. These are added to the grammars as new rules ('OOVFix=1'). + %p Notes: In addition to dynamically adding each source/post-edit instance to the suffix array extractor, the system additionally uses the provided phrase alignments to extract new rules. The extraction follows the original Hiero grammar extraction, but using phrases instead of words and using only a single binary feature: 'NewRule=1'. Extracted rules that already exist in a grammar are annotated with an additional feature: 'KnownRules=1'. OOVs are avoided by asking the user for translations of unknown words prior to translation. These are added to the grammars as new rules ('OOVFix=1'). %h3 New Rules @@ -225,7 +226,7 @@ %th Rate %tbody - if pairwise_ranking_data["update_raw"] - - raw_update = SparseVector.from_kv(pairwise_ranking_data["update_raw"]) + - raw_update = SparseVector.new(pairwise_ranking_data["update_raw"]) - pairwise_ranking_data["weights_before"].default = 0 - pairwise_ranking_data["weights_after"].keys.each.sort { |a,b| a <=> b }.each do |k| - diff = pairwise_ranking_data["weights_after"][k] - pairwise_ranking_data["weights_before"][k] -- cgit v1.2.3 From dcc5f06c86e4a8975ec52062bf66ae252c90cbea Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Mon, 18 Jul 2016 15:39:12 +0200 Subject: js/interface.js: remove alerts --- js/interface.js | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/js/interface.js b/js/interface.js index 1fd3468..6c32063 100644 --- a/js/interface.js +++ b/js/interface.js @@ -874,7 +874,7 @@ var get_simplest_rules1 = function () if (s2t[r.src_gaps[k][l]] > id2idx(r.tgt[0])) { // before r["tgt_gaps_pos"].push(0); } else if(s2t[r.src_gaps[k][l]] < id2idx(r.tgt[r.tgt.length-1])) { //after - alert("!!"); + //alert("!!"); r["tgt_gaps_pos"].push(1); } else { } @@ -900,7 +900,7 @@ var get_simplest_rules1 = function () rs = {} for (r in rules) { if (r.invalid) { - alert(r); + //alert(r); continue; } var src = "", tgt = ""; @@ -1012,7 +1012,7 @@ var get_simplest_rules1 = function () if (accept) { rs[rules[r]["src"]] = b+" ||| "+$.trim(src+tgt); } else { - alert(src+tgt+" "+rules[r]["tgt_gaps"].length+" "+src_gaps_covered.length+" --- "+String(x.length==y.length) + " " + String(uniq) + " " + String(same)); + //alert(src+tgt+" "+rules[r]["tgt_gaps"].length+" "+src_gaps_covered.length+" --- "+String(x.length==y.length) + " " + String(uniq) + " " + String(same)); } } } -- cgit v1.2.3 From 68fbe2a717f07cda33aa57668d01d8190dae9ede Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Mon, 18 Jul 2016 15:39:33 +0200 Subject: util/run_all: faster startup --- util/run_all | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/util/run_all b/util/run_all index a873e63..e82ced9 100755 --- a/util/run_all +++ b/util/run_all @@ -4,6 +4,6 @@ for i in `cat ../sessions/sessions | cut -f 1`; do echo $i ./util/run_session $i & - sleep 60 + sleep 10 done -- cgit v1.2.3 From fac00976168c6b3c94d01d76babede147e4a0710 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Wed, 20 Jul 2016 10:55:13 +0200 Subject: noloo new rules: align all --- phrase2_extraction/phrase2_extraction.rb | 10 ++++++++++ server.rb | 12 +++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/phrase2_extraction/phrase2_extraction.rb b/phrase2_extraction/phrase2_extraction.rb index 1f268cd..b376953 100755 --- a/phrase2_extraction/phrase2_extraction.rb +++ b/phrase2_extraction/phrase2_extraction.rb @@ -178,6 +178,16 @@ class Rule } astr.strip! + #a = [] + #source_string.strip.lstrip.split.each_with_index { |s,i| + # target_string.strip.lstrip.split.each_with_index { |t,j| + # if !s.match /\[X,\d+\]/ and !t.match /\[X,\d+\]/ + # a << "#{i}-#{j}" + # end + # } + #} + #astr = a.join ' ' + return "[X] ||| #{source_string} ||| #{target_string} ||| NewRule=1 ||| #{astr}" end diff --git a/server.rb b/server.rb index 4a3e8f6..752d0d5 100755 --- a/server.rb +++ b/server.rb @@ -307,7 +307,17 @@ def process_next reply } tmp_rules_known = tmp_rules - tmp_rules_new tmp_rules_known.each { |i| no_loo_known_rules << "[X] ||| #{i[0]} ||| #{i[1]} ||| KnownRule=1 ||| 0-0" } - tmp_rules_new.each { |i| no_loo_new_rules << "[X] ||| #{i[0]} ||| #{i[1]} ||| NewRule=1 ||| 0-0" } + tmp_rules_new.each { |i| + a = [] + i[0].strip.lstrip.split.each_with_index { |s,ii| + i[1].strip.lstrip.split.each_with_index { |t,j| + if !s.match /\[X,\d+\]/ and !t.match /\[X,\d+\]/ + a << "#{ii}-#{j}" + end + } + } + no_loo_new_rules << "[X] ||| #{i[0]} ||| #{i[1]} ||| NewRule=1 ||| #{a.join ' '}" + } end # regular new_rules = PhrasePhraseExtraction.extract_rules f, e, data["align"], true -- cgit v1.2.3