From d4263d1dc29fe46871caec9fde613bf40f3ed90c Mon Sep 17 00:00:00 2001 From: Patrick Simianer
Date: Fri, 15 Jul 2016 12:17:47 +0200
Subject: support for 'noloo' extraction and updatable lm; fixes
---
inc/db.inc.php | 2 +-
js/interface.js | 429 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
server.rb | 144 +++++++++++++++++--
views/debug.haml | 5 +-
4 files changed, 567 insertions(+), 13 deletions(-)
diff --git a/inc/db.inc.php b/inc/db.inc.php
index e023c3a..bc18e16 100644
--- a/inc/db.inc.php
+++ b/inc/db.inc.php
@@ -2,7 +2,7 @@
$SESSION_DIR="/srv/postedit/sessions";
$key = $_GET["key"];
-if (preg_match('/^[a-z0-9]{1,4}$/', $key)) {
+if (preg_match('/^[a-f0-9]{1,4}$/', $key)) {
$json = file_get_contents($SESSION_DIR."/".$key."/data.json");
}
$db = json_decode($json);
diff --git a/js/interface.js b/js/interface.js
index ec36af7..1fd3468 100644
--- a/js/interface.js
+++ b/js/interface.js
@@ -8,6 +8,8 @@ var data, // global data object
var TEXT_count_click=0,
TEXT_count_kbd=0;
+var rules_orig = {};
+
/*
* cross-site request
*
@@ -282,6 +284,11 @@ var next = function ()
send_data["post_edit"] = safe_str(post_edit);
send_data['type'] = 'g';
send_data["original_svg"] = document.getElementById("original_svg").value;
+ var dx = rule_diff(rules_orig, get_simplest_rules1());
+ for (k in dx) {
+ dx[k] = safe_str(dx[k]);
+ }
+ send_data["rule_diff"] = dx;
} else {
post_edit = $.trim(target_textarea.value);
send_data["post_edit"] = safe_str(post_edit);
@@ -514,6 +521,7 @@ var request_and_process_next = function ()
var x = $.trim(JSON.parse(DE_extract_data())["target"].join(" "));
last_post_edit.value = x;
document.getElementById("original_svg").value = DE_get_raw_svg_data();
+ rules_orig = get_simplest_rules1();
}
// start timer
@@ -593,3 +601,424 @@ $().ready(function()
});
+var explore = function (o,src,tgt,s2t,t2s,done)
+{
+ if (done[o["id"]]) return;
+ var d,other_t;
+ if (o["type"] == "source") {
+ d = s2t;
+ src.push(o["id"]);
+ other_t = "target";
+ } else {
+ d = t2s;
+ tgt.push(o["id"])
+ other_t = "source";
+ }
+
+ if (!d[o["id"]]) return;
+ if (d[o["id"]].length==0) return;
+
+ done[o["id"]] = true;
+
+ for (var i=0; i < d[o["id"]].length; i++) {
+ explore({"id":d[o["id"]][i], "type":other_t}, src, tgt, s2t, t2s, done);
+ }
+
+ return;
+}
+
+var rule_diff = function (prev,now)
+{
+ var diff = {};
+ for (key in now) {
+ if (prev[key] && now[key] != prev[key]) {
+ diff[key] = now[key];
+ }
+ if (!prev[key]) {
+ diff[key] = now[key];
+ }
+ }
+
+ return diff;
+}
+
+var get_simplest_rules = function ()
+{
+ var s2t = [];
+ var t2s = [];
+ for (key in DE_connections) {
+ var a = key.split("-");
+ if (s2t.hasOwnProperty(a[0])) {
+ s2t[parseInt(a[0])].push(parseInt(a[1]));
+ } else {
+ s2t[parseInt(a[0])] = [parseInt(a[1])];
+ }
+ if (t2s.hasOwnProperty(a[1])) {
+ t2s[parseInt(a[1])].push(parseInt(a[0]));
+ } else {
+ t2s[parseInt(a[1])] = [parseInt(a[0])];
+ }
+ }
+
+ var rules = [];
+ var done = {};
+ for (var i=0; i < DE_shapes.length; i++) {
+ if (DE_shapes[i]["type_"] == "source") {
+ var id = parseInt(DE_shapes[i]["id_"]);
+ var src = [];
+ var tgt = [];
+ explore({"id":id,"type":"source"}, src, tgt, s2t, t2s, done);
+ if (src.length >0 && tgt.length>0) {
+ rules.push( {"src":src, "tgt":tgt } );
+ }
+ }
+ }
+
+ rs = {}
+ for (r in rules) {
+ var src = "", tgt = "";
+ var prev=null
+ for (var i=0; i< rules[r]["src"].length; i++) {
+ if (prev!=null && prev < rules[r]["src"][i]-1) {
+ src += "[X] ";
+ }
+ src += DE_shapes_by_id[rules[r]["src"][i]].pair[0].textContent+" ";
+ if (rules[r]["src"][i]!=null)
+ prev = rules[r]["src"][i];
+ }
+ src += "||| ";
+ prev = null;
+ for (var i=0; i< rules[r]["tgt"].length; i++) {
+ if (!DE_shapes_by_id[rules[r]["tgt"][i]]) // unaligned source
+ continue;
+ if (prev && prev < rules[r]["tgt"][i]-1) {
+ tgt += "[X] ";
+ }
+ tgt += DE_shapes_by_id[rules[r]["tgt"][i]].pair[0].textContent+" ";
+ if (rules[r]["tgt"][i])
+ prev = rules[r]["tgt"][i];
+ }
+ if (tgt.replace(/\|\|\|/g, "").trim() != "") {
+ var id = rules[r]["tgt"][0];
+ var b = false;
+ if (DE_target_shapes[0]["id_"] == id) {
+ b = true;
+ }
+ rs[rules[r]["src"]] = b+" ||| "+$.trim(src+tgt);
+ }
+ }
+
+ return rs;
+}
+
+var id2idx = function (id) { // or grid_pos
+ var i = 0;
+ for (k in DE_target_shapes) {
+ if (DE_target_shapes[k]["id_"] == id) {
+ return i;
+ }
+ i++;
+ }
+
+ return -1;
+}
+
+var idx2id = function (idx) {
+ return DE_target_shapes[idx]["id_"];
+}
+
+var amax = function (a) {
+ var max = -9999999999999;
+ for (k in a) {
+ if (a[k] > max)
+ max = a[k];
+ }
+ return max;
+}
+
+var $rules =[];
+var get_simplest_rules1 = function ()
+{
+ var s2t = [];
+ var t2s = [];
+ for (key in DE_connections) {
+ var a = key.split("-");
+ if (s2t.hasOwnProperty(a[0])) {
+ s2t[parseInt(a[0])].push(parseInt(a[1]));
+ } else {
+ s2t[parseInt(a[0])] = [parseInt(a[1])];
+ }
+ if (t2s.hasOwnProperty(a[1])) {
+ t2s[parseInt(a[1])].push(parseInt(a[0]));
+ } else {
+ t2s[parseInt(a[1])] = [parseInt(a[0])];
+ }
+ }
+
+ var rules = [];
+ var done = {};
+ for (var i=0; i < DE_shapes.length; i++) {
+ if (DE_shapes[i]["type_"] == "source") {
+ var id = parseInt(DE_shapes[i]["id_"]);
+ var src = [];
+ var tgt = [];
+ explore({"id":id,"type":"source"}, src, tgt, s2t, t2s, done);
+ if (src.length >0 && tgt.length>0) {
+ tgt.sort(function(a,b) { return id2idx(a) > id2idx(b) });
+ rules.push( {"src":src, "tgt":tgt } );
+ }
+ }
+ }
+
+ for (var z=0; zIn addition to dynamically adding each source/post-edit instance to the suffix array extractor, the system additionally uses the provided phrase alignments to extract new rules. The extraction follows the original Hiero grammar extraction, but using phrases instead of words and using only a single binary feature: 'NewRule=1'. Extracted rules that already exist in a grammar are annotated with an additional feature: 'KnownRules=1'. OOVs are avoided by asking the user for translations of unknown words prior to translation. These are added to the grammars as new rules ('OOVFix=1').
%h3
New Rules
@@ -225,7 +226,7 @@
%th Rate
%tbody
- if pairwise_ranking_data["update_raw"]
- - raw_update = SparseVector.from_kv(pairwise_ranking_data["update_raw"])
+ - raw_update = SparseVector.new(pairwise_ranking_data["update_raw"])
- pairwise_ranking_data["weights_before"].default = 0
- pairwise_ranking_data["weights_after"].keys.each.sort { |a,b| a <=> b }.each do |k|
- diff = pairwise_ranking_data["weights_after"][k] - pairwise_ranking_data["weights_before"][k]
--
cgit v1.2.3