Merge remote-tracking branch 'upstream/master'

author: Patrick Simianer <p@simianer.de> 2014-03-16 17:48:48 +0100
committer: Patrick Simianer <p@simianer.de> 2014-03-16 17:48:48 +0100
commit: 5250fd67a4b8f242068cff87f0a6a4211f8b0fcf (patch)
tree: f1401c1fd3eeae8671e59baf0d2169d1eb721cb7 /python/cdec/sa/online.py
parent: 3eedf96b5a08b3e3414888d328c505814b84d8db (diff)
parent: cc87bfed0697583b7c11243913254dde3c0047d4 (diff)
1 files changed, 128 insertions, 0 deletions
diff --git a/python/cdec/sa/online.py b/python/cdec/sa/online.py
new file mode 100644
index 00000000..d3f967e8
--- /dev/null
+++ b/python/cdec/sa/online.py
@@ -0,0 +1,128 @@
+from __future__ import division
+
+import collections
+import gzip
+import itertools
+
+from cdec.sa._sa import gzip_or_text
+
+# Same as Cython implementation.  Collisions with NULL in bitext?
+NULL_WORD = 'NULL'
+
+def learn_vocab(text_f):
+    vocab = set()
+    for line in gzip_or_text(text_f):
+        for word in line.strip().split():
+            vocab.add(word)
+    return vocab
+
+def write_vocab(vocab, out_f):
+    with gzip.open(out_f, 'wb') as out:
+        for word in sorted(vocab):
+            out.write('{}\n'.format(word))
+
+def read_vocab(in_f):
+    return set(line.strip() for line in gzip_or_text(in_f))
+
+class Bilex:
+
+    def __init__(self, in_f=None):
+        self.f = collections.defaultdict(int)
+        self.e = collections.defaultdict(int)
+        self.fe = collections.defaultdict(int)
+        if in_f:
+            self.read(in_f)
+
+    # Compatibility with Cython implementation
+    def get_score(self, f, e, dir):
+        if dir == 0:
+            p = self.p_fe(f, e)
+        elif dir == 1:
+            p = self.p_ef(e, f)
+        return p
+
+    def p_fe(self, f, e):
+        denom = self.f.get(f, None)
+        if not denom:
+            return None
+        num = self.fe.get((f, e), None)
+        if not num:
+            return None
+        return num / denom
+
+    def p_ef(self, e, f):
+        denom = self.e.get(e, None)
+        if not denom:
+            return None
+        num = self.fe.get((f, e), None)
+        if not num:
+            return None
+        return num / denom
+
+    # Update counts from aligned sentence
+    def update(self, f_words, e_words, links):
+        covered_f = set()
+        covered_e = set()
+        for (i, j) in links:
+            covered_f.add(i)
+            covered_e.add(j)
+            self.f[f_words[i]] += 1
+            self.e[e_words[j]] += 1
+            self.fe[(f_words[i], e_words[j])] += 1
+        # e being covered corresponds to f->e
+        for j in range(len(e_words)):
+            if j not in covered_e:
+                self.f[NULL_WORD] += 1
+                self.e[e_words[j]] += 1
+                self.fe[(NULL_WORD, e_words[j])] += 1
+        # e->f
+        for i in range(len(f_words)):
+            if i not in covered_f:
+                self.f[f_words[i]] += 1
+                self.e[NULL_WORD] += 1
+                self.fe[(f_words[i], NULL_WORD)] += 1
+
+    # Update counts from alignd bitext
+    def add_bitext(self, alignment_f, text_f, target_text_f=None):
+        # Allow one or two args for bitext
+        if target_text_f:
+            t = itertools.izip((line.strip() for line in gzip_or_text(text_f)), (line.strip() for line in gzip_or_text(target_text_f)))
+        else:
+            t = (line.strip().split(' ||| ') for line in gzip_or_text(text_f))
+        a = (line.strip() for line in gzip_or_text(alignment_f))
+        for (source, target) in t:
+            links = sorted(tuple(int(link) for link in link_str.split('-')) for link_str in a.next().split())
+            self.update(source.split(), target.split(), links)
+
+    def write(self, out_f):
+        with gzip.open(out_f, 'wb') as out:
+            for f in sorted(self.f):
+                out.write('{} {}\n'.format(f, self.f[f]))
+            out.write('\n')
+            for e in sorted(self.e):
+                out.write('{} {}\n'.format(e, self.e[e]))
+            out.write('\n')
+            for (f, e) in sorted(self.fe):
+                out.write('{} {} {}\n'.format(f, e, self.fe[(f, e)]))
+            out.write('\n')
+
+    def read(self, in_f):
+        with gzip_or_text(in_f) as inp:
+            while True:
+                line = inp.readline().strip()
+                if not line:
+                    break
+                (w, c) = line.split()
+                self.f[w] = int(c)
+            while True:
+                line = inp.readline().strip()
+                if not line:
+                    break
+                (w, c) = line.split()
+                self.e[w] = int(c)
+            while True:
+                line = inp.readline().strip()
+                if not line:
+                    break
+                (f, e, c) = line.split()
+                self.fe[(f, e)] = float(c)
author	Patrick Simianer <p@simianer.de>	2014-03-16 17:48:48 +0100
committer	Patrick Simianer <p@simianer.de>	2014-03-16 17:48:48 +0100
commit	5250fd67a4b8f242068cff87f0a6a4211f8b0fcf (patch)
tree	f1401c1fd3eeae8671e59baf0d2169d1eb721cb7 /python/cdec/sa/online.py
parent	3eedf96b5a08b3e3414888d328c505814b84d8db (diff)
parent	cc87bfed0697583b7c11243913254dde3c0047d4 (diff)