summaryrefslogtreecommitdiff
path: root/python/cdec/sa/online.py
diff options
context:
space:
mode:
authorMichael Denkowski <mdenkows@cs.cmu.edu>2014-03-06 15:35:10 -0800
committerMichael Denkowski <mdenkows@cs.cmu.edu>2014-03-06 15:35:10 -0800
commita49f3a5b19547e7e46a652b22fab601da8fc210f (patch)
treee601ada49b2751344d14175b005182ca842a29cf /python/cdec/sa/online.py
parent18aa808143ab06da361a557350f6b3dd964717ce (diff)
Compile count-based bilex table for online grammar extraction.
Diffstat (limited to 'python/cdec/sa/online.py')
-rw-r--r--python/cdec/sa/online.py95
1 files changed, 95 insertions, 0 deletions
diff --git a/python/cdec/sa/online.py b/python/cdec/sa/online.py
new file mode 100644
index 00000000..92f6eae2
--- /dev/null
+++ b/python/cdec/sa/online.py
@@ -0,0 +1,95 @@
+from __future__ import division
+
+import collections
+import gzip
+import itertools
+
+from cdec.sa._sa import gzip_or_text
+
+# Same as Cython implementation. Collisions with NULL in bitext?
+NULL_WORD = 'NULL'
+
+def learn_vocab(text_f):
+ vocab = set()
+ for line in gzip_or_text(text_f):
+ for word in line.strip().split():
+ vocab.add(word)
+ return vocab
+
+def write_vocab(vocab, out_f):
+ with gzip.open(out_f, 'wb') as out:
+ for word in sorted(vocab):
+ out.write('{}\n'.format(word))
+
+def read_vocab(in_f):
+ return set(line.strip() for line in gzip_or_text(in_f))
+
+class Bilex:
+
+ def __init__(self, in_f=None, alignment_f=None, text_f=None, target_text_f=None):
+ self.f = collections.defaultdict(int)
+ self.e = collections.defaultdict(int)
+ self.fe = collections.defaultdict(lambda: collections.defaultdict(int))
+ self.ef = collections.defaultdict(lambda: collections.defaultdict(int))
+
+ # Read from file
+ if in_f:
+ self.read(in_f)
+ # Build from aligned bitext
+ elif alignment_f:
+ # Allow one or two args for bitext
+ if target_text_f:
+ t = itertools.izip((line.strip() for line in gzip_or_text(text_f)), (line.strip() for line in gzip_or_text(target_text_f)))
+ else:
+ t = (line.strip().split(' ||| ') for line in gzip_or_text(text_f))
+ a = (line.strip() for line in gzip_or_text(alignment_f))
+ for (source, target) in t:
+ links = sorted(tuple(int(link) for link in link_str.split('-')) for link_str in a.next().split())
+ self.update(source.split(), target.split(), links)
+
+ # Add bilex counts from new aligned sentence pair
+ def update(self, f_words, e_words, links):
+ aligned_fe = [list() for _ in range(len(f_words))]
+ aligned_ef = [list() for _ in range(len(e_words))]
+ for (i, j) in links:
+ aligned_fe[i].append(j)
+ aligned_ef[j].append(i)
+ for f_i in range(len(f_words)):
+ self.f[f_words[f_i]] += 1
+ e_i_aligned = aligned_fe[f_i]
+ lc = len(e_i_aligned)
+ if lc > 0:
+ for e_i in e_i_aligned:
+ self.fe[f_words[f_i]][e_words[e_i]] += (1 / lc)
+ else:
+ self.fe[f_words[f_i]][NULL_WORD] += 1
+ for e_i in range(len(e_words)):
+ self.e[e_words[e_i]] += 1
+ f_i_aligned = aligned_ef[e_i]
+ lc = len(f_i_aligned)
+ if lc > 0:
+ for f_i in f_i_aligned:
+ self.ef[e_words[e_i]][f_words[f_i]] += (1 / lc)
+ else:
+ self.ef[e_words[e_i]][NULL_WORD] += 1
+
+ def write(self, out_f):
+ fv = sorted(self.f)
+ ev = sorted(self.e)
+ with gzip.open(out_f, 'wb') as out:
+ for f in fv:
+ out.write('{} {}\n'.format(f, self.f[f]))
+ out.write('\n')
+ for e in ev:
+ out.write('{} {}\n'.format(e, self.e[e]))
+ out.write('\n')
+ for f in fv:
+ for (e, c) in sorted(self.fe[f].iteritems()):
+ out.write('{} {} {}\n'.format(f, e, c))
+ for e in ev:
+ for (f, c) in sorted(self.ef[e].iteritems()):
+ out.write('{} {} {}\n'.format(e, f, c))
+
+ def read(self, in_f):
+ with gzip_or_text(in_f) as inp:
+ pass