diff options
author | Michael Denkowski <mdenkows@cs.cmu.edu> | 2014-03-06 15:35:10 -0800 |
---|---|---|
committer | Michael Denkowski <mdenkows@cs.cmu.edu> | 2014-03-06 15:35:10 -0800 |
commit | a49f3a5b19547e7e46a652b22fab601da8fc210f (patch) | |
tree | e601ada49b2751344d14175b005182ca842a29cf /python/cdec/sa/online.py | |
parent | 18aa808143ab06da361a557350f6b3dd964717ce (diff) |
Compile count-based bilex table for online grammar extraction.
Diffstat (limited to 'python/cdec/sa/online.py')
-rw-r--r-- | python/cdec/sa/online.py | 95 |
1 files changed, 95 insertions, 0 deletions
diff --git a/python/cdec/sa/online.py b/python/cdec/sa/online.py new file mode 100644 index 00000000..92f6eae2 --- /dev/null +++ b/python/cdec/sa/online.py @@ -0,0 +1,95 @@ +from __future__ import division + +import collections +import gzip +import itertools + +from cdec.sa._sa import gzip_or_text + +# Same as Cython implementation. Collisions with NULL in bitext? +NULL_WORD = 'NULL' + +def learn_vocab(text_f): + vocab = set() + for line in gzip_or_text(text_f): + for word in line.strip().split(): + vocab.add(word) + return vocab + +def write_vocab(vocab, out_f): + with gzip.open(out_f, 'wb') as out: + for word in sorted(vocab): + out.write('{}\n'.format(word)) + +def read_vocab(in_f): + return set(line.strip() for line in gzip_or_text(in_f)) + +class Bilex: + + def __init__(self, in_f=None, alignment_f=None, text_f=None, target_text_f=None): + self.f = collections.defaultdict(int) + self.e = collections.defaultdict(int) + self.fe = collections.defaultdict(lambda: collections.defaultdict(int)) + self.ef = collections.defaultdict(lambda: collections.defaultdict(int)) + + # Read from file + if in_f: + self.read(in_f) + # Build from aligned bitext + elif alignment_f: + # Allow one or two args for bitext + if target_text_f: + t = itertools.izip((line.strip() for line in gzip_or_text(text_f)), (line.strip() for line in gzip_or_text(target_text_f))) + else: + t = (line.strip().split(' ||| ') for line in gzip_or_text(text_f)) + a = (line.strip() for line in gzip_or_text(alignment_f)) + for (source, target) in t: + links = sorted(tuple(int(link) for link in link_str.split('-')) for link_str in a.next().split()) + self.update(source.split(), target.split(), links) + + # Add bilex counts from new aligned sentence pair + def update(self, f_words, e_words, links): + aligned_fe = [list() for _ in range(len(f_words))] + aligned_ef = [list() for _ in range(len(e_words))] + for (i, j) in links: + aligned_fe[i].append(j) + aligned_ef[j].append(i) + for f_i in range(len(f_words)): + self.f[f_words[f_i]] += 1 + e_i_aligned = aligned_fe[f_i] + lc = len(e_i_aligned) + if lc > 0: + for e_i in e_i_aligned: + self.fe[f_words[f_i]][e_words[e_i]] += (1 / lc) + else: + self.fe[f_words[f_i]][NULL_WORD] += 1 + for e_i in range(len(e_words)): + self.e[e_words[e_i]] += 1 + f_i_aligned = aligned_ef[e_i] + lc = len(f_i_aligned) + if lc > 0: + for f_i in f_i_aligned: + self.ef[e_words[e_i]][f_words[f_i]] += (1 / lc) + else: + self.ef[e_words[e_i]][NULL_WORD] += 1 + + def write(self, out_f): + fv = sorted(self.f) + ev = sorted(self.e) + with gzip.open(out_f, 'wb') as out: + for f in fv: + out.write('{} {}\n'.format(f, self.f[f])) + out.write('\n') + for e in ev: + out.write('{} {}\n'.format(e, self.e[e])) + out.write('\n') + for f in fv: + for (e, c) in sorted(self.fe[f].iteritems()): + out.write('{} {} {}\n'.format(f, e, c)) + for e in ev: + for (f, c) in sorted(self.ef[e].iteritems()): + out.write('{} {} {}\n'.format(e, f, c)) + + def read(self, in_f): + with gzip_or_text(in_f) as inp: + pass |