summaryrefslogtreecommitdiff
path: root/python/cdec/sa/online.py
diff options
context:
space:
mode:
authorMichael Denkowski <mdenkows@cs.cmu.edu>2014-03-07 01:52:09 -0800
committerMichael Denkowski <mdenkows@cs.cmu.edu>2014-03-07 01:52:09 -0800
commitfcf92357806816bb65dade6f761fa08448f88f71 (patch)
tree681aa2f4319ebfe1bf9e1d6a748e504cfae72790 /python/cdec/sa/online.py
parent256070339f557e295c1e90e9a451c82731d5c223 (diff)
More online bilex updates
Diffstat (limited to 'python/cdec/sa/online.py')
-rw-r--r--python/cdec/sa/online.py72
1 files changed, 56 insertions, 16 deletions
diff --git a/python/cdec/sa/online.py b/python/cdec/sa/online.py
index 92f6eae2..008793d8 100644
--- a/python/cdec/sa/online.py
+++ b/python/cdec/sa/online.py
@@ -26,28 +26,33 @@ def read_vocab(in_f):
class Bilex:
- def __init__(self, in_f=None, alignment_f=None, text_f=None, target_text_f=None):
+ def __init__(self, in_f=None):
self.f = collections.defaultdict(int)
self.e = collections.defaultdict(int)
self.fe = collections.defaultdict(lambda: collections.defaultdict(int))
self.ef = collections.defaultdict(lambda: collections.defaultdict(int))
-
- # Read from file
if in_f:
self.read(in_f)
- # Build from aligned bitext
- elif alignment_f:
- # Allow one or two args for bitext
- if target_text_f:
- t = itertools.izip((line.strip() for line in gzip_or_text(text_f)), (line.strip() for line in gzip_or_text(target_text_f)))
- else:
- t = (line.strip().split(' ||| ') for line in gzip_or_text(text_f))
- a = (line.strip() for line in gzip_or_text(alignment_f))
- for (source, target) in t:
- links = sorted(tuple(int(link) for link in link_str.split('-')) for link_str in a.next().split())
- self.update(source.split(), target.split(), links)
- # Add bilex counts from new aligned sentence pair
+ def get_score(self, f, e, dir):
+ if dir == 0:
+ return self.p_fe(f, e)
+ if dir == 1:
+ return self.p_ef(e, f)
+
+ def p_fe(self, f, e):
+ d = self.fe.get(f, None)
+ if not d:
+ return 0
+ return d.get(e, 0) / self.f.get(f)
+
+ def p_ef(self, e, f):
+ d = self.ef.get(e, None)
+ if not d:
+ return 0
+ return d.get(f, 0) / self.e.get(e)
+
+ # Update counts from aligned sentence
def update(self, f_words, e_words, links):
aligned_fe = [list() for _ in range(len(f_words))]
aligned_ef = [list() for _ in range(len(e_words))]
@@ -73,6 +78,18 @@ class Bilex:
else:
self.ef[e_words[e_i]][NULL_WORD] += 1
+ # Update counts from alignd bitext
+ def add_bitext(self, alignment_f, text_f, target_text_f=None):
+ # Allow one or two args for bitext
+ if target_text_f:
+ t = itertools.izip((line.strip() for line in gzip_or_text(text_f)), (line.strip() for line in gzip_or_text(target_text_f)))
+ else:
+ t = (line.strip().split(' ||| ') for line in gzip_or_text(text_f))
+ a = (line.strip() for line in gzip_or_text(alignment_f))
+ for (source, target) in t:
+ links = sorted(tuple(int(link) for link in link_str.split('-')) for link_str in a.next().split())
+ self.update(source.split(), target.split(), links)
+
def write(self, out_f):
fv = sorted(self.f)
ev = sorted(self.e)
@@ -92,4 +109,27 @@ class Bilex:
def read(self, in_f):
with gzip_or_text(in_f) as inp:
- pass
+ while True:
+ line = inp.readline().strip()
+ if not line:
+ break
+ (w, c) = line.split()
+ self.f[w] = int(c)
+ while True:
+ line = inp.readline().strip()
+ if not line:
+ break
+ (w, c) = line.split()
+ self.e[w] = int(c)
+ while True:
+ line = inp.readline().strip()
+ if not line:
+ break
+ (f, e, c) = line.split()
+ self.fe[f][e] = float(c)
+ while True:
+ line = inp.readline().strip()
+ if not line:
+ break
+ (e, f, c) = line.split()
+ self.ef[e][f] = float(c)