1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
|
from __future__ import division
import math
from cdec.sa import isvar
MAXSCORE = 99
def EgivenF(ctx): # p(e|f) = c(e, f)/c(f)
if not ctx.online:
prob = ctx.paircount/ctx.fcount
else:
prob = (ctx.paircount + ctx.online.paircount) / (ctx.fcount + ctx.online.fcount)
return -math.log10(prob)
def CountEF(ctx): # c(e, f)
if not ctx.online:
count = 1 + ctx.paircount
else:
count = 1 + ctx.paircount + ctx.online.paircount
return math.log10(count)
def SampleCountF(ctx): # sample c(f)
if not ctx.online:
count = 1 + ctx.fsample_count
else:
count = 1 + ctx.fsample_count + ctx.online.fsample_count
return math.log10(count)
def EgivenFCoherent(ctx): # c(e, f) / sample c(f)
if not ctx.online:
prob = ctx.paircount/ctx.fsample_count
else:
prob = (ctx.paircount + ctx.online.paircount) / (ctx.fsample_count + ctx.online.fsample_count)
return -math.log10(prob) if prob > 0 else MAXSCORE
def CoherenceProb(ctx): # c(f) / sample c(f)
if not ctx.online:
prob = ctx.fcount/ctx.fsample_count
else:
prob = (ctx.fcount + ctx.online.fcount) / (ctx.fsample_count + ctx.online.fsample_count)
return -math.log10(prob)
def MaxLexEgivenF(ttable):
def MaxLexEgivenF(ctx):
fwords = ctx.fphrase.words
fwords.append('NULL')
# Always use this for now
if not ctx.online or ctx.online:
maxOffScore = 0.0
for e in ctx.ephrase.words:
maxScore = max(ttable.get_score(f, e, 0) for f in fwords)
maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
return maxOffScore
else:
# For now, straight average
maxOffScore = 0.0
maxOnScore = 0.0
for e in ctx.ephrase.words:
maxScore = max(ttable.get_score(f, e, 0) for f in fwords)
maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
for e in ctx.ephrase:
if not isvar(e):
maxScore = 0.0
for f in ctx.fphrase:
if not isvar(f):
b_f = ctx.online.bilex_f.get(f, 0)
if b_f:
maxScore = max(maxScore, ctx.online.bilex_fe.get(f, {}).get(e))
maxOnScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
return (maxOffScore + maxOnScore) / 2
return MaxLexEgivenF
def MaxLexFgivenE(ttable):
def MaxLexFgivenE(ctx):
ewords = ctx.ephrase.words
ewords.append('NULL')
# Always use this for now
if not ctx.online or ctx.online:
maxOffScore = 0.0
for f in ctx.fphrase.words:
maxScore = max(ttable.get_score(f, e, 1) for e in ewords)
maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
return maxOffScore
else:
# For now, straight average
maxOffScore = 0.0
maxOnScore = 0.0
for f in ctx.fphrase.words:
maxScore = max(ttable.get_score(f, e, 1) for e in ewords)
maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
for f in ctx.fphrase:
if not isvar(f):
maxScore = 0.0
for e in ctx.ephrase:
if not isvar(e):
b_e = ctx.online.bilex_e.get(e, 0)
if b_e:
maxScore = max(maxScore, ctx.online.bilex_fe.get(f, {}).get(e, 0) / b_e )
maxOnScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
return (maxOffScore + maxOnScore) / 2
return MaxLexFgivenE
def IsSingletonF(ctx):
if not ctx.online:
count = ctx.fcount
else:
count = ctx.fcount + ctx.online.fcount
return math.fabs(count - 1) < 1e-6
def IsSingletonFE(ctx):
if not ctx.online:
count = ctx.paircount
else:
count = ctx.paircount + ctx.online.paircount
return (count == 1)
def IsNotSingletonF(ctx):
if not ctx.online:
count = ctx.fcount
else:
count = ctx.fcount + ctx.online.fcount
return (count > 1)
def IsNotSingletonFE(ctx):
if not ctx.online:
count = ctx.paircount
else:
count = ctx.paircount + ctx.online.paircount
return (ctx.paircount > 1)
def IsFEGreaterThanZero(ctx):
if not ctx.online:
count = ctx.paircount
else:
count = ctx.paircount + ctx.online.paircount
return (ctx.paircount > 0.01)
def IsSupportedOnline(ctx): # Occurs in online data?
if ctx.online:
return (ctx.online.paircount > 0.01)
else:
return False
def CountExceptLM(vocab):
def CountExceptLM(ctx): # Word count in bitext (inc online data) but NOT mono text
return sum(1 for e in ctx.ephrase.words if e not in vocab)
return CountExceptLM
def CountExceptLex(ttable):
def CountExceptLex(ctx): # Word count in online data but NOT aligned in original bitext
# TODO: Check that online data actually contains aligned word when rulefactory TODO is addressed.
return sum(1 for e in ctx.ephrase.words if not ttable.contains_e_word(e))
return CountExceptLex
|