python/cdec/sa/features.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153

from __future__ import division
import math

from cdec.sa import isvar

MAXSCORE = 99

def EgivenF(ctx): # p(e|f) = c(e, f)/c(f)
    if not ctx.online:
        prob = ctx.paircount/ctx.fcount
    else:
        prob = (ctx.paircount + ctx.online.paircount) / (ctx.fcount + ctx.online.fcount)
    return -math.log10(prob)

def CountEF(ctx): # c(e, f)
    if not ctx.online:
        count = 1 + ctx.paircount
    else:
        count = 1 + ctx.paircount + ctx.online.paircount
    return math.log10(count)

def SampleCountF(ctx): # sample c(f)
    if not ctx.online:
        count = 1 + ctx.fsample_count
    else:
        count = 1 + ctx.fsample_count + ctx.online.fsample_count
    return math.log10(count)

def EgivenFCoherent(ctx): # c(e, f) / sample c(f)
    if not ctx.online:
        prob = ctx.paircount/ctx.fsample_count
    else:
        prob = (ctx.paircount + ctx.online.paircount) / (ctx.fsample_count + ctx.online.fsample_count)
    return -math.log10(prob) if prob > 0 else MAXSCORE

def CoherenceProb(ctx): # c(f) / sample c(f)
    if not ctx.online:
        prob = ctx.fcount/ctx.fsample_count
    else:
        prob = (ctx.fcount + ctx.online.fcount) / (ctx.fsample_count + ctx.online.fsample_count)
    return -math.log10(prob)

def MaxLexEgivenF(ttable):
    def MaxLexEgivenF(ctx):
        fwords = ctx.fphrase.words
        fwords.append('NULL')
        # Always use this for now
        if not ctx.online or ctx.online:
            maxOffScore = 0.0
            for e in ctx.ephrase.words:
                maxScore = max(ttable.get_score(f, e, 0) for f in fwords)
                maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
            return maxOffScore
        else:
            # For now, straight average
            maxOffScore = 0.0
            maxOnScore = 0.0
            for e in ctx.ephrase.words:
                maxScore = max(ttable.get_score(f, e, 0) for f in fwords)
                maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
            for e in ctx.ephrase:
                if not isvar(e):
                    maxScore = 0.0
                    for f in ctx.fphrase:
                        if not isvar(f):
                            b_f = ctx.online.bilex_f.get(f, 0)
                            if b_f:
                                maxScore = max(maxScore, ctx.online.bilex_fe.get(f, {}).get(e))
                    maxOnScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
            return (maxOffScore + maxOnScore) / 2
    return MaxLexEgivenF

def MaxLexFgivenE(ttable):
    def MaxLexFgivenE(ctx):
        ewords = ctx.ephrase.words
        ewords.append('NULL')
        # Always use this for now
        if not ctx.online or ctx.online:
            maxOffScore = 0.0
            for f in ctx.fphrase.words:
                maxScore = max(ttable.get_score(f, e, 1) for e in ewords)
                maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
            return maxOffScore
        else:
            # For now, straight average
            maxOffScore = 0.0
            maxOnScore = 0.0
            for f in ctx.fphrase.words:
                maxScore = max(ttable.get_score(f, e, 1) for e in ewords)
                maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
            for f in ctx.fphrase:
                if not isvar(f):
                    maxScore = 0.0
                    for e in ctx.ephrase:
                        if not isvar(e):
                            b_e = ctx.online.bilex_e.get(e, 0)
                            if b_e:
                                maxScore = max(maxScore, ctx.online.bilex_fe.get(f, {}).get(e, 0) / b_e )
                    maxOnScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
            return (maxOffScore + maxOnScore) / 2
    return MaxLexFgivenE

def IsSingletonF(ctx):
    if not ctx.online:
        count = ctx.fcount
    else:
        count = ctx.fcount + ctx.online.fcount  
    return math.fabs(count - 1) < 1e-6

def IsSingletonFE(ctx):
    if not ctx.online:
        count = ctx.paircount
    else:
        count = ctx.paircount + ctx.online.paircount
    return (count == 1)

def IsNotSingletonF(ctx):
    if not ctx.online:
        count = ctx.fcount
    else:
        count = ctx.fcount + ctx.online.fcount
    return (count > 1)

def IsNotSingletonFE(ctx):
    if not ctx.online:
        count = ctx.paircount
    else:
        count = ctx.paircount + ctx.online.paircount
    return (ctx.paircount > 1)

def IsFEGreaterThanZero(ctx):
    if not ctx.online:
        count = ctx.paircount
    else:
        count = ctx.paircount + ctx.online.paircount
    return (ctx.paircount > 0.01)

def IsSupportedOnline(ctx): # Occurs in online data?
    if ctx.online:
        return (ctx.online.paircount > 0.01)
    else:
        return False

def CountExceptLM(vocab):
    def CountExceptLM(ctx): # Word count in bitext (inc online data) but NOT mono text
        return sum(1 for e in ctx.ephrase.words if e not in vocab)
    return CountExceptLM

def CountExceptLex(ttable):
    def CountExceptLex(ctx): # Word count in online data but NOT aligned in original bitext
        # TODO: Check that online data actually contains aligned word when rulefactory TODO is addressed.
        return sum(1 for e in ctx.ephrase.words if not ttable.contains_e_word(e))
    return CountExceptLex