summaryrefslogtreecommitdiff
path: root/python/pkg/cdec/sa/features.py
blob: a89499d4b2a6b89fd12915a494c64516ba06564e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from __future__ import division
import math

from cdec.sa import isvar

MAXSCORE = 99

def EgivenF(ctx): # p(e|f) = c(e, f)/c(f)
    if not ctx.online:
        prob = ctx.paircount/ctx.fcount
    else:
        prob = (ctx.paircount + ctx.online.paircount) / (ctx.fcount + ctx.online.fcount)
    return -math.log10(prob)

def CountEF(ctx): # c(e, f)
    if not ctx.online:
        count = 1 + ctx.paircount
    else:
        count = 1 + ctx.paircount + ctx.online.paircount
    return math.log10(count)

def SampleCountF(ctx): # sample c(f)
    if not ctx.online:
        count = 1 + ctx.fsample_count
    else:
        count = 1 + ctx.fsample_count + ctx.online.fsample_count
    return math.log10(count)

def EgivenFCoherent(ctx): # c(e, f) / sample c(f)
    if not ctx.online:
        prob = ctx.paircount/ctx.fsample_count
    else:
        prob = (ctx.paircount + ctx.online.paircount) / (ctx.fsample_count + ctx.online.fsample_count)
    return -math.log10(prob) if prob > 0 else MAXSCORE

def CoherenceProb(ctx): # c(f) / sample c(f)
    if not ctx.online:
        prob = ctx.fcount/ctx.fsample_count
    else:
        prob = (ctx.fcount + ctx.online.fcount) / (ctx.fsample_count + ctx.online.fsample_count)
    return -math.log10(prob)

def MaxLexEgivenF(ttable):
    def MaxLexEgivenF(ctx):
        fwords = ctx.fphrase.words
        fwords.append('NULL')
        if not ctx.online:
            maxOffScore = 0.0
            for e in ctx.ephrase.words:
                maxScore = max(ttable.get_score(f, e, 0) for f in fwords)
                maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
            return maxOffScore
        else:
            # For now, straight average
            maxOffScore = 0.0
            maxOnScore = 0.0
            for e in ctx.ephrase.words:
                maxScore = max(ttable.get_score(f, e, 0) for f in fwords)
                maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
            for e in ctx.ephrase:
                if not isvar(e):
                    maxScore = max((ctx.online.bilex_fe[f][e] / ctx.online.bilex_f[f]) for f in ctx.fphrase if not isvar(f))
                    maxOnScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
            return (maxOffScore + maxOnScore) / 2
    return MaxLexEgivenF

def MaxLexFgivenE(ttable):
    def MaxLexFgivenE(ctx):
        ewords = ctx.ephrase.words
        ewords.append('NULL')
        if not ctx.online:
            maxOffScore = 0.0
            for f in ctx.fphrase.words:
                maxScore = max(ttable.get_score(f, e, 1) for e in ewords)
                maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
            return maxOffScore
        else:
            # For now, straight average
            maxOffScore = 0.0
            maxOnScore = 0.0
            for f in ctx.fphrase.words:
                maxScore = max(ttable.get_score(f, e, 1) for e in ewords)
                maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
            for f in ctx.fphrase:
                if not isvar(f):
                    maxScore = max((ctx.online.bilex_fe[f][e] / ctx.online.bilex_e[e]) for e in ctx.ephrase if not isvar(e))
                    maxOnScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE
            return (maxOffScore + maxOnScore) / 2
    return MaxLexFgivenE

def IsSingletonF(ctx):
    if not ctx.online:
        count = ctx.fcount
    else:
        count = ctx.fcount + ctx.online.fcount  
    return (count == 1)

def IsSingletonFE(ctx):
    if not ctx.online:
        count = ctx.paircount
    else:
        count = ctx.paircount + ctx.online.paircount
    return (count == 1)

def IsNotSingletonF(ctx):
    if not ctx.online:
        count = ctx.fcount
    else:
        count = ctx.fcount + ctx.online.fcount
    return (count > 1)

def IsNotSingletonFE(ctx):
    if not ctx.online:
        count = ctx.paircount
    else:
        count = ctx.paircount + ctx.online.paircount
    return (ctx.paircount > 1)

def IsFEGreaterThanZero(ctx):
    if not ctx.online:
        count = ctx.paircount
    else:
        count = ctx.paircount + ctx.online.paircount
    return (ctx.paircount > 0.01)

def IsSupportedOnline(ctx): # Occurs in online data?
    if ctx.online:
        return (ctx.online.fcount > 0.01)
    else:
        return False