summaryrefslogtreecommitdiff
path: root/python/cdec/sa/features.py
blob: 8d35d8e6b5b4bcbb716341ee36aa5c1b27314479 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from __future__ import division
import math
import cdec.sa

MAXSCORE = 99

def EgivenF(fphrase, ephrase, paircount, fcount, fsample_count): # p(e|f)
    return -math.log10(paircount/fcount)

def CountEF(fphrase, ephrase, paircount, fcount, fsample_count):
    return math.log10(1 + paircount)

def SampleCountF(fphrase, ephrase, paircount, fcount, fsample_count):
    return math.log10(1 + fsample_count)

def EgivenFCoherent(fphrase, ephrase, paircount, fcount, fsample_count):
    prob = paircount/fsample_count
    return -math.log10(prob) if prob > 0 else MAXSCORE

def CoherenceProb(fphrase, ephrase, paircount, fcount, fsample_count):
    return -math.log10(fcount/fsample_count)

def MaxLexEgivenF(ttable):
    def feature(fphrase, ephrase, paircount, fcount, fsample_count):
        fwords = [cdec.sa.sym_tostring(w) for w in fphrase if not cdec.sa.sym_isvar(w)]
        fwords.append('NULL')
        ewords = (cdec.sa.sym_tostring(w) for w in ephrase if not cdec.sa.sym_isvar(w))
        def score():
            for e in ewords:
              maxScore = max(ttable.get_score(f, e, 0) for f in fwords)
              yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE
        return sum(score())
    return feature

def MaxLexFgivenE(ttable):
    def feature(fphrase, ephrase, paircount, fcount, fsample_count):
        fwords = (cdec.sa.sym_tostring(w) for w in fphrase if not cdec.sa.sym_isvar(w))
        ewords = [cdec.sa.sym_tostring(w) for w in ephrase if not cdec.sa.sym_isvar(w)]
        ewords.append('NULL')
        def score():
            for f in fwords:
              maxScore = max(ttable.get_score(f, e, 1) for e in ewords)
              yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE
        return sum(score())
    return feature

def IsSingletonF(fphrase, ephrase, paircount, fcount, fsample_count):
    return (fcount == 1)

def IsSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count):
    return (paircount == 1)

def IsNotSingletonF(fphrase, ephrase, paircount, fcount, fsample_count):
    return (fcount > 1)

def IsNotSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count):
    return (paircount > 1)

def IsFEGreaterThanZero(fphrase, ephrase, paircount, fcount, fsample_count):
    return (paircount > 0.01)