diff options
Diffstat (limited to 'python/cdec/sa/features.py')
-rw-r--r-- | python/cdec/sa/features.py | 69 |
1 files changed, 10 insertions, 59 deletions
diff --git a/python/cdec/sa/features.py b/python/cdec/sa/features.py index dcc60401..1779f2f9 100644 --- a/python/cdec/sa/features.py +++ b/python/cdec/sa/features.py @@ -44,60 +44,22 @@ def MaxLexEgivenF(ttable): def MaxLexEgivenF(ctx): fwords = ctx.fphrase.words fwords.append('NULL') - # Always use this for now - if not ctx.online or ctx.online: - maxOffScore = 0.0 - for e in ctx.ephrase.words: - maxScore = max(ttable.get_score(f, e, 0) for f in fwords) - maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE - return maxOffScore - else: - # For now, straight average - maxOffScore = 0.0 - maxOnScore = 0.0 - for e in ctx.ephrase.words: - maxScore = max(ttable.get_score(f, e, 0) for f in fwords) - maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE - for e in ctx.ephrase: - if not isvar(e): - maxScore = 0.0 - for f in ctx.fphrase: - if not isvar(f): - b_f = ctx.online.bilex_f.get(f, 0) - if b_f: - maxScore = max(maxScore, ctx.online.bilex_fe.get(f, {}).get(e)) - maxOnScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE - return (maxOffScore + maxOnScore) / 2 + maxOffScore = 0.0 + for e in ctx.ephrase.words: + maxScore = max(ttable.get_score(f, e, 0) for f in fwords) + maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE + return maxOffScore return MaxLexEgivenF def MaxLexFgivenE(ttable): def MaxLexFgivenE(ctx): ewords = ctx.ephrase.words ewords.append('NULL') - # Always use this for now - if not ctx.online or ctx.online: - maxOffScore = 0.0 - for f in ctx.fphrase.words: - maxScore = max(ttable.get_score(f, e, 1) for e in ewords) - maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE - return maxOffScore - else: - # For now, straight average - maxOffScore = 0.0 - maxOnScore = 0.0 - for f in ctx.fphrase.words: - maxScore = max(ttable.get_score(f, e, 1) for e in ewords) - maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE - for f in ctx.fphrase: - if not isvar(f): - maxScore = 0.0 - for e in ctx.ephrase: - if not isvar(e): - b_e = ctx.online.bilex_e.get(e, 0) - if b_e: - maxScore = max(maxScore, ctx.online.bilex_fe.get(f, {}).get(e, 0) / b_e ) - maxOnScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE - return (maxOffScore + maxOnScore) / 2 + maxOffScore = 0.0 + for f in ctx.fphrase.words: + maxScore = max(ttable.get_score(f, e, 1) for e in ewords) + maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE + return maxOffScore return MaxLexFgivenE def IsSingletonF(ctx): @@ -140,14 +102,3 @@ def IsSupportedOnline(ctx): # Occurs in online data? return (ctx.online.paircount > 0.01) else: return False - -def CountExceptLM(vocab): - def CountExceptLM(ctx): # Word count in bitext (inc online data) but NOT mono text - return sum(1 for e in ctx.ephrase.words if e not in vocab) - return CountExceptLM - -def CountExceptLex(ttable): - def CountExceptLex(ctx): # Word count in online data but NOT aligned in original bitext - # TODO: Check that online data actually contains aligned word when rulefactory TODO is addressed. - return sum(1 for e in ctx.ephrase.words if not ttable.contains_e_word(e)) - return CountExceptLex |