From 143afb1e6210ae6105426d47d7f225cbfa695753 Mon Sep 17 00:00:00 2001 From: Michael Denkowski Date: Mon, 28 Jan 2013 14:21:22 -0500 Subject: Bilexical scores for online rules --- python/pkg/cdec/sa/__init__.py | 2 +- python/pkg/cdec/sa/extractor.py | 5 +++-- python/pkg/cdec/sa/features.py | 44 +++++++++++++++++++++++++++++++++-------- 3 files changed, 40 insertions(+), 11 deletions(-) (limited to 'python/pkg/cdec') diff --git a/python/pkg/cdec/sa/__init__.py b/python/pkg/cdec/sa/__init__.py index 418531d9..14ba5ecb 100644 --- a/python/pkg/cdec/sa/__init__.py +++ b/python/pkg/cdec/sa/__init__.py @@ -1,5 +1,5 @@ from cdec.sa._sa import make_lattice, decode_lattice, decode_sentence,\ - encode_words, decode_words,\ + encode_words, decode_words, isvar,\ SuffixArray, DataArray, LCP, Precomputation, Alignment, BiLex,\ HieroCachingRuleFactory, Sampler, Scorer from cdec.sa.extractor import GrammarExtractor diff --git a/python/pkg/cdec/sa/extractor.py b/python/pkg/cdec/sa/extractor.py index bb552c49..cd3ab899 100644 --- a/python/pkg/cdec/sa/extractor.py +++ b/python/pkg/cdec/sa/extractor.py @@ -61,13 +61,14 @@ class GrammarExtractor: # TODO: clean this up extended_features = [] extended_features.append(IsSupportedOnline) - #if online: - # extended_features.append(IsSupportedOnline) + if online: + extended_features.append(IsSupportedOnline) # TODO: use @cdec.sa.features decorator for standard features too # + add a mask to disable features for f in cdec.sa._SA_FEATURES: extended_features.append(f) + scorer = cdec.sa.Scorer(EgivenFCoherent, SampleCountF, CountEF, MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE, *extended_features) diff --git a/python/pkg/cdec/sa/features.py b/python/pkg/cdec/sa/features.py index 49064f73..a89499d4 100644 --- a/python/pkg/cdec/sa/features.py +++ b/python/pkg/cdec/sa/features.py @@ -1,6 +1,8 @@ from __future__ import division import math +from cdec.sa import isvar + MAXSCORE = 99 def EgivenF(ctx): # p(e|f) = c(e, f)/c(f) @@ -42,22 +44,48 @@ def MaxLexEgivenF(ttable): def MaxLexEgivenF(ctx): fwords = ctx.fphrase.words fwords.append('NULL') - def score(): + if not ctx.online: + maxOffScore = 0.0 + for e in ctx.ephrase.words: + maxScore = max(ttable.get_score(f, e, 0) for f in fwords) + maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE + return maxOffScore + else: + # For now, straight average + maxOffScore = 0.0 + maxOnScore = 0.0 for e in ctx.ephrase.words: - maxScore = max(ttable.get_score(f, e, 0) for f in fwords) - yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE - return sum(score()) + maxScore = max(ttable.get_score(f, e, 0) for f in fwords) + maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE + for e in ctx.ephrase: + if not isvar(e): + maxScore = max((ctx.online.bilex_fe[f][e] / ctx.online.bilex_f[f]) for f in ctx.fphrase if not isvar(f)) + maxOnScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE + return (maxOffScore + maxOnScore) / 2 return MaxLexEgivenF def MaxLexFgivenE(ttable): def MaxLexFgivenE(ctx): ewords = ctx.ephrase.words ewords.append('NULL') - def score(): + if not ctx.online: + maxOffScore = 0.0 + for f in ctx.fphrase.words: + maxScore = max(ttable.get_score(f, e, 1) for e in ewords) + maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE + return maxOffScore + else: + # For now, straight average + maxOffScore = 0.0 + maxOnScore = 0.0 for f in ctx.fphrase.words: - maxScore = max(ttable.get_score(f, e, 1) for e in ewords) - yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE - return sum(score()) + maxScore = max(ttable.get_score(f, e, 1) for e in ewords) + maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE + for f in ctx.fphrase: + if not isvar(f): + maxScore = max((ctx.online.bilex_fe[f][e] / ctx.online.bilex_e[e]) for e in ctx.ephrase if not isvar(e)) + maxOnScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE + return (maxOffScore + maxOnScore) / 2 return MaxLexFgivenE def IsSingletonF(ctx): -- cgit v1.2.3