From ca3da3a815b6e85531d6ded07e7d6bec7852748c Mon Sep 17 00:00:00 2001 From: Michael Denkowski Date: Thu, 24 Jan 2013 16:55:21 -0500 Subject: Scored grammars from online extraction. Don't trust them yet. --- python/pkg/cdec/sa/extract.py | 6 ++-- python/pkg/cdec/sa/extractor.py | 15 ++++++++-- python/pkg/cdec/sa/features.py | 61 +++++++++++++++++++++++++++++++++++------ 3 files changed, 68 insertions(+), 14 deletions(-) (limited to 'python/pkg/cdec/sa') diff --git a/python/pkg/cdec/sa/extract.py b/python/pkg/cdec/sa/extract.py index 9fc37345..20eab9dd 100644 --- a/python/pkg/cdec/sa/extract.py +++ b/python/pkg/cdec/sa/extract.py @@ -12,10 +12,10 @@ extractor, prefix = None, None online = False def make_extractor(config, grammars, features): - global extractor, prefix + global extractor, prefix, online signal.signal(signal.SIGINT, signal.SIG_IGN) # Let parent process catch Ctrl+C load_features(features) - extractor = cdec.sa.GrammarExtractor(config) + extractor = cdec.sa.GrammarExtractor(config, online) prefix = grammars def load_features(features): @@ -53,7 +53,7 @@ def extract(inp): # Add training instance _after_ extracting grammars if online: extractor.add_instance(sentence, reference, alignment) - extractor.dump_online_stats() + #extractor.dump_online_stats() grammar_file = os.path.abspath(grammar_file) return ' {2} {3}'.format(grammar_file, i, sentence, suffix) diff --git a/python/pkg/cdec/sa/extractor.py b/python/pkg/cdec/sa/extractor.py index 62a251a7..5ef8041c 100644 --- a/python/pkg/cdec/sa/extractor.py +++ b/python/pkg/cdec/sa/extractor.py @@ -2,14 +2,15 @@ from itertools import chain import os, sys import cdec.configobj from cdec.sa.features import EgivenFCoherent, SampleCountF, CountEF,\ - MaxLexEgivenF, MaxLexFgivenE, IsSingletonF, IsSingletonFE + MaxLexEgivenF, MaxLexFgivenE, IsSingletonF, IsSingletonFE,\ + IsSupportedOnline import cdec.sa # maximum span of a grammar rule in TEST DATA MAX_INITIAL_SIZE = 15 class GrammarExtractor: - def __init__(self, config, features=None): + def __init__(self, config, online=False, features=None): if isinstance(config, basestring): if not os.path.exists(config): raise IOError('cannot read configuration from {0}'.format(config)) @@ -57,11 +58,19 @@ class GrammarExtractor: # lexical weighting tables tt = cdec.sa.BiLex(from_binary=config['lex_file']) + # TODO: clean this up + extended_features = [] + #extended_features.append(IsSupportedOnline) + if online: + extended_features.append(IsSupportedOnline) + # TODO: use @cdec.sa.features decorator for standard features too # + add a mask to disable features + for f in cdec.sa._SA_FEATURES: + extended_features.append(f) scorer = cdec.sa.Scorer(EgivenFCoherent, SampleCountF, CountEF, MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE, - *cdec.sa._SA_FEATURES) + *extended_features) fsarray = cdec.sa.SuffixArray(from_binary=config['f_sa_file']) edarray = cdec.sa.DataArray(from_binary=config['e_file']) diff --git a/python/pkg/cdec/sa/features.py b/python/pkg/cdec/sa/features.py index a4ae23e8..cede5304 100644 --- a/python/pkg/cdec/sa/features.py +++ b/python/pkg/cdec/sa/features.py @@ -4,20 +4,39 @@ import math MAXSCORE = 99 def EgivenF(ctx): # p(e|f) = c(e, f)/c(f) - return -math.log10(ctx.paircount/ctx.fcount) + if not ctx.online: + prob = ctx.paircount/ctx.fcount + else: + prob = (ctx.paircount + ctx.online.paircount) / (ctx.fcount + ctx.online.fcount) + return -math.log10(prob) def CountEF(ctx): # c(e, f) - return math.log10(1 + ctx.paircount) + if not ctx.online: + count = 1 + ctx.paircount + else: + count = 1 + ctx.paircount + ctx.online.paircount + return math.log10(count) def SampleCountF(ctx): # sample c(f) - return math.log10(1 + ctx.fsample_count) + if not ctx.online: + count = 1 + ctx.fsample_count + else: + count = 1 + ctx.fsample_count + ctx.online.fcount + return math.log10(count) def EgivenFCoherent(ctx): # c(e, f) / sample c(f) - prob = ctx.paircount/ctx.fsample_count + if not ctx.online: + prob = ctx.paircount/ctx.fsample_count + else: + prob = (ctx.paircount + ctx.online.paircount) / (ctx.fsample_count + ctx.online.fcount) return -math.log10(prob) if prob > 0 else MAXSCORE def CoherenceProb(ctx): # c(f) / sample c(f) - return -math.log10(ctx.fcount/ctx.fsample_count) + if not ctx.online: + prob = ctx.fcount/ctx.fsample_count + else: + prob = (ctx.fcount + ctx.online.fcount) / (ctx.fsample_count + ctx.online.fcount) + return -math.log10(prob) def MaxLexEgivenF(ttable): def MaxLexEgivenF(ctx): @@ -42,16 +61,42 @@ def MaxLexFgivenE(ttable): return MaxLexFgivenE def IsSingletonF(ctx): - return (ctx.fcount == 1) + if not ctx.online: + count = ctx.fcount + else: + count = ctx.fcount + ctx.online.fcount + return (count == 1) def IsSingletonFE(ctx): - return (ctx.paircount == 1) + if not ctx.online: + count = ctx.paircount + else: + count = ctx.paircount + ctx.online.paircount + return (count == 1) def IsNotSingletonF(ctx): - return (ctx.fcount > 1) + if not ctx.online: + count = ctx.fcount + else: + count = ctx.fcount + ctx.online.fcount + return (count > 1) def IsNotSingletonFE(ctx): + if not ctx.online: + count = ctx.paircount + else: + count = ctx.paircount + ctx.online.paircount return (ctx.paircount > 1) def IsFEGreaterThanZero(ctx): + if not ctx.online: + count = ctx.paircount + else: + count = ctx.paircount + ctx.online.paircount return (ctx.paircount > 0.01) + +def IsSupportedOnline(ctx): + if ctx.online: + return (ctx.online.fcount > 0.01) + else: + return False \ No newline at end of file -- cgit v1.2.3