diff options
Diffstat (limited to 'python/pkg/cdec/sa')
| -rw-r--r-- | python/pkg/cdec/sa/__init__.py | 2 | ||||
| -rw-r--r-- | python/pkg/cdec/sa/extract.py | 5 | ||||
| -rw-r--r-- | python/pkg/cdec/sa/extractor.py | 19 | ||||
| -rw-r--r-- | python/pkg/cdec/sa/features.py | 117 | 
4 files changed, 119 insertions, 24 deletions
| diff --git a/python/pkg/cdec/sa/__init__.py b/python/pkg/cdec/sa/__init__.py index 418531d9..14ba5ecb 100644 --- a/python/pkg/cdec/sa/__init__.py +++ b/python/pkg/cdec/sa/__init__.py @@ -1,5 +1,5 @@  from cdec.sa._sa import make_lattice, decode_lattice, decode_sentence,\ -        encode_words, decode_words,\ +        encode_words, decode_words, isvar,\          SuffixArray, DataArray, LCP, Precomputation, Alignment, BiLex,\          HieroCachingRuleFactory, Sampler, Scorer  from cdec.sa.extractor import GrammarExtractor diff --git a/python/pkg/cdec/sa/extract.py b/python/pkg/cdec/sa/extract.py index 9fc37345..2e596bd3 100644 --- a/python/pkg/cdec/sa/extract.py +++ b/python/pkg/cdec/sa/extract.py @@ -12,10 +12,10 @@ extractor, prefix = None, None  online = False  def make_extractor(config, grammars, features): -    global extractor, prefix +    global extractor, prefix, online      signal.signal(signal.SIGINT, signal.SIG_IGN) # Let parent process catch Ctrl+C      load_features(features) -    extractor = cdec.sa.GrammarExtractor(config) +    extractor = cdec.sa.GrammarExtractor(config, online)      prefix = grammars  def load_features(features): @@ -53,7 +53,6 @@ def extract(inp):      # Add training instance _after_ extracting grammars      if online:          extractor.add_instance(sentence, reference, alignment) -        extractor.dump_online_stats()      grammar_file = os.path.abspath(grammar_file)      return '<seg grammar="{0}" id="{1}"> {2} </seg>{3}'.format(grammar_file, i, sentence, suffix) diff --git a/python/pkg/cdec/sa/extractor.py b/python/pkg/cdec/sa/extractor.py index 62a251a7..acc13cbc 100644 --- a/python/pkg/cdec/sa/extractor.py +++ b/python/pkg/cdec/sa/extractor.py @@ -2,14 +2,15 @@ from itertools import chain  import os, sys  import cdec.configobj  from cdec.sa.features import EgivenFCoherent, SampleCountF, CountEF,\ -        MaxLexEgivenF, MaxLexFgivenE, IsSingletonF, IsSingletonFE +        MaxLexEgivenF, MaxLexFgivenE, IsSingletonF, IsSingletonFE,\ +        IsSupportedOnline  import cdec.sa  # maximum span of a grammar rule in TEST DATA  MAX_INITIAL_SIZE = 15  class GrammarExtractor: -    def __init__(self, config, features=None): +    def __init__(self, config, online=False, features=None):          if isinstance(config, basestring):              if not os.path.exists(config):                  raise IOError('cannot read configuration from {0}'.format(config)) @@ -57,11 +58,19 @@ class GrammarExtractor:          # lexical weighting tables          tt = cdec.sa.BiLex(from_binary=config['lex_file']) +        # TODO: clean this up +        extended_features = [] +        if online: +            extended_features.append(IsSupportedOnline) +                      # TODO: use @cdec.sa.features decorator for standard features too          # + add a mask to disable features +        for f in cdec.sa._SA_FEATURES: +            extended_features.append(f) +                      scorer = cdec.sa.Scorer(EgivenFCoherent, SampleCountF, CountEF,               MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE, -            *cdec.sa._SA_FEATURES) +            *extended_features)          fsarray = cdec.sa.SuffixArray(from_binary=config['f_sa_file'])          edarray = cdec.sa.DataArray(from_binary=config['e_file']) @@ -92,4 +101,6 @@ class GrammarExtractor:      # Debugging      def dump_online_stats(self): -        self.factory.dump_online_stats()
\ No newline at end of file +        self.factory.dump_online_stats() +    def dump_online_rules(self): +        self.factory.dump_online_rules()
\ No newline at end of file diff --git a/python/pkg/cdec/sa/features.py b/python/pkg/cdec/sa/features.py index a4ae23e8..46412cd5 100644 --- a/python/pkg/cdec/sa/features.py +++ b/python/pkg/cdec/sa/features.py @@ -1,57 +1,142 @@  from __future__ import division  import math +from cdec.sa import isvar +  MAXSCORE = 99  def EgivenF(ctx): # p(e|f) = c(e, f)/c(f) -    return -math.log10(ctx.paircount/ctx.fcount) +    if not ctx.online: +        prob = ctx.paircount/ctx.fcount +    else: +        prob = (ctx.paircount + ctx.online.paircount) / (ctx.fcount + ctx.online.fcount) +    return -math.log10(prob)  def CountEF(ctx): # c(e, f) -    return math.log10(1 + ctx.paircount) +    if not ctx.online: +        count = 1 + ctx.paircount +    else: +        count = 1 + ctx.paircount + ctx.online.paircount +    return math.log10(count)  def SampleCountF(ctx): # sample c(f) -    return math.log10(1 + ctx.fsample_count) +    if not ctx.online: +        count = 1 + ctx.fsample_count +    else: +        count = 1 + ctx.fsample_count + ctx.online.fsample_count +    return math.log10(count)  def EgivenFCoherent(ctx): # c(e, f) / sample c(f) -    prob = ctx.paircount/ctx.fsample_count +    if not ctx.online: +        prob = ctx.paircount/ctx.fsample_count +    else: +        prob = (ctx.paircount + ctx.online.paircount) / (ctx.fsample_count + ctx.online.fsample_count)      return -math.log10(prob) if prob > 0 else MAXSCORE  def CoherenceProb(ctx): # c(f) / sample c(f) -    return -math.log10(ctx.fcount/ctx.fsample_count) +    if not ctx.online: +        prob = ctx.fcount/ctx.fsample_count +    else: +        prob = (ctx.fcount + ctx.online.fcount) / (ctx.fsample_count + ctx.online.fsample_count) +    return -math.log10(prob)  def MaxLexEgivenF(ttable):      def MaxLexEgivenF(ctx):          fwords = ctx.fphrase.words          fwords.append('NULL') -        def score(): +        # Always use this for now +        if not ctx.online or ctx.online: +            maxOffScore = 0.0 +            for e in ctx.ephrase.words: +                maxScore = max(ttable.get_score(f, e, 0) for f in fwords) +                maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE +            return maxOffScore +        else: +            # For now, straight average +            maxOffScore = 0.0 +            maxOnScore = 0.0              for e in ctx.ephrase.words: -              maxScore = max(ttable.get_score(f, e, 0) for f in fwords) -              yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE -        return sum(score()) +                maxScore = max(ttable.get_score(f, e, 0) for f in fwords) +                maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE +            for e in ctx.ephrase: +                if not isvar(e): +                    maxScore = 0.0 +                    for f in ctx.fphrase: +                        if not isvar(f): +                            b_f = ctx.online.bilex_f.get(f, 0) +                            if b_f: +                                maxScore = max(maxScore, ctx.online.bilex_fe.get(f, {}).get(e)) +                    maxOnScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE +            return (maxOffScore + maxOnScore) / 2      return MaxLexEgivenF  def MaxLexFgivenE(ttable):      def MaxLexFgivenE(ctx):          ewords = ctx.ephrase.words          ewords.append('NULL') -        def score(): +        # Always use this for now +        if not ctx.online or ctx.online: +            maxOffScore = 0.0              for f in ctx.fphrase.words: -              maxScore = max(ttable.get_score(f, e, 1) for e in ewords) -              yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE -        return sum(score()) +                maxScore = max(ttable.get_score(f, e, 1) for e in ewords) +                maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE +            return maxOffScore +        else: +            # For now, straight average +            maxOffScore = 0.0 +            maxOnScore = 0.0 +            for f in ctx.fphrase.words: +                maxScore = max(ttable.get_score(f, e, 1) for e in ewords) +                maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE +            for f in ctx.fphrase: +                if not isvar(f): +                    maxScore = 0.0 +                    for e in ctx.ephrase: +                        if not isvar(e): +                            b_e = ctx.online.bilex_e.get(e, 0) +                            if b_e: +                                maxScore = max(maxScore, ctx.online.bilex_fe.get(f, {}).get(e, 0) / b_e ) +                    maxOnScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE +            return (maxOffScore + maxOnScore) / 2      return MaxLexFgivenE  def IsSingletonF(ctx): -    return (ctx.fcount == 1) +    if not ctx.online: +        count = ctx.fcount +    else: +        count = ctx.fcount + ctx.online.fcount   +    return (count == 1)  def IsSingletonFE(ctx): -    return (ctx.paircount == 1) +    if not ctx.online: +        count = ctx.paircount +    else: +        count = ctx.paircount + ctx.online.paircount +    return (count == 1)  def IsNotSingletonF(ctx): -    return (ctx.fcount > 1) +    if not ctx.online: +        count = ctx.fcount +    else: +        count = ctx.fcount + ctx.online.fcount +    return (count > 1)  def IsNotSingletonFE(ctx): +    if not ctx.online: +        count = ctx.paircount +    else: +        count = ctx.paircount + ctx.online.paircount      return (ctx.paircount > 1)  def IsFEGreaterThanZero(ctx): +    if not ctx.online: +        count = ctx.paircount +    else: +        count = ctx.paircount + ctx.online.paircount      return (ctx.paircount > 0.01) + +def IsSupportedOnline(ctx): # Occurs in online data? +    if ctx.online: +        return (ctx.online.paircount > 0.01) +    else: +        return False
\ No newline at end of file | 
