diff options
Diffstat (limited to 'python/pkg/cdec/sa/extractor.py')
| -rw-r--r-- | python/pkg/cdec/sa/extractor.py | 30 | 
1 files changed, 26 insertions, 4 deletions
| diff --git a/python/pkg/cdec/sa/extractor.py b/python/pkg/cdec/sa/extractor.py index e09f79ea..acc13cbc 100644 --- a/python/pkg/cdec/sa/extractor.py +++ b/python/pkg/cdec/sa/extractor.py @@ -1,15 +1,16 @@  from itertools import chain -import os +import os, sys  import cdec.configobj  from cdec.sa.features import EgivenFCoherent, SampleCountF, CountEF,\ -        MaxLexEgivenF, MaxLexFgivenE, IsSingletonF, IsSingletonFE +        MaxLexEgivenF, MaxLexFgivenE, IsSingletonF, IsSingletonFE,\ +        IsSupportedOnline  import cdec.sa  # maximum span of a grammar rule in TEST DATA  MAX_INITIAL_SIZE = 15  class GrammarExtractor: -    def __init__(self, config, features=None): +    def __init__(self, config, online=False, features=None):          if isinstance(config, basestring):              if not os.path.exists(config):                  raise IOError('cannot read configuration from {0}'.format(config)) @@ -57,11 +58,19 @@ class GrammarExtractor:          # lexical weighting tables          tt = cdec.sa.BiLex(from_binary=config['lex_file']) +        # TODO: clean this up +        extended_features = [] +        if online: +            extended_features.append(IsSupportedOnline) +                      # TODO: use @cdec.sa.features decorator for standard features too          # + add a mask to disable features +        for f in cdec.sa._SA_FEATURES: +            extended_features.append(f) +                      scorer = cdec.sa.Scorer(EgivenFCoherent, SampleCountF, CountEF,               MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE, -            *cdec.sa._SA_FEATURES) +            *extended_features)          fsarray = cdec.sa.SuffixArray(from_binary=config['f_sa_file'])          edarray = cdec.sa.DataArray(from_binary=config['e_file']) @@ -82,3 +91,16 @@ class GrammarExtractor:          meta = cdec.sa.annotate(words)          cnet = cdec.sa.make_lattice(words)          return self.factory.input(cnet, meta) + +    # Add training instance to data +    def add_instance(self, sentence, reference, alignment): +        f_words = cdec.sa.encode_words(sentence.split()) +        e_words = cdec.sa.encode_words(reference.split()) +        al = sorted(tuple(int(i) for i in pair.split('-')) for pair in alignment.split()) +        self.factory.add_instance(f_words, e_words, al) +     +    # Debugging +    def dump_online_stats(self): +        self.factory.dump_online_stats() +    def dump_online_rules(self): +        self.factory.dump_online_rules()
\ No newline at end of file | 
