diff options
Diffstat (limited to 'python/cdec/sa')
-rw-r--r-- | python/cdec/sa/__init__.py | 2 | ||||
-rw-r--r-- | python/cdec/sa/compile.py | 4 | ||||
-rw-r--r-- | python/cdec/sa/extract.py | 3 | ||||
-rw-r--r-- | python/cdec/sa/extractor.py | 7 | ||||
-rw-r--r-- | python/cdec/sa/features.py | 11 |
5 files changed, 14 insertions, 13 deletions
diff --git a/python/cdec/sa/__init__.py b/python/cdec/sa/__init__.py index ddefa280..8645e837 100644 --- a/python/cdec/sa/__init__.py +++ b/python/cdec/sa/__init__.py @@ -1,4 +1,4 @@ -from _cdec_sa import sym_tostring, sym_isvar, sym_fromstring,\ +from _sa import sym_fromstring,\ SuffixArray, DataArray, LCP, Precomputation, Alignment, BiLex,\ HieroCachingRuleFactory, Sampler from extractor import GrammarExtractor diff --git a/python/cdec/sa/compile.py b/python/cdec/sa/compile.py index 061cdab2..30e605a6 100644 --- a/python/cdec/sa/compile.py +++ b/python/cdec/sa/compile.py @@ -2,7 +2,7 @@ import argparse import os import logging -import configobj +import cdec.configobj import cdec.sa MAX_PHRASE_LENGTH = 4 @@ -80,7 +80,7 @@ def main(): lex.write_binary(lex_bin) # Write configuration - config = configobj.ConfigObj(args.config, unrepr=True) + config = cdec.configobj.ConfigObj(args.config, unrepr=True) config['f_sa_file'] = f_sa_bin config['e_file'] = e_bin config['a_file'] = a_bin diff --git a/python/cdec/sa/extract.py b/python/cdec/sa/extract.py index c6da5e9d..918aa3bb 100644 --- a/python/cdec/sa/extract.py +++ b/python/cdec/sa/extract.py @@ -3,7 +3,6 @@ import sys import os import argparse import logging -import configobj import cdec.sa def main(): @@ -18,7 +17,7 @@ def main(): if not os.path.exists(args.grammars): os.mkdir(args.grammars) - extractor = cdec.sa.GrammarExtractor(configobj.ConfigObj(args.config, unrepr=True)) + extractor = cdec.sa.GrammarExtractor(args.config) for i, sentence in enumerate(sys.stdin): sentence = sentence[:-1] grammar_file = os.path.join(args.grammars, 'grammar.{0}'.format(i)) diff --git a/python/cdec/sa/extractor.py b/python/cdec/sa/extractor.py index c97b3c6f..bb912e16 100644 --- a/python/cdec/sa/extractor.py +++ b/python/cdec/sa/extractor.py @@ -1,4 +1,6 @@ from itertools import chain +import os +import cdec.configobj from cdec.sa.features import EgivenFCoherent, SampleCountF, CountEF,\ MaxLexEgivenF, MaxLexFgivenE, IsSingletonF, IsSingletonFE import cdec.sa @@ -8,7 +10,10 @@ MAX_INITIAL_SIZE = 15 class GrammarExtractor: def __init__(self, config): - # TODO if str, read config + if isinstance(config, str) or isinstance(config, unicode): + if not os.path.exists(config): + raise IOError('cannot read configuration from {0}'.format(config)) + config = cdec.configobj.ConfigObj(config, unrepr=True) alignment = cdec.sa.Alignment(from_binary=config['a_file']) self.factory = cdec.sa.HieroCachingRuleFactory( # compiled alignment object (REQUIRED) diff --git a/python/cdec/sa/features.py b/python/cdec/sa/features.py index 8d35d8e6..325b9e13 100644 --- a/python/cdec/sa/features.py +++ b/python/cdec/sa/features.py @@ -1,6 +1,5 @@ from __future__ import division import math -import cdec.sa MAXSCORE = 99 @@ -22,11 +21,10 @@ def CoherenceProb(fphrase, ephrase, paircount, fcount, fsample_count): def MaxLexEgivenF(ttable): def feature(fphrase, ephrase, paircount, fcount, fsample_count): - fwords = [cdec.sa.sym_tostring(w) for w in fphrase if not cdec.sa.sym_isvar(w)] + fwords = fphrase.words fwords.append('NULL') - ewords = (cdec.sa.sym_tostring(w) for w in ephrase if not cdec.sa.sym_isvar(w)) def score(): - for e in ewords: + for e in ephrase.words: maxScore = max(ttable.get_score(f, e, 0) for f in fwords) yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE return sum(score()) @@ -34,11 +32,10 @@ def MaxLexEgivenF(ttable): def MaxLexFgivenE(ttable): def feature(fphrase, ephrase, paircount, fcount, fsample_count): - fwords = (cdec.sa.sym_tostring(w) for w in fphrase if not cdec.sa.sym_isvar(w)) - ewords = [cdec.sa.sym_tostring(w) for w in ephrase if not cdec.sa.sym_isvar(w)] + ewords = ephrase.words ewords.append('NULL') def score(): - for f in fwords: + for f in fphrase.words: maxScore = max(ttable.get_score(f, e, 1) for e in ewords) yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE return sum(score()) |