diff options
Diffstat (limited to 'python/pkg/cdec')
-rw-r--r-- | python/pkg/cdec/sa/__init__.py | 6 | ||||
-rw-r--r-- | python/pkg/cdec/sa/extract.py | 35 | ||||
-rw-r--r-- | python/pkg/cdec/sa/extractor.py | 5 | ||||
-rw-r--r-- | python/pkg/cdec/sa/features.py | 52 |
4 files changed, 61 insertions, 37 deletions
diff --git a/python/pkg/cdec/sa/__init__.py b/python/pkg/cdec/sa/__init__.py index ab8be809..cc532fb9 100644 --- a/python/pkg/cdec/sa/__init__.py +++ b/python/pkg/cdec/sa/__init__.py @@ -2,3 +2,9 @@ from cdec.sa._sa import sym_fromstring,\ SuffixArray, DataArray, LCP, Precomputation, Alignment, BiLex,\ HieroCachingRuleFactory, Sampler, Scorer from cdec.sa.extractor import GrammarExtractor + +_SA_FEATURES = [] + +def feature(fn): + _SA_FEATURES.append(fn) + return fn diff --git a/python/pkg/cdec/sa/extract.py b/python/pkg/cdec/sa/extract.py index 39eac824..b370c4ca 100644 --- a/python/pkg/cdec/sa/extract.py +++ b/python/pkg/cdec/sa/extract.py @@ -8,12 +8,20 @@ import signal import cdec.sa extractor, prefix = None, None -def make_extractor(config, grammars): +def make_extractor(config, grammars, features): global extractor, prefix signal.signal(signal.SIGINT, signal.SIG_IGN) # Let parent process catch Ctrl+C + if features: load_features(features) extractor = cdec.sa.GrammarExtractor(config) prefix = grammars +def load_features(features): + logging.info('Loading additional feature definitions from %s', features) + prefix = os.path.dirname(features) + sys.path.append(prefix) + __import__(os.path.basename(features).replace('.py', '')) + sys.path.remove(prefix) + def extract(inp): global extractor, prefix i, sentence = inp @@ -25,7 +33,6 @@ def extract(inp): grammar_file = os.path.abspath(grammar_file) return '<seg grammar="{0}" id="{1}">{2}</seg>'.format(grammar_file, i, sentence) - def main(): logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser(description='Extract grammars from a compiled corpus.') @@ -37,18 +44,28 @@ def main(): help='number of parallel extractors') parser.add_argument('-s', '--chunksize', type=int, default=10, help='number of sentences / chunk') + parser.add_argument('-f', '--features', type=str, default=None, + help='additional feature definitions') args = parser.parse_args() if not os.path.exists(args.grammars): os.mkdir(args.grammars) - - logging.info('Starting %d workers; chunk size: %d', args.jobs, args.chunksize) - pool = mp.Pool(args.jobs, make_extractor, (args.config, args.grammars)) - try: - for output in pool.imap(extract, enumerate(sys.stdin), args.chunksize): + if not args.features.endswith('.py'): + sys.stderr.write('Error: feature definition file should be a python module\n') + sys.exit(1) + + if args.jobs > 1: + logging.info('Starting %d workers; chunk size: %d', args.jobs, args.chunksize) + pool = mp.Pool(args.jobs, make_extractor, (args.config, args.grammars, args.features)) + try: + for output in pool.imap(extract, enumerate(sys.stdin), args.chunksize): + print(output) + except KeyboardInterrupt: + pool.terminate() + else: + make_extractor(args.config, args.grammars, args.features) + for output in map(extract, enumerate(sys.stdin)): print(output) - except KeyboardInterrupt: - pool.terminate() if __name__ == '__main__': main() diff --git a/python/pkg/cdec/sa/extractor.py b/python/pkg/cdec/sa/extractor.py index 90cc4c51..89e35bf8 100644 --- a/python/pkg/cdec/sa/extractor.py +++ b/python/pkg/cdec/sa/extractor.py @@ -9,7 +9,7 @@ import cdec.sa MAX_INITIAL_SIZE = 15 class GrammarExtractor: - def __init__(self, config): + def __init__(self, config, features=None): if isinstance(config, str) or isinstance(config, unicode): if not os.path.exists(config): raise IOError('cannot read configuration from {0}'.format(config)) @@ -58,7 +58,8 @@ class GrammarExtractor: tt = cdec.sa.BiLex(from_binary=config['lex_file']) scorer = cdec.sa.Scorer(EgivenFCoherent, SampleCountF, CountEF, - MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE) + MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE, + *cdec.sa._SA_FEATURES) fsarray = cdec.sa.SuffixArray(from_binary=config['f_sa_file']) edarray = cdec.sa.DataArray(from_binary=config['e_file']) diff --git a/python/pkg/cdec/sa/features.py b/python/pkg/cdec/sa/features.py index 8fd370cc..a4ae23e8 100644 --- a/python/pkg/cdec/sa/features.py +++ b/python/pkg/cdec/sa/features.py @@ -3,55 +3,55 @@ import math MAXSCORE = 99 -def EgivenF(fphrase, ephrase, paircount, fcount, fsample_count): # p(e|f) - return -math.log10(paircount/fcount) +def EgivenF(ctx): # p(e|f) = c(e, f)/c(f) + return -math.log10(ctx.paircount/ctx.fcount) -def CountEF(fphrase, ephrase, paircount, fcount, fsample_count): - return math.log10(1 + paircount) +def CountEF(ctx): # c(e, f) + return math.log10(1 + ctx.paircount) -def SampleCountF(fphrase, ephrase, paircount, fcount, fsample_count): - return math.log10(1 + fsample_count) +def SampleCountF(ctx): # sample c(f) + return math.log10(1 + ctx.fsample_count) -def EgivenFCoherent(fphrase, ephrase, paircount, fcount, fsample_count): - prob = paircount/fsample_count +def EgivenFCoherent(ctx): # c(e, f) / sample c(f) + prob = ctx.paircount/ctx.fsample_count return -math.log10(prob) if prob > 0 else MAXSCORE -def CoherenceProb(fphrase, ephrase, paircount, fcount, fsample_count): - return -math.log10(fcount/fsample_count) +def CoherenceProb(ctx): # c(f) / sample c(f) + return -math.log10(ctx.fcount/ctx.fsample_count) def MaxLexEgivenF(ttable): - def MaxLexEgivenF(fphrase, ephrase, paircount, fcount, fsample_count): - fwords = fphrase.words + def MaxLexEgivenF(ctx): + fwords = ctx.fphrase.words fwords.append('NULL') def score(): - for e in ephrase.words: + for e in ctx.ephrase.words: maxScore = max(ttable.get_score(f, e, 0) for f in fwords) yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE return sum(score()) return MaxLexEgivenF def MaxLexFgivenE(ttable): - def MaxLexFgivenE(fphrase, ephrase, paircount, fcount, fsample_count): - ewords = ephrase.words + def MaxLexFgivenE(ctx): + ewords = ctx.ephrase.words ewords.append('NULL') def score(): - for f in fphrase.words: + for f in ctx.fphrase.words: maxScore = max(ttable.get_score(f, e, 1) for e in ewords) yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE return sum(score()) return MaxLexFgivenE -def IsSingletonF(fphrase, ephrase, paircount, fcount, fsample_count): - return (fcount == 1) +def IsSingletonF(ctx): + return (ctx.fcount == 1) -def IsSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count): - return (paircount == 1) +def IsSingletonFE(ctx): + return (ctx.paircount == 1) -def IsNotSingletonF(fphrase, ephrase, paircount, fcount, fsample_count): - return (fcount > 1) +def IsNotSingletonF(ctx): + return (ctx.fcount > 1) -def IsNotSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count): - return (paircount > 1) +def IsNotSingletonFE(ctx): + return (ctx.paircount > 1) -def IsFEGreaterThanZero(fphrase, ephrase, paircount, fcount, fsample_count): - return (paircount > 0.01) +def IsFEGreaterThanZero(ctx): + return (ctx.paircount > 0.01) |