diff options
Diffstat (limited to 'python/pkg/cdec/sa')
-rw-r--r-- | python/pkg/cdec/sa/__init__.py | 24 | ||||
-rw-r--r-- | python/pkg/cdec/sa/extract.py | 67 | ||||
-rw-r--r-- | python/pkg/cdec/sa/extractor.py | 22 | ||||
-rw-r--r-- | python/pkg/cdec/sa/features.py | 56 |
4 files changed, 119 insertions, 50 deletions
diff --git a/python/pkg/cdec/sa/__init__.py b/python/pkg/cdec/sa/__init__.py index fd4a4148..e0a344b7 100644 --- a/python/pkg/cdec/sa/__init__.py +++ b/python/pkg/cdec/sa/__init__.py @@ -1,4 +1,24 @@ -from cdec.sa._sa import sym_fromstring,\ +from cdec.sa._sa import make_lattice, decode_lattice, decode_sentence,\ SuffixArray, DataArray, LCP, Precomputation, Alignment, BiLex,\ - HieroCachingRuleFactory, Sampler + HieroCachingRuleFactory, Sampler, Scorer from cdec.sa.extractor import GrammarExtractor + +_SA_FEATURES = [] +_SA_ANNOTATORS = {} +_SA_CONFIGURE = [] + +def feature(fn): + _SA_FEATURES.append(fn) + return fn + +def annotator(fn): + _SA_ANNOTATORS[fn.__name__] = fn + +def annotate(sentence): + meta = {} + for name, fn in _SA_ANNOTATORS.iteritems(): + meta[name] = fn(sentence) + return meta + +def configure(fn): + _SA_CONFIGURE.append(fn) diff --git a/python/pkg/cdec/sa/extract.py b/python/pkg/cdec/sa/extract.py index 875bf42e..10a81556 100644 --- a/python/pkg/cdec/sa/extract.py +++ b/python/pkg/cdec/sa/extract.py @@ -3,29 +3,72 @@ import sys import os import argparse import logging +import multiprocessing as mp +import signal import cdec.sa +extractor, prefix = None, None +def make_extractor(config, grammars, features): + global extractor, prefix + signal.signal(signal.SIGINT, signal.SIG_IGN) # Let parent process catch Ctrl+C + load_features(features) + extractor = cdec.sa.GrammarExtractor(config) + prefix = grammars + +def load_features(features): + for featdef in features: + logging.info('Loading additional feature definitions from %s', featdef) + prefix = os.path.dirname(featdef) + sys.path.append(prefix) + __import__(os.path.basename(featdef).replace('.py', '')) + sys.path.remove(prefix) + +def extract(inp): + global extractor, prefix + i, sentence = inp + sentence = sentence[:-1] + grammar_file = os.path.join(prefix, 'grammar.{0}'.format(i)) + with open(grammar_file, 'w') as output: + for rule in extractor.grammar(sentence): + output.write(str(rule)+'\n') + grammar_file = os.path.abspath(grammar_file) + return '<seg grammar="{0}" id="{1}">{2}</seg>'.format(grammar_file, i, sentence) + def main(): logging.basicConfig(level=logging.INFO) parser = argparse.ArgumentParser(description='Extract grammars from a compiled corpus.') parser.add_argument('-c', '--config', required=True, - help='Extractor configuration') + help='extractor configuration') parser.add_argument('-g', '--grammars', required=True, - help='Grammar output path') + help='grammar output path') + parser.add_argument('-j', '--jobs', type=int, default=1, + help='number of parallel extractors') + parser.add_argument('-s', '--chunksize', type=int, default=10, + help='number of sentences / chunk') + parser.add_argument('-f', '--features', nargs='*', default=[], + help='additional feature definitions') args = parser.parse_args() if not os.path.exists(args.grammars): os.mkdir(args.grammars) - - extractor = cdec.sa.GrammarExtractor(args.config) - for i, sentence in enumerate(sys.stdin): - sentence = sentence[:-1] - grammar_file = os.path.join(args.grammars, 'grammar.{0}'.format(i)) - with open(grammar_file, 'w') as output: - for rule in extractor.grammar(sentence): - output.write(str(rule)+'\n') - grammar_file = os.path.abspath(grammar_file) - print('<seg grammar="{0}" id="{1}">{2}</seg>'.format(grammar_file, i, sentence)) + for featdef in args.features: + if not featdef.endswith('.py'): + sys.stderr.write('Error: feature definition file <{0}>' + ' should be a python module\n'.format(featdef)) + sys.exit(1) + + if args.jobs > 1: + logging.info('Starting %d workers; chunk size: %d', args.jobs, args.chunksize) + pool = mp.Pool(args.jobs, make_extractor, (args.config, args.grammars, args.features)) + try: + for output in pool.imap(extract, enumerate(sys.stdin), args.chunksize): + print(output) + except KeyboardInterrupt: + pool.terminate() + else: + make_extractor(args.config, args.grammars, args.features) + for output in map(extract, enumerate(sys.stdin)): + print(output) if __name__ == '__main__': main() diff --git a/python/pkg/cdec/sa/extractor.py b/python/pkg/cdec/sa/extractor.py index bb912e16..a5ce8a68 100644 --- a/python/pkg/cdec/sa/extractor.py +++ b/python/pkg/cdec/sa/extractor.py @@ -9,7 +9,7 @@ import cdec.sa MAX_INITIAL_SIZE = 15 class GrammarExtractor: - def __init__(self, config): + def __init__(self, config, features=None): if isinstance(config, str) or isinstance(config, unicode): if not os.path.exists(config): raise IOError('cannot read configuration from {0}'.format(config)) @@ -57,8 +57,11 @@ class GrammarExtractor: # lexical weighting tables tt = cdec.sa.BiLex(from_binary=config['lex_file']) - self.models = (EgivenFCoherent, SampleCountF, CountEF, - MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE) + # TODO: use @cdec.sa.features decorator for standard features too + # + add a mask to disable features + scorer = cdec.sa.Scorer(EgivenFCoherent, SampleCountF, CountEF, + MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE, + *cdec.sa._SA_FEATURES) fsarray = cdec.sa.SuffixArray(from_binary=config['f_sa_file']) edarray = cdec.sa.DataArray(from_binary=config['e_file']) @@ -67,12 +70,15 @@ class GrammarExtractor: # -1 = don't sample, use all data (VERY SLOW!) sampler = cdec.sa.Sampler(300, fsarray) - self.factory.configure(fsarray, edarray, sampler) + self.factory.configure(fsarray, edarray, sampler, scorer) + # Initialize feature definitions with configuration + for fn in cdec.sa._SA_CONFIGURE: + fn(config) def grammar(self, sentence): if isinstance(sentence, unicode): sentence = sentence.encode('utf8') - cnet = chain(('<s>',), sentence.split(), ('</s>',)) - cnet = (cdec.sa.sym_fromstring(word, terminal=True) for word in cnet) - cnet = tuple(((word, None, 1), ) for word in cnet) - return self.factory.input(cnet, self.models) + words = tuple(chain(('<s>',), sentence.split(), ('</s>',))) + meta = cdec.sa.annotate(words) + cnet = cdec.sa.make_lattice(words) + return self.factory.input(cnet, meta) diff --git a/python/pkg/cdec/sa/features.py b/python/pkg/cdec/sa/features.py index 325b9e13..a4ae23e8 100644 --- a/python/pkg/cdec/sa/features.py +++ b/python/pkg/cdec/sa/features.py @@ -3,55 +3,55 @@ import math MAXSCORE = 99 -def EgivenF(fphrase, ephrase, paircount, fcount, fsample_count): # p(e|f) - return -math.log10(paircount/fcount) +def EgivenF(ctx): # p(e|f) = c(e, f)/c(f) + return -math.log10(ctx.paircount/ctx.fcount) -def CountEF(fphrase, ephrase, paircount, fcount, fsample_count): - return math.log10(1 + paircount) +def CountEF(ctx): # c(e, f) + return math.log10(1 + ctx.paircount) -def SampleCountF(fphrase, ephrase, paircount, fcount, fsample_count): - return math.log10(1 + fsample_count) +def SampleCountF(ctx): # sample c(f) + return math.log10(1 + ctx.fsample_count) -def EgivenFCoherent(fphrase, ephrase, paircount, fcount, fsample_count): - prob = paircount/fsample_count +def EgivenFCoherent(ctx): # c(e, f) / sample c(f) + prob = ctx.paircount/ctx.fsample_count return -math.log10(prob) if prob > 0 else MAXSCORE -def CoherenceProb(fphrase, ephrase, paircount, fcount, fsample_count): - return -math.log10(fcount/fsample_count) +def CoherenceProb(ctx): # c(f) / sample c(f) + return -math.log10(ctx.fcount/ctx.fsample_count) def MaxLexEgivenF(ttable): - def feature(fphrase, ephrase, paircount, fcount, fsample_count): - fwords = fphrase.words + def MaxLexEgivenF(ctx): + fwords = ctx.fphrase.words fwords.append('NULL') def score(): - for e in ephrase.words: + for e in ctx.ephrase.words: maxScore = max(ttable.get_score(f, e, 0) for f in fwords) yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE return sum(score()) - return feature + return MaxLexEgivenF def MaxLexFgivenE(ttable): - def feature(fphrase, ephrase, paircount, fcount, fsample_count): - ewords = ephrase.words + def MaxLexFgivenE(ctx): + ewords = ctx.ephrase.words ewords.append('NULL') def score(): - for f in fphrase.words: + for f in ctx.fphrase.words: maxScore = max(ttable.get_score(f, e, 1) for e in ewords) yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE return sum(score()) - return feature + return MaxLexFgivenE -def IsSingletonF(fphrase, ephrase, paircount, fcount, fsample_count): - return (fcount == 1) +def IsSingletonF(ctx): + return (ctx.fcount == 1) -def IsSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count): - return (paircount == 1) +def IsSingletonFE(ctx): + return (ctx.paircount == 1) -def IsNotSingletonF(fphrase, ephrase, paircount, fcount, fsample_count): - return (fcount > 1) +def IsNotSingletonF(ctx): + return (ctx.fcount > 1) -def IsNotSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count): - return (paircount > 1) +def IsNotSingletonFE(ctx): + return (ctx.paircount > 1) -def IsFEGreaterThanZero(fphrase, ephrase, paircount, fcount, fsample_count): - return (paircount > 0.01) +def IsFEGreaterThanZero(ctx): + return (ctx.paircount > 0.01) |