summaryrefslogtreecommitdiff
path: root/python/pkg/cdec/sa
diff options
context:
space:
mode:
authorPatrick Simianer <simianer@cl.uni-heidelberg.de>2012-11-05 15:29:46 +0100
committerPatrick Simianer <simianer@cl.uni-heidelberg.de>2012-11-05 15:29:46 +0100
commit6f29f345dc06c1a1033475eac1d1340781d1d603 (patch)
tree6fa4cdd7aefd7d54c9585c2c6274db61bb8b159a /python/pkg/cdec/sa
parentb510da2e562c695c90d565eb295c749569c59be8 (diff)
parentc615c37501fa8576584a510a9d2bfe2fdd5bace7 (diff)
merge upstream/master
Diffstat (limited to 'python/pkg/cdec/sa')
-rw-r--r--python/pkg/cdec/sa/__init__.py24
-rw-r--r--python/pkg/cdec/sa/extract.py67
-rw-r--r--python/pkg/cdec/sa/extractor.py22
-rw-r--r--python/pkg/cdec/sa/features.py56
4 files changed, 119 insertions, 50 deletions
diff --git a/python/pkg/cdec/sa/__init__.py b/python/pkg/cdec/sa/__init__.py
index fd4a4148..e0a344b7 100644
--- a/python/pkg/cdec/sa/__init__.py
+++ b/python/pkg/cdec/sa/__init__.py
@@ -1,4 +1,24 @@
-from cdec.sa._sa import sym_fromstring,\
+from cdec.sa._sa import make_lattice, decode_lattice, decode_sentence,\
SuffixArray, DataArray, LCP, Precomputation, Alignment, BiLex,\
- HieroCachingRuleFactory, Sampler
+ HieroCachingRuleFactory, Sampler, Scorer
from cdec.sa.extractor import GrammarExtractor
+
+_SA_FEATURES = []
+_SA_ANNOTATORS = {}
+_SA_CONFIGURE = []
+
+def feature(fn):
+ _SA_FEATURES.append(fn)
+ return fn
+
+def annotator(fn):
+ _SA_ANNOTATORS[fn.__name__] = fn
+
+def annotate(sentence):
+ meta = {}
+ for name, fn in _SA_ANNOTATORS.iteritems():
+ meta[name] = fn(sentence)
+ return meta
+
+def configure(fn):
+ _SA_CONFIGURE.append(fn)
diff --git a/python/pkg/cdec/sa/extract.py b/python/pkg/cdec/sa/extract.py
index 875bf42e..10a81556 100644
--- a/python/pkg/cdec/sa/extract.py
+++ b/python/pkg/cdec/sa/extract.py
@@ -3,29 +3,72 @@ import sys
import os
import argparse
import logging
+import multiprocessing as mp
+import signal
import cdec.sa
+extractor, prefix = None, None
+def make_extractor(config, grammars, features):
+ global extractor, prefix
+ signal.signal(signal.SIGINT, signal.SIG_IGN) # Let parent process catch Ctrl+C
+ load_features(features)
+ extractor = cdec.sa.GrammarExtractor(config)
+ prefix = grammars
+
+def load_features(features):
+ for featdef in features:
+ logging.info('Loading additional feature definitions from %s', featdef)
+ prefix = os.path.dirname(featdef)
+ sys.path.append(prefix)
+ __import__(os.path.basename(featdef).replace('.py', ''))
+ sys.path.remove(prefix)
+
+def extract(inp):
+ global extractor, prefix
+ i, sentence = inp
+ sentence = sentence[:-1]
+ grammar_file = os.path.join(prefix, 'grammar.{0}'.format(i))
+ with open(grammar_file, 'w') as output:
+ for rule in extractor.grammar(sentence):
+ output.write(str(rule)+'\n')
+ grammar_file = os.path.abspath(grammar_file)
+ return '<seg grammar="{0}" id="{1}">{2}</seg>'.format(grammar_file, i, sentence)
+
def main():
logging.basicConfig(level=logging.INFO)
parser = argparse.ArgumentParser(description='Extract grammars from a compiled corpus.')
parser.add_argument('-c', '--config', required=True,
- help='Extractor configuration')
+ help='extractor configuration')
parser.add_argument('-g', '--grammars', required=True,
- help='Grammar output path')
+ help='grammar output path')
+ parser.add_argument('-j', '--jobs', type=int, default=1,
+ help='number of parallel extractors')
+ parser.add_argument('-s', '--chunksize', type=int, default=10,
+ help='number of sentences / chunk')
+ parser.add_argument('-f', '--features', nargs='*', default=[],
+ help='additional feature definitions')
args = parser.parse_args()
if not os.path.exists(args.grammars):
os.mkdir(args.grammars)
-
- extractor = cdec.sa.GrammarExtractor(args.config)
- for i, sentence in enumerate(sys.stdin):
- sentence = sentence[:-1]
- grammar_file = os.path.join(args.grammars, 'grammar.{0}'.format(i))
- with open(grammar_file, 'w') as output:
- for rule in extractor.grammar(sentence):
- output.write(str(rule)+'\n')
- grammar_file = os.path.abspath(grammar_file)
- print('<seg grammar="{0}" id="{1}">{2}</seg>'.format(grammar_file, i, sentence))
+ for featdef in args.features:
+ if not featdef.endswith('.py'):
+ sys.stderr.write('Error: feature definition file <{0}>'
+ ' should be a python module\n'.format(featdef))
+ sys.exit(1)
+
+ if args.jobs > 1:
+ logging.info('Starting %d workers; chunk size: %d', args.jobs, args.chunksize)
+ pool = mp.Pool(args.jobs, make_extractor, (args.config, args.grammars, args.features))
+ try:
+ for output in pool.imap(extract, enumerate(sys.stdin), args.chunksize):
+ print(output)
+ except KeyboardInterrupt:
+ pool.terminate()
+ else:
+ make_extractor(args.config, args.grammars, args.features)
+ for output in map(extract, enumerate(sys.stdin)):
+ print(output)
if __name__ == '__main__':
main()
diff --git a/python/pkg/cdec/sa/extractor.py b/python/pkg/cdec/sa/extractor.py
index bb912e16..a5ce8a68 100644
--- a/python/pkg/cdec/sa/extractor.py
+++ b/python/pkg/cdec/sa/extractor.py
@@ -9,7 +9,7 @@ import cdec.sa
MAX_INITIAL_SIZE = 15
class GrammarExtractor:
- def __init__(self, config):
+ def __init__(self, config, features=None):
if isinstance(config, str) or isinstance(config, unicode):
if not os.path.exists(config):
raise IOError('cannot read configuration from {0}'.format(config))
@@ -57,8 +57,11 @@ class GrammarExtractor:
# lexical weighting tables
tt = cdec.sa.BiLex(from_binary=config['lex_file'])
- self.models = (EgivenFCoherent, SampleCountF, CountEF,
- MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE)
+ # TODO: use @cdec.sa.features decorator for standard features too
+ # + add a mask to disable features
+ scorer = cdec.sa.Scorer(EgivenFCoherent, SampleCountF, CountEF,
+ MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE,
+ *cdec.sa._SA_FEATURES)
fsarray = cdec.sa.SuffixArray(from_binary=config['f_sa_file'])
edarray = cdec.sa.DataArray(from_binary=config['e_file'])
@@ -67,12 +70,15 @@ class GrammarExtractor:
# -1 = don't sample, use all data (VERY SLOW!)
sampler = cdec.sa.Sampler(300, fsarray)
- self.factory.configure(fsarray, edarray, sampler)
+ self.factory.configure(fsarray, edarray, sampler, scorer)
+ # Initialize feature definitions with configuration
+ for fn in cdec.sa._SA_CONFIGURE:
+ fn(config)
def grammar(self, sentence):
if isinstance(sentence, unicode):
sentence = sentence.encode('utf8')
- cnet = chain(('<s>',), sentence.split(), ('</s>',))
- cnet = (cdec.sa.sym_fromstring(word, terminal=True) for word in cnet)
- cnet = tuple(((word, None, 1), ) for word in cnet)
- return self.factory.input(cnet, self.models)
+ words = tuple(chain(('<s>',), sentence.split(), ('</s>',)))
+ meta = cdec.sa.annotate(words)
+ cnet = cdec.sa.make_lattice(words)
+ return self.factory.input(cnet, meta)
diff --git a/python/pkg/cdec/sa/features.py b/python/pkg/cdec/sa/features.py
index 325b9e13..a4ae23e8 100644
--- a/python/pkg/cdec/sa/features.py
+++ b/python/pkg/cdec/sa/features.py
@@ -3,55 +3,55 @@ import math
MAXSCORE = 99
-def EgivenF(fphrase, ephrase, paircount, fcount, fsample_count): # p(e|f)
- return -math.log10(paircount/fcount)
+def EgivenF(ctx): # p(e|f) = c(e, f)/c(f)
+ return -math.log10(ctx.paircount/ctx.fcount)
-def CountEF(fphrase, ephrase, paircount, fcount, fsample_count):
- return math.log10(1 + paircount)
+def CountEF(ctx): # c(e, f)
+ return math.log10(1 + ctx.paircount)
-def SampleCountF(fphrase, ephrase, paircount, fcount, fsample_count):
- return math.log10(1 + fsample_count)
+def SampleCountF(ctx): # sample c(f)
+ return math.log10(1 + ctx.fsample_count)
-def EgivenFCoherent(fphrase, ephrase, paircount, fcount, fsample_count):
- prob = paircount/fsample_count
+def EgivenFCoherent(ctx): # c(e, f) / sample c(f)
+ prob = ctx.paircount/ctx.fsample_count
return -math.log10(prob) if prob > 0 else MAXSCORE
-def CoherenceProb(fphrase, ephrase, paircount, fcount, fsample_count):
- return -math.log10(fcount/fsample_count)
+def CoherenceProb(ctx): # c(f) / sample c(f)
+ return -math.log10(ctx.fcount/ctx.fsample_count)
def MaxLexEgivenF(ttable):
- def feature(fphrase, ephrase, paircount, fcount, fsample_count):
- fwords = fphrase.words
+ def MaxLexEgivenF(ctx):
+ fwords = ctx.fphrase.words
fwords.append('NULL')
def score():
- for e in ephrase.words:
+ for e in ctx.ephrase.words:
maxScore = max(ttable.get_score(f, e, 0) for f in fwords)
yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE
return sum(score())
- return feature
+ return MaxLexEgivenF
def MaxLexFgivenE(ttable):
- def feature(fphrase, ephrase, paircount, fcount, fsample_count):
- ewords = ephrase.words
+ def MaxLexFgivenE(ctx):
+ ewords = ctx.ephrase.words
ewords.append('NULL')
def score():
- for f in fphrase.words:
+ for f in ctx.fphrase.words:
maxScore = max(ttable.get_score(f, e, 1) for e in ewords)
yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE
return sum(score())
- return feature
+ return MaxLexFgivenE
-def IsSingletonF(fphrase, ephrase, paircount, fcount, fsample_count):
- return (fcount == 1)
+def IsSingletonF(ctx):
+ return (ctx.fcount == 1)
-def IsSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count):
- return (paircount == 1)
+def IsSingletonFE(ctx):
+ return (ctx.paircount == 1)
-def IsNotSingletonF(fphrase, ephrase, paircount, fcount, fsample_count):
- return (fcount > 1)
+def IsNotSingletonF(ctx):
+ return (ctx.fcount > 1)
-def IsNotSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count):
- return (paircount > 1)
+def IsNotSingletonFE(ctx):
+ return (ctx.paircount > 1)
-def IsFEGreaterThanZero(fphrase, ephrase, paircount, fcount, fsample_count):
- return (paircount > 0.01)
+def IsFEGreaterThanZero(ctx):
+ return (ctx.paircount > 0.01)