diff options
Diffstat (limited to 'python/pkg/cdec/sa')
-rw-r--r-- | python/pkg/cdec/sa/__init__.py | 25 | ||||
-rw-r--r-- | python/pkg/cdec/sa/compile.py | 132 | ||||
-rw-r--r-- | python/pkg/cdec/sa/extract.py | 113 | ||||
-rw-r--r-- | python/pkg/cdec/sa/extractor.py | 106 | ||||
-rw-r--r-- | python/pkg/cdec/sa/features.py | 142 |
5 files changed, 0 insertions, 518 deletions
diff --git a/python/pkg/cdec/sa/__init__.py b/python/pkg/cdec/sa/__init__.py deleted file mode 100644 index 14ba5ecb..00000000 --- a/python/pkg/cdec/sa/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -from cdec.sa._sa import make_lattice, decode_lattice, decode_sentence,\ - encode_words, decode_words, isvar,\ - SuffixArray, DataArray, LCP, Precomputation, Alignment, BiLex,\ - HieroCachingRuleFactory, Sampler, Scorer -from cdec.sa.extractor import GrammarExtractor - -_SA_FEATURES = [] -_SA_ANNOTATORS = {} -_SA_CONFIGURE = [] - -def feature(fn): - _SA_FEATURES.append(fn) - return fn - -def annotator(fn): - _SA_ANNOTATORS[fn.__name__] = fn - -def annotate(sentence): - meta = {} - for name, fn in _SA_ANNOTATORS.iteritems(): - meta[name] = fn(sentence) - return meta - -def configure(fn): - _SA_CONFIGURE.append(fn) diff --git a/python/pkg/cdec/sa/compile.py b/python/pkg/cdec/sa/compile.py deleted file mode 100644 index d4cd8387..00000000 --- a/python/pkg/cdec/sa/compile.py +++ /dev/null @@ -1,132 +0,0 @@ -#!/usr/bin/env python -import argparse -import os -import logging -import cdec.configobj -import cdec.sa -from cdec.sa._sa import monitor_cpu -import sys - -MAX_PHRASE_LENGTH = 4 -def precompute(f_sa, max_len, max_nt, max_size, min_gap, rank1, rank2, tight_phrases): - lcp = cdec.sa.LCP(f_sa) - stats = sorted(lcp.compute_stats(MAX_PHRASE_LENGTH), reverse=True) - precomp = cdec.sa.Precomputation(from_stats=stats, - fsarray=f_sa, - precompute_rank=rank1, - precompute_secondary_rank=rank2, - max_length=max_len, - max_nonterminals=max_nt, - train_max_initial_size=max_size, - train_min_gap_size=min_gap) - return precomp - -def main(): - preprocess_start_time = monitor_cpu() - sys.setrecursionlimit(sys.getrecursionlimit() * 100) - - logging.basicConfig(level=logging.INFO) - logger = logging.getLogger('cdec.sa.compile') - parser = argparse.ArgumentParser(description='Compile a corpus into a suffix array.') - parser.add_argument('--maxnt', '-n', type=int, default=2, - help='Maximum number of non-terminal symbols') - parser.add_argument('--maxlen', '-l', type=int, default=5, - help='Maximum number of terminals') - parser.add_argument('--maxsize', '-s', type=int, default=15, - help='Maximum rule span') - parser.add_argument('--mingap', '-g', type=int, default=1, - help='Minimum gap size') - parser.add_argument('--rank1', '-r1', type=int, default=100, - help='Number of pre-computed frequent patterns') - parser.add_argument('--rank2', '-r2', type=int, default=10, - help='Number of pre-computed super-frequent patterns)') - parser.add_argument('--loose', action='store_true', - help='Enable loose phrase extraction (default: tight)') - parser.add_argument('-c', '--config', default='/dev/stdout', - help='Output configuration') - parser.add_argument('-f', '--source', - help='Source language corpus') - parser.add_argument('-e', '--target', - help='Target language corpus') - parser.add_argument('-b', '--bitext', - help='Parallel text (source ||| target)') - parser.add_argument('-a', '--alignment', required=True, - help='Bitext word alignment') - parser.add_argument('-o', '--output', required=True, - help='Output path') - args = parser.parse_args() - - if not ((args.source and args.target) or args.bitext): - parser.error('a parallel corpus is required\n' - '\tuse -f (source) with -e (target) or -b (bitext)') - - param_names = ('max_len', 'max_nt', 'max_size', 'min_gap', - 'rank1', 'rank2', 'tight_phrases') - params = (args.maxlen, args.maxnt, args.maxsize, args.mingap, - args.rank1, args.rank2, not args.loose) - - if not os.path.exists(args.output): - os.mkdir(args.output) - - f_sa_bin = os.path.join(args.output, 'f.sa.bin') - e_bin = os.path.join(args.output, 'e.bin') - precomp_file = 'precomp.{0}.{1}.{2}.{3}.{4}.{5}.bin'.format(*params) - precomp_bin = os.path.join(args.output, precomp_file) - a_bin = os.path.join(args.output, 'a.bin') - lex_bin = os.path.join(args.output, 'lex.bin') - - start_time = monitor_cpu() - logger.info('Compiling source suffix array') - if args.bitext: - f_sa = cdec.sa.SuffixArray(from_text=args.bitext, side='source') - else: - f_sa = cdec.sa.SuffixArray(from_text=args.source) - f_sa.write_binary(f_sa_bin) - stop_time = monitor_cpu() - logger.info('Compiling source suffix array took %f seconds', stop_time - start_time) - - start_time = monitor_cpu() - logger.info('Compiling target data array') - if args.bitext: - e = cdec.sa.DataArray(from_text=args.bitext, side='target') - else: - e = cdec.sa.DataArray(from_text=args.target) - e.write_binary(e_bin) - stop_time = monitor_cpu() - logger.info('Compiling target data array took %f seconds', stop_time - start_time) - - start_time = monitor_cpu() - logger.info('Precomputing frequent phrases') - precompute(f_sa, *params).write_binary(precomp_bin) - stop_time = monitor_cpu() - logger.info('Compiling precomputations took %f seconds', stop_time - start_time) - - start_time = monitor_cpu() - logger.info('Compiling alignment') - a = cdec.sa.Alignment(from_text=args.alignment) - a.write_binary(a_bin) - stop_time = monitor_cpu() - logger.info('Compiling alignment took %f seonds', stop_time - start_time) - - start_time = monitor_cpu() - logger.info('Compiling bilexical dictionary') - lex = cdec.sa.BiLex(from_data=True, alignment=a, earray=e, fsarray=f_sa) - lex.write_binary(lex_bin) - stop_time = monitor_cpu() - logger.info('Compiling bilexical dictionary took %f seconds', stop_time - start_time) - - # Write configuration - config = cdec.configobj.ConfigObj(args.config, unrepr=True) - config['f_sa_file'] = os.path.abspath(f_sa_bin) - config['e_file'] = os.path.abspath(e_bin) - config['a_file'] = os.path.abspath(a_bin) - config['lex_file'] = os.path.abspath(lex_bin) - config['precompute_file'] = os.path.abspath(precomp_bin) - for name, value in zip(param_names, params): - config[name] = value - config.write() - preprocess_stop_time = monitor_cpu() - logger.info('Overall preprocessing step took %f seconds', preprocess_stop_time - preprocess_start_time) - -if __name__ == '__main__': - main() diff --git a/python/pkg/cdec/sa/extract.py b/python/pkg/cdec/sa/extract.py deleted file mode 100644 index b6502c52..00000000 --- a/python/pkg/cdec/sa/extract.py +++ /dev/null @@ -1,113 +0,0 @@ -#!/usr/bin/env python -import sys -import os -import re -import gzip -import argparse -import logging -import signal -import multiprocessing as mp -import cdec.sa -from cdec.sa._sa import monitor_cpu - -extractor, prefix = None, None -online, compress = False, False - -def make_extractor(args): - global extractor, prefix, online, compress - signal.signal(signal.SIGINT, signal.SIG_IGN) # Let parent process catch Ctrl+C - load_features(args.features) - extractor = cdec.sa.GrammarExtractor(args.config, online) - prefix = args.grammars - online = args.online - compress = args.compress - -def load_features(features): - for featdef in features: - logging.info('Loading additional feature definitions from %s', featdef) - prefix = os.path.dirname(featdef) - sys.path.append(prefix) - __import__(os.path.basename(featdef).replace('.py', '')) - sys.path.remove(prefix) - -def extract(inp): - global extractor, prefix, online, compress - i, sentence = inp - sentence = sentence[:-1] - fields = re.split('\s*\|\|\|\s*', sentence) - suffix = '' - # 3 fields for online mode, 1 for normal - if online: - if len(fields) < 3: - sys.stderr.write('Error: online mode requires references and alignments.' - ' Not adding sentence to training data: {}\n'.format(sentence)) - sentence = fields[0] - else: - sentence, reference, alignment = fields[0:3] - if len(fields) > 3: - suffix = ' ||| ' + ' ||| '.join(fields[3:]) - else: - if len(fields) > 1: - sentence = fields[0] - suffix = ' ||| ' + ' ||| '.join(fields[1:]) - - grammar_file = os.path.join(prefix, 'grammar.'+str(i)) - if compress: grammar_file += '.gz' - with (gzip.open if compress else open)(grammar_file, 'w') as output: - for rule in extractor.grammar(sentence): - output.write(str(rule)+'\n') - # Add training instance _after_ extracting grammars - if online: - extractor.add_instance(sentence, reference, alignment) - grammar_file = os.path.abspath(grammar_file) - return '<seg grammar="{}" id="{}">{}</seg>{}'.format(grammar_file, i, sentence, suffix) - -def main(): - global online - logging.basicConfig(level=logging.INFO) - parser = argparse.ArgumentParser(description='Extract grammars from a compiled corpus.') - parser.add_argument('-c', '--config', required=True, - help='extractor configuration') - parser.add_argument('-g', '--grammars', required=True, - help='grammar output path') - parser.add_argument('-j', '--jobs', type=int, default=1, - help='number of parallel extractors') - parser.add_argument('-s', '--chunksize', type=int, default=10, - help='number of sentences / chunk') - parser.add_argument('-f', '--features', nargs='*', default=[], - help='additional feature definitions') - parser.add_argument('-o', '--online', action='store_true', - help='online grammar extraction') - parser.add_argument('-z', '--compress', action='store_true', - help='compress grammars with gzip') - args = parser.parse_args() - - if not os.path.exists(args.grammars): - os.mkdir(args.grammars) - for featdef in args.features: - if not featdef.endswith('.py'): - sys.stderr.write('Error: feature definition file <{}>' - ' should be a python module\n'.format(featdef)) - sys.exit(1) - - online = args.online - - start_time = monitor_cpu() - if args.jobs > 1: - logging.info('Starting %d workers; chunk size: %d', args.jobs, args.chunksize) - pool = mp.Pool(args.jobs, make_extractor, (args,)) - try: - for output in pool.imap(extract, enumerate(sys.stdin), args.chunksize): - print(output) - except KeyboardInterrupt: - pool.terminate() - else: - make_extractor(args) - for output in map(extract, enumerate(sys.stdin)): - print(output) - - stop_time = monitor_cpu() - logging.info("Overall extraction step took %f seconds", stop_time - start_time) - -if __name__ == '__main__': - main() diff --git a/python/pkg/cdec/sa/extractor.py b/python/pkg/cdec/sa/extractor.py deleted file mode 100644 index acc13cbc..00000000 --- a/python/pkg/cdec/sa/extractor.py +++ /dev/null @@ -1,106 +0,0 @@ -from itertools import chain -import os, sys -import cdec.configobj -from cdec.sa.features import EgivenFCoherent, SampleCountF, CountEF,\ - MaxLexEgivenF, MaxLexFgivenE, IsSingletonF, IsSingletonFE,\ - IsSupportedOnline -import cdec.sa - -# maximum span of a grammar rule in TEST DATA -MAX_INITIAL_SIZE = 15 - -class GrammarExtractor: - def __init__(self, config, online=False, features=None): - if isinstance(config, basestring): - if not os.path.exists(config): - raise IOError('cannot read configuration from {0}'.format(config)) - config = cdec.configobj.ConfigObj(config, unrepr=True) - alignment = cdec.sa.Alignment(from_binary=config['a_file']) - self.factory = cdec.sa.HieroCachingRuleFactory( - # compiled alignment object (REQUIRED) - alignment, - # name of generic nonterminal used by Hiero - category="[X]", - # maximum number of contiguous chunks of terminal symbols in RHS of a rule - max_chunks=config['max_nt']+1, - # maximum span of a grammar rule in TEST DATA - max_initial_size=MAX_INITIAL_SIZE, - # maximum number of symbols (both T and NT) allowed in a rule - max_length=config['max_len'], - # maximum number of nonterminals allowed in a rule (set >2 at your own risk) - max_nonterminals=config['max_nt'], - # maximum number of contiguous chunks of terminal symbols - # in target-side RHS of a rule. - max_target_chunks=config['max_nt']+1, - # maximum number of target side symbols (both T and NT) allowed in a rule. - max_target_length=MAX_INITIAL_SIZE, - # minimum span of a nonterminal in the RHS of a rule in TEST DATA - min_gap_size=1, - # filename of file containing precomputed collocations - precompute_file=config['precompute_file'], - # maximum frequency rank of patterns used to compute triples (< 20) - precompute_secondary_rank=config['rank2'], - # maximum frequency rank of patterns used to compute collocations (< 300) - precompute_rank=config['rank1'], - # require extracted rules to have at least one aligned word - require_aligned_terminal=True, - # require each contiguous chunk of extracted rules - # to have at least one aligned word - require_aligned_chunks=False, - # maximum span of a grammar rule extracted from TRAINING DATA - train_max_initial_size=config['max_size'], - # minimum span of an RHS nonterminal in a rule extracted from TRAINING DATA - train_min_gap_size=config['min_gap'], - # False if phrases should be loose (better but slower), True otherwise - tight_phrases=config.get('tight_phrases', True), - ) - - # lexical weighting tables - tt = cdec.sa.BiLex(from_binary=config['lex_file']) - - # TODO: clean this up - extended_features = [] - if online: - extended_features.append(IsSupportedOnline) - - # TODO: use @cdec.sa.features decorator for standard features too - # + add a mask to disable features - for f in cdec.sa._SA_FEATURES: - extended_features.append(f) - - scorer = cdec.sa.Scorer(EgivenFCoherent, SampleCountF, CountEF, - MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE, - *extended_features) - - fsarray = cdec.sa.SuffixArray(from_binary=config['f_sa_file']) - edarray = cdec.sa.DataArray(from_binary=config['e_file']) - - # lower=faster, higher=better; improvements level off above 200-300 range, - # -1 = don't sample, use all data (VERY SLOW!) - sampler = cdec.sa.Sampler(300, fsarray) - - self.factory.configure(fsarray, edarray, sampler, scorer) - # Initialize feature definitions with configuration - for fn in cdec.sa._SA_CONFIGURE: - fn(config) - - def grammar(self, sentence): - if isinstance(sentence, unicode): - sentence = sentence.encode('utf8') - words = tuple(chain(('<s>',), sentence.split(), ('</s>',))) - meta = cdec.sa.annotate(words) - cnet = cdec.sa.make_lattice(words) - return self.factory.input(cnet, meta) - - # Add training instance to data - def add_instance(self, sentence, reference, alignment): - f_words = cdec.sa.encode_words(sentence.split()) - e_words = cdec.sa.encode_words(reference.split()) - al = sorted(tuple(int(i) for i in pair.split('-')) for pair in alignment.split()) - self.factory.add_instance(f_words, e_words, al) - - # Debugging - def dump_online_stats(self): - self.factory.dump_online_stats() - def dump_online_rules(self): - self.factory.dump_online_rules()
\ No newline at end of file diff --git a/python/pkg/cdec/sa/features.py b/python/pkg/cdec/sa/features.py deleted file mode 100644 index c8fc1cca..00000000 --- a/python/pkg/cdec/sa/features.py +++ /dev/null @@ -1,142 +0,0 @@ -from __future__ import division -import math - -from cdec.sa import isvar - -MAXSCORE = 99 - -def EgivenF(ctx): # p(e|f) = c(e, f)/c(f) - if not ctx.online: - prob = ctx.paircount/ctx.fcount - else: - prob = (ctx.paircount + ctx.online.paircount) / (ctx.fcount + ctx.online.fcount) - return -math.log10(prob) - -def CountEF(ctx): # c(e, f) - if not ctx.online: - count = 1 + ctx.paircount - else: - count = 1 + ctx.paircount + ctx.online.paircount - return math.log10(count) - -def SampleCountF(ctx): # sample c(f) - if not ctx.online: - count = 1 + ctx.fsample_count - else: - count = 1 + ctx.fsample_count + ctx.online.fsample_count - return math.log10(count) - -def EgivenFCoherent(ctx): # c(e, f) / sample c(f) - if not ctx.online: - prob = ctx.paircount/ctx.fsample_count - else: - prob = (ctx.paircount + ctx.online.paircount) / (ctx.fsample_count + ctx.online.fsample_count) - return -math.log10(prob) if prob > 0 else MAXSCORE - -def CoherenceProb(ctx): # c(f) / sample c(f) - if not ctx.online: - prob = ctx.fcount/ctx.fsample_count - else: - prob = (ctx.fcount + ctx.online.fcount) / (ctx.fsample_count + ctx.online.fsample_count) - return -math.log10(prob) - -def MaxLexEgivenF(ttable): - def MaxLexEgivenF(ctx): - fwords = ctx.fphrase.words - fwords.append('NULL') - # Always use this for now - if not ctx.online or ctx.online: - maxOffScore = 0.0 - for e in ctx.ephrase.words: - maxScore = max(ttable.get_score(f, e, 0) for f in fwords) - maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE - return maxOffScore - else: - # For now, straight average - maxOffScore = 0.0 - maxOnScore = 0.0 - for e in ctx.ephrase.words: - maxScore = max(ttable.get_score(f, e, 0) for f in fwords) - maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE - for e in ctx.ephrase: - if not isvar(e): - maxScore = 0.0 - for f in ctx.fphrase: - if not isvar(f): - b_f = ctx.online.bilex_f.get(f, 0) - if b_f: - maxScore = max(maxScore, ctx.online.bilex_fe.get(f, {}).get(e)) - maxOnScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE - return (maxOffScore + maxOnScore) / 2 - return MaxLexEgivenF - -def MaxLexFgivenE(ttable): - def MaxLexFgivenE(ctx): - ewords = ctx.ephrase.words - ewords.append('NULL') - # Always use this for now - if not ctx.online or ctx.online: - maxOffScore = 0.0 - for f in ctx.fphrase.words: - maxScore = max(ttable.get_score(f, e, 1) for e in ewords) - maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE - return maxOffScore - else: - # For now, straight average - maxOffScore = 0.0 - maxOnScore = 0.0 - for f in ctx.fphrase.words: - maxScore = max(ttable.get_score(f, e, 1) for e in ewords) - maxOffScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE - for f in ctx.fphrase: - if not isvar(f): - maxScore = 0.0 - for e in ctx.ephrase: - if not isvar(e): - b_e = ctx.online.bilex_e.get(e, 0) - if b_e: - maxScore = max(maxScore, ctx.online.bilex_fe.get(f, {}).get(e, 0) / b_e ) - maxOnScore += -math.log10(maxScore) if maxScore > 0 else MAXSCORE - return (maxOffScore + maxOnScore) / 2 - return MaxLexFgivenE - -def IsSingletonF(ctx): - if not ctx.online: - count = ctx.fcount - else: - count = ctx.fcount + ctx.online.fcount - return math.fabs(count - 1) < 1e-6 - -def IsSingletonFE(ctx): - if not ctx.online: - count = ctx.paircount - else: - count = ctx.paircount + ctx.online.paircount - return (count == 1) - -def IsNotSingletonF(ctx): - if not ctx.online: - count = ctx.fcount - else: - count = ctx.fcount + ctx.online.fcount - return (count > 1) - -def IsNotSingletonFE(ctx): - if not ctx.online: - count = ctx.paircount - else: - count = ctx.paircount + ctx.online.paircount - return (ctx.paircount > 1) - -def IsFEGreaterThanZero(ctx): - if not ctx.online: - count = ctx.paircount - else: - count = ctx.paircount + ctx.online.paircount - return (ctx.paircount > 0.01) - -def IsSupportedOnline(ctx): # Occurs in online data? - if ctx.online: - return (ctx.online.paircount > 0.01) - else: - return False |