diff options
Diffstat (limited to 'python/cdec/sa')
-rw-r--r-- | python/cdec/sa/__init__.py | 4 | ||||
-rw-r--r-- | python/cdec/sa/compile.py | 94 | ||||
-rw-r--r-- | python/cdec/sa/extract.py | 32 | ||||
-rw-r--r-- | python/cdec/sa/extractor.py | 73 | ||||
-rw-r--r-- | python/cdec/sa/features.py | 60 |
5 files changed, 0 insertions, 263 deletions
diff --git a/python/cdec/sa/__init__.py b/python/cdec/sa/__init__.py deleted file mode 100644 index ddefa280..00000000 --- a/python/cdec/sa/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from _cdec_sa import sym_tostring, sym_isvar, sym_fromstring,\ - SuffixArray, DataArray, LCP, Precomputation, Alignment, BiLex,\ - HieroCachingRuleFactory, Sampler -from extractor import GrammarExtractor diff --git a/python/cdec/sa/compile.py b/python/cdec/sa/compile.py deleted file mode 100644 index 061cdab2..00000000 --- a/python/cdec/sa/compile.py +++ /dev/null @@ -1,94 +0,0 @@ -#!/usr/bin/env python -import argparse -import os -import logging -import configobj -import cdec.sa - -MAX_PHRASE_LENGTH = 4 -def precompute(f_sa, max_len, max_nt, max_size, min_gap, rank1, rank2): - lcp = cdec.sa.LCP(f_sa) - stats = sorted(lcp.compute_stats(MAX_PHRASE_LENGTH), reverse=True) - precomp = cdec.sa.Precomputation(from_stats=stats, - fsarray=f_sa, - precompute_rank=rank1, - precompute_secondary_rank=rank2, - max_length=max_len, - max_nonterminals=max_nt, - train_max_initial_size=max_size, - train_min_gap_size=min_gap) - return precomp - -def main(): - logging.basicConfig(level=logging.INFO) - logger = logging.getLogger('cdec.sa.compile') - parser = argparse.ArgumentParser(description='Compile a corpus into a suffix array.') - parser.add_argument('--maxnt', '-n', type=int, default=2, - help='Maximum number of non-terminal symbols') - parser.add_argument('--maxlen', '-l', type=int, default=5, - help='Maximum number of terminals') - parser.add_argument('--maxsize', '-s', type=int, default=15, - help='Maximum rule span') - parser.add_argument('--mingap', '-g', type=int, default=1, - help='Minimum gap size') - parser.add_argument('--rank1', '-r1', type=int, default=100, - help='Number of pre-computed frequent patterns') - parser.add_argument('--rank2', '-r2', type=int, default=10, - help='Number of pre-computed super-frequent patterns)') - parser.add_argument('-c', '--config', default='/dev/stdout', - help='Output configuration') - parser.add_argument('-o', '--output', required=True, - help='Output path') - parser.add_argument('-f', '--source', required=True, - help='Source language corpus') - parser.add_argument('-e', '--target', required=True, - help='Target language corpus') - parser.add_argument('-a', '--alignment', required=True, - help='Bitext word alignment') - args = parser.parse_args() - - param_names = ("max_len", "max_nt", "max_size", "min_gap", "rank1", "rank2") - params = (args.maxlen, args.maxnt, args.maxsize, args.mingap, args.rank1, args.rank2) - - if not os.path.exists(args.output): - os.mkdir(args.output) - - f_sa_bin = os.path.join(args.output, 'f.sa.bin') - e_bin = os.path.join(args.output, 'e.bin') - precomp_file = 'precomp.{0}.{1}.{2}.{3}.{4}.{5}.bin'.format(*params) - precomp_bin = os.path.join(args.output, precomp_file) - a_bin = os.path.join(args.output, 'a.bin') - lex_bin = os.path.join(args.output, 'lex.bin') - - logger.info('Compiling source suffix array') - f_sa = cdec.sa.SuffixArray(from_text=args.source) - f_sa.write_binary(f_sa_bin) - - logger.info('Compiling target data array') - e = cdec.sa.DataArray(from_text=args.target) - e.write_binary(e_bin) - - logger.info('Precomputing frequent phrases') - precompute(f_sa, *params).write_binary(precomp_bin) - - logger.info('Compiling alignment') - a = cdec.sa.Alignment(from_text=args.alignment) - a.write_binary(a_bin) - - logger.info('Compiling bilexical dictionary') - lex = cdec.sa.BiLex(from_data=True, alignment=a, earray=e, fsarray=f_sa) - lex.write_binary(lex_bin) - - # Write configuration - config = configobj.ConfigObj(args.config, unrepr=True) - config['f_sa_file'] = f_sa_bin - config['e_file'] = e_bin - config['a_file'] = a_bin - config['lex_file'] = lex_bin - config['precompute_file'] = precomp_bin - for name, value in zip(param_names, params): - config[name] = value - config.write() - -if __name__ == '__main__': - main() diff --git a/python/cdec/sa/extract.py b/python/cdec/sa/extract.py deleted file mode 100644 index c6da5e9d..00000000 --- a/python/cdec/sa/extract.py +++ /dev/null @@ -1,32 +0,0 @@ -#!/usr/bin/env python -import sys -import os -import argparse -import logging -import configobj -import cdec.sa - -def main(): - logging.basicConfig(level=logging.INFO) - parser = argparse.ArgumentParser(description='Extract grammars from a compiled corpus.') - parser.add_argument('-c', '--config', required=True, - help='Extractor configuration') - parser.add_argument('-g', '--grammars', required=True, - help='Grammar output path') - args = parser.parse_args() - - if not os.path.exists(args.grammars): - os.mkdir(args.grammars) - - extractor = cdec.sa.GrammarExtractor(configobj.ConfigObj(args.config, unrepr=True)) - for i, sentence in enumerate(sys.stdin): - sentence = sentence[:-1] - grammar_file = os.path.join(args.grammars, 'grammar.{0}'.format(i)) - with open(grammar_file, 'w') as output: - for rule in extractor.grammar(sentence): - output.write(str(rule)+'\n') - grammar_file = os.path.abspath(grammar_file) - print('<seg grammar="{0}">{1}</seg>'.format(grammar_file, sentence)) - -if __name__ == '__main__': - main() diff --git a/python/cdec/sa/extractor.py b/python/cdec/sa/extractor.py deleted file mode 100644 index c97b3c6f..00000000 --- a/python/cdec/sa/extractor.py +++ /dev/null @@ -1,73 +0,0 @@ -from itertools import chain -from cdec.sa.features import EgivenFCoherent, SampleCountF, CountEF,\ - MaxLexEgivenF, MaxLexFgivenE, IsSingletonF, IsSingletonFE -import cdec.sa - -# maximum span of a grammar rule in TEST DATA -MAX_INITIAL_SIZE = 15 - -class GrammarExtractor: - def __init__(self, config): - # TODO if str, read config - alignment = cdec.sa.Alignment(from_binary=config['a_file']) - self.factory = cdec.sa.HieroCachingRuleFactory( - # compiled alignment object (REQUIRED) - alignment, - # name of generic nonterminal used by Hiero - category="[X]", - # maximum number of contiguous chunks of terminal symbols in RHS of a rule - max_chunks=config['max_nt']+1, - # maximum span of a grammar rule in TEST DATA - max_initial_size=MAX_INITIAL_SIZE, - # maximum number of symbols (both T and NT) allowed in a rule - max_length=config['max_len'], - # maximum number of nonterminals allowed in a rule (set >2 at your own risk) - max_nonterminals=config['max_nt'], - # maximum number of contiguous chunks of terminal symbols - # in target-side RHS of a rule. - max_target_chunks=config['max_nt']+1, - # maximum number of target side symbols (both T and NT) allowed in a rule. - max_target_length=MAX_INITIAL_SIZE, - # minimum span of a nonterminal in the RHS of a rule in TEST DATA - min_gap_size=1, - # filename of file containing precomputed collocations - precompute_file=config['precompute_file'], - # maximum frequency rank of patterns used to compute triples (< 20) - precompute_secondary_rank=config['rank2'], - # maximum frequency rank of patterns used to compute collocations (< 300) - precompute_rank=config['rank1'], - # require extracted rules to have at least one aligned word - require_aligned_terminal=True, - # require each contiguous chunk of extracted rules - # to have at least one aligned word - require_aligned_chunks=False, - # maximum span of a grammar rule extracted from TRAINING DATA - train_max_initial_size=config['max_size'], - # minimum span of an RHS nonterminal in a rule extracted from TRAINING DATA - train_min_gap_size=config['min_gap'], - # True if phrases should be tight, False otherwise (better but slower) - tight_phrases=True, - ) - - # lexical weighting tables - tt = cdec.sa.BiLex(from_binary=config['lex_file']) - - self.models = (EgivenFCoherent, SampleCountF, CountEF, - MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE) - - fsarray = cdec.sa.SuffixArray(from_binary=config['f_sa_file']) - edarray = cdec.sa.DataArray(from_binary=config['e_file']) - - # lower=faster, higher=better; improvements level off above 200-300 range, - # -1 = don't sample, use all data (VERY SLOW!) - sampler = cdec.sa.Sampler(300, fsarray) - - self.factory.configure(fsarray, edarray, sampler) - - def grammar(self, sentence): - if isinstance(sentence, unicode): - sentence = sentence.encode('utf8') - cnet = chain(('<s>',), sentence.split(), ('</s>',)) - cnet = (cdec.sa.sym_fromstring(word, terminal=True) for word in cnet) - cnet = tuple(((word, None, 1), ) for word in cnet) - return self.factory.input(cnet, self.models) diff --git a/python/cdec/sa/features.py b/python/cdec/sa/features.py deleted file mode 100644 index 8d35d8e6..00000000 --- a/python/cdec/sa/features.py +++ /dev/null @@ -1,60 +0,0 @@ -from __future__ import division -import math -import cdec.sa - -MAXSCORE = 99 - -def EgivenF(fphrase, ephrase, paircount, fcount, fsample_count): # p(e|f) - return -math.log10(paircount/fcount) - -def CountEF(fphrase, ephrase, paircount, fcount, fsample_count): - return math.log10(1 + paircount) - -def SampleCountF(fphrase, ephrase, paircount, fcount, fsample_count): - return math.log10(1 + fsample_count) - -def EgivenFCoherent(fphrase, ephrase, paircount, fcount, fsample_count): - prob = paircount/fsample_count - return -math.log10(prob) if prob > 0 else MAXSCORE - -def CoherenceProb(fphrase, ephrase, paircount, fcount, fsample_count): - return -math.log10(fcount/fsample_count) - -def MaxLexEgivenF(ttable): - def feature(fphrase, ephrase, paircount, fcount, fsample_count): - fwords = [cdec.sa.sym_tostring(w) for w in fphrase if not cdec.sa.sym_isvar(w)] - fwords.append('NULL') - ewords = (cdec.sa.sym_tostring(w) for w in ephrase if not cdec.sa.sym_isvar(w)) - def score(): - for e in ewords: - maxScore = max(ttable.get_score(f, e, 0) for f in fwords) - yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE - return sum(score()) - return feature - -def MaxLexFgivenE(ttable): - def feature(fphrase, ephrase, paircount, fcount, fsample_count): - fwords = (cdec.sa.sym_tostring(w) for w in fphrase if not cdec.sa.sym_isvar(w)) - ewords = [cdec.sa.sym_tostring(w) for w in ephrase if not cdec.sa.sym_isvar(w)] - ewords.append('NULL') - def score(): - for f in fwords: - maxScore = max(ttable.get_score(f, e, 1) for e in ewords) - yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE - return sum(score()) - return feature - -def IsSingletonF(fphrase, ephrase, paircount, fcount, fsample_count): - return (fcount == 1) - -def IsSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count): - return (paircount == 1) - -def IsNotSingletonF(fphrase, ephrase, paircount, fcount, fsample_count): - return (fcount > 1) - -def IsNotSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count): - return (paircount > 1) - -def IsFEGreaterThanZero(fphrase, ephrase, paircount, fcount, fsample_count): - return (paircount > 0.01) |