diff options
| author | Victor Chahuneau <vchahune@cs.cmu.edu> | 2012-07-27 23:33:45 -0400 | 
|---|---|---|
| committer | Victor Chahuneau <vchahune@cs.cmu.edu> | 2012-07-27 23:33:45 -0400 | 
| commit | 455ca8bc1406ec2f6554fce9be7488bb3cca75dd (patch) | |
| tree | c02312611d11b178ef131545f377757b87f4bfed /python/cdec/sa | |
| parent | 9961abf8f756279ac6d839e0b3de2b0d83431965 (diff) | |
[python] Move python files to avoid pythonpath conflicts
Diffstat (limited to 'python/cdec/sa')
| -rw-r--r-- | python/cdec/sa/__init__.py | 4 | ||||
| -rw-r--r-- | python/cdec/sa/compile.py | 94 | ||||
| -rw-r--r-- | python/cdec/sa/extract.py | 31 | ||||
| -rw-r--r-- | python/cdec/sa/extractor.py | 78 | ||||
| -rw-r--r-- | python/cdec/sa/features.py | 57 | 
5 files changed, 0 insertions, 264 deletions
diff --git a/python/cdec/sa/__init__.py b/python/cdec/sa/__init__.py deleted file mode 100644 index 8645e837..00000000 --- a/python/cdec/sa/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from _sa import sym_fromstring,\ -        SuffixArray, DataArray, LCP, Precomputation, Alignment, BiLex,\ -        HieroCachingRuleFactory, Sampler -from extractor import GrammarExtractor diff --git a/python/cdec/sa/compile.py b/python/cdec/sa/compile.py deleted file mode 100644 index 30e605a6..00000000 --- a/python/cdec/sa/compile.py +++ /dev/null @@ -1,94 +0,0 @@ -#!/usr/bin/env python -import argparse -import os -import logging -import cdec.configobj -import cdec.sa - -MAX_PHRASE_LENGTH = 4 -def precompute(f_sa, max_len, max_nt, max_size, min_gap, rank1, rank2): -    lcp = cdec.sa.LCP(f_sa) -    stats = sorted(lcp.compute_stats(MAX_PHRASE_LENGTH), reverse=True) -    precomp = cdec.sa.Precomputation(from_stats=stats, -            fsarray=f_sa, -            precompute_rank=rank1, -            precompute_secondary_rank=rank2, -            max_length=max_len, -            max_nonterminals=max_nt, -            train_max_initial_size=max_size, -            train_min_gap_size=min_gap) -    return precomp - -def main(): -    logging.basicConfig(level=logging.INFO) -    logger = logging.getLogger('cdec.sa.compile') -    parser = argparse.ArgumentParser(description='Compile a corpus into a suffix array.') -    parser.add_argument('--maxnt', '-n', type=int, default=2, -                        help='Maximum number of non-terminal symbols') -    parser.add_argument('--maxlen', '-l', type=int, default=5, -                        help='Maximum number of terminals') -    parser.add_argument('--maxsize', '-s', type=int, default=15, -                        help='Maximum rule span') -    parser.add_argument('--mingap', '-g', type=int, default=1, -                        help='Minimum gap size') -    parser.add_argument('--rank1', '-r1', type=int, default=100, -                        help='Number of pre-computed frequent patterns') -    parser.add_argument('--rank2', '-r2', type=int, default=10, -                        help='Number of pre-computed super-frequent patterns)') -    parser.add_argument('-c', '--config', default='/dev/stdout', -                        help='Output configuration') -    parser.add_argument('-o', '--output', required=True, -                        help='Output path') -    parser.add_argument('-f', '--source', required=True, -                        help='Source language corpus') -    parser.add_argument('-e', '--target', required=True, -                        help='Target language corpus') -    parser.add_argument('-a', '--alignment', required=True, -                        help='Bitext word alignment') -    args = parser.parse_args() - -    param_names = ("max_len", "max_nt", "max_size", "min_gap", "rank1", "rank2") -    params = (args.maxlen, args.maxnt, args.maxsize, args.mingap, args.rank1, args.rank2) - -    if not os.path.exists(args.output): -        os.mkdir(args.output) - -    f_sa_bin = os.path.join(args.output, 'f.sa.bin') -    e_bin = os.path.join(args.output, 'e.bin') -    precomp_file = 'precomp.{0}.{1}.{2}.{3}.{4}.{5}.bin'.format(*params) -    precomp_bin = os.path.join(args.output, precomp_file) -    a_bin = os.path.join(args.output, 'a.bin') -    lex_bin = os.path.join(args.output, 'lex.bin') - -    logger.info('Compiling source suffix array') -    f_sa = cdec.sa.SuffixArray(from_text=args.source) -    f_sa.write_binary(f_sa_bin) - -    logger.info('Compiling target data array') -    e = cdec.sa.DataArray(from_text=args.target) -    e.write_binary(e_bin) - -    logger.info('Precomputing frequent phrases') -    precompute(f_sa, *params).write_binary(precomp_bin) - -    logger.info('Compiling alignment') -    a = cdec.sa.Alignment(from_text=args.alignment) -    a.write_binary(a_bin) - -    logger.info('Compiling bilexical dictionary') -    lex = cdec.sa.BiLex(from_data=True, alignment=a, earray=e, fsarray=f_sa) -    lex.write_binary(lex_bin) -     -    # Write configuration -    config = cdec.configobj.ConfigObj(args.config, unrepr=True) -    config['f_sa_file'] = f_sa_bin -    config['e_file'] = e_bin -    config['a_file'] = a_bin -    config['lex_file'] = lex_bin -    config['precompute_file'] = precomp_bin -    for name, value in zip(param_names, params): -        config[name] = value -    config.write() - -if __name__ == '__main__': -    main() diff --git a/python/cdec/sa/extract.py b/python/cdec/sa/extract.py deleted file mode 100644 index 918aa3bb..00000000 --- a/python/cdec/sa/extract.py +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/env python -import sys -import os -import argparse -import logging -import cdec.sa - -def main(): -    logging.basicConfig(level=logging.INFO) -    parser = argparse.ArgumentParser(description='Extract grammars from a compiled corpus.') -    parser.add_argument('-c', '--config', required=True, -                        help='Extractor configuration') -    parser.add_argument('-g', '--grammars', required=True, -                        help='Grammar output path') -    args = parser.parse_args() - -    if not os.path.exists(args.grammars): -        os.mkdir(args.grammars) - -    extractor = cdec.sa.GrammarExtractor(args.config) -    for i, sentence in enumerate(sys.stdin): -        sentence = sentence[:-1] -        grammar_file = os.path.join(args.grammars, 'grammar.{0}'.format(i)) -        with open(grammar_file, 'w') as output: -            for rule in extractor.grammar(sentence): -                output.write(str(rule)+'\n') -        grammar_file = os.path.abspath(grammar_file) -        print('<seg grammar="{0}">{1}</seg>'.format(grammar_file, sentence)) - -if __name__ == '__main__': -    main() diff --git a/python/cdec/sa/extractor.py b/python/cdec/sa/extractor.py deleted file mode 100644 index bb912e16..00000000 --- a/python/cdec/sa/extractor.py +++ /dev/null @@ -1,78 +0,0 @@ -from itertools import chain -import os -import cdec.configobj -from cdec.sa.features import EgivenFCoherent, SampleCountF, CountEF,\ -        MaxLexEgivenF, MaxLexFgivenE, IsSingletonF, IsSingletonFE -import cdec.sa - -# maximum span of a grammar rule in TEST DATA -MAX_INITIAL_SIZE = 15 - -class GrammarExtractor: -    def __init__(self, config): -        if isinstance(config, str) or isinstance(config, unicode): -            if not os.path.exists(config): -                raise IOError('cannot read configuration from {0}'.format(config)) -            config = cdec.configobj.ConfigObj(config, unrepr=True) -        alignment = cdec.sa.Alignment(from_binary=config['a_file']) -        self.factory = cdec.sa.HieroCachingRuleFactory( -                # compiled alignment object (REQUIRED) -                alignment, -                # name of generic nonterminal used by Hiero -                category="[X]", -                # maximum number of contiguous chunks of terminal symbols in RHS of a rule -                max_chunks=config['max_nt']+1, -                # maximum span of a grammar rule in TEST DATA -                max_initial_size=MAX_INITIAL_SIZE, -                # maximum number of symbols (both T and NT) allowed in a rule -                max_length=config['max_len'], -                # maximum number of nonterminals allowed in a rule (set >2 at your own risk) -                max_nonterminals=config['max_nt'], -                # maximum number of contiguous chunks of terminal symbols -                # in target-side RHS of a rule. -                max_target_chunks=config['max_nt']+1, -                # maximum number of target side symbols (both T and NT) allowed in a rule. -                max_target_length=MAX_INITIAL_SIZE, -                # minimum span of a nonterminal in the RHS of a rule in TEST DATA -                min_gap_size=1, -                # filename of file containing precomputed collocations -                precompute_file=config['precompute_file'], -                # maximum frequency rank of patterns used to compute triples (< 20) -                precompute_secondary_rank=config['rank2'], -                # maximum frequency rank of patterns used to compute collocations (< 300) -                precompute_rank=config['rank1'], -                # require extracted rules to have at least one aligned word -                require_aligned_terminal=True, -                # require each contiguous chunk of extracted rules -                # to have at least one aligned word -                require_aligned_chunks=False, -                # maximum span of a grammar rule extracted from TRAINING DATA -                train_max_initial_size=config['max_size'], -                # minimum span of an RHS nonterminal in a rule extracted from TRAINING DATA -                train_min_gap_size=config['min_gap'], -                # True if phrases should be tight, False otherwise (better but slower) -                tight_phrases=True, -                ) - -        # lexical weighting tables -        tt = cdec.sa.BiLex(from_binary=config['lex_file']) - -        self.models = (EgivenFCoherent, SampleCountF, CountEF,  -                MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE) - -        fsarray = cdec.sa.SuffixArray(from_binary=config['f_sa_file']) -        edarray = cdec.sa.DataArray(from_binary=config['e_file']) - -        # lower=faster, higher=better; improvements level off above 200-300 range, -        # -1 = don't sample, use all data (VERY SLOW!) -        sampler = cdec.sa.Sampler(300, fsarray) - -        self.factory.configure(fsarray, edarray, sampler) - -    def grammar(self, sentence): -        if isinstance(sentence, unicode): -            sentence = sentence.encode('utf8') -        cnet = chain(('<s>',), sentence.split(), ('</s>',)) -        cnet = (cdec.sa.sym_fromstring(word, terminal=True) for word in cnet) -        cnet = tuple(((word, None, 1), ) for word in cnet) -        return self.factory.input(cnet, self.models) diff --git a/python/cdec/sa/features.py b/python/cdec/sa/features.py deleted file mode 100644 index 325b9e13..00000000 --- a/python/cdec/sa/features.py +++ /dev/null @@ -1,57 +0,0 @@ -from __future__ import division -import math - -MAXSCORE = 99 - -def EgivenF(fphrase, ephrase, paircount, fcount, fsample_count): # p(e|f) -    return -math.log10(paircount/fcount) - -def CountEF(fphrase, ephrase, paircount, fcount, fsample_count): -    return math.log10(1 + paircount) - -def SampleCountF(fphrase, ephrase, paircount, fcount, fsample_count): -    return math.log10(1 + fsample_count) - -def EgivenFCoherent(fphrase, ephrase, paircount, fcount, fsample_count): -    prob = paircount/fsample_count -    return -math.log10(prob) if prob > 0 else MAXSCORE - -def CoherenceProb(fphrase, ephrase, paircount, fcount, fsample_count): -    return -math.log10(fcount/fsample_count) - -def MaxLexEgivenF(ttable): -    def feature(fphrase, ephrase, paircount, fcount, fsample_count): -        fwords = fphrase.words -        fwords.append('NULL') -        def score(): -            for e in ephrase.words: -              maxScore = max(ttable.get_score(f, e, 0) for f in fwords) -              yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE -        return sum(score()) -    return feature - -def MaxLexFgivenE(ttable): -    def feature(fphrase, ephrase, paircount, fcount, fsample_count): -        ewords = ephrase.words -        ewords.append('NULL') -        def score(): -            for f in fphrase.words: -              maxScore = max(ttable.get_score(f, e, 1) for e in ewords) -              yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE -        return sum(score()) -    return feature - -def IsSingletonF(fphrase, ephrase, paircount, fcount, fsample_count): -    return (fcount == 1) - -def IsSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count): -    return (paircount == 1) - -def IsNotSingletonF(fphrase, ephrase, paircount, fcount, fsample_count): -    return (fcount > 1) - -def IsNotSingletonFE(fphrase, ephrase, paircount, fcount, fsample_count): -    return (paircount > 1) - -def IsFEGreaterThanZero(fphrase, ephrase, paircount, fcount, fsample_count): -    return (paircount > 0.01)  | 
