1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
|
import sys, os
import re
import StringIO
from itertools import chain
import clex
import rulefactory
import calignment
import csuf
import cdat
import sym
import log
from features import EgivenFCoherent, SampleCountF, CountEF,\
MaxLexEgivenF, MaxLexFgivenE, IsSingletonF, IsSingletonFE
from features import contextless
log.level = -1
class Output(StringIO.StringIO):
def close(self):
pass
def __str__(self):
return self.getvalue()
def get_cn(sentence):
sentence = chain(('<s>',), sentence.split(), ('</s>',))
sentence = (sym.fromstring(word, terminal=True) for word in sentence)
return tuple(((word, None, 1), ) for word in sentence)
class PhonyGrammar:
def add(self, thing):
pass
class GrammarExtractor:
def __init__(self, cfg):
if isinstance(cfg, dict):
config = cfg
elif isinstance(cfg, str):
cfg_file = os.path.basename(cfg)
if not re.match(r'^\w+\.py$', cfg_file):
raise ValueError('Config must be a *.py file')
sys.path.append(os.path.dirname(cfg))
config = __import__(cfg_file.replace('.py', '')).__dict__
sys.path.pop()
alignment = calignment.Alignment(config['a_file'], from_binary=True)
self.factory = rulefactory.HieroCachingRuleFactory(
# compiled alignment object (REQUIRED)
alignment=alignment,
# name of generic nonterminal used by Hiero
category="[X]",
# do not change for extraction
grammar=PhonyGrammar(), # TODO: set to None?
# maximum number of contiguous chunks of terminal symbols in RHS of a rule. If None, defaults to max_nonterminals+1
max_chunks=None,
# maximum span of a grammar rule in TEST DATA
max_initial_size=15,
# maximum number of symbols (both T and NT) allowed in a rule
max_length=config['max_len'],
# maximum number of nonterminals allowed in a rule (set >2 at your own risk)
max_nonterminals=config['max_nt'],
# maximum number of contiguous chunks of terminal symbols in target-side RHS of a rule. If None, defaults to max_nonterminals+1
max_target_chunks=None,
# maximum number of target side symbols (both T and NT) allowed in a rule. If None, defaults to max_initial_size
max_target_length=None,
# minimum span of a nonterminal in the RHS of a rule in TEST DATA
min_gap_size=1,
# filename of file containing precomputed collocations
precompute_file=config['precompute_file'],
# maximum frequency rank of patterns used to compute triples (don't set higher than 20).
precompute_secondary_rank=config['rank2'],
# maximum frequency rank of patterns used to compute collocations (no need to set higher than maybe 200-300)
precompute_rank=config['rank1'],
# require extracted rules to have at least one aligned word
require_aligned_terminal=True,
# require each contiguous chunk of extracted rules to have at least one aligned word
require_aligned_chunks=False,
# generate a complete grammar for each input sentence
per_sentence_grammar=True,
# maximum span of a grammar rule extracted from TRAINING DATA
train_max_initial_size=config['max_size'],
# minimum span of an RHS nonterminal in a rule extracted from TRAINING DATA
train_min_gap_size=config['min_gap'],
# True if phrases should be tight, False otherwise (False seems to give better results but is slower)
tight_phrases=True,
)
self.fsarray = csuf.SuffixArray(config['f_sa_file'], from_binary=True)
self.edarray = cdat.DataArray(config['e_file'], from_binary=True)
self.factory.registerContext(self)
# lower=faster, higher=better; improvements level off above 200-300 range, -1 = don't sample, use all data (VERY SLOW!)
self.sampler = rulefactory.Sampler(300)
self.sampler.registerContext(self)
# lexical weighting tables
tt = clex.CLex(config['lex_file'], from_binary=True)
self.models = (EgivenFCoherent, SampleCountF, CountEF,
MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE)
self.models = tuple(contextless(feature) for feature in self.models)
def grammar(self, sentence):
if isinstance(sentence, unicode):
sentence = sentence.encode('utf8')
out = Output()
cn = get_cn(sentence)
self.factory.input(cn, output=out)
return str(out)
def main(config):
extractor = GrammarExtractor(config)
sys.stdout.write(extractor.grammar(next(sys.stdin)))
if __name__ == '__main__':
if len(sys.argv) != 2 or not sys.argv[1].endswith('.py'):
sys.stderr.write('Usage: %s config.py\n' % sys.argv[0])
sys.exit(1)
main(*sys.argv[1:])
|