1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
|
# This .ini file extracts grammars to a file using
# the pattern matching infrastructure.
#
# Does not do any decoding.
#
# Variables can be set using sa-system.pl
#
# Usage: decoder.py -c <this ini file> [-x <grammar file>]
#
# If the -x option is used, grammar will be written to the
# specified file, otherwise it is written to $PWD/grammar.out
#
# NOTE: all information about rules is cached, so use generous
# memory limits (rules themselves are not cached.)
import os
import manager
import clex
import context_model
import rulefactory
import calignment
import sys
out_grammar_file = "grammar.out"
if opts.extra:
out_grammar_file = opts.extra
# *** these variables written by sa-system.pl. Do not modify ***
lm_file = "/tmp/sa-redpony/de-en/lm/lm/lm.gz"
f_sa_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/f.sa.bin"
e_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/e.bin"
a_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/a/gdfa/a.bin"
lex_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/a/gdfa/lex.bin"
max_len = 5
max_nt = 2
max_size=10
min_gap=1
rank1 = 100
rank2 = 10
precompute_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/precomp.5.2.10.1.100.10.bin"
# check for path errors
if not os.path.exists(f_sa_file):
raise Exception("Cannot find compiled source language suffix array file %s" % f_sa_file)
if not os.path.exists(e_file):
raise Exception("Cannot find compiled target language array file %s" % e_file)
if not os.path.exists(a_file):
raise Exception("Cannot find compiled alignment file %s" % a_file)
if not os.path.exists(lex_file):
raise Exception("Cannot find compiled lexical weights file %s" % lex_file)
if not os.path.exists(precompute_file):
log.writeln("Could not find precomputed collocations %s, decoding will be slower" % precompute_file)
precompute_file = None
### Output options
mark_phrases = False # show derivation as SGML markup in output
mert_mark_phrases = False # do the same when generating n-best lists (don't use this with minimum error rate training!)
# Verbosity. 0 = silent, 1 = normal, 2-5 = verbose
log.level = 1
log.file = sys.stderr
# pattern-matching stuff
class PhonyGrammar: # saves us the cost of keeping the rules around
def add(self, thing):
pass
local_grammar = PhonyGrammar()
xcat="X"
cm = manager.ContextManager(
f_sa_file,
e_file,
sampler=rulefactory.Sampler(300), # lower=faster, higher=better; improvements level off above 200-300 range, -1 = don't sample, use all data (VERY SLOW!)
rulefactory=rulefactory.HieroCachingRuleFactory(
alignment=calignment.Alignment( # compiled alignment object (REQUIRED)
a_file,
from_binary=True
),
category="["+xcat+"]", # name of generic nonterminal used by Hiero
grammar=local_grammar, # do not change for extraction
max_chunks=None, # maximum number of contiguous chunks of terminal symbols in RHS of a rule. If None, defaults to max_nonterminals+1
max_initial_size=15, # maximum span of a grammar rule in TEST DATA
max_length=max_len, # maximum number of symbols (both T and NT) allowed in a rule
max_nonterminals=max_nt, # maximum number of nonterminals allowed in a rule (set >2 at your own risk)
max_target_chunks=None, # maximum number of contiguous chunks of terminal symbols in target-side RHS of a rule. If None, defaults to max_nonterminals+1
max_target_length=None, # maximum number of target side symbols (both T and NT) allowed in a rule. If None, defaults to max_initial_size
min_gap_size=1, # minimum span of a nonterminal in the RHS of a rule in TEST DATA
precompute_file=precompute_file, # filename of file containing precomputed collocations
precompute_secondary_rank=rank2, # maximum frequency rank of patterns used to compute triples (don't set higher than 20).
precompute_rank=rank1, # maximum frequency rank of patterns used to compute collocations (no need to set higher than maybe 200-300)
require_aligned_terminal=True, # require extracted rules to have at least one aligned word
require_aligned_chunks=False, # require each contiguous chunk of extracted rules to have at least one aligned word
per_sentence_grammar=True, # generate a complete grammar for each input sentence
rule_file=out_grammar_file, # grammar is written to this file (sentence id is added to file name for per sentence grammars)
train_max_initial_size=max_size, # maximum span of a grammar rule extracted from TRAINING DATA
train_min_gap_size=min_gap, # minimum span of an RHS nonterminal in a rule extracted from TRAINING DATA
tight_phrases=True, # True if phrases should be tight, False otherwise (False seems to give better results but is slower)
),
from_binary=True
)
# lexical weighting tables
tt = clex.CLex(lex_file, from_binary=True)
# Only include features that depend on rule identity here
add_model(context_model.EgivenFCoherent(cm), 0.125)
add_model(context_model.SampleCountF(cm), 0.125)
add_model(context_model.CountEF(cm), 0.125)
add_model(context_model.MaxLexFgivenE(cm, tt), 0.125)
add_model(context_model.MaxLexEgivenF(cm, tt), 0.125)
add_model(context_model.IsSingletonF(cm), 0.125)
add_model(context_model.IsSingletonFE(cm), 0.125)
# grammars, search parameters and all that other stuff are irrelevant
|