sa-extract/extract.ini


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116

# This .ini file extracts grammars to a file using
# the pattern matching infrastructure.
#
# Does not do any decoding.
#
# Variables can be set using sa-system.pl
#
# Usage: decoder.py -c <this ini file> [-x <grammar file>]
#
# If the -x option is used, grammar will be written to the
# specified file, otherwise it is written to $PWD/grammar.out
# 
# NOTE: all information about rules is cached, so use generous
# memory limits (rules themselves are not cached.)

import os
import manager
import clex
import context_model
import rulefactory
import calignment
import sys

out_grammar_file = "grammar.out"
if opts.extra:
	out_grammar_file = opts.extra

# *** these variables written by sa-system.pl.  Do not modify ***
lm_file = "/tmp/sa-redpony/de-en/lm/lm/lm.gz"
f_sa_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/f.sa.bin"
e_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/e.bin"
a_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/a/gdfa/a.bin"
lex_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/a/gdfa/lex.bin"
max_len = 5
max_nt = 2
max_size=10
min_gap=1
rank1 = 100
rank2 = 10
precompute_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/precomp.5.2.10.1.100.10.bin"

# check for path errors
if not os.path.exists(f_sa_file):
	raise Exception("Cannot find compiled source language suffix array file %s" % f_sa_file)
if not os.path.exists(e_file):
	raise Exception("Cannot find compiled target language array file %s" % e_file)
if not os.path.exists(a_file):
	raise Exception("Cannot find compiled alignment file %s" % a_file)
if not os.path.exists(lex_file):
	raise Exception("Cannot find compiled lexical weights file %s" % lex_file)
if not os.path.exists(precompute_file):
	log.writeln("Could not find precomputed collocations %s, decoding will be slower" % precompute_file)
	precompute_file = None

### Output options
mark_phrases = False      # show derivation as SGML markup in output
mert_mark_phrases = False # do the same when generating n-best lists (don't use this with minimum error rate training!)

# Verbosity. 0 = silent, 1 = normal, 2-5 = verbose
log.level = 1
log.file = sys.stderr

# pattern-matching stuff
class PhonyGrammar:  # saves us the cost of keeping the rules around
	def add(self, thing):
		pass

local_grammar = PhonyGrammar()
xcat="X"

cm = manager.ContextManager(
	f_sa_file,
	e_file,
	sampler=rulefactory.Sampler(300),  # lower=faster, higher=better; improvements level off above 200-300 range, -1 = don't sample, use all data (VERY SLOW!)
	rulefactory=rulefactory.HieroCachingRuleFactory(
		alignment=calignment.Alignment(  # compiled alignment object (REQUIRED)
			a_file, 
			from_binary=True
			),
		category="["+xcat+"]",           # name of generic nonterminal used by Hiero
		grammar=local_grammar,           # do not change for extraction
		max_chunks=None,                 # maximum number of contiguous chunks of terminal symbols in RHS of a rule. If None, defaults to max_nonterminals+1
		max_initial_size=15,             # maximum span of a grammar rule in TEST DATA
		max_length=max_len,              # maximum number of symbols (both T and NT) allowed in a rule
		max_nonterminals=max_nt,         # maximum number of nonterminals allowed in a rule (set >2 at your own risk)
		max_target_chunks=None,          # maximum number of contiguous chunks of terminal symbols in target-side RHS of a rule. If None, defaults to max_nonterminals+1
		max_target_length=None,          # maximum number of target side symbols (both T and NT) allowed in a rule. If None, defaults to max_initial_size
		min_gap_size=1,                  # minimum span of a nonterminal in the RHS of a rule in TEST DATA
		precompute_file=precompute_file, # filename of file containing precomputed collocations
		precompute_secondary_rank=rank2, # maximum frequency rank of patterns used to compute triples (don't set higher than 20).
		precompute_rank=rank1,           # maximum frequency rank of patterns used to compute collocations (no need to set higher than maybe 200-300)
		require_aligned_terminal=True,   # require extracted rules to have at least one aligned word
		require_aligned_chunks=False,    # require each contiguous chunk of extracted rules to have at least one aligned word
		per_sentence_grammar=True,       # generate a complete grammar for each input sentence
		rule_file=out_grammar_file,      # grammar is written to this file (sentence id is added to file name for per sentence grammars)
		train_max_initial_size=max_size, # maximum span of a grammar rule extracted from TRAINING DATA
		train_min_gap_size=min_gap,      # minimum span of an RHS nonterminal in a rule extracted from TRAINING DATA
		tight_phrases=True,              # True if phrases should be tight, False otherwise (False seems to give better results but is slower)
		),
	from_binary=True
	)

# lexical weighting tables
tt = clex.CLex(lex_file, from_binary=True)

# Only include features that depend on rule identity here
add_model(context_model.EgivenFCoherent(cm), 0.125) 
add_model(context_model.SampleCountF(cm), 0.125)
add_model(context_model.CountEF(cm), 0.125)
add_model(context_model.MaxLexFgivenE(cm, tt), 0.125) 
add_model(context_model.MaxLexEgivenF(cm, tt), 0.125) 
add_model(context_model.IsSingletonF(cm), 0.125)
add_model(context_model.IsSingletonFE(cm), 0.125)

# grammars, search parameters and all that other stuff are irrelevant