diff options
Diffstat (limited to 'sa-extract/extract.ini')
-rw-r--r-- | sa-extract/extract.ini | 116 |
1 files changed, 116 insertions, 0 deletions
diff --git a/sa-extract/extract.ini b/sa-extract/extract.ini new file mode 100644 index 00000000..56913245 --- /dev/null +++ b/sa-extract/extract.ini @@ -0,0 +1,116 @@ +# This .ini file extracts grammars to a file using +# the pattern matching infrastructure. +# +# Does not do any decoding. +# +# Variables can be set using sa-system.pl +# +# Usage: decoder.py -c <this ini file> [-x <grammar file>] +# +# If the -x option is used, grammar will be written to the +# specified file, otherwise it is written to $PWD/grammar.out +# +# NOTE: all information about rules is cached, so use generous +# memory limits (rules themselves are not cached.) + +import os +import manager +import clex +import context_model +import rulefactory +import calignment +import sys + +out_grammar_file = "grammar.out" +if opts.extra: + out_grammar_file = opts.extra + +# *** these variables written by sa-system.pl. Do not modify *** +lm_file = "/tmp/sa-redpony/de-en/lm/lm/lm.gz" +f_sa_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/f.sa.bin" +e_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/e.bin" +a_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/a/gdfa/a.bin" +lex_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/a/gdfa/lex.bin" +max_len = 5 +max_nt = 2 +max_size=10 +min_gap=1 +rank1 = 100 +rank2 = 10 +precompute_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/precomp.5.2.10.1.100.10.bin" + +# check for path errors +if not os.path.exists(f_sa_file): + raise Exception("Cannot find compiled source language suffix array file %s" % f_sa_file) +if not os.path.exists(e_file): + raise Exception("Cannot find compiled target language array file %s" % e_file) +if not os.path.exists(a_file): + raise Exception("Cannot find compiled alignment file %s" % a_file) +if not os.path.exists(lex_file): + raise Exception("Cannot find compiled lexical weights file %s" % lex_file) +if not os.path.exists(precompute_file): + log.writeln("Could not find precomputed collocations %s, decoding will be slower" % precompute_file) + precompute_file = None + +### Output options +mark_phrases = False # show derivation as SGML markup in output +mert_mark_phrases = False # do the same when generating n-best lists (don't use this with minimum error rate training!) + +# Verbosity. 0 = silent, 1 = normal, 2-5 = verbose +log.level = 1 +log.file = sys.stderr + +# pattern-matching stuff +class PhonyGrammar: # saves us the cost of keeping the rules around + def add(self, thing): + pass + +local_grammar = PhonyGrammar() +xcat="X" + +cm = manager.ContextManager( + f_sa_file, + e_file, + sampler=rulefactory.Sampler(300), # lower=faster, higher=better; improvements level off above 200-300 range, -1 = don't sample, use all data (VERY SLOW!) + rulefactory=rulefactory.HieroCachingRuleFactory( + alignment=calignment.Alignment( # compiled alignment object (REQUIRED) + a_file, + from_binary=True + ), + category="["+xcat+"]", # name of generic nonterminal used by Hiero + grammar=local_grammar, # do not change for extraction + max_chunks=None, # maximum number of contiguous chunks of terminal symbols in RHS of a rule. If None, defaults to max_nonterminals+1 + max_initial_size=15, # maximum span of a grammar rule in TEST DATA + max_length=max_len, # maximum number of symbols (both T and NT) allowed in a rule + max_nonterminals=max_nt, # maximum number of nonterminals allowed in a rule (set >2 at your own risk) + max_target_chunks=None, # maximum number of contiguous chunks of terminal symbols in target-side RHS of a rule. If None, defaults to max_nonterminals+1 + max_target_length=None, # maximum number of target side symbols (both T and NT) allowed in a rule. If None, defaults to max_initial_size + min_gap_size=1, # minimum span of a nonterminal in the RHS of a rule in TEST DATA + precompute_file=precompute_file, # filename of file containing precomputed collocations + precompute_secondary_rank=rank2, # maximum frequency rank of patterns used to compute triples (don't set higher than 20). + precompute_rank=rank1, # maximum frequency rank of patterns used to compute collocations (no need to set higher than maybe 200-300) + require_aligned_terminal=True, # require extracted rules to have at least one aligned word + require_aligned_chunks=False, # require each contiguous chunk of extracted rules to have at least one aligned word + per_sentence_grammar=True, # generate a complete grammar for each input sentence + rule_file=out_grammar_file, # grammar is written to this file (sentence id is added to file name for per sentence grammars) + train_max_initial_size=max_size, # maximum span of a grammar rule extracted from TRAINING DATA + train_min_gap_size=min_gap, # minimum span of an RHS nonterminal in a rule extracted from TRAINING DATA + tight_phrases=True, # True if phrases should be tight, False otherwise (False seems to give better results but is slower) + ), + from_binary=True + ) + +# lexical weighting tables +tt = clex.CLex(lex_file, from_binary=True) + +# Only include features that depend on rule identity here +add_model(context_model.EgivenFCoherent(cm), 0.125) +add_model(context_model.SampleCountF(cm), 0.125) +add_model(context_model.CountEF(cm), 0.125) +add_model(context_model.MaxLexFgivenE(cm, tt), 0.125) +add_model(context_model.MaxLexEgivenF(cm, tt), 0.125) +add_model(context_model.IsSingletonF(cm), 0.125) +add_model(context_model.IsSingletonFE(cm), 0.125) + +# grammars, search parameters and all that other stuff are irrelevant + |