diff options
Diffstat (limited to 'sa-extract/extract.ini')
-rw-r--r-- | sa-extract/extract.ini | 116 |
1 files changed, 0 insertions, 116 deletions
diff --git a/sa-extract/extract.ini b/sa-extract/extract.ini deleted file mode 100644 index 56913245..00000000 --- a/sa-extract/extract.ini +++ /dev/null @@ -1,116 +0,0 @@ -# This .ini file extracts grammars to a file using -# the pattern matching infrastructure. -# -# Does not do any decoding. -# -# Variables can be set using sa-system.pl -# -# Usage: decoder.py -c <this ini file> [-x <grammar file>] -# -# If the -x option is used, grammar will be written to the -# specified file, otherwise it is written to $PWD/grammar.out -# -# NOTE: all information about rules is cached, so use generous -# memory limits (rules themselves are not cached.) - -import os -import manager -import clex -import context_model -import rulefactory -import calignment -import sys - -out_grammar_file = "grammar.out" -if opts.extra: - out_grammar_file = opts.extra - -# *** these variables written by sa-system.pl. Do not modify *** -lm_file = "/tmp/sa-redpony/de-en/lm/lm/lm.gz" -f_sa_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/f.sa.bin" -e_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/e.bin" -a_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/a/gdfa/a.bin" -lex_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/a/gdfa/lex.bin" -max_len = 5 -max_nt = 2 -max_size=10 -min_gap=1 -rank1 = 100 -rank2 = 10 -precompute_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/precomp.5.2.10.1.100.10.bin" - -# check for path errors -if not os.path.exists(f_sa_file): - raise Exception("Cannot find compiled source language suffix array file %s" % f_sa_file) -if not os.path.exists(e_file): - raise Exception("Cannot find compiled target language array file %s" % e_file) -if not os.path.exists(a_file): - raise Exception("Cannot find compiled alignment file %s" % a_file) -if not os.path.exists(lex_file): - raise Exception("Cannot find compiled lexical weights file %s" % lex_file) -if not os.path.exists(precompute_file): - log.writeln("Could not find precomputed collocations %s, decoding will be slower" % precompute_file) - precompute_file = None - -### Output options -mark_phrases = False # show derivation as SGML markup in output -mert_mark_phrases = False # do the same when generating n-best lists (don't use this with minimum error rate training!) - -# Verbosity. 0 = silent, 1 = normal, 2-5 = verbose -log.level = 1 -log.file = sys.stderr - -# pattern-matching stuff -class PhonyGrammar: # saves us the cost of keeping the rules around - def add(self, thing): - pass - -local_grammar = PhonyGrammar() -xcat="X" - -cm = manager.ContextManager( - f_sa_file, - e_file, - sampler=rulefactory.Sampler(300), # lower=faster, higher=better; improvements level off above 200-300 range, -1 = don't sample, use all data (VERY SLOW!) - rulefactory=rulefactory.HieroCachingRuleFactory( - alignment=calignment.Alignment( # compiled alignment object (REQUIRED) - a_file, - from_binary=True - ), - category="["+xcat+"]", # name of generic nonterminal used by Hiero - grammar=local_grammar, # do not change for extraction - max_chunks=None, # maximum number of contiguous chunks of terminal symbols in RHS of a rule. If None, defaults to max_nonterminals+1 - max_initial_size=15, # maximum span of a grammar rule in TEST DATA - max_length=max_len, # maximum number of symbols (both T and NT) allowed in a rule - max_nonterminals=max_nt, # maximum number of nonterminals allowed in a rule (set >2 at your own risk) - max_target_chunks=None, # maximum number of contiguous chunks of terminal symbols in target-side RHS of a rule. If None, defaults to max_nonterminals+1 - max_target_length=None, # maximum number of target side symbols (both T and NT) allowed in a rule. If None, defaults to max_initial_size - min_gap_size=1, # minimum span of a nonterminal in the RHS of a rule in TEST DATA - precompute_file=precompute_file, # filename of file containing precomputed collocations - precompute_secondary_rank=rank2, # maximum frequency rank of patterns used to compute triples (don't set higher than 20). - precompute_rank=rank1, # maximum frequency rank of patterns used to compute collocations (no need to set higher than maybe 200-300) - require_aligned_terminal=True, # require extracted rules to have at least one aligned word - require_aligned_chunks=False, # require each contiguous chunk of extracted rules to have at least one aligned word - per_sentence_grammar=True, # generate a complete grammar for each input sentence - rule_file=out_grammar_file, # grammar is written to this file (sentence id is added to file name for per sentence grammars) - train_max_initial_size=max_size, # maximum span of a grammar rule extracted from TRAINING DATA - train_min_gap_size=min_gap, # minimum span of an RHS nonterminal in a rule extracted from TRAINING DATA - tight_phrases=True, # True if phrases should be tight, False otherwise (False seems to give better results but is slower) - ), - from_binary=True - ) - -# lexical weighting tables -tt = clex.CLex(lex_file, from_binary=True) - -# Only include features that depend on rule identity here -add_model(context_model.EgivenFCoherent(cm), 0.125) -add_model(context_model.SampleCountF(cm), 0.125) -add_model(context_model.CountEF(cm), 0.125) -add_model(context_model.MaxLexFgivenE(cm, tt), 0.125) -add_model(context_model.MaxLexEgivenF(cm, tt), 0.125) -add_model(context_model.IsSingletonF(cm), 0.125) -add_model(context_model.IsSingletonFE(cm), 0.125) - -# grammars, search parameters and all that other stuff are irrelevant - |