summaryrefslogtreecommitdiff
path: root/sa-extract/extract.ini
diff options
context:
space:
mode:
authorChris Dyer <prguest11@taipan.cs>2012-02-02 06:29:50 +0000
committerChris Dyer <prguest11@taipan.cs>2012-02-02 06:29:50 +0000
commit8e5fad9bcbadf36bbab3c1c5b053e3c8f7dddbce (patch)
tree9c812b3f267aa1975cdf8b7af928c4b20eb36f93 /sa-extract/extract.ini
parentff496d3089e84846c8562c574155d8df1e4d911c (diff)
lopez suffix array extractor with copyrighted david chiang code excised
Diffstat (limited to 'sa-extract/extract.ini')
-rw-r--r--sa-extract/extract.ini116
1 files changed, 116 insertions, 0 deletions
diff --git a/sa-extract/extract.ini b/sa-extract/extract.ini
new file mode 100644
index 00000000..56913245
--- /dev/null
+++ b/sa-extract/extract.ini
@@ -0,0 +1,116 @@
+# This .ini file extracts grammars to a file using
+# the pattern matching infrastructure.
+#
+# Does not do any decoding.
+#
+# Variables can be set using sa-system.pl
+#
+# Usage: decoder.py -c <this ini file> [-x <grammar file>]
+#
+# If the -x option is used, grammar will be written to the
+# specified file, otherwise it is written to $PWD/grammar.out
+#
+# NOTE: all information about rules is cached, so use generous
+# memory limits (rules themselves are not cached.)
+
+import os
+import manager
+import clex
+import context_model
+import rulefactory
+import calignment
+import sys
+
+out_grammar_file = "grammar.out"
+if opts.extra:
+ out_grammar_file = opts.extra
+
+# *** these variables written by sa-system.pl. Do not modify ***
+lm_file = "/tmp/sa-redpony/de-en/lm/lm/lm.gz"
+f_sa_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/f.sa.bin"
+e_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/e.bin"
+a_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/a/gdfa/a.bin"
+lex_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/a/gdfa/lex.bin"
+max_len = 5
+max_nt = 2
+max_size=10
+min_gap=1
+rank1 = 100
+rank2 = 10
+precompute_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/precomp.5.2.10.1.100.10.bin"
+
+# check for path errors
+if not os.path.exists(f_sa_file):
+ raise Exception("Cannot find compiled source language suffix array file %s" % f_sa_file)
+if not os.path.exists(e_file):
+ raise Exception("Cannot find compiled target language array file %s" % e_file)
+if not os.path.exists(a_file):
+ raise Exception("Cannot find compiled alignment file %s" % a_file)
+if not os.path.exists(lex_file):
+ raise Exception("Cannot find compiled lexical weights file %s" % lex_file)
+if not os.path.exists(precompute_file):
+ log.writeln("Could not find precomputed collocations %s, decoding will be slower" % precompute_file)
+ precompute_file = None
+
+### Output options
+mark_phrases = False # show derivation as SGML markup in output
+mert_mark_phrases = False # do the same when generating n-best lists (don't use this with minimum error rate training!)
+
+# Verbosity. 0 = silent, 1 = normal, 2-5 = verbose
+log.level = 1
+log.file = sys.stderr
+
+# pattern-matching stuff
+class PhonyGrammar: # saves us the cost of keeping the rules around
+ def add(self, thing):
+ pass
+
+local_grammar = PhonyGrammar()
+xcat="X"
+
+cm = manager.ContextManager(
+ f_sa_file,
+ e_file,
+ sampler=rulefactory.Sampler(300), # lower=faster, higher=better; improvements level off above 200-300 range, -1 = don't sample, use all data (VERY SLOW!)
+ rulefactory=rulefactory.HieroCachingRuleFactory(
+ alignment=calignment.Alignment( # compiled alignment object (REQUIRED)
+ a_file,
+ from_binary=True
+ ),
+ category="["+xcat+"]", # name of generic nonterminal used by Hiero
+ grammar=local_grammar, # do not change for extraction
+ max_chunks=None, # maximum number of contiguous chunks of terminal symbols in RHS of a rule. If None, defaults to max_nonterminals+1
+ max_initial_size=15, # maximum span of a grammar rule in TEST DATA
+ max_length=max_len, # maximum number of symbols (both T and NT) allowed in a rule
+ max_nonterminals=max_nt, # maximum number of nonterminals allowed in a rule (set >2 at your own risk)
+ max_target_chunks=None, # maximum number of contiguous chunks of terminal symbols in target-side RHS of a rule. If None, defaults to max_nonterminals+1
+ max_target_length=None, # maximum number of target side symbols (both T and NT) allowed in a rule. If None, defaults to max_initial_size
+ min_gap_size=1, # minimum span of a nonterminal in the RHS of a rule in TEST DATA
+ precompute_file=precompute_file, # filename of file containing precomputed collocations
+ precompute_secondary_rank=rank2, # maximum frequency rank of patterns used to compute triples (don't set higher than 20).
+ precompute_rank=rank1, # maximum frequency rank of patterns used to compute collocations (no need to set higher than maybe 200-300)
+ require_aligned_terminal=True, # require extracted rules to have at least one aligned word
+ require_aligned_chunks=False, # require each contiguous chunk of extracted rules to have at least one aligned word
+ per_sentence_grammar=True, # generate a complete grammar for each input sentence
+ rule_file=out_grammar_file, # grammar is written to this file (sentence id is added to file name for per sentence grammars)
+ train_max_initial_size=max_size, # maximum span of a grammar rule extracted from TRAINING DATA
+ train_min_gap_size=min_gap, # minimum span of an RHS nonterminal in a rule extracted from TRAINING DATA
+ tight_phrases=True, # True if phrases should be tight, False otherwise (False seems to give better results but is slower)
+ ),
+ from_binary=True
+ )
+
+# lexical weighting tables
+tt = clex.CLex(lex_file, from_binary=True)
+
+# Only include features that depend on rule identity here
+add_model(context_model.EgivenFCoherent(cm), 0.125)
+add_model(context_model.SampleCountF(cm), 0.125)
+add_model(context_model.CountEF(cm), 0.125)
+add_model(context_model.MaxLexFgivenE(cm, tt), 0.125)
+add_model(context_model.MaxLexEgivenF(cm, tt), 0.125)
+add_model(context_model.IsSingletonF(cm), 0.125)
+add_model(context_model.IsSingletonFE(cm), 0.125)
+
+# grammars, search parameters and all that other stuff are irrelevant
+