Merge branch 'master' of github.com:redpony/cdec

author: Kenneth Heafield <github@kheafield.com> 2012-08-03 07:46:54 -0400
committer: Kenneth Heafield <github@kheafield.com> 2012-08-03 07:46:54 -0400
commit: 122f46c31102b683eaab3ad81a3a98accbc694bb (patch)
tree: 8d499d789b159ebed25bb23b6983813d064a6296 /sa-extract/extract.ini
parent: ac664bdb0e481539cf77098a7dd0e1ec8d937ba0 (diff)
parent: 193d137056c3c4f73d66f8db84691d63307de894 (diff)
1 files changed, 0 insertions, 116 deletions
diff --git a/sa-extract/extract.ini b/sa-extract/extract.ini
deleted file mode 100644
index 56913245..00000000
--- a/sa-extract/extract.ini
+++ /dev/null
@@ -1,116 +0,0 @@
-# This .ini file extracts grammars to a file using
-# the pattern matching infrastructure.
-#
-# Does not do any decoding.
-#
-# Variables can be set using sa-system.pl
-#
-# Usage: decoder.py -c <this ini file> [-x <grammar file>]
-#
-# If the -x option is used, grammar will be written to the
-# specified file, otherwise it is written to $PWD/grammar.out
-# 
-# NOTE: all information about rules is cached, so use generous
-# memory limits (rules themselves are not cached.)
-
-import os
-import manager
-import clex
-import context_model
-import rulefactory
-import calignment
-import sys
-
-out_grammar_file = "grammar.out"
-if opts.extra:
-	out_grammar_file = opts.extra
-
-# *** these variables written by sa-system.pl.  Do not modify ***
-lm_file = "/tmp/sa-redpony/de-en/lm/lm/lm.gz"
-f_sa_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/f.sa.bin"
-e_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/e.bin"
-a_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/a/gdfa/a.bin"
-lex_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/a/gdfa/lex.bin"
-max_len = 5
-max_nt = 2
-max_size=10
-min_gap=1
-rank1 = 100
-rank2 = 10
-precompute_file = "/tmp/sa-redpony/de-en/bitext/soseos.vc/precomp.5.2.10.1.100.10.bin"
-
-# check for path errors
-if not os.path.exists(f_sa_file):
-	raise Exception("Cannot find compiled source language suffix array file %s" % f_sa_file)
-if not os.path.exists(e_file):
-	raise Exception("Cannot find compiled target language array file %s" % e_file)
-if not os.path.exists(a_file):
-	raise Exception("Cannot find compiled alignment file %s" % a_file)
-if not os.path.exists(lex_file):
-	raise Exception("Cannot find compiled lexical weights file %s" % lex_file)
-if not os.path.exists(precompute_file):
-	log.writeln("Could not find precomputed collocations %s, decoding will be slower" % precompute_file)
-	precompute_file = None
-
-### Output options
-mark_phrases = False      # show derivation as SGML markup in output
-mert_mark_phrases = False # do the same when generating n-best lists (don't use this with minimum error rate training!)
-
-# Verbosity. 0 = silent, 1 = normal, 2-5 = verbose
-log.level = 1
-log.file = sys.stderr
-
-# pattern-matching stuff
-class PhonyGrammar:  # saves us the cost of keeping the rules around
-	def add(self, thing):
-		pass
-
-local_grammar = PhonyGrammar()
-xcat="X"
-
-cm = manager.ContextManager(
-	f_sa_file,
-	e_file,
-	sampler=rulefactory.Sampler(300),  # lower=faster, higher=better; improvements level off above 200-300 range, -1 = don't sample, use all data (VERY SLOW!)
-	rulefactory=rulefactory.HieroCachingRuleFactory(
-		alignment=calignment.Alignment(  # compiled alignment object (REQUIRED)
-			a_file, 
-			from_binary=True
-			),
-		category="["+xcat+"]",           # name of generic nonterminal used by Hiero
-		grammar=local_grammar,           # do not change for extraction
-		max_chunks=None,                 # maximum number of contiguous chunks of terminal symbols in RHS of a rule. If None, defaults to max_nonterminals+1
-		max_initial_size=15,             # maximum span of a grammar rule in TEST DATA
-		max_length=max_len,              # maximum number of symbols (both T and NT) allowed in a rule
-		max_nonterminals=max_nt,         # maximum number of nonterminals allowed in a rule (set >2 at your own risk)
-		max_target_chunks=None,          # maximum number of contiguous chunks of terminal symbols in target-side RHS of a rule. If None, defaults to max_nonterminals+1
-		max_target_length=None,          # maximum number of target side symbols (both T and NT) allowed in a rule. If None, defaults to max_initial_size
-		min_gap_size=1,                  # minimum span of a nonterminal in the RHS of a rule in TEST DATA
-		precompute_file=precompute_file, # filename of file containing precomputed collocations
-		precompute_secondary_rank=rank2, # maximum frequency rank of patterns used to compute triples (don't set higher than 20).
-		precompute_rank=rank1,           # maximum frequency rank of patterns used to compute collocations (no need to set higher than maybe 200-300)
-		require_aligned_terminal=True,   # require extracted rules to have at least one aligned word
-		require_aligned_chunks=False,    # require each contiguous chunk of extracted rules to have at least one aligned word
-		per_sentence_grammar=True,       # generate a complete grammar for each input sentence
-		rule_file=out_grammar_file,      # grammar is written to this file (sentence id is added to file name for per sentence grammars)
-		train_max_initial_size=max_size, # maximum span of a grammar rule extracted from TRAINING DATA
-		train_min_gap_size=min_gap,      # minimum span of an RHS nonterminal in a rule extracted from TRAINING DATA
-		tight_phrases=True,              # True if phrases should be tight, False otherwise (False seems to give better results but is slower)
-		),
-	from_binary=True
-	)
-
-# lexical weighting tables
-tt = clex.CLex(lex_file, from_binary=True)
-
-# Only include features that depend on rule identity here
-add_model(context_model.EgivenFCoherent(cm), 0.125) 
-add_model(context_model.SampleCountF(cm), 0.125)
-add_model(context_model.CountEF(cm), 0.125)
-add_model(context_model.MaxLexFgivenE(cm, tt), 0.125) 
-add_model(context_model.MaxLexEgivenF(cm, tt), 0.125) 
-add_model(context_model.IsSingletonF(cm), 0.125)
-add_model(context_model.IsSingletonFE(cm), 0.125)
-
-# grammars, search parameters and all that other stuff are irrelevant
-
author	Kenneth Heafield <github@kheafield.com>	2012-08-03 07:46:54 -0400
committer	Kenneth Heafield <github@kheafield.com>	2012-08-03 07:46:54 -0400
commit	122f46c31102b683eaab3ad81a3a98accbc694bb (patch)
tree	8d499d789b159ebed25bb23b6983813d064a6296 /sa-extract/extract.ini
parent	ac664bdb0e481539cf77098a7dd0e1ec8d937ba0 (diff)
parent	193d137056c3c4f73d66f8db84691d63307de894 (diff)