summaryrefslogtreecommitdiff
path: root/python/pkg/cdec
diff options
context:
space:
mode:
Diffstat (limited to 'python/pkg/cdec')
-rw-r--r--python/pkg/cdec/sa/__init__.py16
-rw-r--r--python/pkg/cdec/sa/extract.py23
-rw-r--r--python/pkg/cdec/sa/extractor.py13
3 files changed, 37 insertions, 15 deletions
diff --git a/python/pkg/cdec/sa/__init__.py b/python/pkg/cdec/sa/__init__.py
index cc532fb9..e0a344b7 100644
--- a/python/pkg/cdec/sa/__init__.py
+++ b/python/pkg/cdec/sa/__init__.py
@@ -1,10 +1,24 @@
-from cdec.sa._sa import sym_fromstring,\
+from cdec.sa._sa import make_lattice, decode_lattice, decode_sentence,\
SuffixArray, DataArray, LCP, Precomputation, Alignment, BiLex,\
HieroCachingRuleFactory, Sampler, Scorer
from cdec.sa.extractor import GrammarExtractor
_SA_FEATURES = []
+_SA_ANNOTATORS = {}
+_SA_CONFIGURE = []
def feature(fn):
_SA_FEATURES.append(fn)
return fn
+
+def annotator(fn):
+ _SA_ANNOTATORS[fn.__name__] = fn
+
+def annotate(sentence):
+ meta = {}
+ for name, fn in _SA_ANNOTATORS.iteritems():
+ meta[name] = fn(sentence)
+ return meta
+
+def configure(fn):
+ _SA_CONFIGURE.append(fn)
diff --git a/python/pkg/cdec/sa/extract.py b/python/pkg/cdec/sa/extract.py
index 472f128b..10a81556 100644
--- a/python/pkg/cdec/sa/extract.py
+++ b/python/pkg/cdec/sa/extract.py
@@ -11,16 +11,17 @@ extractor, prefix = None, None
def make_extractor(config, grammars, features):
global extractor, prefix
signal.signal(signal.SIGINT, signal.SIG_IGN) # Let parent process catch Ctrl+C
- if features: load_features(features)
+ load_features(features)
extractor = cdec.sa.GrammarExtractor(config)
prefix = grammars
def load_features(features):
- logging.info('Loading additional feature definitions from %s', features)
- prefix = os.path.dirname(features)
- sys.path.append(prefix)
- __import__(os.path.basename(features).replace('.py', ''))
- sys.path.remove(prefix)
+ for featdef in features:
+ logging.info('Loading additional feature definitions from %s', featdef)
+ prefix = os.path.dirname(featdef)
+ sys.path.append(prefix)
+ __import__(os.path.basename(featdef).replace('.py', ''))
+ sys.path.remove(prefix)
def extract(inp):
global extractor, prefix
@@ -44,15 +45,17 @@ def main():
help='number of parallel extractors')
parser.add_argument('-s', '--chunksize', type=int, default=10,
help='number of sentences / chunk')
- parser.add_argument('-f', '--features', type=str, default=None,
+ parser.add_argument('-f', '--features', nargs='*', default=[],
help='additional feature definitions')
args = parser.parse_args()
if not os.path.exists(args.grammars):
os.mkdir(args.grammars)
- if not (args.features is None or args.features.endswith('.py')):
- sys.stderr.write('Error: feature definition file should be a python module\n')
- sys.exit(1)
+ for featdef in args.features:
+ if not featdef.endswith('.py'):
+ sys.stderr.write('Error: feature definition file <{0}>'
+ ' should be a python module\n'.format(featdef))
+ sys.exit(1)
if args.jobs > 1:
logging.info('Starting %d workers; chunk size: %d', args.jobs, args.chunksize)
diff --git a/python/pkg/cdec/sa/extractor.py b/python/pkg/cdec/sa/extractor.py
index 89e35bf8..a5ce8a68 100644
--- a/python/pkg/cdec/sa/extractor.py
+++ b/python/pkg/cdec/sa/extractor.py
@@ -57,6 +57,8 @@ class GrammarExtractor:
# lexical weighting tables
tt = cdec.sa.BiLex(from_binary=config['lex_file'])
+ # TODO: use @cdec.sa.features decorator for standard features too
+ # + add a mask to disable features
scorer = cdec.sa.Scorer(EgivenFCoherent, SampleCountF, CountEF,
MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE,
*cdec.sa._SA_FEATURES)
@@ -69,11 +71,14 @@ class GrammarExtractor:
sampler = cdec.sa.Sampler(300, fsarray)
self.factory.configure(fsarray, edarray, sampler, scorer)
+ # Initialize feature definitions with configuration
+ for fn in cdec.sa._SA_CONFIGURE:
+ fn(config)
def grammar(self, sentence):
if isinstance(sentence, unicode):
sentence = sentence.encode('utf8')
- cnet = chain(('<s>',), sentence.split(), ('</s>',))
- cnet = (cdec.sa.sym_fromstring(word, terminal=True) for word in cnet)
- cnet = tuple(((word, None, 1), ) for word in cnet)
- return self.factory.input(cnet)
+ words = tuple(chain(('<s>',), sentence.split(), ('</s>',)))
+ meta = cdec.sa.annotate(words)
+ cnet = cdec.sa.make_lattice(words)
+ return self.factory.input(cnet, meta)