summaryrefslogtreecommitdiff
path: root/python/cdec/sa/extractor.py
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2014-03-16 17:48:48 +0100
committerPatrick Simianer <p@simianer.de>2014-03-16 17:48:48 +0100
commit5250fd67a4b8f242068cff87f0a6a4211f8b0fcf (patch)
treef1401c1fd3eeae8671e59baf0d2169d1eb721cb7 /python/cdec/sa/extractor.py
parent3eedf96b5a08b3e3414888d328c505814b84d8db (diff)
parentcc87bfed0697583b7c11243913254dde3c0047d4 (diff)
Merge remote-tracking branch 'upstream/master'
Diffstat (limited to 'python/cdec/sa/extractor.py')
-rw-r--r--python/cdec/sa/extractor.py33
1 files changed, 23 insertions, 10 deletions
diff --git a/python/cdec/sa/extractor.py b/python/cdec/sa/extractor.py
index 244f84be..777f5afd 100644
--- a/python/cdec/sa/extractor.py
+++ b/python/cdec/sa/extractor.py
@@ -1,25 +1,44 @@
from itertools import chain
-import os, sys
+import logging
+import os
+import sys
import cdec.configobj
from cdec.sa._sa import gzip_or_text
from cdec.sa.features import EgivenFCoherent, SampleCountF, CountEF,\
MaxLexEgivenF, MaxLexFgivenE, IsSingletonF, IsSingletonFE,\
- IsSupportedOnline, CountExceptLM, CountExceptLex
+ IsSupportedOnline
import cdec.sa
# maximum span of a grammar rule in TEST DATA
MAX_INITIAL_SIZE = 15
class GrammarExtractor:
- def __init__(self, config, online=False, vocab=None, features=None):
+ def __init__(self, config, online=False, features=None):
+
+ logging.basicConfig(level=logging.INFO)
+ logger = logging.getLogger('cdec.sa')
+
if isinstance(config, basestring):
if not os.path.exists(config):
raise IOError('cannot read configuration from {0}'.format(config))
config = cdec.configobj.ConfigObj(config, unrepr=True)
+
+ logger.info('Loading alignment...')
alignment = cdec.sa.Alignment(from_binary=config['a_file'])
+
+ # lexical weighting tables
+ if not online:
+ logger.info('Loading bilexical dictionary...')
+ tt = cdec.sa.BiLex(from_binary=config['lex_file'])
+ else:
+ logger.info('Loading online bilexical dictionary...')
+ tt = cdec.sa.online.Bilex(config['bilex_file'])
+
self.factory = cdec.sa.HieroCachingRuleFactory(
# compiled alignment object (REQUIRED)
alignment,
+ # bilexical dictionary if online
+ bilex=tt if online else None,
# name of generic nonterminal used by Hiero
category="[X]",
# maximum number of contiguous chunks of terminal symbols in RHS of a rule
@@ -56,17 +75,11 @@ class GrammarExtractor:
tight_phrases=config.get('tight_phrases', True),
)
- # lexical weighting tables
- tt = cdec.sa.BiLex(from_binary=config['lex_file'])
-
# TODO: clean this up
+ # Load data and add features for online grammar extraction
extended_features = []
if online:
extended_features.append(IsSupportedOnline)
- if vocab:
- vcb_set = set(line.strip() for line in gzip_or_text(vocab))
- extended_features.append(CountExceptLM(vcb_set))
- extended_features.append(CountExceptLex(tt))
# TODO: use @cdec.sa.features decorator for standard features too
# + add a mask to disable features