Merge branch 'master' of https://github.com/pauldb89/cdec

author: Paul Baltescu <pauldb89@gmail.com> 2013-02-21 14:13:55 +0000
committer: Paul Baltescu <pauldb89@gmail.com> 2013-02-21 14:13:55 +0000
commit: b5491898549c61bd799d199aa9178a8394a1ef69 (patch)
tree: fb2686a2aae03ff07bcdf4cd47e8c3191eff8d1e /python/pkg/cdec/sa/extractor.py
parent: 0187447a643c3ea262b13b3052cb1531990eafe6 (diff)
parent: c17d9c23d023a5c08656376944f636180f0a437b (diff)
1 files changed, 26 insertions, 4 deletions
diff --git a/python/pkg/cdec/sa/extractor.py b/python/pkg/cdec/sa/extractor.py
index e09f79ea..acc13cbc 100644
--- a/python/pkg/cdec/sa/extractor.py
+++ b/python/pkg/cdec/sa/extractor.py
@@ -1,15 +1,16 @@
 from itertools import chain
-import os
+import os, sys
 import cdec.configobj
 from cdec.sa.features import EgivenFCoherent, SampleCountF, CountEF,\
-        MaxLexEgivenF, MaxLexFgivenE, IsSingletonF, IsSingletonFE
+        MaxLexEgivenF, MaxLexFgivenE, IsSingletonF, IsSingletonFE,\
+        IsSupportedOnline
 import cdec.sa
 
 # maximum span of a grammar rule in TEST DATA
 MAX_INITIAL_SIZE = 15
 
 class GrammarExtractor:
-    def __init__(self, config, features=None):
+    def __init__(self, config, online=False, features=None):
         if isinstance(config, basestring):
             if not os.path.exists(config):
                 raise IOError('cannot read configuration from {0}'.format(config))
@@ -57,11 +58,19 @@ class GrammarExtractor:
         # lexical weighting tables
         tt = cdec.sa.BiLex(from_binary=config['lex_file'])
 
+        # TODO: clean this up
+        extended_features = []
+        if online:
+            extended_features.append(IsSupportedOnline)
+            
         # TODO: use @cdec.sa.features decorator for standard features too
         # + add a mask to disable features
+        for f in cdec.sa._SA_FEATURES:
+            extended_features.append(f)
+            
         scorer = cdec.sa.Scorer(EgivenFCoherent, SampleCountF, CountEF, 
             MaxLexFgivenE(tt), MaxLexEgivenF(tt), IsSingletonF, IsSingletonFE,
-            *cdec.sa._SA_FEATURES)
+            *extended_features)
 
         fsarray = cdec.sa.SuffixArray(from_binary=config['f_sa_file'])
         edarray = cdec.sa.DataArray(from_binary=config['e_file'])
@@ -82,3 +91,16 @@ class GrammarExtractor:
         meta = cdec.sa.annotate(words)
         cnet = cdec.sa.make_lattice(words)
         return self.factory.input(cnet, meta)
+
+    # Add training instance to data
+    def add_instance(self, sentence, reference, alignment):
+        f_words = cdec.sa.encode_words(sentence.split())
+        e_words = cdec.sa.encode_words(reference.split())
+        al = sorted(tuple(int(i) for i in pair.split('-')) for pair in alignment.split())
+        self.factory.add_instance(f_words, e_words, al)
+    
+    # Debugging
+    def dump_online_stats(self):
+        self.factory.dump_online_stats()
+    def dump_online_rules(self):
+        self.factory.dump_online_rules()
+\ No newline at end of file
author	Paul Baltescu <pauldb89@gmail.com>	2013-02-21 14:13:55 +0000
committer	Paul Baltescu <pauldb89@gmail.com>	2013-02-21 14:13:55 +0000
commit	b5491898549c61bd799d199aa9178a8394a1ef69 (patch)
tree	fb2686a2aae03ff07bcdf4cd47e8c3191eff8d1e /python/pkg/cdec/sa/extractor.py
parent	0187447a643c3ea262b13b3052cb1531990eafe6 (diff)
parent	c17d9c23d023a5c08656376944f636180f0a437b (diff)