summaryrefslogtreecommitdiff
path: root/python/pkg/cdec/sa/extractor.py
diff options
context:
space:
mode:
authorMichael Denkowski <michael.j.denkowski@gmail.com>2012-12-27 19:38:56 -0500
committerMichael Denkowski <michael.j.denkowski@gmail.com>2012-12-27 19:38:56 -0500
commitd4cb5aac6e7e083572dfb4d3393ceceb0dbad99c (patch)
tree4e492573497741aac37b1607be2c37458525aa68 /python/pkg/cdec/sa/extractor.py
parentdd708f2931ff416eae781e29f27ddb61c1632c9e (diff)
Hooks for online grammar extraction
Diffstat (limited to 'python/pkg/cdec/sa/extractor.py')
-rw-r--r--python/pkg/cdec/sa/extractor.py9
1 files changed, 8 insertions, 1 deletions
diff --git a/python/pkg/cdec/sa/extractor.py b/python/pkg/cdec/sa/extractor.py
index e09f79ea..f3a86d9d 100644
--- a/python/pkg/cdec/sa/extractor.py
+++ b/python/pkg/cdec/sa/extractor.py
@@ -1,5 +1,5 @@
from itertools import chain
-import os
+import os, sys
import cdec.configobj
from cdec.sa.features import EgivenFCoherent, SampleCountF, CountEF,\
MaxLexEgivenF, MaxLexFgivenE, IsSingletonF, IsSingletonFE
@@ -82,3 +82,10 @@ class GrammarExtractor:
meta = cdec.sa.annotate(words)
cnet = cdec.sa.make_lattice(words)
return self.factory.input(cnet, meta)
+
+ # Add training instance to data
+ def add_instance(self, sentence, reference, alignment):
+ f_words = cdec.sa.encode_words(sentence.split())
+ e_words = cdec.sa.encode_words(reference.split())
+ al = sorted(tuple(int(i) for i in pair.split('-')) for pair in alignment.split())
+ self.factory.add_instance(f_words, e_words, al) \ No newline at end of file