diff options
author | Michael Denkowski <michael.j.denkowski@gmail.com> | 2012-12-27 19:38:56 -0500 |
---|---|---|
committer | Michael Denkowski <michael.j.denkowski@gmail.com> | 2012-12-27 19:38:56 -0500 |
commit | d4cb5aac6e7e083572dfb4d3393ceceb0dbad99c (patch) | |
tree | 4e492573497741aac37b1607be2c37458525aa68 /python/pkg/cdec/sa/extractor.py | |
parent | dd708f2931ff416eae781e29f27ddb61c1632c9e (diff) |
Hooks for online grammar extraction
Diffstat (limited to 'python/pkg/cdec/sa/extractor.py')
-rw-r--r-- | python/pkg/cdec/sa/extractor.py | 9 |
1 files changed, 8 insertions, 1 deletions
diff --git a/python/pkg/cdec/sa/extractor.py b/python/pkg/cdec/sa/extractor.py index e09f79ea..f3a86d9d 100644 --- a/python/pkg/cdec/sa/extractor.py +++ b/python/pkg/cdec/sa/extractor.py @@ -1,5 +1,5 @@ from itertools import chain -import os +import os, sys import cdec.configobj from cdec.sa.features import EgivenFCoherent, SampleCountF, CountEF,\ MaxLexEgivenF, MaxLexFgivenE, IsSingletonF, IsSingletonFE @@ -82,3 +82,10 @@ class GrammarExtractor: meta = cdec.sa.annotate(words) cnet = cdec.sa.make_lattice(words) return self.factory.input(cnet, meta) + + # Add training instance to data + def add_instance(self, sentence, reference, alignment): + f_words = cdec.sa.encode_words(sentence.split()) + e_words = cdec.sa.encode_words(reference.split()) + al = sorted(tuple(int(i) for i in pair.split('-')) for pair in alignment.split()) + self.factory.add_instance(f_words, e_words, al)
\ No newline at end of file |