summaryrefslogtreecommitdiff
path: root/python/cdec/sa
diff options
context:
space:
mode:
Diffstat (limited to 'python/cdec/sa')
-rw-r--r--python/cdec/sa/__init__.py2
-rw-r--r--python/cdec/sa/compile.py4
-rw-r--r--python/cdec/sa/extract.py3
-rw-r--r--python/cdec/sa/extractor.py7
-rw-r--r--python/cdec/sa/features.py11
5 files changed, 14 insertions, 13 deletions
diff --git a/python/cdec/sa/__init__.py b/python/cdec/sa/__init__.py
index ddefa280..8645e837 100644
--- a/python/cdec/sa/__init__.py
+++ b/python/cdec/sa/__init__.py
@@ -1,4 +1,4 @@
-from _cdec_sa import sym_tostring, sym_isvar, sym_fromstring,\
+from _sa import sym_fromstring,\
SuffixArray, DataArray, LCP, Precomputation, Alignment, BiLex,\
HieroCachingRuleFactory, Sampler
from extractor import GrammarExtractor
diff --git a/python/cdec/sa/compile.py b/python/cdec/sa/compile.py
index 061cdab2..30e605a6 100644
--- a/python/cdec/sa/compile.py
+++ b/python/cdec/sa/compile.py
@@ -2,7 +2,7 @@
import argparse
import os
import logging
-import configobj
+import cdec.configobj
import cdec.sa
MAX_PHRASE_LENGTH = 4
@@ -80,7 +80,7 @@ def main():
lex.write_binary(lex_bin)
# Write configuration
- config = configobj.ConfigObj(args.config, unrepr=True)
+ config = cdec.configobj.ConfigObj(args.config, unrepr=True)
config['f_sa_file'] = f_sa_bin
config['e_file'] = e_bin
config['a_file'] = a_bin
diff --git a/python/cdec/sa/extract.py b/python/cdec/sa/extract.py
index c6da5e9d..918aa3bb 100644
--- a/python/cdec/sa/extract.py
+++ b/python/cdec/sa/extract.py
@@ -3,7 +3,6 @@ import sys
import os
import argparse
import logging
-import configobj
import cdec.sa
def main():
@@ -18,7 +17,7 @@ def main():
if not os.path.exists(args.grammars):
os.mkdir(args.grammars)
- extractor = cdec.sa.GrammarExtractor(configobj.ConfigObj(args.config, unrepr=True))
+ extractor = cdec.sa.GrammarExtractor(args.config)
for i, sentence in enumerate(sys.stdin):
sentence = sentence[:-1]
grammar_file = os.path.join(args.grammars, 'grammar.{0}'.format(i))
diff --git a/python/cdec/sa/extractor.py b/python/cdec/sa/extractor.py
index c97b3c6f..bb912e16 100644
--- a/python/cdec/sa/extractor.py
+++ b/python/cdec/sa/extractor.py
@@ -1,4 +1,6 @@
from itertools import chain
+import os
+import cdec.configobj
from cdec.sa.features import EgivenFCoherent, SampleCountF, CountEF,\
MaxLexEgivenF, MaxLexFgivenE, IsSingletonF, IsSingletonFE
import cdec.sa
@@ -8,7 +10,10 @@ MAX_INITIAL_SIZE = 15
class GrammarExtractor:
def __init__(self, config):
- # TODO if str, read config
+ if isinstance(config, str) or isinstance(config, unicode):
+ if not os.path.exists(config):
+ raise IOError('cannot read configuration from {0}'.format(config))
+ config = cdec.configobj.ConfigObj(config, unrepr=True)
alignment = cdec.sa.Alignment(from_binary=config['a_file'])
self.factory = cdec.sa.HieroCachingRuleFactory(
# compiled alignment object (REQUIRED)
diff --git a/python/cdec/sa/features.py b/python/cdec/sa/features.py
index 8d35d8e6..325b9e13 100644
--- a/python/cdec/sa/features.py
+++ b/python/cdec/sa/features.py
@@ -1,6 +1,5 @@
from __future__ import division
import math
-import cdec.sa
MAXSCORE = 99
@@ -22,11 +21,10 @@ def CoherenceProb(fphrase, ephrase, paircount, fcount, fsample_count):
def MaxLexEgivenF(ttable):
def feature(fphrase, ephrase, paircount, fcount, fsample_count):
- fwords = [cdec.sa.sym_tostring(w) for w in fphrase if not cdec.sa.sym_isvar(w)]
+ fwords = fphrase.words
fwords.append('NULL')
- ewords = (cdec.sa.sym_tostring(w) for w in ephrase if not cdec.sa.sym_isvar(w))
def score():
- for e in ewords:
+ for e in ephrase.words:
maxScore = max(ttable.get_score(f, e, 0) for f in fwords)
yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE
return sum(score())
@@ -34,11 +32,10 @@ def MaxLexEgivenF(ttable):
def MaxLexFgivenE(ttable):
def feature(fphrase, ephrase, paircount, fcount, fsample_count):
- fwords = (cdec.sa.sym_tostring(w) for w in fphrase if not cdec.sa.sym_isvar(w))
- ewords = [cdec.sa.sym_tostring(w) for w in ephrase if not cdec.sa.sym_isvar(w)]
+ ewords = ephrase.words
ewords.append('NULL')
def score():
- for f in fwords:
+ for f in fphrase.words:
maxScore = max(ttable.get_score(f, e, 1) for e in ewords)
yield -math.log10(maxScore) if maxScore > 0 else MAXSCORE
return sum(score())