summaryrefslogtreecommitdiff
path: root/python/src/sa/data_array.pxi
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2012-07-28 18:35:20 -0400
committerChris Dyer <cdyer@cs.cmu.edu>2012-07-28 18:35:20 -0400
commit15a653ab0d684fae36602665e82f3ef23ac5a028 (patch)
tree4f5e5f0acd0106f32346e7183342fcb210fd5fd5 /python/src/sa/data_array.pxi
parent16d0e2a34df3e32e8992aeda3ad2de7a6e525f14 (diff)
parent0b0616c6f7400ce52d07350f7a7054a2513d9813 (diff)
Merge branch 'master' of github.com:redpony/cdec
Diffstat (limited to 'python/src/sa/data_array.pxi')
-rw-r--r--python/src/sa/data_array.pxi38
1 files changed, 25 insertions, 13 deletions
diff --git a/python/src/sa/data_array.pxi b/python/src/sa/data_array.pxi
index 1c044694..7a102a7e 100644
--- a/python/src/sa/data_array.pxi
+++ b/python/src/sa/data_array.pxi
@@ -14,7 +14,7 @@ cdef class DataArray:
cdef IntList sent_index
cdef bint use_sent_id
- def __cinit__(self, from_binary=None, from_text=None, bint use_sent_id=False):
+ def __cinit__(self, from_binary=None, from_text=None, side=None, bint use_sent_id=False):
self.word2id = {"END_OF_FILE":0, "END_OF_LINE":1}
self.id2word = ["END_OF_FILE", "END_OF_LINE"]
self.data = IntList(1000,1000)
@@ -24,7 +24,10 @@ cdef class DataArray:
if from_binary:
self.read_binary(from_binary)
elif from_text:
- self.read_text(from_text)
+ if side:
+ self.read_bitext(from_text, (0 if side == 'source' else 1))
+ else:
+ self.read_text(from_text)
def __len__(self):
return len(self.data)
@@ -62,21 +65,30 @@ cdef class DataArray:
f.write("\n")
def read_text(self, char* filename):
- cdef int word_count = 0
with gzip_or_text(filename) as fp:
- for line_num, line in enumerate(fp):
- self.sent_index.append(word_count)
- for word in line.split():
- self.data.append(self.get_id(word))
- if self.use_sent_id:
- self.sent_id.append(line_num)
- word_count = word_count + 1
- self.data.append(1)
+ self.read_text_data(fp)
+
+ def read_bitext(self, char* filename, int side):
+ with gzip_or_text(filename) as fp:
+ data = (line.split(' ||| ')[side] for line in fp)
+ self.read_text_data(data)
+
+ def read_text_data(self, data):
+ cdef int word_count = 0
+ for line_num, line in enumerate(data):
+ self.sent_index.append(word_count)
+ for word in line.split():
+ self.data.append(self.get_id(word))
if self.use_sent_id:
self.sent_id.append(line_num)
word_count = word_count + 1
- self.data.append(0)
- self.sent_index.append(word_count)
+ self.data.append(1)
+ if self.use_sent_id:
+ self.sent_id.append(line_num)
+ word_count = word_count + 1
+ self.data.append(0)
+ self.sent_index.append(word_count)
+
def read_binary(self, char* filename):
cdef FILE* f