Merge branch 'master' of github.com:redpony/cdec

author: Kenneth Heafield <github@kheafield.com> 2012-08-03 07:46:54 -0400
committer: Kenneth Heafield <github@kheafield.com> 2012-08-03 07:46:54 -0400
commit: 122f46c31102b683eaab3ad81a3a98accbc694bb (patch)
tree: 8d499d789b159ebed25bb23b6983813d064a6296 /sa-extract/cn.py
parent: ac664bdb0e481539cf77098a7dd0e1ec8d937ba0 (diff)
parent: 193d137056c3c4f73d66f8db84691d63307de894 (diff)
1 files changed, 0 insertions, 164 deletions
diff --git a/sa-extract/cn.py b/sa-extract/cn.py
deleted file mode 100644
index e534783f..00000000
--- a/sa-extract/cn.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# cn.py
-# Chris Dyer <redpony@umd.edu>
-# Copyright (c) 2006 University of Maryland.
-
-# vim:tabstop=4:autoindent:expandtab
-
-import sys
-import math
-import sym
-import log
-import sgml
-
-epsilon = sym.fromstring('*EPS*');
-
-class CNStats(object):
-    def __init__(self):
-      self.read = 0
-      self.colls = 0
-      self.words = 0
-
-    def collect(self, cn):
-      self.read += 1
-      self.colls += cn.get_length()
-      for col in cn.columns:
-        self.words += len(col)
-
-    def __str__(self):
-      return "confusion net statistics:\n succ. read: %d\n columns:    %d\n words:      %d\n avg. words/column:\t%f\n avg. cols/sent:\t%f\n\n" % (self.read, self.colls, self.words, float(self.words)/float(self.colls), float(self.colls)/float(self.read))
-
-class ConfusionNet(object):
-    def __init__(self, sent):
-        object.__init__(self)
-        if (len(sent.words) == 0):
-            self.columns = ()
-            return # empty line, it happens
-        line = sent.words[0]
-        if (line.startswith("(((")):
-            if (len(sent.words) > 1):
-                log.write("Bad sentence: %s\n" % (line))
-            assert(len(sent.words) == 1) # make sure there are no spaces in your confusion nets!
-            line =  "((('<s>',1.0,1),),"+line[1:len(line)-1]+"(('</s>',1.0,1),))"
-            cols = eval(line)
-            res = []
-            for col in cols:
-               x = []
-               for alt in col:
-                   costs = alt[1]
-                   if (type(costs) != type((1,2))):
-                       costs=(float(costs),)
-                   j=[]
-                   for c in costs:
-                       j.append(float(c))
-                   cost = tuple(j)
-                   spanlen = 1
-                   if (len(alt) == 3):
-                       spanlen = alt[2]
-                   x.append((sym.fromstring(alt[0],terminal=True), None, spanlen))
-               res.append(tuple(x))
-            self.columns = tuple(res)
-        else:  # convert a string of input into a CN
-            res = [];
-            res.append(((sym.fromstring('<s>',terminal=True), None, 1), ))
-            for word in sent.words:
-               res.append(((sym.fromstring(word,terminal=True), None, 1), ));  # (alt=word, cost=0.0)
-            res.append(((sym.fromstring('</s>',terminal=True), None, 1), ))
-            self.columns = tuple(res)
-
-    def is_epsilon(self, position):
-        x = self.columns[position[0]][position[1]][0]
-        return x == epsilon
-
-    def compute_epsilon_run_length(self, cn_path):
-        if (len(cn_path) == 0):
-            return 0
-        x = len(cn_path) - 1
-        res = 0
-        ''' -1 denotes a non-terminal '''
-        while (x >= 0 and cn_path[x][0] >= 0 and self.is_epsilon(cn_path[x])):
-            res += 1
-            x -= 1
-        return res
-
-    def compute_cn_cost(self, cn_path):
-       c = None
-       for (col, row) in cn_path:
-           if (col >= 0):
-               if c is None:
-                   c = self.columns[col][row][1].clone()
-               else:
-                   c += self.columns[col][row][1]
-       return c                                        
-
-    def get_column(self, col):
-        return self.columns[col]
-
-    def get_length(self):
-        return len(self.columns)
-
-    def __str__(self):
-        r = "conf net: %d\n" % (len(self.columns),)
-        i = 0
-        for col in self.columns:
-            r += "%d -- " % i
-            i += 1
-            for alternative in col:
-                r += "(%s, %s, %s) " % (sym.tostring(alternative[0]), alternative[1], alternative[2])
-            r += "\n"
-        return r
-    
-    def listdown(_columns, col = 0):
-        # output all the possible sentences out of the self lattice
-        # will be used by the "dumb" adaptation of lattice decoding with suffix array
-        result = []
-        for entry in _columns[col]:
-            if col+entry[2]+1<=len(_columns) :
-                for suffix in self.listdown(_columns,col+entry[2]):
-                    result.append(entry[0]+' '+suffix)
-                    #result.append(entry[0]+' '+suffix)
-            else:
-                result.append(entry[0])
-                #result.append(entry[0])
-        return result
-    
-    def next(self,_columns,curr_idx, min_dist=1):
-        # can be used only when prev_id is defined
-        result = []
-        #print "curr_idx=%i\n" % curr_idx
-        if curr_idx+min_dist >= len(_columns): 
-            return result
-        for alt_idx in xrange(len(_columns[curr_idx])):
-            alt = _columns[curr_idx][alt_idx]
-            #print "checking %i alternative : " % alt_idx
-            #print "%s %f %i\n" % (alt[0],alt[1],alt[2])
-            #print alt
-            if alt[2]<min_dist:
-                #print "recursive next(%i, %i, %i)\n" % (curr_idx,alt_idx,min_dist-alt[2])
-                result.extend(self.next(_columns,curr_idx+alt[2],min_dist-alt[2]))
-            elif curr_idx+alt[2]<len(_columns):
-                #print "adding because the skip %i doesn't go beyong the length\n" % alt[2]
-                result.append(curr_idx+alt[2])
-        return set(result)
-                
-    
-
-
-#file = open(sys.argv[1], "rb")
-#sent = sgml.process_sgml_line(file.read())
-#print sent
-#cn = ConfusionNet(sent)
-#print cn
-#results = cn.listdown()
-#for result in results:
-#    print sym.tostring(result)
-#print cn.next(0);
-#print cn.next(1);
-#print cn.next(2);
-#print cn.next(3);
-#print cn
-#cn = ConfusionNet()
-#k = 0
-#while (cn.read(file)):
-#  print cn
-  
-#print cn.stats
author	Kenneth Heafield <github@kheafield.com>	2012-08-03 07:46:54 -0400
committer	Kenneth Heafield <github@kheafield.com>	2012-08-03 07:46:54 -0400
commit	122f46c31102b683eaab3ad81a3a98accbc694bb (patch)
tree	8d499d789b159ebed25bb23b6983813d064a6296 /sa-extract/cn.py
parent	ac664bdb0e481539cf77098a7dd0e1ec8d937ba0 (diff)
parent	193d137056c3c4f73d66f8db84691d63307de894 (diff)