Initial ci of gi dir

git-svn-id: https://ws10smt.googlecode.com/svn/trunk@5 ec762483-ff6d-05da-a07a-a48fb63a330f
author: philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-06-22 20:34:00 +0000
committer: philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> 2010-06-22 20:34:00 +0000
commit: 2f2ba42a1453f4a3a08f9c1ecfc53c1b1c83d550 (patch)
tree: 646e81b6325280f64a72771b5eeadf5118e465a9 /gi/pyp-topics/scripts/extract_contexts.py
parent: 2f2e36ca3060e7e9853c3d611f6cc5e112a76ddd (diff)
1 files changed, 144 insertions, 0 deletions
diff --git a/gi/pyp-topics/scripts/extract_contexts.py b/gi/pyp-topics/scripts/extract_contexts.py
new file mode 100755
index 00000000..b2723f2a
--- /dev/null
+++ b/gi/pyp-topics/scripts/extract_contexts.py
@@ -0,0 +1,144 @@
+#!/usr/bin/python
+
+import sys,collections
+
+def extract_backoff(context_list, order):
+  assert len(context_list) == (2*order)
+  backoffs = []
+  for i in range(1,order+1):
+    if i == order:
+      backoffs.append(([context_list[i-1]+"|"], ["|"+context_list[i]]))
+    else:
+      right_limit = 2*order-i
+      core = context_list[i:right_limit]
+      left = [context_list[i-1]+"|"*(order-i+1)]
+      right = ["|"*(order-i+1)+context_list[right_limit]]
+      backoffs.append((core, left, right))
+# print context_list, backoffs
+  return backoffs
+
+def tuple_to_str(t):
+  s=""
+  for i,x in enumerate(t):
+    if i > 0: s += "|"
+    s += str(x)
+  return s
+
+if len(sys.argv) < 3:
+  print "Usage: extract-contexts.py output_filename order cutoff lowercase"
+  exit(1)
+
+output_filename = sys.argv[1]
+order = int(sys.argv[2])
+cutoff = 0
+if len(sys.argv) > 3:
+  cutoff = int(sys.argv[3])
+lowercase = False
+if len(sys.argv) > 4:
+  lowercase = bool(sys.argv[4])
+
+contexts_dict={}
+contexts_list=[]
+contexts_freq=collections.defaultdict(int)
+contexts_backoff={}
+
+token_dict={}
+token_list=[]
+documents_dict=collections.defaultdict(dict)
+
+contexts_at_order = [i for i in range(order+1)]
+
+prefix = ["<s%d>|<s>"%i for i in range(order)]
+suffix = ["</s%d>|</s>"%i for i in range(order)]
+
+for line in sys.stdin:
+  tokens = list(prefix)
+  tokens.extend(line.split())
+  tokens.extend(suffix)
+  if lowercase:
+    tokens = map(lambda x: x.lower(), tokens)
+
+  for i in range(order, len(tokens)-order):
+    context_list = []
+    term=""
+    for j in range(i-order, i+order+1):
+      token,tag = tokens[j].rsplit('|',2)
+      if j != i:
+        context_list.append(token)
+      else:
+        if token not in token_dict: 
+          token_dict[token] = len(token_dict)
+          token_list.append(token)
+        term = token_dict[token] 
+
+    context = tuple_to_str(tuple(context_list))
+
+    if context not in contexts_dict: 
+      context_index = len(contexts_dict)
+      contexts_dict[context] = context_index
+      contexts_list.append(context)
+      contexts_at_order[0] += 1
+
+      # handle backoff
+      backoff_contexts = extract_backoff(context_list, order)
+      bo_indexes=[(context_index,)]
+#     bo_indexes=[(context,)]
+      for i,bo in enumerate(backoff_contexts):
+        factor_indexes=[]
+        for factor in bo:
+          bo_tuple = tuple_to_str(tuple(factor))
+          if bo_tuple not in contexts_dict:
+            contexts_dict[bo_tuple] = len(contexts_dict)
+            contexts_list.append(bo_tuple)
+            contexts_at_order[i+1] += 1
+#         factor_indexes.append(bo_tuple)
+          factor_indexes.append(contexts_dict[bo_tuple])
+        bo_indexes.append(tuple(factor_indexes))
+      
+      for i in range(len(bo_indexes)-1):
+        contexts_backoff[bo_indexes[i][0]] = bo_indexes[i+1]
+
+    context_index = contexts_dict[context]
+    contexts_freq[context_index] += 1
+
+    if context_index not in documents_dict[term]:
+      documents_dict[term][context_index] = 1
+    else:
+      documents_dict[term][context_index] += 1
+
+term_file = open(output_filename+".terms",'w')
+for t in token_list: print >>term_file, t
+term_file.close()
+
+contexts_file = open(output_filename+".contexts",'w')
+for c in contexts_list: 
+  print >>contexts_file, c
+contexts_file.close()
+
+data_file = open(output_filename+".data",'w')
+for t in range(len(token_list)): 
+  line=""
+  num_active=0
+  for c in documents_dict[t]:
+    count = documents_dict[t][c]
+    if contexts_freq[c] >= cutoff:
+      line += (' ' + str(c) + ':' + str(count))
+      num_active += 1
+  if num_active > 0:
+    print >>data_file, "%d%s" % (num_active,line)
+data_file.close()
+
+contexts_backoff_file = open(output_filename+".contexts_backoff",'w')
+print >>contexts_backoff_file, len(contexts_list), order,
+#for x in contexts_at_order: 
+#  print >>contexts_backoff_file, x,
+#print >>contexts_backoff_file
+for x in range(order-1):
+  print >>contexts_backoff_file, 3,
+print >>contexts_backoff_file, 2
+
+for x in contexts_backoff: 
+  print >>contexts_backoff_file, x, 
+  for y in contexts_backoff[x]: print >>contexts_backoff_file, y,
+  print >>contexts_backoff_file 
+contexts_backoff_file.close()
author	philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-06-22 20:34:00 +0000
committer	philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f>	2010-06-22 20:34:00 +0000
commit	2f2ba42a1453f4a3a08f9c1ecfc53c1b1c83d550 (patch)
tree	646e81b6325280f64a72771b5eeadf5118e465a9 /gi/pyp-topics/scripts/extract_contexts.py
parent	2f2e36ca3060e7e9853c3d611f6cc5e112a76ddd (diff)