diff options
Diffstat (limited to 'gi/pyp-topics/scripts')
| -rwxr-xr-x | gi/pyp-topics/scripts/contexts2documents.py | 37 | ||||
| -rwxr-xr-x | gi/pyp-topics/scripts/extract_contexts.py | 144 | ||||
| -rwxr-xr-x | gi/pyp-topics/scripts/extract_contexts_test.py | 72 | ||||
| -rwxr-xr-x | gi/pyp-topics/scripts/extract_leaves.py | 49 | ||||
| -rwxr-xr-x | gi/pyp-topics/scripts/map-documents.py | 20 | ||||
| -rwxr-xr-x | gi/pyp-topics/scripts/map-terms.py | 20 | ||||
| -rw-r--r-- | gi/pyp-topics/scripts/run.sh | 13 | ||||
| -rwxr-xr-x | gi/pyp-topics/scripts/score-mkcls.py | 61 | ||||
| -rwxr-xr-x | gi/pyp-topics/scripts/score-topics.py | 64 | ||||
| -rwxr-xr-x | gi/pyp-topics/scripts/spans2labels.py | 137 | ||||
| -rwxr-xr-x | gi/pyp-topics/scripts/tokens2classes.py | 27 | ||||
| -rwxr-xr-x | gi/pyp-topics/scripts/topics.py | 20 | 
12 files changed, 0 insertions, 664 deletions
diff --git a/gi/pyp-topics/scripts/contexts2documents.py b/gi/pyp-topics/scripts/contexts2documents.py deleted file mode 100755 index 9be4ebbb..00000000 --- a/gi/pyp-topics/scripts/contexts2documents.py +++ /dev/null @@ -1,37 +0,0 @@ -#!/usr/bin/python - -import sys -from operator import itemgetter - -if len(sys.argv) > 3: -  print "Usage: contexts2documents.py [contexts_index_out] [phrases_index_out]" -  exit(1) - -context_index = {}  -phrase_index = {} -for line in sys.stdin: -  phrase, line_tail = line.split('\t') - -  raw_contexts = line_tail.split('|||') -  contexts = [c.strip() for x,c in enumerate(raw_contexts) if x%2 == 0] -  counts   = [int(c.split('=')[1].strip()) for x,c in enumerate(raw_contexts) if x%2 != 0] -  phrase_index.setdefault(phrase, len(phrase_index)) -  print len(contexts), -  for context,count in zip(contexts,counts):  -    c = context_index.setdefault(context, len(context_index)) -    print "%d:%d" % (c,count), -  print -if 1 < len(sys.argv) < 4: -  contexts_out = open(sys.argv[1],'w') -  contexts = context_index.items() -  contexts.sort(key = itemgetter(1)) -  for context in contexts:  -    print >>contexts_out, context[0] -  contexts_out.close() -if len(sys.argv) == 3: -  phrases_out = open(sys.argv[2],'w') -  phrases = phrase_index.items() -  phrases.sort(key = itemgetter(1)) -  for phrase in phrases:  -    print >>phrases_out, phrase[0] -  phrases_out.close() diff --git a/gi/pyp-topics/scripts/extract_contexts.py b/gi/pyp-topics/scripts/extract_contexts.py deleted file mode 100755 index b2723f2a..00000000 --- a/gi/pyp-topics/scripts/extract_contexts.py +++ /dev/null @@ -1,144 +0,0 @@ -#!/usr/bin/python - -import sys,collections - -def extract_backoff(context_list, order): -  assert len(context_list) == (2*order) -  backoffs = [] -  for i in range(1,order+1): -    if i == order: -      backoffs.append(([context_list[i-1]+"|"], ["|"+context_list[i]])) -    else: -      right_limit = 2*order-i -      core = context_list[i:right_limit] -      left = [context_list[i-1]+"|"*(order-i+1)] -      right = ["|"*(order-i+1)+context_list[right_limit]] -      backoffs.append((core, left, right)) -# print context_list, backoffs -  return backoffs - -def tuple_to_str(t): -  s="" -  for i,x in enumerate(t): -    if i > 0: s += "|" -    s += str(x) -  return s - -if len(sys.argv) < 3: -  print "Usage: extract-contexts.py output_filename order cutoff lowercase" -  exit(1) - -output_filename = sys.argv[1] -order = int(sys.argv[2]) -cutoff = 0 -if len(sys.argv) > 3: -  cutoff = int(sys.argv[3]) -lowercase = False -if len(sys.argv) > 4: -  lowercase = bool(sys.argv[4]) - -contexts_dict={} -contexts_list=[] -contexts_freq=collections.defaultdict(int) -contexts_backoff={} - -token_dict={} -token_list=[] -documents_dict=collections.defaultdict(dict) - -contexts_at_order = [i for i in range(order+1)] - -prefix = ["<s%d>|<s>"%i for i in range(order)] -suffix = ["</s%d>|</s>"%i for i in range(order)] - -for line in sys.stdin: -  tokens = list(prefix) -  tokens.extend(line.split()) -  tokens.extend(suffix) -  if lowercase: -    tokens = map(lambda x: x.lower(), tokens) - -  for i in range(order, len(tokens)-order): -    context_list = [] -    term="" -    for j in range(i-order, i+order+1): -      token,tag = tokens[j].rsplit('|',2) -      if j != i: -        context_list.append(token) -      else: -        if token not in token_dict:  -          token_dict[token] = len(token_dict) -          token_list.append(token) -        term = token_dict[token]  - -    context = tuple_to_str(tuple(context_list)) - -    if context not in contexts_dict:  -      context_index = len(contexts_dict) -      contexts_dict[context] = context_index -      contexts_list.append(context) -      contexts_at_order[0] += 1 - -      # handle backoff -      backoff_contexts = extract_backoff(context_list, order) -      bo_indexes=[(context_index,)] -#     bo_indexes=[(context,)] -      for i,bo in enumerate(backoff_contexts): -        factor_indexes=[] -        for factor in bo: -          bo_tuple = tuple_to_str(tuple(factor)) -          if bo_tuple not in contexts_dict: -            contexts_dict[bo_tuple] = len(contexts_dict) -            contexts_list.append(bo_tuple) -            contexts_at_order[i+1] += 1 -#         factor_indexes.append(bo_tuple) -          factor_indexes.append(contexts_dict[bo_tuple]) -        bo_indexes.append(tuple(factor_indexes)) -       -      for i in range(len(bo_indexes)-1): -        contexts_backoff[bo_indexes[i][0]] = bo_indexes[i+1] - -    context_index = contexts_dict[context] -    contexts_freq[context_index] += 1 - -    if context_index not in documents_dict[term]: -      documents_dict[term][context_index] = 1 -    else: -      documents_dict[term][context_index] += 1 - -term_file = open(output_filename+".terms",'w') -for t in token_list: print >>term_file, t -term_file.close() - -contexts_file = open(output_filename+".contexts",'w') -for c in contexts_list:  -  print >>contexts_file, c -contexts_file.close() - -data_file = open(output_filename+".data",'w') -for t in range(len(token_list)):  -  line="" -  num_active=0 -  for c in documents_dict[t]: -    count = documents_dict[t][c] -    if contexts_freq[c] >= cutoff: -      line += (' ' + str(c) + ':' + str(count)) -      num_active += 1 -  if num_active > 0: -    print >>data_file, "%d%s" % (num_active,line) -data_file.close() - -contexts_backoff_file = open(output_filename+".contexts_backoff",'w') -print >>contexts_backoff_file, len(contexts_list), order, -#for x in contexts_at_order:  -#  print >>contexts_backoff_file, x, -#print >>contexts_backoff_file -for x in range(order-1): -  print >>contexts_backoff_file, 3, -print >>contexts_backoff_file, 2 - -for x in contexts_backoff:  -  print >>contexts_backoff_file, x,  -  for y in contexts_backoff[x]: print >>contexts_backoff_file, y, -  print >>contexts_backoff_file  -contexts_backoff_file.close() diff --git a/gi/pyp-topics/scripts/extract_contexts_test.py b/gi/pyp-topics/scripts/extract_contexts_test.py deleted file mode 100755 index 693b6e0b..00000000 --- a/gi/pyp-topics/scripts/extract_contexts_test.py +++ /dev/null @@ -1,72 +0,0 @@ -#!/usr/bin/python - -import sys,collections - -def tuple_to_str(t): -  s="" -  for i,x in enumerate(t): -    if i > 0: s += "|" -    s += str(x) -  return s - -if len(sys.argv) < 5: -  print "Usage: extract-contexts_test.py output_filename vocab contexts order lowercase" -  exit(1) - -output_filename = sys.argv[1] -output = open(output_filename+".test_data",'w') - -unk_term="-UNK-" -vocab_dict={} -for i,x in enumerate(file(sys.argv[2], 'r').readlines()):  -  vocab_dict[x.strip()]=i - -contexts_dict={} -contexts_list=[] -for i,x in enumerate(file(sys.argv[3], 'r').readlines()):  -  contexts_dict[x.strip()]=i -  contexts_list.append(x.strip()) - -order = int(sys.argv[4]) - -lowercase = False -if len(sys.argv) > 5: -  lowercase = bool(sys.argv[5]) -if lowercase: unk_term = unk_term.lower() - -prefix = ["<s%d>|<s>"%i for i in range(order)] -suffix = ["</s%d>|</s>"%i for i in range(order)] - -assert unk_term in vocab_dict -for line in sys.stdin: -  tokens = list(prefix) -  tokens.extend(line.split()) -  tokens.extend(suffix) -  if lowercase: -    tokens = map(lambda x: x.lower(), tokens) - -  for i in range(order, len(tokens)-order): -    context_list=[] -    term="" -    for j in range(i-order, i+order+1): -      token,tag = tokens[j].rsplit('|',2) -      if j != i: -        context_list.append(token) -      else: -        if token not in vocab_dict:  -          term = vocab_dict[unk_term]  -        else: -          term = vocab_dict[token]  -    context = tuple_to_str(context_list) -    if context not in contexts_dict:  -      contexts_dict[context] = len(contexts_dict) -      contexts_list.append(context) -    context_index = contexts_dict[context] -    print >>output, "%d:%d" % (term,context_index), -  print >>output -output.close() - -contexts_file = open(output_filename+".test_contexts",'w') -for c in contexts_list:  -  print >>contexts_file, c -contexts_file.close() diff --git a/gi/pyp-topics/scripts/extract_leaves.py b/gi/pyp-topics/scripts/extract_leaves.py deleted file mode 100755 index 14783b36..00000000 --- a/gi/pyp-topics/scripts/extract_leaves.py +++ /dev/null @@ -1,49 +0,0 @@ -#!/usr/bin/python - -import nltk -import nltk.probability -import sys -import getopt  - -lexicalise=False -rm_traces=False -cutoff=100 -length_cutoff=10000 -try:                                 -  opts, args = getopt.getopt(sys.argv[1:], "hs:c:l", ["help", "lexicalise", "cutoff","sentence-length","remove-traces"]) -except getopt.GetoptError:           -  print "Usage: extract_leaves.py [-lsc]"                         -  sys.exit(2)                      -for opt, arg in opts:                 -  if opt in ("-h", "--help"):       -    print "Usage: extract_leaves.py [-lsc]"                         -    sys.exit()                   -  elif opt in ("-l", "--lexicalise"):                 -    lexicalise = True                  -  elif opt in ("-c", "--cutoff"):                 -    cutoff = int(arg)  -  elif opt in ("-s", "--sentence-length"):                 -    length_cutoff = int(arg)  -  elif opt in ("--remove-traces"):                 -    rm_traces = True                  - -token_freq = nltk.probability.FreqDist() -lines = [] -for line in sys.stdin: -  t = nltk.Tree.parse(line) -  pos = t.pos() -  if len(pos) <= length_cutoff: -    lines.append(pos) -    for token, tag in pos: -      token_freq.inc(token)   - -for line in lines: -  for token,tag in line: -    if not (rm_traces and tag == "-NONE-"): -      if lexicalise: -        if token_freq[token] < cutoff: -          token = '-UNK-' -        print '%s|%s' % (token,tag), -      else: -        print '%s' % tag, -  print diff --git a/gi/pyp-topics/scripts/map-documents.py b/gi/pyp-topics/scripts/map-documents.py deleted file mode 100755 index 703de312..00000000 --- a/gi/pyp-topics/scripts/map-documents.py +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/python - -import sys - -if len(sys.argv) != 2: -  print "Usage: map-documents.py vocab-file" -  exit(1) - -vocab = file(sys.argv[1], 'r').readlines() -term_dict = map(lambda x: x.strip(), vocab) - -for line in sys.stdin: -  tokens = line.split() -  for token in tokens: -    elements = token.split(':') -    if len(elements) == 1: -      print "%s" % (term_dict[int(elements[0])]), -    else: -      print "%s:%s" % (term_dict[int(elements[0])], elements[1]), -  print diff --git a/gi/pyp-topics/scripts/map-terms.py b/gi/pyp-topics/scripts/map-terms.py deleted file mode 100755 index eb0298d7..00000000 --- a/gi/pyp-topics/scripts/map-terms.py +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/python - -import sys - -if len(sys.argv) != 2: -  print "Usage: map-terms.py vocab-file" -  exit(1) - -vocab = file(sys.argv[1], 'r').readlines() -term_dict = map(lambda x: x.strip().replace(' ','_'), vocab) - -for line in sys.stdin: -  tokens = line.split() -  for token in tokens: -    elements = token.split(':') -    if len(elements) == 1: -      print "%s" % (term_dict[int(elements[0])]), -    else: -      print "%s:%s" % (term_dict[int(elements[0])], elements[1]), -  print diff --git a/gi/pyp-topics/scripts/run.sh b/gi/pyp-topics/scripts/run.sh deleted file mode 100644 index 19e625b1..00000000 --- a/gi/pyp-topics/scripts/run.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/sh - - -./simple-extract-context.sh ~/workspace/clsp2010/jhuws2010/data/btec/split.zh-en.al 1 | ~/workspace/pyp-topics/scripts/contexts2documents.py > split.zh-en.data - -~/workspace/pyp-topics/bin/pyp-topics-train -d split.zh-en.data -t 50 -s 100 -o split.zh-en.documents.gz -w split.zh-en.topics.gz -gunzip split.zh-en.documents.gz - -~/workspace/cdec/extools/extractor -i ../jhuws2010/data/btec/split.zh-en.al -S 1 -c 500000 -L 12 --base_phrase_spans | ~/workspace/pyp-topics/scripts/spans2labels.py split.zh-en.phrases split.zh-en.contexts split.zh-en.documents > corpus.zh-en.labelled_spans - -paste -d " " ~/workspace/clsp2010/jhuws2010/data/btec/split.zh-en.al corpus.labelled_spans > split.zh-en.labelled_spans - -./simple-extract.sh ~/workspace/clsp2010/scratch/split.zh-en.labelled_spans diff --git a/gi/pyp-topics/scripts/score-mkcls.py b/gi/pyp-topics/scripts/score-mkcls.py deleted file mode 100755 index 6bd33fc5..00000000 --- a/gi/pyp-topics/scripts/score-mkcls.py +++ /dev/null @@ -1,61 +0,0 @@ -#!/usr/bin/python - -import sys -from collections import defaultdict - -def dict_max(d): -  max_val=-1 -  max_key=None -  for k in d: -    if d[k] > max_val:  -      max_val = d[k] -      max_key = k -  assert max_key -  return max_key - -if len(sys.argv) != 3: -  print "Usage: score-mkcls.py gold classes" -  exit(1) - -gold_file=open(sys.argv[1],'r') - -term_to_topics = {} -for line in open(sys.argv[2],'r'): -  term,cls = line.split() -  term_to_topics[term] = cls - -gold_to_topics = defaultdict(dict) -topics_to_gold = defaultdict(dict) - -for gold_line in gold_file: -  gold_tokens = gold_line.split() -  for gold_token in gold_tokens: -    gold_term,gold_tag = gold_token.rsplit('|',1) -    pred_token = term_to_topics[gold_term] -    gold_to_topics[gold_tag][pred_token] \ -      = gold_to_topics[gold_tag].get(pred_token, 0) + 1 -    topics_to_gold[pred_token][gold_tag] \ -      = topics_to_gold[pred_token].get(gold_tag, 0) + 1 - -pred=0 -correct=0 -gold_file=open(sys.argv[1],'r') -for gold_line in gold_file: -  gold_tokens = gold_line.split() - -  for gold_token in gold_tokens: -    gold_term,gold_tag = gold_token.rsplit('|',1) -    pred_token = term_to_topics[gold_term] -    print "%s|%s|%s" % (gold_token, pred_token, dict_max(topics_to_gold[pred_token])), -    pred += 1 -    if gold_tag == dict_max(topics_to_gold[pred_token]): -      correct += 1 -  print -print >>sys.stderr, "Many-to-One Accuracy = %f" % (float(correct) / pred) -#for x in gold_to_topics:  -#  print x,dict_max(gold_to_topics[x]) -#print "###################################################" -#for x in range(len(topics_to_gold)):  -#  print x,dict_max(topics_to_gold[str(x)]) -#  print x,topics_to_gold[str(x)] -#print term_to_topics diff --git a/gi/pyp-topics/scripts/score-topics.py b/gi/pyp-topics/scripts/score-topics.py deleted file mode 100755 index 1d8a1fcd..00000000 --- a/gi/pyp-topics/scripts/score-topics.py +++ /dev/null @@ -1,64 +0,0 @@ -#!/usr/bin/python - -import sys -from collections import defaultdict - -def dict_max(d): -  max_val=-1 -  max_key=None -  for k in d: -    if d[k] > max_val:  -      max_val = d[k] -      max_key = k -  assert max_key -  return max_key - -if len(sys.argv) != 3: -  print "Usage: score-topics.py gold pred" -  exit(1) - -gold_file=open(sys.argv[1],'r') -pred_file=open(sys.argv[2],'r') - -gold_to_topics = defaultdict(dict) -topics_to_gold = defaultdict(dict) -term_to_topics = defaultdict(dict) - -for gold_line,pred_line in zip(gold_file,pred_file): -  gold_tokens = gold_line.split() -  pred_tokens = pred_line.split() -  assert len(gold_tokens) == len(pred_tokens) - -  for gold_token,pred_token in zip(gold_tokens,pred_tokens): -    gold_term,gold_tag = gold_token.rsplit('|',1) -    gold_to_topics[gold_tag][pred_token] \ -      = gold_to_topics[gold_tag].get(pred_token, 0) + 1 -    term_to_topics[gold_term][pred_token] \ -      = term_to_topics[gold_term].get(pred_token, 0) + 1 -    topics_to_gold[pred_token][gold_tag] \ -      = topics_to_gold[pred_token].get(gold_tag, 0) + 1 - -pred=0 -correct=0 -gold_file=open(sys.argv[1],'r') -pred_file=open(sys.argv[2],'r') -for gold_line,pred_line in zip(gold_file,pred_file): -  gold_tokens = gold_line.split() -  pred_tokens = pred_line.split() - -  for gold_token,pred_token in zip(gold_tokens,pred_tokens): -    gold_term,gold_tag = gold_token.rsplit('|',1) -#   print "%s|%s" % (gold_token, dict_max(gold_to_topics[gold_tag])), -    print "%s|%s|%s" % (gold_token, pred_token, dict_max(topics_to_gold[pred_token])), -    pred += 1 -    if gold_tag == dict_max(topics_to_gold[pred_token]): -      correct += 1 -  print -print >>sys.stderr, "Many-to-One Accuracy = %f" % (float(correct) / pred) -#for x in gold_to_topics:  -#  print x,dict_max(gold_to_topics[x]) -#print "###################################################" -#for x in range(len(topics_to_gold)):  -#  print x,dict_max(topics_to_gold[str(x)]) -#  print x,topics_to_gold[str(x)] -#print term_to_topics diff --git a/gi/pyp-topics/scripts/spans2labels.py b/gi/pyp-topics/scripts/spans2labels.py deleted file mode 100755 index 50fa8106..00000000 --- a/gi/pyp-topics/scripts/spans2labels.py +++ /dev/null @@ -1,137 +0,0 @@ -#!/usr/bin/python - -import sys -from operator import itemgetter - -if len(sys.argv) <= 2: -  print "Usage: spans2labels.py phrase_context_index [order] [threshold] [languages={s,t,b}{s,t,b}] [type={tag,tok,both},{tag,tok,both}]" -  exit(1) - -order=1 -threshold = 0 -cutoff_cat = "<UNK>" -if len(sys.argv) > 2: -  order = int(sys.argv[2]) -if len(sys.argv) > 3: -  threshold = float(sys.argv[3]) -phr=ctx='t' -if len(sys.argv) > 4: -  phr, ctx = sys.argv[4] -  assert phr in 'stb' -  assert ctx in 'stb' -phr_typ = ctx_typ = 'both' -if len(sys.argv) > 5: -  phr_typ, ctx_typ = sys.argv[5].split(',') -  assert phr_typ in ('tag', 'tok', 'both') -  assert ctx_typ in ('tag', 'tok', 'both') - -#print >>sys.stderr, "Loading phrase index" -phrase_context_index = {} -for line in file(sys.argv[1], 'r'): -  phrase,tail= line.split('\t') -  contexts = tail.split(" ||| ") -  try: # remove Phil's bizarre integer pair -       x,y = contexts[0].split() -       x=int(x); y=int(y) -       contexts = contexts[1:] -  except: -       pass -  if len(contexts) == 1: continue -  assert len(contexts) % 2 == 0 -  for i in range(0, len(contexts), 2): -    #parse contexts[i+1] = " C=1 P=0.8 ... " -    features=dict([ keyval.split('=') for keyval in contexts[i+1].split()]) -    category = features['C']     -    if features.has_key('P') and float(features['P']) < threshold: -	category = cutoff_cat -     -    phrase_context_index[(phrase,contexts[i])] = category  -    #print (phrase,contexts[i]), category - -#print >>sys.stderr, "Labelling spans" -for line in sys.stdin: -  #print >>sys.stderr, "line", line.strip() -  line_segments = line.split(' ||| ') -  assert len(line_segments) >= 3 -  source = ['<s>' for x in range(order)] + line_segments[0].split() + ['</s>' for x in range(order)] -  target = ['<s>' for x in range(order)] + line_segments[1].split() + ['</s>' for x in range(order)] -  phrases = [ [int(i) for i in x.split('-')] for x in line_segments[2].split()] - -  if phr_typ != 'both' or ctx_typ != 'both': -    if phr in 'tb' or ctx in 'tb': -        target_toks = ['<s>' for x in range(order)] + map(lambda x: x.rsplit('_', 1)[0], line_segments[1].split()) + ['</s>' for x in range(order)] -        target_tags = ['<s>' for x in range(order)] + map(lambda x: x.rsplit('_', 1)[-1], line_segments[1].split()) + ['</s>' for x in range(order)] - -        if phr in 'tb': -            if phr_typ == 'tok': -                targetP = target_toks -            elif phr_typ == 'tag': -                targetP = target_tags -        if ctx in 'tb': -            if ctx_typ == 'tok': -                targetC = target_toks -            elif ctx_typ == 'tag': -                targetC = target_tags - -    if phr in 'sb' or ctx in 'sb': -        source_toks = ['<s>' for x in range(order)] + map(lambda x: x.rsplit('_', 1)[0], line_segments[0].split()) + ['</s>' for x in range(order)] -        source_tags = ['<s>' for x in range(order)] + map(lambda x: x.rsplit('_', 1)[-1], line_segments[0].split()) + ['</s>' for x in range(order)] - -        if phr in 'sb': -            if phr_typ == 'tok': -                sourceP = source_toks -            elif phr_typ == 'tag': -                sourceP = source_tags -        if ctx in 'sb': -            if ctx_typ == 'tok': -                sourceC = source_toks -            elif ctx_typ == 'tag': -                sourceC = source_tags -  else: -    sourceP = sourceC = source -    targetP = targetC = target - -  #print >>sys.stderr, "line", source, '---', target, 'phrases', phrases - -  print "|||", - -  for s1,s2,t1,t2 in phrases: -    s1 += order -    s2 += order -    t1 += order -    t2 += order - -    phraset = phrases = contextt = contexts = '' -    if phr in 'tb': -        phraset = reduce(lambda x, y: x+y+" ", targetP[t1:t2], "").strip() -    if phr in 'sb': -        phrases = reduce(lambda x, y: x+y+" ", sourceP[s1:s2], "").strip() - -    if ctx in 'tb': -        left_context = reduce(lambda x, y: x+y+" ", targetC[t1-order:t1], "") -        right_context = reduce(lambda x, y: x+y+" ", targetC[t2:t2+order], "").strip() -        contextt = "%s<PHRASE> %s" % (left_context, right_context) -    if ctx in 'sb': -        left_context = reduce(lambda x, y: x+y+" ", sourceC[s1-order:s1], "") -        right_context = reduce(lambda x, y: x+y+" ", sourceC[s2:s2+order], "").strip() -        contexts = "%s<PHRASE> %s" % (left_context, right_context) - -    if phr == 'b': -        phrase = phraset + ' <SPLIT> ' + phrases -    elif phr == 's': -        phrase = phrases -    else: -        phrase = phraset - -    if ctx == 'b': -        context = contextt + ' <SPLIT> ' + contexts -    elif ctx == 's': -        context = contexts -    else: -        context = contextt - -    #print "%d-%d-%d-%d looking up" % (s1-order,s2-order,t1-order,t2-order), (phrase, context) -    label = phrase_context_index.get((phrase,context), cutoff_cat) -    if label != cutoff_cat: #cutoff'd spans are left unlabelled -      print "%d-%d-%d-%d:X%s" % (s1-order,s2-order,t1-order,t2-order,label), -  print diff --git a/gi/pyp-topics/scripts/tokens2classes.py b/gi/pyp-topics/scripts/tokens2classes.py deleted file mode 100755 index 33df255f..00000000 --- a/gi/pyp-topics/scripts/tokens2classes.py +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/python - -import sys - -if len(sys.argv) != 3: -  print "Usage: tokens2classes.py source_classes target_classes" -  exit(1) - -source_to_topics = {} -for line in open(sys.argv[1],'r'): -  term,cls = line.split() -  source_to_topics[term] = cls - -target_to_topics = {} -for line in open(sys.argv[2],'r'): -  term,cls = line.split() -  target_to_topics[term] = cls - -for line in sys.stdin: -  source, target, tail = line.split(" ||| ") - -  for token in source.split(): -    print source_to_topics[token], -  print "|||", -  for token in target.split(): -    print target_to_topics[token], -  print "|||", tail, diff --git a/gi/pyp-topics/scripts/topics.py b/gi/pyp-topics/scripts/topics.py deleted file mode 100755 index 0db1af71..00000000 --- a/gi/pyp-topics/scripts/topics.py +++ /dev/null @@ -1,20 +0,0 @@ -#!/usr/bin/python - -import sys - -if len(sys.argv) != 2: -  print "Usage: topics.py words-per-topic" -  exit(1) - -for t,line in enumerate(sys.stdin): -  tokens = line.split() -  terms = [] -  for token in tokens: -    elements = token.rsplit(':',1) -    terms.append((int(elements[1]),elements[0])) -  terms.sort() -  terms.reverse() - -  print "Topic %d:" % t -  map(lambda (x,y) : sys.stdout.write("   %s:%s\n" % (y,x)), terms[:int(sys.argv[1])]) -  print  | 
