diff options
author | philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-29 17:06:03 +0000 |
---|---|---|
committer | philblunsom@gmail.com <philblunsom@gmail.com@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-29 17:06:03 +0000 |
commit | c12944507581de67e720b10320148f6dda2d267d (patch) | |
tree | b8d97423efdb5a6788689da0704801b9b79cc851 /gi | |
parent | 3d5480c15e35885f6668b90ed00d41e75ccd153f (diff) |
Added different executable for working with context models, changed input/output processing.
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@54 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi')
-rwxr-xr-x | gi/pipeline/local-gi-pipeline.pl | 3 | ||||
-rwxr-xr-x | gi/pyp-topics/scripts/spans2labels.py | 47 |
2 files changed, 25 insertions, 25 deletions
diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl index af83beb8..be91f9ad 100755 --- a/gi/pipeline/local-gi-pipeline.pl +++ b/gi/pipeline/local-gi-pipeline.pl @@ -138,7 +138,8 @@ sub label_spans_with_topics { print STDERR "$OUT_SPANS exists, reusing...\n"; } else { safesystem("$ZCAT $IN_CLUSTERS > $OUTPUT/clusters.txt") or die "Failed to unzip"; - safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $OUTPUT/phrases.index $OUTPUT/contexts.index $OUTPUT/clusters.txt > $OUT_SPANS") or die "Failed to label spans"; +# safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $OUTPUT/phrases.index $OUTPUT/contexts.index $OUTPUT/clusters.txt > $OUT_SPANS") or die "Failed to label spans"; + safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $OUTPUT/clusters.txt $CONTEXT_SIZE > $OUT_SPANS") or die "Failed to label spans"; unlink("$OUTPUT/clusters.txt") or warn "Failed to remove $OUTPUT/clusters.txt"; safesystem("paste -d ' ' $CORPUS $OUT_SPANS > $OUTPUT/corpus.src_trg_al") or die "Couldn't paste"; } diff --git a/gi/pyp-topics/scripts/spans2labels.py b/gi/pyp-topics/scripts/spans2labels.py index 409fda92..3221dbf0 100755 --- a/gi/pyp-topics/scripts/spans2labels.py +++ b/gi/pyp-topics/scripts/spans2labels.py @@ -3,44 +3,43 @@ import sys from operator import itemgetter -if len(sys.argv) != 4: - print "Usage: spans2labels.py phrase_index context_index phrase_context_index" +if len(sys.argv) <= 2: + print "Usage: spans2labels.py phrase_context_index [order]" exit(1) -phrase_index = dict(map(lambda x: (x[1].strip(),x[0]), enumerate(file(sys.argv[1], 'r').readlines()))) -context_index = dict(map(lambda x: (x[1].strip(),x[0]), enumerate(file(sys.argv[2], 'r').readlines()))) +order=1 +if len(sys.argv) > 2: + order = int(sys.argv[2]) phrase_context_index = {} -for i,line in enumerate(file(sys.argv[3], 'r').readlines()): - for c,l in map(lambda x: x.split(':'), line.split()[1:]): - phrase_context_index[(int(i),int(c))] = l +for line in file(sys.argv[1], 'r'): + phrase,tail= line.split('\t') + contexts = tail.split(" ||| ") + assert len(contexts) % 2 == 0 + for i in range(0, len(contexts), 2): + category = contexts[i+1].split("=")[1].strip() + phrase_context_index[(phrase,contexts[i])] = category +# print (phrase,contexts[i]), category for line in sys.stdin: line_segments = line.split('|||') - source = ['<s>'] + line_segments[0].split() + ['</s>'] - target = ['<s>'] + line_segments[1].split() + ['</s>'] + source = ['<s>' for x in range(order)] + line_segments[0].split() + ['</s>' for x in range(order)] + target = ['<s>' for x in range(order)] + line_segments[1].split() + ['</s>' for x in range(order)] phrases = [ [int(i) for i in x.split('-')] for x in line_segments[2].split()] -# for x in source[1:-1]: -# print x, -# print "|||", -# for x in target[1:-1]: -# print x, print "|||", for s1,s2,t1,t2 in phrases: - s1 += 1 - s2 += 1 - t1 += 1 - t2 += 1 + s1 += order + s2 += order + t1 += order + t2 += order phrase = reduce(lambda x, y: x+y+" ", target[t1:t2], "").strip() - context = "%s <PHRASE> %s" % (target[t1-1], target[t2]) + left_context = reduce(lambda x, y: x+y+" ", target[t1-order:t1], "") + right_context = reduce(lambda x, y: x+y+" ", target[t2:t2+order], "").strip() + context = "%s<PHRASE> %s" % (left_context, right_context) - pi = phrase_index[phrase] - ci = context_index[context] - label = phrase_context_index[(pi,ci)] + label = phrase_context_index[(phrase,context)] print "%s-%s:X%s" % (t1-1,t2-1,label), -# print phrase, pi, context, ci -# print phrase_context_index[(pi,ci)] print |