diff options
author | trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-15 00:34:58 +0000 |
---|---|---|
committer | trevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-07-15 00:34:58 +0000 |
commit | 40ee5446b84b5cdbc4e4a613e4c1aa19231c42d3 (patch) | |
tree | deb10ba93df13ec5cce90aa59d5fb8fe5a678a55 /gi | |
parent | 2775fc13d1e8d3ad45c8ddf94226397403e0e373 (diff) |
Massacred the pipeline to support source language phrases and contexts.
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@255 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi')
-rwxr-xr-x | gi/pipeline/local-gi-pipeline.pl | 18 | ||||
-rw-r--r-- | gi/posterior-regularisation/prjava/src/phrase/Corpus.java | 4 | ||||
-rwxr-xr-x | gi/pyp-topics/scripts/spans2labels.py | 41 |
3 files changed, 50 insertions, 13 deletions
diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl index 0d6c553c..cb411c1b 100755 --- a/gi/pipeline/local-gi-pipeline.pl +++ b/gi/pipeline/local-gi-pipeline.pl @@ -19,7 +19,8 @@ my $NUM_SAMPLES = 1000; my $CONTEXT_SIZE = 1; my $BIDIR = 0; my $TOPICS_CONFIG = "pyp-topics.conf"; -my $LABEL_THRESHOLD = 0; +my $LANGUAGE = "target"; + my $MODEL = "pyp"; my $NUM_EM_ITERS = 100; my $NUM_PR_ITERS = 0; @@ -71,6 +72,7 @@ usage() unless &GetOptions('base_phrase_max_size=i' => \$BASE_PHRASE_MAX_SIZE, 'pr-scale-context=f' => \$PR_SCALE_C, 'pr-threads=i' => \$PR_THREADS, 'tagged_corpus=s' => \$TAGGED_CORPUS, + 'language=s' => \$LANGUAGE, ); usage() unless scalar @ARGV == 1; @@ -166,7 +168,7 @@ sub setup_data { } sub context_dir { - return "ct${CONTEXT_SIZE}s0.L$BASE_PHRASE_MAX_SIZE"; + return "ct${CONTEXT_SIZE}s0.L$BASE_PHRASE_MAX_SIZE.l$LANGUAGE"; } sub cluster_dir { @@ -231,10 +233,10 @@ sub extract_context { if (-e $OUT_CONTEXTS) { print STDERR "$OUT_CONTEXTS exists, reusing...\n"; } else { - my $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $REDUCER | $GZIP > $OUT_CONTEXTS"; + my $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE --phrase_language $LANGUAGE --context_language $LANGUAGE | $SORT_KEYS | $REDUCER | $GZIP > $OUT_CONTEXTS"; if ($COMPLETE_CACHE) { print STDERR "COMPLETE_CACHE is set: removing memory limits on cache.\n"; - $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c 0 -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $GZIP > $OUT_CONTEXTS"; + $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c 0 -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE --phrase_language $LANGUAGE --context_language $LANGUAGE | $SORT_KEYS | $GZIP > $OUT_CONTEXTS"; } safesystem($cmd) or die "Failed to extract contexts."; } @@ -270,8 +272,14 @@ sub label_spans_with_topics { if (-e $OUT_SPANS) { print STDERR "$OUT_SPANS exists, reusing...\n"; } else { + my $l = "tt"; + if ($LANGUAGE eq "source") { + $l = "ss"; + } elsif ($LANGUAGE eq "both") { + $l = "bb"; + } else { die "Invalid language specifier $LANGUAGE\n" unless $LANGUAGE eq "target" }; safesystem("$ZCAT $IN_CLUSTERS > $CLUSTER_DIR/clusters.txt") or die "Failed to unzip"; - safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $CLUSTER_DIR/clusters.txt $CONTEXT_SIZE $LABEL_THRESHOLD > $OUT_SPANS") or die "Failed to label spans"; + safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $CLUSTER_DIR/clusters.txt $CONTEXT_SIZE $LABEL_THRESHOLD $l > $OUT_SPANS") or die "Failed to label spans"; unlink("$CLUSTER_DIR/clusters.txt") or warn "Failed to remove $CLUSTER_DIR/clusters.txt"; safesystem("paste -d ' ' $CORPUS_LEX $OUT_SPANS > $LABELED_DIR/corpus.src_trg_al_label") or die "Couldn't paste"; } diff --git a/gi/posterior-regularisation/prjava/src/phrase/Corpus.java b/gi/posterior-regularisation/prjava/src/phrase/Corpus.java index 81264ab9..d57f3c04 100644 --- a/gi/posterior-regularisation/prjava/src/phrase/Corpus.java +++ b/gi/posterior-regularisation/prjava/src/phrase/Corpus.java @@ -151,7 +151,7 @@ public class Corpus for (int i = 0; i < c.size(); ++i) { if (i > 0) b.append(" "); - if (i == c.size() / 2) b.append("<PHRASE> "); + //if (i == c.size() / 2) b.append("<PHRASE> "); b.append(wordLexicon.lookup(c.get(i))); } return b.toString(); @@ -198,7 +198,7 @@ public class Corpus while (ctxStrtok.hasMoreTokens()) { String token = ctxStrtok.nextToken(); - if (!token.equals("<PHRASE>")) + //if (!token.equals("<PHRASE>")) ctx.add(c.wordLexicon.insert(token)); } int contextId = c.contextLexicon.insert(ctx); diff --git a/gi/pyp-topics/scripts/spans2labels.py b/gi/pyp-topics/scripts/spans2labels.py index f990582e..3dc60835 100755 --- a/gi/pyp-topics/scripts/spans2labels.py +++ b/gi/pyp-topics/scripts/spans2labels.py @@ -4,7 +4,7 @@ import sys from operator import itemgetter if len(sys.argv) <= 2: - print "Usage: spans2labels.py phrase_context_index [order] [threshold]" + print "Usage: spans2labels.py phrase_context_index [order] [threshold] [languages={s,t,b}{s,t,b}]" exit(1) order=1 @@ -14,6 +14,11 @@ if len(sys.argv) > 2: order = int(sys.argv[2]) if len(sys.argv) > 3: threshold = float(sys.argv[3]) +phr=ctx='t' +if len(sys.argv) > 4: + phr, ctx = sys.argv[4] + assert phr in 'stb' + assert ctx in 'stb' phrase_context_index = {} for line in file(sys.argv[1], 'r'): @@ -52,11 +57,35 @@ for line in sys.stdin: t1 += order t2 += order - phrase = reduce(lambda x, y: x+y+" ", target[t1:t2], "").strip() - left_context = reduce(lambda x, y: x+y+" ", target[t1-order:t1], "") - right_context = reduce(lambda x, y: x+y+" ", target[t2:t2+order], "").strip() - context = "%s<PHRASE> %s" % (left_context, right_context) + phraset = phrases = contextt = contexts = '' + if phr in 'tb': + phraset = reduce(lambda x, y: x+y+" ", target[t1:t2], "").strip() + if phr in 'sb': + phrases = reduce(lambda x, y: x+y+" ", source[s1:s2], "").strip() + + if ctx in 'tb': + left_context = reduce(lambda x, y: x+y+" ", target[t1-order:t1], "") + right_context = reduce(lambda x, y: x+y+" ", target[t2:t2+order], "").strip() + contextt = "%s<PHRASE> %s" % (left_context, right_context) + if ctx in 'sb': + left_context = reduce(lambda x, y: x+y+" ", source[s1-order:s1], "") + right_context = reduce(lambda x, y: x+y+" ", source[s2:s2+order], "").strip() + contexts = "%s<PHRASE> %s" % (left_context, right_context) + + if phr == 'b': + phrase = phraset + ' <SPLIT> ' + phrases + elif phr == 's': + phrase = phrases + else: + phrase = phraset + + if ctx == 'b': + context = contextt + ' <SPLIT> ' + contexts + elif ctx == 's': + context = contexts + else: + context = contextt label = phrase_context_index.get((phrase,context), "<UNK>") - print "%s-%s:X%s" % (t1-order,t2-order,label), + print "%d-%d-%d-%d:X%s" % (s1-order,s2-order,t1-order,t2-order,label), print |