summaryrefslogtreecommitdiff
path: root/gi
diff options
context:
space:
mode:
authortrevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-15 00:34:58 +0000
committertrevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-15 00:34:58 +0000
commit40ee5446b84b5cdbc4e4a613e4c1aa19231c42d3 (patch)
treedeb10ba93df13ec5cce90aa59d5fb8fe5a678a55 /gi
parent2775fc13d1e8d3ad45c8ddf94226397403e0e373 (diff)
Massacred the pipeline to support source language phrases and contexts.
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@255 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'gi')
-rwxr-xr-xgi/pipeline/local-gi-pipeline.pl18
-rw-r--r--gi/posterior-regularisation/prjava/src/phrase/Corpus.java4
-rwxr-xr-xgi/pyp-topics/scripts/spans2labels.py41
3 files changed, 50 insertions, 13 deletions
diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl
index 0d6c553c..cb411c1b 100755
--- a/gi/pipeline/local-gi-pipeline.pl
+++ b/gi/pipeline/local-gi-pipeline.pl
@@ -19,7 +19,8 @@ my $NUM_SAMPLES = 1000;
my $CONTEXT_SIZE = 1;
my $BIDIR = 0;
my $TOPICS_CONFIG = "pyp-topics.conf";
-my $LABEL_THRESHOLD = 0;
+my $LANGUAGE = "target";
+
my $MODEL = "pyp";
my $NUM_EM_ITERS = 100;
my $NUM_PR_ITERS = 0;
@@ -71,6 +72,7 @@ usage() unless &GetOptions('base_phrase_max_size=i' => \$BASE_PHRASE_MAX_SIZE,
'pr-scale-context=f' => \$PR_SCALE_C,
'pr-threads=i' => \$PR_THREADS,
'tagged_corpus=s' => \$TAGGED_CORPUS,
+ 'language=s' => \$LANGUAGE,
);
usage() unless scalar @ARGV == 1;
@@ -166,7 +168,7 @@ sub setup_data {
}
sub context_dir {
- return "ct${CONTEXT_SIZE}s0.L$BASE_PHRASE_MAX_SIZE";
+ return "ct${CONTEXT_SIZE}s0.L$BASE_PHRASE_MAX_SIZE.l$LANGUAGE";
}
sub cluster_dir {
@@ -231,10 +233,10 @@ sub extract_context {
if (-e $OUT_CONTEXTS) {
print STDERR "$OUT_CONTEXTS exists, reusing...\n";
} else {
- my $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $REDUCER | $GZIP > $OUT_CONTEXTS";
+ my $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE --phrase_language $LANGUAGE --context_language $LANGUAGE | $SORT_KEYS | $REDUCER | $GZIP > $OUT_CONTEXTS";
if ($COMPLETE_CACHE) {
print STDERR "COMPLETE_CACHE is set: removing memory limits on cache.\n";
- $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c 0 -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE | $SORT_KEYS | $GZIP > $OUT_CONTEXTS";
+ $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c 0 -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE --phrase_language $LANGUAGE --context_language $LANGUAGE | $SORT_KEYS | $GZIP > $OUT_CONTEXTS";
}
safesystem($cmd) or die "Failed to extract contexts.";
}
@@ -270,8 +272,14 @@ sub label_spans_with_topics {
if (-e $OUT_SPANS) {
print STDERR "$OUT_SPANS exists, reusing...\n";
} else {
+ my $l = "tt";
+ if ($LANGUAGE eq "source") {
+ $l = "ss";
+ } elsif ($LANGUAGE eq "both") {
+ $l = "bb";
+ } else { die "Invalid language specifier $LANGUAGE\n" unless $LANGUAGE eq "target" };
safesystem("$ZCAT $IN_CLUSTERS > $CLUSTER_DIR/clusters.txt") or die "Failed to unzip";
- safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $CLUSTER_DIR/clusters.txt $CONTEXT_SIZE $LABEL_THRESHOLD > $OUT_SPANS") or die "Failed to label spans";
+ safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $CLUSTER_DIR/clusters.txt $CONTEXT_SIZE $LABEL_THRESHOLD $l > $OUT_SPANS") or die "Failed to label spans";
unlink("$CLUSTER_DIR/clusters.txt") or warn "Failed to remove $CLUSTER_DIR/clusters.txt";
safesystem("paste -d ' ' $CORPUS_LEX $OUT_SPANS > $LABELED_DIR/corpus.src_trg_al_label") or die "Couldn't paste";
}
diff --git a/gi/posterior-regularisation/prjava/src/phrase/Corpus.java b/gi/posterior-regularisation/prjava/src/phrase/Corpus.java
index 81264ab9..d57f3c04 100644
--- a/gi/posterior-regularisation/prjava/src/phrase/Corpus.java
+++ b/gi/posterior-regularisation/prjava/src/phrase/Corpus.java
@@ -151,7 +151,7 @@ public class Corpus
for (int i = 0; i < c.size(); ++i)
{
if (i > 0) b.append(" ");
- if (i == c.size() / 2) b.append("<PHRASE> ");
+ //if (i == c.size() / 2) b.append("<PHRASE> ");
b.append(wordLexicon.lookup(c.get(i)));
}
return b.toString();
@@ -198,7 +198,7 @@ public class Corpus
while (ctxStrtok.hasMoreTokens())
{
String token = ctxStrtok.nextToken();
- if (!token.equals("<PHRASE>"))
+ //if (!token.equals("<PHRASE>"))
ctx.add(c.wordLexicon.insert(token));
}
int contextId = c.contextLexicon.insert(ctx);
diff --git a/gi/pyp-topics/scripts/spans2labels.py b/gi/pyp-topics/scripts/spans2labels.py
index f990582e..3dc60835 100755
--- a/gi/pyp-topics/scripts/spans2labels.py
+++ b/gi/pyp-topics/scripts/spans2labels.py
@@ -4,7 +4,7 @@ import sys
from operator import itemgetter
if len(sys.argv) <= 2:
- print "Usage: spans2labels.py phrase_context_index [order] [threshold]"
+ print "Usage: spans2labels.py phrase_context_index [order] [threshold] [languages={s,t,b}{s,t,b}]"
exit(1)
order=1
@@ -14,6 +14,11 @@ if len(sys.argv) > 2:
order = int(sys.argv[2])
if len(sys.argv) > 3:
threshold = float(sys.argv[3])
+phr=ctx='t'
+if len(sys.argv) > 4:
+ phr, ctx = sys.argv[4]
+ assert phr in 'stb'
+ assert ctx in 'stb'
phrase_context_index = {}
for line in file(sys.argv[1], 'r'):
@@ -52,11 +57,35 @@ for line in sys.stdin:
t1 += order
t2 += order
- phrase = reduce(lambda x, y: x+y+" ", target[t1:t2], "").strip()
- left_context = reduce(lambda x, y: x+y+" ", target[t1-order:t1], "")
- right_context = reduce(lambda x, y: x+y+" ", target[t2:t2+order], "").strip()
- context = "%s<PHRASE> %s" % (left_context, right_context)
+ phraset = phrases = contextt = contexts = ''
+ if phr in 'tb':
+ phraset = reduce(lambda x, y: x+y+" ", target[t1:t2], "").strip()
+ if phr in 'sb':
+ phrases = reduce(lambda x, y: x+y+" ", source[s1:s2], "").strip()
+
+ if ctx in 'tb':
+ left_context = reduce(lambda x, y: x+y+" ", target[t1-order:t1], "")
+ right_context = reduce(lambda x, y: x+y+" ", target[t2:t2+order], "").strip()
+ contextt = "%s<PHRASE> %s" % (left_context, right_context)
+ if ctx in 'sb':
+ left_context = reduce(lambda x, y: x+y+" ", source[s1-order:s1], "")
+ right_context = reduce(lambda x, y: x+y+" ", source[s2:s2+order], "").strip()
+ contexts = "%s<PHRASE> %s" % (left_context, right_context)
+
+ if phr == 'b':
+ phrase = phraset + ' <SPLIT> ' + phrases
+ elif phr == 's':
+ phrase = phrases
+ else:
+ phrase = phraset
+
+ if ctx == 'b':
+ context = contextt + ' <SPLIT> ' + contexts
+ elif ctx == 's':
+ context = contexts
+ else:
+ context = contextt
label = phrase_context_index.get((phrase,context), "<UNK>")
- print "%s-%s:X%s" % (t1-order,t2-order,label),
+ print "%d-%d-%d-%d:X%s" % (s1-order,s2-order,t1-order,t2-order,label),
print