summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authortrevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-23 16:39:41 +0000
committertrevor.cohn <trevor.cohn@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-07-23 16:39:41 +0000
commit7d0cad292c444baddd70c3b76540304364d454d9 (patch)
treeb93b34d81dc3681a401ff811be61cca218d9a8eb
parente0bca5fea3b0267819186d0fc34c036e6b77679c (diff)
Pipeline code for running with mixing tokens and tags in the clustering.
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@377 ec762483-ff6d-05da-a07a-a48fb63a330f
-rwxr-xr-xgi/pipeline/local-gi-pipeline.pl30
-rwxr-xr-xgi/pipeline/scripts/patch-corpus.pl31
-rwxr-xr-xgi/pipeline/scripts/remove-tags-from-contexts.pl53
-rwxr-xr-xgi/pipeline/scripts/remove-tags-from-corpus.pl51
-rwxr-xr-xgi/pyp-topics/scripts/spans2labels.py63
5 files changed, 172 insertions, 56 deletions
diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl
index db2969c7..e832e556 100755
--- a/gi/pipeline/local-gi-pipeline.pl
+++ b/gi/pipeline/local-gi-pipeline.pl
@@ -21,6 +21,7 @@ my $BIDIR = 0;
my $TOPICS_CONFIG = "pyp-topics.conf";
my $LANGUAGE = "target";
my $LABEL_THRESHOLD = 0;
+my $PRESERVE_PHRASES;
my $MODEL = "pyp";
my $NUM_ITERS = 100;
@@ -45,11 +46,13 @@ my $PREM_TRAIN="$PRTOOLS/prjava/train-PR-cluster.sh";
my $SORT_KEYS = "$SCRIPT_DIR/scripts/sort-by-key.sh";
my $PATCH_CORPUS = "$SCRIPT_DIR/scripts/patch-corpus.pl";
+my $REMOVE_TAGS_CORPUS = "$SCRIPT_DIR/scripts/remove-tags-from-corpus.pl";
+my $REMOVE_TAGS_CONTEXT = "$SCRIPT_DIR/scripts/remove-tags-from-contexts.pl";
my $EXTRACTOR = "$EXTOOLS/extractor";
my $TOPIC_TRAIN = "$PYPTOOLS/pyp-contexts-train";
assert_exec($PATCH_CORPUS, $SORT_KEYS, $REDUCER, $EXTRACTOR,
- $S2L, $C2D, $TOPIC_TRAIN, $SPLIT);
+ $S2L, $C2D, $TOPIC_TRAIN, $SPLIT, $REMOVE_TAGS_CONTEXT, $REMOVE_TAGS_CORPUS);
my $BACKOFF_GRAMMAR;
my $DEFAULT_CAT;
@@ -77,7 +80,8 @@ usage() unless &GetOptions('base_phrase_max_size=i' => \$BASE_PHRASE_MAX_SIZE,
'pr-flags=s' => \$PR_FLAGS,
'tagged_corpus=s' => \$TAGGED_CORPUS,
'language=s' => \$LANGUAGE,
- 'get_name_only' => \$NAME_SHORTCUT
+ 'get_name_only' => \$NAME_SHORTCUT,
+ 'preserve_phrases' => \$PRESERVE_PHRASES,
);
if ($NAME_SHORTCUT) {
$NUM_TOPICS = $NUM_TOPICS_FINE;
@@ -185,6 +189,7 @@ sub setup_data {
die "Can't find $TAGGED_CORPUS" unless -f $TAGGED_CORPUS;
my $opt="";
$opt = "-s" if ($LANGUAGE eq "source");
+ $opt = "-a" if ($PRESERVE_PHRASES);
my $cmd="$PATCH_CORPUS $opt $TAGGED_CORPUS $CORPUS_LEX > $CORPUS_CLUSTER";
safesystem($cmd) or die "Failed to extract contexts.";
} else {
@@ -260,11 +265,19 @@ sub extract_context {
if (-e $OUT_CONTEXTS) {
print STDERR "$OUT_CONTEXTS exists, reusing...\n";
} else {
- my $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE --phrase_language $LANGUAGE --context_language $LANGUAGE | $SORT_KEYS | $REDUCER | $GZIP > $OUT_CONTEXTS";
+ my $ccopt = "-c $ITEMS_IN_MEMORY";
+ my $pipe = "| $REDUCER ";
if ($COMPLETE_CACHE) {
print STDERR "COMPLETE_CACHE is set: removing memory limits on cache.\n";
- $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c 0 -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE --phrase_language $LANGUAGE --context_language $LANGUAGE | $SORT_KEYS | $GZIP > $OUT_CONTEXTS";
+ $ccopt = "-c 0";
+ $pipe = "";
}
+
+ if ($PRESERVE_PHRASES) {
+ $pipe = "| $REMOVE_TAGS_CONTEXT --phrase=tok --context=tag " . $pipe;
+ }
+
+ my $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER $ccopt -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE --phrase_language $LANGUAGE --context_language $LANGUAGE | $SORT_KEYS $pipe | $GZIP > $OUT_CONTEXTS";
safesystem($cmd) or die "Failed to extract contexts.";
}
}
@@ -331,14 +344,15 @@ sub label_spans_with_topics {
if (-e $OUT_SPANS) {
print STDERR "$OUT_SPANS exists, reusing...\n";
} else {
- my $l = "tt";
+ my $extra = "tt";
if ($LANGUAGE eq "source") {
- $l = "ss";
+ $extra = "ss";
} elsif ($LANGUAGE eq "both") {
- $l = "bb";
+ $extra = "bb";
} else { die "Invalid language specifier $LANGUAGE\n" unless $LANGUAGE eq "target" };
+ $extra = $extra . " tok,tag" if ($PRESERVE_PHRASES);
safesystem("$ZCAT $IN_CLUSTERS > $CLUSTER_DIR/clusters.txt") or die "Failed to unzip";
- safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $CLUSTER_DIR/clusters.txt $CONTEXT_SIZE $LABEL_THRESHOLD $l > $OUT_SPANS") or die "Failed to label spans";
+ safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $CLUSTER_DIR/clusters.txt $CONTEXT_SIZE $LABEL_THRESHOLD $extra > $OUT_SPANS") or die "Failed to label spans";
unlink("$CLUSTER_DIR/clusters.txt") or warn "Failed to remove $CLUSTER_DIR/clusters.txt";
safesystem("paste -d ' ' $CORPUS_LEX $OUT_SPANS > $LABELED_DIR/corpus.src_trg_al_label") or die "Couldn't paste";
}
diff --git a/gi/pipeline/scripts/patch-corpus.pl b/gi/pipeline/scripts/patch-corpus.pl
index 200022bc..c0eec43e 100755
--- a/gi/pipeline/scripts/patch-corpus.pl
+++ b/gi/pipeline/scripts/patch-corpus.pl
@@ -3,12 +3,17 @@ use strict;
my $PATCH = shift @ARGV;
my $TGT = 1;
-if ($PATCH eq "-s") {
- undef $TGT;
+my $APPEND;
+while ($PATCH eq "-s" || $PATCH eq "-a") {
+ if ($PATCH eq "-s") {
+ undef $TGT;
+ } else {
+ $APPEND = 1;
+ }
$PATCH = shift @ARGV;
}
-die "Usage: $0 [-s] tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH;
+die "Usage: $0 [-s] [-a] tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH;
open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!";
my $first=<P>; close P;
@@ -33,11 +38,25 @@ while(my $pline = <P>) {
if ($TGT) {
my @lwords = split /\s+/, $fields[1];
die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords);
- $fields[1] = $pline;
- } else {
+ if ($APPEND) {
+ foreach my $i (0..(scalar @pwords-1)) {
+ $lwords[$i] = $lwords[$i] . '_' . $pwords[$i];
+ }
+ $fields[1] = join ' ', @lwords;
+ } else {
+ $fields[1] = $pline;
+ }
+ } else { # source side
my @lwords = split /\s+/, $fields[0];
die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords);
- $fields[0] = $pline;
+ if ($APPEND) {
+ foreach my $i (0..(scalar @pwords-1)) {
+ $lwords[$i] = $lwords[$i] . '_' . $pwords[$i];
+ }
+ $fields[0] = join ' ', @lwords;
+ } else {
+ $fields[0] = $pline;
+ }
}
print join ' ||| ', @fields;
print "\n";
diff --git a/gi/pipeline/scripts/remove-tags-from-contexts.pl b/gi/pipeline/scripts/remove-tags-from-contexts.pl
new file mode 100755
index 00000000..20698816
--- /dev/null
+++ b/gi/pipeline/scripts/remove-tags-from-contexts.pl
@@ -0,0 +1,53 @@
+#!/usr/bin/perl -w
+use strict;
+
+use Getopt::Long "GetOptions";
+
+my $PHRASE = 'tok';
+my $CONTEXT = 'tag';
+
+die "Usage: $0 [--phrase=tok|tag] [--context=tok|tag] < corpus"
+ unless &GetOptions('phrase=s' => \$PHRASE, 'context=s' => \$CONTEXT);
+
+my $lno = 0;
+while(my $line = <>) {
+ $lno++;
+ chomp $line;
+ my @top = split /\t/, $line;
+ die unless (scalar @top == 2);
+
+ my @pwords = split /\s+/, $top[0];
+ foreach my $token (@pwords) {
+ #print $token . "\n";
+ my @parts = split /_(?!.*_)/, $token;
+ die unless (scalar @parts == 2);
+ if ($PHRASE eq "tok") {
+ $token = $parts[0]
+ } elsif ($PHRASE eq "tag") {
+ $token = $parts[1]
+ }
+ }
+
+ my @fields = split / \|\|\| /, $top[1];
+ foreach my $i (0..((scalar @fields) / 2 - 1)) {
+ #print $i . ": " . $fields[2*$i] . " of " . (scalar @fields) . "\n";
+ my @cwords = split /\s+/, $fields[2*$i];
+ foreach my $token (@cwords) {
+ #print $i . ": " . $token . "\n";
+ my @parts = split /_(?!.*_)/, $token;
+ if (scalar @parts == 2) {
+ if ($CONTEXT eq "tok") {
+ $token = $parts[0]
+ } elsif ($CONTEXT eq "tag") {
+ $token = $parts[1]
+ }
+ }
+ }
+ $fields[2*$i] = join ' ', @cwords;
+ }
+
+ print join ' ', @pwords;
+ print "\t";
+ print join ' ||| ', @fields;
+ print "\n";
+}
diff --git a/gi/pipeline/scripts/remove-tags-from-corpus.pl b/gi/pipeline/scripts/remove-tags-from-corpus.pl
index 5460db95..be3e97c0 100755
--- a/gi/pipeline/scripts/remove-tags-from-corpus.pl
+++ b/gi/pipeline/scripts/remove-tags-from-corpus.pl
@@ -3,51 +3,42 @@ use strict;
use Getopt::Long "GetOptions";
-my $PHRASE = 'tok';
-my $CONTEXT = 'tag';
-
-die "Usage: $0 [--phrase=tok|tag] [--context=tok|tag] < corpus"
- unless &GetOptions('phrase=s' => \$PHRASE, 'context=s' => \$CONTEXT);
+my $LANGUAGE = shift @ARGV;
+$LANGUAGE = 'target' unless ($LANGUAGE);
my $lno = 0;
while(my $line = <>) {
$lno++;
chomp $line;
- my @top = split /\t/, $line;
- die unless (scalar @top == 2);
- my @pwords = split /\s+/, $top[0];
- foreach my $token (@pwords) {
- #print $token . "\n";
- my @parts = split /_(?!_)/, $token;
- die unless (scalar @parts == 2);
- if ($PHRASE eq "tok") {
- $token = $parts[0]
- } elsif ($PHRASE eq "tag") {
- $token = $parts[1]
+ my @fields = split / \|\|\| /, $line;
+
+ if ($LANGUAGE eq "source" or $LANGUAGE eq "both") {
+ my @cwords = split /\s+/, $fields[0];
+ foreach my $token (@cwords) {
+ my @parts = split /_(?!.*_)/, $token;
+ if (scalar @parts == 2) {
+ $token = $parts[0]
+ } else {
+ print STDERR "WARNING: invalid tagged token $token\n";
+ }
}
+ $fields[0] = join ' ', @cwords;
}
- my @fields = split / \|\|\| /, $top[1];
- foreach my $i (0..((scalar @fields) / 2 - 1)) {
- #print $i . ": " . $fields[2*$i] . " of " . (scalar @fields) . "\n";
- my @cwords = split /\s+/, $fields[2*$i];
+ if ($LANGUAGE eq "target" or $LANGUAGE eq "both") {
+ my @cwords = split /\s+/, $fields[1];
foreach my $token (@cwords) {
- #print $i . ": " . $token . "\n";
- my @parts = split /_/, $token;
+ my @parts = split /_(?!.*_)/, $token;
if (scalar @parts == 2) {
- if ($CONTEXT eq "tok") {
- $token = $parts[0]
- } elsif ($CONTEXT eq "tag") {
- $token = $parts[1]
- }
+ $token = $parts[1]
+ } else {
+ print STDERR "WARNING: invalid tagged token $token\n";
}
}
- $fields[2*$i] = join ' ', @cwords;
+ $fields[0] = join ' ', @cwords;
}
- print join ' ', @pwords;
- print "\t";
print join ' ||| ', @fields;
print "\n";
}
diff --git a/gi/pyp-topics/scripts/spans2labels.py b/gi/pyp-topics/scripts/spans2labels.py
index 73ea20f2..50fa8106 100755
--- a/gi/pyp-topics/scripts/spans2labels.py
+++ b/gi/pyp-topics/scripts/spans2labels.py
@@ -4,7 +4,7 @@ import sys
from operator import itemgetter
if len(sys.argv) <= 2:
- print "Usage: spans2labels.py phrase_context_index [order] [threshold] [languages={s,t,b}{s,t,b}]"
+ print "Usage: spans2labels.py phrase_context_index [order] [threshold] [languages={s,t,b}{s,t,b}] [type={tag,tok,both},{tag,tok,both}]"
exit(1)
order=1
@@ -19,8 +19,13 @@ if len(sys.argv) > 4:
phr, ctx = sys.argv[4]
assert phr in 'stb'
assert ctx in 'stb'
+phr_typ = ctx_typ = 'both'
+if len(sys.argv) > 5:
+ phr_typ, ctx_typ = sys.argv[5].split(',')
+ assert phr_typ in ('tag', 'tok', 'both')
+ assert ctx_typ in ('tag', 'tok', 'both')
-print >>sys.stderr, "Loading phrase index"
+#print >>sys.stderr, "Loading phrase index"
phrase_context_index = {}
for line in file(sys.argv[1], 'r'):
phrase,tail= line.split('\t')
@@ -43,13 +48,49 @@ for line in file(sys.argv[1], 'r'):
phrase_context_index[(phrase,contexts[i])] = category
#print (phrase,contexts[i]), category
-print >>sys.stderr, "Labelling spans"
+#print >>sys.stderr, "Labelling spans"
for line in sys.stdin:
- line_segments = line.split('|||')
+ #print >>sys.stderr, "line", line.strip()
+ line_segments = line.split(' ||| ')
+ assert len(line_segments) >= 3
source = ['<s>' for x in range(order)] + line_segments[0].split() + ['</s>' for x in range(order)]
target = ['<s>' for x in range(order)] + line_segments[1].split() + ['</s>' for x in range(order)]
phrases = [ [int(i) for i in x.split('-')] for x in line_segments[2].split()]
+ if phr_typ != 'both' or ctx_typ != 'both':
+ if phr in 'tb' or ctx in 'tb':
+ target_toks = ['<s>' for x in range(order)] + map(lambda x: x.rsplit('_', 1)[0], line_segments[1].split()) + ['</s>' for x in range(order)]
+ target_tags = ['<s>' for x in range(order)] + map(lambda x: x.rsplit('_', 1)[-1], line_segments[1].split()) + ['</s>' for x in range(order)]
+
+ if phr in 'tb':
+ if phr_typ == 'tok':
+ targetP = target_toks
+ elif phr_typ == 'tag':
+ targetP = target_tags
+ if ctx in 'tb':
+ if ctx_typ == 'tok':
+ targetC = target_toks
+ elif ctx_typ == 'tag':
+ targetC = target_tags
+
+ if phr in 'sb' or ctx in 'sb':
+ source_toks = ['<s>' for x in range(order)] + map(lambda x: x.rsplit('_', 1)[0], line_segments[0].split()) + ['</s>' for x in range(order)]
+ source_tags = ['<s>' for x in range(order)] + map(lambda x: x.rsplit('_', 1)[-1], line_segments[0].split()) + ['</s>' for x in range(order)]
+
+ if phr in 'sb':
+ if phr_typ == 'tok':
+ sourceP = source_toks
+ elif phr_typ == 'tag':
+ sourceP = source_tags
+ if ctx in 'sb':
+ if ctx_typ == 'tok':
+ sourceC = source_toks
+ elif ctx_typ == 'tag':
+ sourceC = source_tags
+ else:
+ sourceP = sourceC = source
+ targetP = targetC = target
+
#print >>sys.stderr, "line", source, '---', target, 'phrases', phrases
print "|||",
@@ -62,17 +103,17 @@ for line in sys.stdin:
phraset = phrases = contextt = contexts = ''
if phr in 'tb':
- phraset = reduce(lambda x, y: x+y+" ", target[t1:t2], "").strip()
+ phraset = reduce(lambda x, y: x+y+" ", targetP[t1:t2], "").strip()
if phr in 'sb':
- phrases = reduce(lambda x, y: x+y+" ", source[s1:s2], "").strip()
+ phrases = reduce(lambda x, y: x+y+" ", sourceP[s1:s2], "").strip()
if ctx in 'tb':
- left_context = reduce(lambda x, y: x+y+" ", target[t1-order:t1], "")
- right_context = reduce(lambda x, y: x+y+" ", target[t2:t2+order], "").strip()
+ left_context = reduce(lambda x, y: x+y+" ", targetC[t1-order:t1], "")
+ right_context = reduce(lambda x, y: x+y+" ", targetC[t2:t2+order], "").strip()
contextt = "%s<PHRASE> %s" % (left_context, right_context)
if ctx in 'sb':
- left_context = reduce(lambda x, y: x+y+" ", source[s1-order:s1], "")
- right_context = reduce(lambda x, y: x+y+" ", source[s2:s2+order], "").strip()
+ left_context = reduce(lambda x, y: x+y+" ", sourceC[s1-order:s1], "")
+ right_context = reduce(lambda x, y: x+y+" ", sourceC[s2:s2+order], "").strip()
contexts = "%s<PHRASE> %s" % (left_context, right_context)
if phr == 'b':
@@ -94,5 +135,3 @@ for line in sys.stdin:
if label != cutoff_cat: #cutoff'd spans are left unlabelled
print "%d-%d-%d-%d:X%s" % (s1-order,s2-order,t1-order,t2-order,label),
print
-
-