From 7d0cad292c444baddd70c3b76540304364d454d9 Mon Sep 17 00:00:00 2001 From: "trevor.cohn" Date: Fri, 23 Jul 2010 16:39:41 +0000 Subject: Pipeline code for running with mixing tokens and tags in the clustering. git-svn-id: https://ws10smt.googlecode.com/svn/trunk@377 ec762483-ff6d-05da-a07a-a48fb63a330f --- gi/pipeline/local-gi-pipeline.pl | 30 ++++++++--- gi/pipeline/scripts/patch-corpus.pl | 31 +++++++++--- gi/pipeline/scripts/remove-tags-from-contexts.pl | 53 ++++++++++++++++++++ gi/pipeline/scripts/remove-tags-from-corpus.pl | 51 ++++++++----------- gi/pyp-topics/scripts/spans2labels.py | 63 +++++++++++++++++++----- 5 files changed, 172 insertions(+), 56 deletions(-) create mode 100755 gi/pipeline/scripts/remove-tags-from-contexts.pl diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl index db2969c7..e832e556 100755 --- a/gi/pipeline/local-gi-pipeline.pl +++ b/gi/pipeline/local-gi-pipeline.pl @@ -21,6 +21,7 @@ my $BIDIR = 0; my $TOPICS_CONFIG = "pyp-topics.conf"; my $LANGUAGE = "target"; my $LABEL_THRESHOLD = 0; +my $PRESERVE_PHRASES; my $MODEL = "pyp"; my $NUM_ITERS = 100; @@ -45,11 +46,13 @@ my $PREM_TRAIN="$PRTOOLS/prjava/train-PR-cluster.sh"; my $SORT_KEYS = "$SCRIPT_DIR/scripts/sort-by-key.sh"; my $PATCH_CORPUS = "$SCRIPT_DIR/scripts/patch-corpus.pl"; +my $REMOVE_TAGS_CORPUS = "$SCRIPT_DIR/scripts/remove-tags-from-corpus.pl"; +my $REMOVE_TAGS_CONTEXT = "$SCRIPT_DIR/scripts/remove-tags-from-contexts.pl"; my $EXTRACTOR = "$EXTOOLS/extractor"; my $TOPIC_TRAIN = "$PYPTOOLS/pyp-contexts-train"; assert_exec($PATCH_CORPUS, $SORT_KEYS, $REDUCER, $EXTRACTOR, - $S2L, $C2D, $TOPIC_TRAIN, $SPLIT); + $S2L, $C2D, $TOPIC_TRAIN, $SPLIT, $REMOVE_TAGS_CONTEXT, $REMOVE_TAGS_CORPUS); my $BACKOFF_GRAMMAR; my $DEFAULT_CAT; @@ -77,7 +80,8 @@ usage() unless &GetOptions('base_phrase_max_size=i' => \$BASE_PHRASE_MAX_SIZE, 'pr-flags=s' => \$PR_FLAGS, 'tagged_corpus=s' => \$TAGGED_CORPUS, 'language=s' => \$LANGUAGE, - 'get_name_only' => \$NAME_SHORTCUT + 'get_name_only' => \$NAME_SHORTCUT, + 'preserve_phrases' => \$PRESERVE_PHRASES, ); if ($NAME_SHORTCUT) { $NUM_TOPICS = $NUM_TOPICS_FINE; @@ -185,6 +189,7 @@ sub setup_data { die "Can't find $TAGGED_CORPUS" unless -f $TAGGED_CORPUS; my $opt=""; $opt = "-s" if ($LANGUAGE eq "source"); + $opt = "-a" if ($PRESERVE_PHRASES); my $cmd="$PATCH_CORPUS $opt $TAGGED_CORPUS $CORPUS_LEX > $CORPUS_CLUSTER"; safesystem($cmd) or die "Failed to extract contexts."; } else { @@ -260,11 +265,19 @@ sub extract_context { if (-e $OUT_CONTEXTS) { print STDERR "$OUT_CONTEXTS exists, reusing...\n"; } else { - my $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE --phrase_language $LANGUAGE --context_language $LANGUAGE | $SORT_KEYS | $REDUCER | $GZIP > $OUT_CONTEXTS"; + my $ccopt = "-c $ITEMS_IN_MEMORY"; + my $pipe = "| $REDUCER "; if ($COMPLETE_CACHE) { print STDERR "COMPLETE_CACHE is set: removing memory limits on cache.\n"; - $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c 0 -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE --phrase_language $LANGUAGE --context_language $LANGUAGE | $SORT_KEYS | $GZIP > $OUT_CONTEXTS"; + $ccopt = "-c 0"; + $pipe = ""; } + + if ($PRESERVE_PHRASES) { + $pipe = "| $REMOVE_TAGS_CONTEXT --phrase=tok --context=tag " . $pipe; + } + + my $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER $ccopt -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE --phrase_language $LANGUAGE --context_language $LANGUAGE | $SORT_KEYS $pipe | $GZIP > $OUT_CONTEXTS"; safesystem($cmd) or die "Failed to extract contexts."; } } @@ -331,14 +344,15 @@ sub label_spans_with_topics { if (-e $OUT_SPANS) { print STDERR "$OUT_SPANS exists, reusing...\n"; } else { - my $l = "tt"; + my $extra = "tt"; if ($LANGUAGE eq "source") { - $l = "ss"; + $extra = "ss"; } elsif ($LANGUAGE eq "both") { - $l = "bb"; + $extra = "bb"; } else { die "Invalid language specifier $LANGUAGE\n" unless $LANGUAGE eq "target" }; + $extra = $extra . " tok,tag" if ($PRESERVE_PHRASES); safesystem("$ZCAT $IN_CLUSTERS > $CLUSTER_DIR/clusters.txt") or die "Failed to unzip"; - safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $CLUSTER_DIR/clusters.txt $CONTEXT_SIZE $LABEL_THRESHOLD $l > $OUT_SPANS") or die "Failed to label spans"; + safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $CLUSTER_DIR/clusters.txt $CONTEXT_SIZE $LABEL_THRESHOLD $extra > $OUT_SPANS") or die "Failed to label spans"; unlink("$CLUSTER_DIR/clusters.txt") or warn "Failed to remove $CLUSTER_DIR/clusters.txt"; safesystem("paste -d ' ' $CORPUS_LEX $OUT_SPANS > $LABELED_DIR/corpus.src_trg_al_label") or die "Couldn't paste"; } diff --git a/gi/pipeline/scripts/patch-corpus.pl b/gi/pipeline/scripts/patch-corpus.pl index 200022bc..c0eec43e 100755 --- a/gi/pipeline/scripts/patch-corpus.pl +++ b/gi/pipeline/scripts/patch-corpus.pl @@ -3,12 +3,17 @@ use strict; my $PATCH = shift @ARGV; my $TGT = 1; -if ($PATCH eq "-s") { - undef $TGT; +my $APPEND; +while ($PATCH eq "-s" || $PATCH eq "-a") { + if ($PATCH eq "-s") { + undef $TGT; + } else { + $APPEND = 1; + } $PATCH = shift @ARGV; } -die "Usage: $0 [-s] tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH; +die "Usage: $0 [-s] [-a] tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH; open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!"; my $first=

; close P; @@ -33,11 +38,25 @@ while(my $pline =

) { if ($TGT) { my @lwords = split /\s+/, $fields[1]; die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords); - $fields[1] = $pline; - } else { + if ($APPEND) { + foreach my $i (0..(scalar @pwords-1)) { + $lwords[$i] = $lwords[$i] . '_' . $pwords[$i]; + } + $fields[1] = join ' ', @lwords; + } else { + $fields[1] = $pline; + } + } else { # source side my @lwords = split /\s+/, $fields[0]; die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords); - $fields[0] = $pline; + if ($APPEND) { + foreach my $i (0..(scalar @pwords-1)) { + $lwords[$i] = $lwords[$i] . '_' . $pwords[$i]; + } + $fields[0] = join ' ', @lwords; + } else { + $fields[0] = $pline; + } } print join ' ||| ', @fields; print "\n"; diff --git a/gi/pipeline/scripts/remove-tags-from-contexts.pl b/gi/pipeline/scripts/remove-tags-from-contexts.pl new file mode 100755 index 00000000..20698816 --- /dev/null +++ b/gi/pipeline/scripts/remove-tags-from-contexts.pl @@ -0,0 +1,53 @@ +#!/usr/bin/perl -w +use strict; + +use Getopt::Long "GetOptions"; + +my $PHRASE = 'tok'; +my $CONTEXT = 'tag'; + +die "Usage: $0 [--phrase=tok|tag] [--context=tok|tag] < corpus" + unless &GetOptions('phrase=s' => \$PHRASE, 'context=s' => \$CONTEXT); + +my $lno = 0; +while(my $line = <>) { + $lno++; + chomp $line; + my @top = split /\t/, $line; + die unless (scalar @top == 2); + + my @pwords = split /\s+/, $top[0]; + foreach my $token (@pwords) { + #print $token . "\n"; + my @parts = split /_(?!.*_)/, $token; + die unless (scalar @parts == 2); + if ($PHRASE eq "tok") { + $token = $parts[0] + } elsif ($PHRASE eq "tag") { + $token = $parts[1] + } + } + + my @fields = split / \|\|\| /, $top[1]; + foreach my $i (0..((scalar @fields) / 2 - 1)) { + #print $i . ": " . $fields[2*$i] . " of " . (scalar @fields) . "\n"; + my @cwords = split /\s+/, $fields[2*$i]; + foreach my $token (@cwords) { + #print $i . ": " . $token . "\n"; + my @parts = split /_(?!.*_)/, $token; + if (scalar @parts == 2) { + if ($CONTEXT eq "tok") { + $token = $parts[0] + } elsif ($CONTEXT eq "tag") { + $token = $parts[1] + } + } + } + $fields[2*$i] = join ' ', @cwords; + } + + print join ' ', @pwords; + print "\t"; + print join ' ||| ', @fields; + print "\n"; +} diff --git a/gi/pipeline/scripts/remove-tags-from-corpus.pl b/gi/pipeline/scripts/remove-tags-from-corpus.pl index 5460db95..be3e97c0 100755 --- a/gi/pipeline/scripts/remove-tags-from-corpus.pl +++ b/gi/pipeline/scripts/remove-tags-from-corpus.pl @@ -3,51 +3,42 @@ use strict; use Getopt::Long "GetOptions"; -my $PHRASE = 'tok'; -my $CONTEXT = 'tag'; - -die "Usage: $0 [--phrase=tok|tag] [--context=tok|tag] < corpus" - unless &GetOptions('phrase=s' => \$PHRASE, 'context=s' => \$CONTEXT); +my $LANGUAGE = shift @ARGV; +$LANGUAGE = 'target' unless ($LANGUAGE); my $lno = 0; while(my $line = <>) { $lno++; chomp $line; - my @top = split /\t/, $line; - die unless (scalar @top == 2); - my @pwords = split /\s+/, $top[0]; - foreach my $token (@pwords) { - #print $token . "\n"; - my @parts = split /_(?!_)/, $token; - die unless (scalar @parts == 2); - if ($PHRASE eq "tok") { - $token = $parts[0] - } elsif ($PHRASE eq "tag") { - $token = $parts[1] + my @fields = split / \|\|\| /, $line; + + if ($LANGUAGE eq "source" or $LANGUAGE eq "both") { + my @cwords = split /\s+/, $fields[0]; + foreach my $token (@cwords) { + my @parts = split /_(?!.*_)/, $token; + if (scalar @parts == 2) { + $token = $parts[0] + } else { + print STDERR "WARNING: invalid tagged token $token\n"; + } } + $fields[0] = join ' ', @cwords; } - my @fields = split / \|\|\| /, $top[1]; - foreach my $i (0..((scalar @fields) / 2 - 1)) { - #print $i . ": " . $fields[2*$i] . " of " . (scalar @fields) . "\n"; - my @cwords = split /\s+/, $fields[2*$i]; + if ($LANGUAGE eq "target" or $LANGUAGE eq "both") { + my @cwords = split /\s+/, $fields[1]; foreach my $token (@cwords) { - #print $i . ": " . $token . "\n"; - my @parts = split /_/, $token; + my @parts = split /_(?!.*_)/, $token; if (scalar @parts == 2) { - if ($CONTEXT eq "tok") { - $token = $parts[0] - } elsif ($CONTEXT eq "tag") { - $token = $parts[1] - } + $token = $parts[1] + } else { + print STDERR "WARNING: invalid tagged token $token\n"; } } - $fields[2*$i] = join ' ', @cwords; + $fields[0] = join ' ', @cwords; } - print join ' ', @pwords; - print "\t"; print join ' ||| ', @fields; print "\n"; } diff --git a/gi/pyp-topics/scripts/spans2labels.py b/gi/pyp-topics/scripts/spans2labels.py index 73ea20f2..50fa8106 100755 --- a/gi/pyp-topics/scripts/spans2labels.py +++ b/gi/pyp-topics/scripts/spans2labels.py @@ -4,7 +4,7 @@ import sys from operator import itemgetter if len(sys.argv) <= 2: - print "Usage: spans2labels.py phrase_context_index [order] [threshold] [languages={s,t,b}{s,t,b}]" + print "Usage: spans2labels.py phrase_context_index [order] [threshold] [languages={s,t,b}{s,t,b}] [type={tag,tok,both},{tag,tok,both}]" exit(1) order=1 @@ -19,8 +19,13 @@ if len(sys.argv) > 4: phr, ctx = sys.argv[4] assert phr in 'stb' assert ctx in 'stb' +phr_typ = ctx_typ = 'both' +if len(sys.argv) > 5: + phr_typ, ctx_typ = sys.argv[5].split(',') + assert phr_typ in ('tag', 'tok', 'both') + assert ctx_typ in ('tag', 'tok', 'both') -print >>sys.stderr, "Loading phrase index" +#print >>sys.stderr, "Loading phrase index" phrase_context_index = {} for line in file(sys.argv[1], 'r'): phrase,tail= line.split('\t') @@ -43,13 +48,49 @@ for line in file(sys.argv[1], 'r'): phrase_context_index[(phrase,contexts[i])] = category #print (phrase,contexts[i]), category -print >>sys.stderr, "Labelling spans" +#print >>sys.stderr, "Labelling spans" for line in sys.stdin: - line_segments = line.split('|||') + #print >>sys.stderr, "line", line.strip() + line_segments = line.split(' ||| ') + assert len(line_segments) >= 3 source = ['' for x in range(order)] + line_segments[0].split() + ['' for x in range(order)] target = ['' for x in range(order)] + line_segments[1].split() + ['' for x in range(order)] phrases = [ [int(i) for i in x.split('-')] for x in line_segments[2].split()] + if phr_typ != 'both' or ctx_typ != 'both': + if phr in 'tb' or ctx in 'tb': + target_toks = ['' for x in range(order)] + map(lambda x: x.rsplit('_', 1)[0], line_segments[1].split()) + ['' for x in range(order)] + target_tags = ['' for x in range(order)] + map(lambda x: x.rsplit('_', 1)[-1], line_segments[1].split()) + ['' for x in range(order)] + + if phr in 'tb': + if phr_typ == 'tok': + targetP = target_toks + elif phr_typ == 'tag': + targetP = target_tags + if ctx in 'tb': + if ctx_typ == 'tok': + targetC = target_toks + elif ctx_typ == 'tag': + targetC = target_tags + + if phr in 'sb' or ctx in 'sb': + source_toks = ['' for x in range(order)] + map(lambda x: x.rsplit('_', 1)[0], line_segments[0].split()) + ['' for x in range(order)] + source_tags = ['' for x in range(order)] + map(lambda x: x.rsplit('_', 1)[-1], line_segments[0].split()) + ['' for x in range(order)] + + if phr in 'sb': + if phr_typ == 'tok': + sourceP = source_toks + elif phr_typ == 'tag': + sourceP = source_tags + if ctx in 'sb': + if ctx_typ == 'tok': + sourceC = source_toks + elif ctx_typ == 'tag': + sourceC = source_tags + else: + sourceP = sourceC = source + targetP = targetC = target + #print >>sys.stderr, "line", source, '---', target, 'phrases', phrases print "|||", @@ -62,17 +103,17 @@ for line in sys.stdin: phraset = phrases = contextt = contexts = '' if phr in 'tb': - phraset = reduce(lambda x, y: x+y+" ", target[t1:t2], "").strip() + phraset = reduce(lambda x, y: x+y+" ", targetP[t1:t2], "").strip() if phr in 'sb': - phrases = reduce(lambda x, y: x+y+" ", source[s1:s2], "").strip() + phrases = reduce(lambda x, y: x+y+" ", sourceP[s1:s2], "").strip() if ctx in 'tb': - left_context = reduce(lambda x, y: x+y+" ", target[t1-order:t1], "") - right_context = reduce(lambda x, y: x+y+" ", target[t2:t2+order], "").strip() + left_context = reduce(lambda x, y: x+y+" ", targetC[t1-order:t1], "") + right_context = reduce(lambda x, y: x+y+" ", targetC[t2:t2+order], "").strip() contextt = "%s %s" % (left_context, right_context) if ctx in 'sb': - left_context = reduce(lambda x, y: x+y+" ", source[s1-order:s1], "") - right_context = reduce(lambda x, y: x+y+" ", source[s2:s2+order], "").strip() + left_context = reduce(lambda x, y: x+y+" ", sourceC[s1-order:s1], "") + right_context = reduce(lambda x, y: x+y+" ", sourceC[s2:s2+order], "").strip() contexts = "%s %s" % (left_context, right_context) if phr == 'b': @@ -94,5 +135,3 @@ for line in sys.stdin: if label != cutoff_cat: #cutoff'd spans are left unlabelled print "%d-%d-%d-%d:X%s" % (s1-order,s2-order,t1-order,t2-order,label), print - - -- cgit v1.2.3