diff options
Diffstat (limited to 'gi')
| -rwxr-xr-x | gi/pipeline/local-gi-pipeline.pl | 30 | ||||
| -rwxr-xr-x | gi/pipeline/scripts/patch-corpus.pl | 31 | ||||
| -rwxr-xr-x | gi/pipeline/scripts/remove-tags-from-contexts.pl | 53 | ||||
| -rwxr-xr-x | gi/pipeline/scripts/remove-tags-from-corpus.pl | 51 | ||||
| -rwxr-xr-x | gi/pyp-topics/scripts/spans2labels.py | 63 | 
5 files changed, 172 insertions, 56 deletions
diff --git a/gi/pipeline/local-gi-pipeline.pl b/gi/pipeline/local-gi-pipeline.pl index db2969c7..e832e556 100755 --- a/gi/pipeline/local-gi-pipeline.pl +++ b/gi/pipeline/local-gi-pipeline.pl @@ -21,6 +21,7 @@ my $BIDIR = 0;  my $TOPICS_CONFIG = "pyp-topics.conf";  my $LANGUAGE = "target";  my $LABEL_THRESHOLD = 0; +my $PRESERVE_PHRASES;  my $MODEL = "pyp";  my $NUM_ITERS = 100; @@ -45,11 +46,13 @@ my $PREM_TRAIN="$PRTOOLS/prjava/train-PR-cluster.sh";  my $SORT_KEYS = "$SCRIPT_DIR/scripts/sort-by-key.sh";  my $PATCH_CORPUS = "$SCRIPT_DIR/scripts/patch-corpus.pl"; +my $REMOVE_TAGS_CORPUS = "$SCRIPT_DIR/scripts/remove-tags-from-corpus.pl"; +my $REMOVE_TAGS_CONTEXT = "$SCRIPT_DIR/scripts/remove-tags-from-contexts.pl";  my $EXTRACTOR = "$EXTOOLS/extractor";  my $TOPIC_TRAIN = "$PYPTOOLS/pyp-contexts-train";  assert_exec($PATCH_CORPUS, $SORT_KEYS, $REDUCER, $EXTRACTOR, -            $S2L, $C2D, $TOPIC_TRAIN, $SPLIT); +            $S2L, $C2D, $TOPIC_TRAIN, $SPLIT, $REMOVE_TAGS_CONTEXT, $REMOVE_TAGS_CORPUS);  my $BACKOFF_GRAMMAR;  my $DEFAULT_CAT; @@ -77,7 +80,8 @@ usage() unless &GetOptions('base_phrase_max_size=i' => \$BASE_PHRASE_MAX_SIZE,                             'pr-flags=s' => \$PR_FLAGS,                             'tagged_corpus=s' => \$TAGGED_CORPUS,                             'language=s' => \$LANGUAGE, -                           'get_name_only' => \$NAME_SHORTCUT +                           'get_name_only' => \$NAME_SHORTCUT, +                           'preserve_phrases' => \$PRESERVE_PHRASES,                            );  if ($NAME_SHORTCUT) {    $NUM_TOPICS = $NUM_TOPICS_FINE; @@ -185,6 +189,7 @@ sub setup_data {      die "Can't find $TAGGED_CORPUS" unless -f $TAGGED_CORPUS;      my $opt="";      $opt = "-s" if ($LANGUAGE eq "source"); +    $opt = "-a" if ($PRESERVE_PHRASES);      my $cmd="$PATCH_CORPUS $opt $TAGGED_CORPUS $CORPUS_LEX > $CORPUS_CLUSTER";      safesystem($cmd) or die "Failed to extract contexts.";    } else { @@ -260,11 +265,19 @@ sub extract_context {   if (-e $OUT_CONTEXTS) {     print STDERR "$OUT_CONTEXTS exists, reusing...\n";   } else { -   my $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE --phrase_language $LANGUAGE --context_language $LANGUAGE | $SORT_KEYS | $REDUCER | $GZIP > $OUT_CONTEXTS"; +   my $ccopt = "-c $ITEMS_IN_MEMORY"; +   my $pipe = "| $REDUCER ";     if ($COMPLETE_CACHE) {       print STDERR "COMPLETE_CACHE is set: removing memory limits on cache.\n"; -     $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER -c 0 -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE  --phrase_language $LANGUAGE --context_language $LANGUAGE  | $SORT_KEYS | $GZIP > $OUT_CONTEXTS"; +     $ccopt = "-c 0"; +     $pipe = "";     } + +   if ($PRESERVE_PHRASES) { +    $pipe = "| $REMOVE_TAGS_CONTEXT --phrase=tok --context=tag " . $pipe; +   } + +   my $cmd = "$EXTRACTOR -i $CORPUS_CLUSTER $ccopt -L $BASE_PHRASE_MAX_SIZE -C -S $CONTEXT_SIZE --phrase_language $LANGUAGE --context_language $LANGUAGE | $SORT_KEYS $pipe | $GZIP > $OUT_CONTEXTS";     safesystem($cmd) or die "Failed to extract contexts.";    }  } @@ -331,14 +344,15 @@ sub label_spans_with_topics {    if (-e $OUT_SPANS) {      print STDERR "$OUT_SPANS exists, reusing...\n";    } else { -    my $l = "tt"; +    my $extra = "tt";      if ($LANGUAGE eq "source") { -        $l = "ss"; +        $extra = "ss";      } elsif ($LANGUAGE eq "both") { -        $l = "bb"; +        $extra = "bb";      } else { die "Invalid language specifier $LANGUAGE\n" unless $LANGUAGE eq "target" }; +    $extra = $extra . " tok,tag" if ($PRESERVE_PHRASES);      safesystem("$ZCAT $IN_CLUSTERS > $CLUSTER_DIR/clusters.txt") or die "Failed to unzip"; -    safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $CLUSTER_DIR/clusters.txt $CONTEXT_SIZE $LABEL_THRESHOLD $l > $OUT_SPANS") or die "Failed to label spans"; +    safesystem("$EXTRACTOR --base_phrase_spans -i $CORPUS_CLUSTER -c $ITEMS_IN_MEMORY -L $BASE_PHRASE_MAX_SIZE -S $CONTEXT_SIZE | $S2L $CLUSTER_DIR/clusters.txt $CONTEXT_SIZE $LABEL_THRESHOLD $extra > $OUT_SPANS") or die "Failed to label spans";      unlink("$CLUSTER_DIR/clusters.txt") or warn "Failed to remove $CLUSTER_DIR/clusters.txt";      safesystem("paste -d ' ' $CORPUS_LEX $OUT_SPANS > $LABELED_DIR/corpus.src_trg_al_label") or die "Couldn't paste";    } diff --git a/gi/pipeline/scripts/patch-corpus.pl b/gi/pipeline/scripts/patch-corpus.pl index 200022bc..c0eec43e 100755 --- a/gi/pipeline/scripts/patch-corpus.pl +++ b/gi/pipeline/scripts/patch-corpus.pl @@ -3,12 +3,17 @@ use strict;  my $PATCH = shift @ARGV;  my $TGT = 1; -if ($PATCH eq "-s") { -    undef $TGT; +my $APPEND; +while ($PATCH eq "-s" || $PATCH eq "-a") { +    if ($PATCH eq "-s") { +        undef $TGT; +    } else { +        $APPEND = 1; +    }      $PATCH = shift @ARGV;  } -die "Usage: $0 [-s] tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH; +die "Usage: $0 [-s] [-a] tagged.en[_fr] < lexical.en_fr_al[_...]\n" unless $PATCH;  open P, "<$PATCH" or die "Can't read tagged corpus $PATCH: $!";  my $first=<P>; close P; @@ -33,11 +38,25 @@ while(my $pline = <P>) {    if ($TGT) {        my @lwords = split /\s+/, $fields[1];        die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords); -      $fields[1] = $pline; -  } else { +      if ($APPEND) { +          foreach my $i (0..(scalar @pwords-1)) { +              $lwords[$i] = $lwords[$i] . '_' . $pwords[$i]; +          } +          $fields[1] = join ' ', @lwords; +      } else { +          $fields[1] = $pline; +      } +  } else { # source side        my @lwords = split /\s+/, $fields[0];        die "Length mismatch in line $line!\n" unless (scalar @pwords == scalar @lwords); -      $fields[0] = $pline; +      if ($APPEND) { +          foreach my $i (0..(scalar @pwords-1)) { +              $lwords[$i] = $lwords[$i] . '_' . $pwords[$i]; +          } +          $fields[0] = join ' ', @lwords; +      } else { +          $fields[0] = $pline; +      }    }    print join ' ||| ', @fields;    print "\n"; diff --git a/gi/pipeline/scripts/remove-tags-from-contexts.pl b/gi/pipeline/scripts/remove-tags-from-contexts.pl new file mode 100755 index 00000000..20698816 --- /dev/null +++ b/gi/pipeline/scripts/remove-tags-from-contexts.pl @@ -0,0 +1,53 @@ +#!/usr/bin/perl -w +use strict; + +use Getopt::Long "GetOptions"; + +my $PHRASE = 'tok'; +my $CONTEXT = 'tag'; + +die "Usage: $0 [--phrase=tok|tag] [--context=tok|tag] < corpus"  +    unless &GetOptions('phrase=s' => \$PHRASE, 'context=s' => \$CONTEXT); + +my $lno = 0; +while(my $line = <>) { +    $lno++; +    chomp $line; +    my @top = split /\t/, $line; +    die unless (scalar @top == 2);  + +    my @pwords = split /\s+/, $top[0]; +    foreach my $token (@pwords) { +        #print $token . "\n"; +        my @parts = split /_(?!.*_)/, $token; +        die unless (scalar @parts == 2);  +        if ($PHRASE eq "tok") { +            $token = $parts[0] +        } elsif ($PHRASE eq "tag") { +            $token = $parts[1] +        } +    } + +    my @fields = split / \|\|\| /, $top[1]; +    foreach my $i (0..((scalar @fields) / 2 - 1)) { +        #print $i . ": " . $fields[2*$i] . " of " . (scalar @fields) . "\n"; +        my @cwords = split /\s+/, $fields[2*$i]; +        foreach my $token (@cwords) { +            #print $i . ": " . $token . "\n"; +            my @parts = split /_(?!.*_)/, $token; +            if (scalar @parts == 2) { +                if ($CONTEXT eq "tok") { +                    $token = $parts[0] +                } elsif ($CONTEXT eq "tag") { +                    $token = $parts[1] +                } +            } +        } +        $fields[2*$i] = join ' ', @cwords; +    } + +    print join ' ', @pwords; +    print "\t"; +    print join ' ||| ', @fields; +    print "\n"; +} diff --git a/gi/pipeline/scripts/remove-tags-from-corpus.pl b/gi/pipeline/scripts/remove-tags-from-corpus.pl index 5460db95..be3e97c0 100755 --- a/gi/pipeline/scripts/remove-tags-from-corpus.pl +++ b/gi/pipeline/scripts/remove-tags-from-corpus.pl @@ -3,51 +3,42 @@ use strict;  use Getopt::Long "GetOptions"; -my $PHRASE = 'tok'; -my $CONTEXT = 'tag'; - -die "Usage: $0 [--phrase=tok|tag] [--context=tok|tag] < corpus"  -    unless &GetOptions('phrase=s' => \$PHRASE, 'context=s' => \$CONTEXT); +my $LANGUAGE = shift @ARGV; +$LANGUAGE = 'target' unless ($LANGUAGE);  my $lno = 0;  while(my $line = <>) {      $lno++;      chomp $line; -    my @top = split /\t/, $line; -    die unless (scalar @top == 2);  -    my @pwords = split /\s+/, $top[0]; -    foreach my $token (@pwords) { -        #print $token . "\n"; -        my @parts = split /_(?!_)/, $token; -        die unless (scalar @parts == 2);  -        if ($PHRASE eq "tok") { -            $token = $parts[0] -        } elsif ($PHRASE eq "tag") { -            $token = $parts[1] +    my @fields = split / \|\|\| /, $line; + +    if ($LANGUAGE eq "source" or $LANGUAGE eq "both") { +        my @cwords = split /\s+/, $fields[0]; +        foreach my $token (@cwords) { +            my @parts = split /_(?!.*_)/, $token; +            if (scalar @parts == 2) { +                $token = $parts[0] +            } else { +                print STDERR "WARNING: invalid tagged token $token\n"; +            }          } +        $fields[0] = join ' ', @cwords;      } -    my @fields = split / \|\|\| /, $top[1]; -    foreach my $i (0..((scalar @fields) / 2 - 1)) { -        #print $i . ": " . $fields[2*$i] . " of " . (scalar @fields) . "\n"; -        my @cwords = split /\s+/, $fields[2*$i]; +    if ($LANGUAGE eq "target" or $LANGUAGE eq "both") { +        my @cwords = split /\s+/, $fields[1];          foreach my $token (@cwords) { -            #print $i . ": " . $token . "\n"; -            my @parts = split /_/, $token; +            my @parts = split /_(?!.*_)/, $token;              if (scalar @parts == 2) { -                if ($CONTEXT eq "tok") { -                    $token = $parts[0] -                } elsif ($CONTEXT eq "tag") { -                    $token = $parts[1] -                } +                $token = $parts[1] +            } else { +                print STDERR "WARNING: invalid tagged token $token\n";              }          } -        $fields[2*$i] = join ' ', @cwords; +        $fields[0] = join ' ', @cwords;      } -    print join ' ', @pwords; -    print "\t";      print join ' ||| ', @fields;      print "\n";  } diff --git a/gi/pyp-topics/scripts/spans2labels.py b/gi/pyp-topics/scripts/spans2labels.py index 73ea20f2..50fa8106 100755 --- a/gi/pyp-topics/scripts/spans2labels.py +++ b/gi/pyp-topics/scripts/spans2labels.py @@ -4,7 +4,7 @@ import sys  from operator import itemgetter  if len(sys.argv) <= 2: -  print "Usage: spans2labels.py phrase_context_index [order] [threshold] [languages={s,t,b}{s,t,b}]" +  print "Usage: spans2labels.py phrase_context_index [order] [threshold] [languages={s,t,b}{s,t,b}] [type={tag,tok,both},{tag,tok,both}]"    exit(1)  order=1 @@ -19,8 +19,13 @@ if len(sys.argv) > 4:    phr, ctx = sys.argv[4]    assert phr in 'stb'    assert ctx in 'stb' +phr_typ = ctx_typ = 'both' +if len(sys.argv) > 5: +  phr_typ, ctx_typ = sys.argv[5].split(',') +  assert phr_typ in ('tag', 'tok', 'both') +  assert ctx_typ in ('tag', 'tok', 'both') -print >>sys.stderr, "Loading phrase index" +#print >>sys.stderr, "Loading phrase index"  phrase_context_index = {}  for line in file(sys.argv[1], 'r'):    phrase,tail= line.split('\t') @@ -43,13 +48,49 @@ for line in file(sys.argv[1], 'r'):      phrase_context_index[(phrase,contexts[i])] = category       #print (phrase,contexts[i]), category -print >>sys.stderr, "Labelling spans" +#print >>sys.stderr, "Labelling spans"  for line in sys.stdin: -  line_segments = line.split('|||') +  #print >>sys.stderr, "line", line.strip() +  line_segments = line.split(' ||| ') +  assert len(line_segments) >= 3    source = ['<s>' for x in range(order)] + line_segments[0].split() + ['</s>' for x in range(order)]    target = ['<s>' for x in range(order)] + line_segments[1].split() + ['</s>' for x in range(order)]    phrases = [ [int(i) for i in x.split('-')] for x in line_segments[2].split()] +  if phr_typ != 'both' or ctx_typ != 'both': +    if phr in 'tb' or ctx in 'tb': +        target_toks = ['<s>' for x in range(order)] + map(lambda x: x.rsplit('_', 1)[0], line_segments[1].split()) + ['</s>' for x in range(order)] +        target_tags = ['<s>' for x in range(order)] + map(lambda x: x.rsplit('_', 1)[-1], line_segments[1].split()) + ['</s>' for x in range(order)] + +        if phr in 'tb': +            if phr_typ == 'tok': +                targetP = target_toks +            elif phr_typ == 'tag': +                targetP = target_tags +        if ctx in 'tb': +            if ctx_typ == 'tok': +                targetC = target_toks +            elif ctx_typ == 'tag': +                targetC = target_tags + +    if phr in 'sb' or ctx in 'sb': +        source_toks = ['<s>' for x in range(order)] + map(lambda x: x.rsplit('_', 1)[0], line_segments[0].split()) + ['</s>' for x in range(order)] +        source_tags = ['<s>' for x in range(order)] + map(lambda x: x.rsplit('_', 1)[-1], line_segments[0].split()) + ['</s>' for x in range(order)] + +        if phr in 'sb': +            if phr_typ == 'tok': +                sourceP = source_toks +            elif phr_typ == 'tag': +                sourceP = source_tags +        if ctx in 'sb': +            if ctx_typ == 'tok': +                sourceC = source_toks +            elif ctx_typ == 'tag': +                sourceC = source_tags +  else: +    sourceP = sourceC = source +    targetP = targetC = target +    #print >>sys.stderr, "line", source, '---', target, 'phrases', phrases    print "|||", @@ -62,17 +103,17 @@ for line in sys.stdin:      phraset = phrases = contextt = contexts = ''      if phr in 'tb': -        phraset = reduce(lambda x, y: x+y+" ", target[t1:t2], "").strip() +        phraset = reduce(lambda x, y: x+y+" ", targetP[t1:t2], "").strip()      if phr in 'sb': -        phrases = reduce(lambda x, y: x+y+" ", source[s1:s2], "").strip() +        phrases = reduce(lambda x, y: x+y+" ", sourceP[s1:s2], "").strip()      if ctx in 'tb': -        left_context = reduce(lambda x, y: x+y+" ", target[t1-order:t1], "") -        right_context = reduce(lambda x, y: x+y+" ", target[t2:t2+order], "").strip() +        left_context = reduce(lambda x, y: x+y+" ", targetC[t1-order:t1], "") +        right_context = reduce(lambda x, y: x+y+" ", targetC[t2:t2+order], "").strip()          contextt = "%s<PHRASE> %s" % (left_context, right_context)      if ctx in 'sb': -        left_context = reduce(lambda x, y: x+y+" ", source[s1-order:s1], "") -        right_context = reduce(lambda x, y: x+y+" ", source[s2:s2+order], "").strip() +        left_context = reduce(lambda x, y: x+y+" ", sourceC[s1-order:s1], "") +        right_context = reduce(lambda x, y: x+y+" ", sourceC[s2:s2+order], "").strip()          contexts = "%s<PHRASE> %s" % (left_context, right_context)      if phr == 'b': @@ -94,5 +135,3 @@ for line in sys.stdin:      if label != cutoff_cat: #cutoff'd spans are left unlabelled        print "%d-%d-%d-%d:X%s" % (s1-order,s2-order,t1-order,t2-order,label),    print - -  | 
