diff options
Diffstat (limited to 'corpus')
| -rwxr-xr-x | corpus/conll2cdec.pl | 42 | ||||
| -rwxr-xr-x | corpus/tokenize-anything.sh | 7 | 
2 files changed, 49 insertions, 0 deletions
| diff --git a/corpus/conll2cdec.pl b/corpus/conll2cdec.pl new file mode 100755 index 00000000..ee4e07db --- /dev/null +++ b/corpus/conll2cdec.pl @@ -0,0 +1,42 @@ +#!/usr/bin/perl -w +use strict; + +die "Usage: $0 file.conll\n\n  Converts a CoNLL formatted labeled sequence into cdec's format.\n\n" unless scalar @ARGV == 1; +open F, "<$ARGV[0]" or die "Can't read $ARGV[0]: $!\n"; + +my @xx; +my @yy; +my @os; +my $sec = undef; +my $i = 0; +while(<F>) { +  chomp; +  if (/^\s*$/) { +    print "<seg id=\"$i\""; +    $i++; +    for (my $j = 0; $j < $sec; $j++) { +      my @oo = (); +      for (my $k = 0; $k < scalar @xx; $k++) { +        my $sym = $os[$k]->[$j]; +        $sym =~ s/"/'/g; +        push @oo, $sym; +      } +      my $zz = $j + 1; +      print " feat$zz=\"@oo\""; +    } + +    print "> @xx ||| @yy </seg>\n"; +    @xx = (); +    @yy = (); +    @os = (); +  } else { +    my ($x, @fs) = split /\s+/; +    my $y = pop @fs; +    if (!defined $sec) { $sec = scalar @fs; } +    die unless $sec == scalar @fs; +    push @xx, $x; +    push @yy, $y; +    push @os, \@fs; +  } +} + diff --git a/corpus/tokenize-anything.sh b/corpus/tokenize-anything.sh index bca954d1..c580e88b 100755 --- a/corpus/tokenize-anything.sh +++ b/corpus/tokenize-anything.sh @@ -7,6 +7,13 @@ if [[ $# == 1 && $1 == '-u' ]] ; then      NORMARGS="--batchline"      SEDFLAGS="-u"  else +    if [[ $# != 0 ]] ; then +        echo Usage: `basename $0` [-u] \< file.in \> file.out 1>&2 +        echo 1>&2 +        echo Tokenizes text in a reasonable way in most languages. 1>&2 +        echo 1>&2 +        exit 1 +    fi      NORMARGS=""      SEDFLAGS=""  fi | 
