diff options
author | Wu, Ke <wuke@cs.umd.edu> | 2014-12-17 16:11:38 -0500 |
---|---|---|
committer | Wu, Ke <wuke@cs.umd.edu> | 2014-12-17 16:11:38 -0500 |
commit | 1613f1fc44ca67820afd7e7b21eb54b316c8ce55 (patch) | |
tree | e02b77084f28a18df6b854f87a986124db44d717 /corpus | |
parent | bd9308e22b5434aa220cc57d82ee867464a011f1 (diff) | |
parent | 796768086a687d3f1856fef6489c34fe4d373642 (diff) |
Merge with upstream
Diffstat (limited to 'corpus')
-rwxr-xr-x | corpus/conll2cdec.pl | 42 | ||||
-rwxr-xr-x | corpus/tokenize-anything.sh | 7 |
2 files changed, 49 insertions, 0 deletions
diff --git a/corpus/conll2cdec.pl b/corpus/conll2cdec.pl new file mode 100755 index 00000000..ee4e07db --- /dev/null +++ b/corpus/conll2cdec.pl @@ -0,0 +1,42 @@ +#!/usr/bin/perl -w +use strict; + +die "Usage: $0 file.conll\n\n Converts a CoNLL formatted labeled sequence into cdec's format.\n\n" unless scalar @ARGV == 1; +open F, "<$ARGV[0]" or die "Can't read $ARGV[0]: $!\n"; + +my @xx; +my @yy; +my @os; +my $sec = undef; +my $i = 0; +while(<F>) { + chomp; + if (/^\s*$/) { + print "<seg id=\"$i\""; + $i++; + for (my $j = 0; $j < $sec; $j++) { + my @oo = (); + for (my $k = 0; $k < scalar @xx; $k++) { + my $sym = $os[$k]->[$j]; + $sym =~ s/"/'/g; + push @oo, $sym; + } + my $zz = $j + 1; + print " feat$zz=\"@oo\""; + } + + print "> @xx ||| @yy </seg>\n"; + @xx = (); + @yy = (); + @os = (); + } else { + my ($x, @fs) = split /\s+/; + my $y = pop @fs; + if (!defined $sec) { $sec = scalar @fs; } + die unless $sec == scalar @fs; + push @xx, $x; + push @yy, $y; + push @os, \@fs; + } +} + diff --git a/corpus/tokenize-anything.sh b/corpus/tokenize-anything.sh index bca954d1..c580e88b 100755 --- a/corpus/tokenize-anything.sh +++ b/corpus/tokenize-anything.sh @@ -7,6 +7,13 @@ if [[ $# == 1 && $1 == '-u' ]] ; then NORMARGS="--batchline" SEDFLAGS="-u" else + if [[ $# != 0 ]] ; then + echo Usage: `basename $0` [-u] \< file.in \> file.out 1>&2 + echo 1>&2 + echo Tokenizes text in a reasonable way in most languages. 1>&2 + echo 1>&2 + exit 1 + fi NORMARGS="" SEDFLAGS="" fi |