summaryrefslogtreecommitdiff
path: root/corpus
diff options
context:
space:
mode:
authorWu, Ke <wuke@cs.umd.edu>2014-12-17 16:11:38 -0500
committerWu, Ke <wuke@cs.umd.edu>2014-12-17 16:11:38 -0500
commit7468e8d85e99b4619442c7afaf4a0d92870111bb (patch)
treea6f17da7c69048c8900260b5490bb9d8611be3bb /corpus
parentb6dd5a683db9dda2d634dd2fdb76606819594901 (diff)
parent1a79175f9a101d46cf27ca921213d5dd9300518f (diff)
Merge with upstream
Diffstat (limited to 'corpus')
-rwxr-xr-xcorpus/conll2cdec.pl42
-rwxr-xr-xcorpus/tokenize-anything.sh7
2 files changed, 49 insertions, 0 deletions
diff --git a/corpus/conll2cdec.pl b/corpus/conll2cdec.pl
new file mode 100755
index 00000000..ee4e07db
--- /dev/null
+++ b/corpus/conll2cdec.pl
@@ -0,0 +1,42 @@
+#!/usr/bin/perl -w
+use strict;
+
+die "Usage: $0 file.conll\n\n Converts a CoNLL formatted labeled sequence into cdec's format.\n\n" unless scalar @ARGV == 1;
+open F, "<$ARGV[0]" or die "Can't read $ARGV[0]: $!\n";
+
+my @xx;
+my @yy;
+my @os;
+my $sec = undef;
+my $i = 0;
+while(<F>) {
+ chomp;
+ if (/^\s*$/) {
+ print "<seg id=\"$i\"";
+ $i++;
+ for (my $j = 0; $j < $sec; $j++) {
+ my @oo = ();
+ for (my $k = 0; $k < scalar @xx; $k++) {
+ my $sym = $os[$k]->[$j];
+ $sym =~ s/"/'/g;
+ push @oo, $sym;
+ }
+ my $zz = $j + 1;
+ print " feat$zz=\"@oo\"";
+ }
+
+ print "> @xx ||| @yy </seg>\n";
+ @xx = ();
+ @yy = ();
+ @os = ();
+ } else {
+ my ($x, @fs) = split /\s+/;
+ my $y = pop @fs;
+ if (!defined $sec) { $sec = scalar @fs; }
+ die unless $sec == scalar @fs;
+ push @xx, $x;
+ push @yy, $y;
+ push @os, \@fs;
+ }
+}
+
diff --git a/corpus/tokenize-anything.sh b/corpus/tokenize-anything.sh
index bca954d1..c580e88b 100755
--- a/corpus/tokenize-anything.sh
+++ b/corpus/tokenize-anything.sh
@@ -7,6 +7,13 @@ if [[ $# == 1 && $1 == '-u' ]] ; then
NORMARGS="--batchline"
SEDFLAGS="-u"
else
+ if [[ $# != 0 ]] ; then
+ echo Usage: `basename $0` [-u] \< file.in \> file.out 1>&2
+ echo 1>&2
+ echo Tokenizes text in a reasonable way in most languages. 1>&2
+ echo 1>&2
+ exit 1
+ fi
NORMARGS=""
SEDFLAGS=""
fi