From dfad5ee42fd1f5fa7447280ac82822486a029b9f Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sun, 28 Sep 2014 16:03:32 -0400 Subject: add error message --- corpus/tokenize-anything.sh | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'corpus') diff --git a/corpus/tokenize-anything.sh b/corpus/tokenize-anything.sh index bca954d1..c580e88b 100755 --- a/corpus/tokenize-anything.sh +++ b/corpus/tokenize-anything.sh @@ -7,6 +7,13 @@ if [[ $# == 1 && $1 == '-u' ]] ; then NORMARGS="--batchline" SEDFLAGS="-u" else + if [[ $# != 0 ]] ; then + echo Usage: `basename $0` [-u] \< file.in \> file.out 1>&2 + echo 1>&2 + echo Tokenizes text in a reasonable way in most languages. 1>&2 + echo 1>&2 + exit 1 + fi NORMARGS="" SEDFLAGS="" fi -- cgit v1.2.3 From 0968a3e56e5fb8d3afee2b1c4904d76160b0fb17 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 24 Oct 2014 15:31:16 -0400 Subject: conll2cdec conversion --- corpus/conll2cdec.pl | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100755 corpus/conll2cdec.pl (limited to 'corpus') diff --git a/corpus/conll2cdec.pl b/corpus/conll2cdec.pl new file mode 100755 index 00000000..f65b86f8 --- /dev/null +++ b/corpus/conll2cdec.pl @@ -0,0 +1,39 @@ +#!/usr/bin/perl -w +use strict; + +my @xx; +my @yy; +my @os; +my $sec = undef; +my $i = 0; +while(<>) { + chomp; + if (/^\s*$/) { + print "[$j]; + $sym =~ s/"/'/g; + push @oo, $sym; + } + my $zz = $j + 1; + print " feat$zz=\"@oo\""; + } + + print "> @xx ||| @yy \n"; + @xx = (); + @yy = (); + @os = (); + } else { + my ($x, @fs) = split /\s+/; + my $y = pop @fs; + if (!defined $sec) { $sec = scalar @fs; } + die unless $sec == scalar @fs; + push @xx, $x; + push @yy, $y; + push @os, \@fs; + } +} + -- cgit v1.2.3 From 328f3808a62975d5bdc2d1bc56f0e4867364fe7a Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 25 Oct 2014 15:01:49 -0400 Subject: bit more info --- corpus/conll2cdec.pl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'corpus') diff --git a/corpus/conll2cdec.pl b/corpus/conll2cdec.pl index f65b86f8..ee4e07db 100755 --- a/corpus/conll2cdec.pl +++ b/corpus/conll2cdec.pl @@ -1,12 +1,15 @@ #!/usr/bin/perl -w use strict; +die "Usage: $0 file.conll\n\n Converts a CoNLL formatted labeled sequence into cdec's format.\n\n" unless scalar @ARGV == 1; +open F, "<$ARGV[0]" or die "Can't read $ARGV[0]: $!\n"; + my @xx; my @yy; my @os; my $sec = undef; my $i = 0; -while(<>) { +while() { chomp; if (/^\s*$/) { print "