diff options
Diffstat (limited to 'corpus')
-rwxr-xr-x | corpus/filter-length.pl | 8 | ||||
-rwxr-xr-x | corpus/support/tokenizer.pl | 2 | ||||
-rwxr-xr-x | corpus/tokenize-anything.sh | 7 |
3 files changed, 13 insertions, 4 deletions
diff --git a/corpus/filter-length.pl b/corpus/filter-length.pl index 2e257cda..8b73a1c8 100755 --- a/corpus/filter-length.pl +++ b/corpus/filter-length.pl @@ -52,8 +52,10 @@ while(<F>) { } next; } - my @fs = split /\s+/, $sf; - my @es = split /\s+/, $se; + my @fs = (); + my @es = (); + if (defined $sf && length($sf) > 0) { @fs = split /\s+/, $sf; } + if (defined $se && length($se) > 0) { @es = split /\s+/, $se; } my $flen = scalar @fs; my $elen = scalar @es; if ($flen == 0) { @@ -114,7 +116,7 @@ while(<F>) { if ($lines % 100000 == 0) { print STDERR " [$lines]\n"; } elsif ($lines % 2500 == 0) { print STDERR "."; } my ($sf, $se, @d) = split / \|\|\| /; - if (scalar @d != 0 or !defined $se) { next; } + if (!defined $se) { next; } my @fs = split /\s+/, $sf; my @es = split /\s+/, $se; my $flen = scalar @fs; diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl index f57bc87a..aa285be4 100755 --- a/corpus/support/tokenizer.pl +++ b/corpus/support/tokenizer.pl @@ -386,7 +386,7 @@ sub deep_proc_token { } ##### step 1: separate by punct T2 on the boundary - my $t2 = '\`|\!|\@|\+|\=|\[|\]|\<|\>|\||\(|\)|\{|\}|\?|\"|;'; + my $t2 = '\`|\!|\@|\+|\=|\[|\]|\<|\>|\||\(|\)|\{|\}|\?|\"|;|●|○'; if($line =~ s/^(($t2)+)/$1 /){ return proc_line($line); } diff --git a/corpus/tokenize-anything.sh b/corpus/tokenize-anything.sh index bca954d1..c580e88b 100755 --- a/corpus/tokenize-anything.sh +++ b/corpus/tokenize-anything.sh @@ -7,6 +7,13 @@ if [[ $# == 1 && $1 == '-u' ]] ; then NORMARGS="--batchline" SEDFLAGS="-u" else + if [[ $# != 0 ]] ; then + echo Usage: `basename $0` [-u] \< file.in \> file.out 1>&2 + echo 1>&2 + echo Tokenizes text in a reasonable way in most languages. 1>&2 + echo 1>&2 + exit 1 + fi NORMARGS="" SEDFLAGS="" fi |