summaryrefslogtreecommitdiff
path: root/corpus
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2014-10-13 19:03:48 +0100
committerPatrick Simianer <p@simianer.de>2014-10-13 19:03:48 +0100
commitcb9fb7088dde35881516c088db402abe747d49fa (patch)
treea91e4935a7941f1b261f76d88ab41fa3078a1891 /corpus
parent0a00e57e921c8eca8e02364db7d2e6607bfdcebc (diff)
parentb1ed81ef3216b212295afa76c5d20a56fb647204 (diff)
Merge remote-tracking branch 'upstream/master'
Diffstat (limited to 'corpus')
-rwxr-xr-xcorpus/filter-length.pl8
-rwxr-xr-xcorpus/support/tokenizer.pl2
-rwxr-xr-xcorpus/tokenize-anything.sh7
3 files changed, 13 insertions, 4 deletions
diff --git a/corpus/filter-length.pl b/corpus/filter-length.pl
index 2e257cda..8b73a1c8 100755
--- a/corpus/filter-length.pl
+++ b/corpus/filter-length.pl
@@ -52,8 +52,10 @@ while(<F>) {
}
next;
}
- my @fs = split /\s+/, $sf;
- my @es = split /\s+/, $se;
+ my @fs = ();
+ my @es = ();
+ if (defined $sf && length($sf) > 0) { @fs = split /\s+/, $sf; }
+ if (defined $se && length($se) > 0) { @es = split /\s+/, $se; }
my $flen = scalar @fs;
my $elen = scalar @es;
if ($flen == 0) {
@@ -114,7 +116,7 @@ while(<F>) {
if ($lines % 100000 == 0) { print STDERR " [$lines]\n"; }
elsif ($lines % 2500 == 0) { print STDERR "."; }
my ($sf, $se, @d) = split / \|\|\| /;
- if (scalar @d != 0 or !defined $se) { next; }
+ if (!defined $se) { next; }
my @fs = split /\s+/, $sf;
my @es = split /\s+/, $se;
my $flen = scalar @fs;
diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl
index f57bc87a..aa285be4 100755
--- a/corpus/support/tokenizer.pl
+++ b/corpus/support/tokenizer.pl
@@ -386,7 +386,7 @@ sub deep_proc_token {
}
##### step 1: separate by punct T2 on the boundary
- my $t2 = '\`|\!|\@|\+|\=|\[|\]|\<|\>|\||\(|\)|\{|\}|\?|\"|;';
+ my $t2 = '\`|\!|\@|\+|\=|\[|\]|\<|\>|\||\(|\)|\{|\}|\?|\"|;|●|○';
if($line =~ s/^(($t2)+)/$1 /){
return proc_line($line);
}
diff --git a/corpus/tokenize-anything.sh b/corpus/tokenize-anything.sh
index bca954d1..c580e88b 100755
--- a/corpus/tokenize-anything.sh
+++ b/corpus/tokenize-anything.sh
@@ -7,6 +7,13 @@ if [[ $# == 1 && $1 == '-u' ]] ; then
NORMARGS="--batchline"
SEDFLAGS="-u"
else
+ if [[ $# != 0 ]] ; then
+ echo Usage: `basename $0` [-u] \< file.in \> file.out 1>&2
+ echo 1>&2
+ echo Tokenizes text in a reasonable way in most languages. 1>&2
+ echo 1>&2
+ exit 1
+ fi
NORMARGS=""
SEDFLAGS=""
fi