From 3822a2063e36b6ced948e5c22910a373c6c691b2 Mon Sep 17 00:00:00 2001
From: Chris Dyer <cdyer@allegro.clab.cs.cmu.edu>
Date: Mon, 15 Sep 2014 23:00:01 -0400
Subject: migrate to new Cython version

---
 corpus/filter-length.pl     | 8 +++++---
 corpus/support/tokenizer.pl | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'corpus')
diff --git a/corpus/filter-length.pl b/corpus/filter-length.pl
index 2e257cda..8b73a1c8 100755
--- a/corpus/filter-length.pl
+++ b/corpus/filter-length.pl
@@ -52,8 +52,10 @@ while(<F>) {
     }
     next;
   }
-  my @fs = split /\s+/, $sf;
-  my @es = split /\s+/, $se;
+  my @fs = ();
+  my @es = ();
+  if (defined $sf && length($sf) > 0) { @fs = split /\s+/, $sf; }
+  if (defined $se && length($se) > 0) { @es = split /\s+/, $se; }
   my $flen = scalar @fs;
   my $elen = scalar @es;
   if ($flen == 0) {
@@ -114,7 +116,7 @@ while(<F>) {
   if ($lines % 100000 == 0) { print STDERR " [$lines]\n"; }
   elsif ($lines % 2500 == 0) { print STDERR "."; }
   my ($sf, $se, @d) = split / \|\|\| /;
-  if (scalar @d != 0 or !defined $se) { next; }
+  if (!defined $se) { next; }
   my @fs = split /\s+/, $sf;
   my @es = split /\s+/, $se;
   my $flen = scalar @fs;
diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl
index f57bc87a..aa285be4 100755
--- a/corpus/support/tokenizer.pl
+++ b/corpus/support/tokenizer.pl
@@ -386,7 +386,7 @@ sub deep_proc_token {
     }
 
     ##### step 1: separate by punct T2 on the boundary
-    my $t2 = '\`|\!|\@|\+|\=|\[|\]|\<|\>|\||\(|\)|\{|\}|\?|\"|;';
+    my $t2 = '\`|\!|\@|\+|\=|\[|\]|\<|\>|\||\(|\)|\{|\}|\?|\"|;|●|○';
     if($line =~ s/^(($t2)+)/$1 /){
 	return proc_line($line);
     }
-- 
cgit v1.2.3


From dfad5ee42fd1f5fa7447280ac82822486a029b9f Mon Sep 17 00:00:00 2001
From: Chris Dyer <redpony@gmail.com>
Date: Sun, 28 Sep 2014 16:03:32 -0400
Subject: add error message

---
 corpus/tokenize-anything.sh | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'corpus')

diff --git a/corpus/tokenize-anything.sh b/corpus/tokenize-anything.sh
index bca954d1..c580e88b 100755
--- a/corpus/tokenize-anything.sh
+++ b/corpus/tokenize-anything.sh
@@ -7,6 +7,13 @@ if [[ $# == 1 && $1 == '-u' ]] ; then
     NORMARGS="--batchline"
     SEDFLAGS="-u"
 else
+    if [[ $# != 0 ]] ; then
+        echo Usage: `basename $0` [-u] \< file.in \> file.out 1>&2
+        echo 1>&2
+        echo Tokenizes text in a reasonable way in most languages. 1>&2
+        echo 1>&2
+        exit 1
+    fi
     NORMARGS=""
     SEDFLAGS=""
 fi
-- 
cgit v1.2.3