diff options
| author | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2014-09-15 23:00:01 -0400 | 
|---|---|---|
| committer | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2014-09-15 23:00:01 -0400 | 
| commit | 118013befdcdcaf96c64657439c441f0108fbdcc (patch) | |
| tree | f19b3a0c9d91f6eadbfbca21740abe00ad93aa30 /corpus | |
| parent | 604b4464e7e3f8b90145c7039d5672e6638c0dd3 (diff) | |
migrate to new Cython version
Diffstat (limited to 'corpus')
| -rwxr-xr-x | corpus/filter-length.pl | 8 | ||||
| -rwxr-xr-x | corpus/support/tokenizer.pl | 2 | 
2 files changed, 6 insertions, 4 deletions
diff --git a/corpus/filter-length.pl b/corpus/filter-length.pl index 2e257cda..8b73a1c8 100755 --- a/corpus/filter-length.pl +++ b/corpus/filter-length.pl @@ -52,8 +52,10 @@ while(<F>) {      }      next;    } -  my @fs = split /\s+/, $sf; -  my @es = split /\s+/, $se; +  my @fs = (); +  my @es = (); +  if (defined $sf && length($sf) > 0) { @fs = split /\s+/, $sf; } +  if (defined $se && length($se) > 0) { @es = split /\s+/, $se; }    my $flen = scalar @fs;    my $elen = scalar @es;    if ($flen == 0) { @@ -114,7 +116,7 @@ while(<F>) {    if ($lines % 100000 == 0) { print STDERR " [$lines]\n"; }    elsif ($lines % 2500 == 0) { print STDERR "."; }    my ($sf, $se, @d) = split / \|\|\| /; -  if (scalar @d != 0 or !defined $se) { next; } +  if (!defined $se) { next; }    my @fs = split /\s+/, $sf;    my @es = split /\s+/, $se;    my $flen = scalar @fs; diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl index f57bc87a..aa285be4 100755 --- a/corpus/support/tokenizer.pl +++ b/corpus/support/tokenizer.pl @@ -386,7 +386,7 @@ sub deep_proc_token {      }      ##### step 1: separate by punct T2 on the boundary -    my $t2 = '\`|\!|\@|\+|\=|\[|\]|\<|\>|\||\(|\)|\{|\}|\?|\"|;'; +    my $t2 = '\`|\!|\@|\+|\=|\[|\]|\<|\>|\||\(|\)|\{|\}|\?|\"|;|●|○';      if($line =~ s/^(($t2)+)/$1 /){  	return proc_line($line);      }  | 
