summaryrefslogtreecommitdiff
path: root/corpus/support/tokenizer.pl
diff options
context:
space:
mode:
authorMichael Denkowski <michael.j.denkowski@gmail.com>2012-12-22 16:01:23 -0500
committerMichael Denkowski <michael.j.denkowski@gmail.com>2012-12-22 16:01:23 -0500
commit778a4cec55f82bcc66b3f52de7cc871e8daaeb92 (patch)
tree2a5bccaa85965855104c4e8ac3738b2e1c77f164 /corpus/support/tokenizer.pl
parent57fff9eea5ba0e71fb958fdb4f32d17f2fe31108 (diff)
parentd21491daa5e50b4456c7c5f9c2e51d25afd2a757 (diff)
Merge branch 'master' of git://github.com/redpony/cdec
Diffstat (limited to 'corpus/support/tokenizer.pl')
-rwxr-xr-xcorpus/support/tokenizer.pl9
1 files changed, 0 insertions, 9 deletions
diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl
index 23be00a5..e9c3a37d 100755
--- a/corpus/support/tokenizer.pl
+++ b/corpus/support/tokenizer.pl
@@ -107,24 +107,15 @@ my $orig_token_total = 0;
my $deep_proc_token_total = 0;
my $new_token_total = 0;
-my $line_total = 0;
-my $content_line_total = 0;
-
while(<STDIN>){
chomp();
- $line_total ++;
- if ($line_total % 100000 == 0) { print STDERR " [$line_total]\n"; }
- elsif ($line_total % 2500 == 0) { print STDERR "."; }
-
if(/^(\[b\s+|\]b|\]f|\[f\s+)/ || (/^\[[bf]$/) || (/^\s*$/) || /^<DOC/ || /^<\/DOC/) {
## markup
print STDOUT "$_\n";
next;
}
- $content_line_total ++;
-
my $orig_num = 0;
my $deep_proc_num = 0;