diff options
author | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2012-12-05 20:27:30 -0500 |
---|---|---|
committer | Chris Dyer <cdyer@allegro.clab.cs.cmu.edu> | 2012-12-05 20:27:30 -0500 |
commit | 8bc9e5d9cfe634191c83bd735af4571525133cfe (patch) | |
tree | 9cedab6a92b0e8e7bac0a930f23ba0850b152aa7 | |
parent | d7b60765395a4fe6da23992f45afd18dfc093aeb (diff) |
remove logging, you should be using pv
-rwxr-xr-x | corpus/support/quote-norm.pl | 7 | ||||
-rwxr-xr-x | corpus/support/tokenizer.pl | 9 |
2 files changed, 6 insertions, 10 deletions
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index 0c5b9c26..72b0064d 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -18,13 +18,18 @@ while(<STDIN>) { s/(\W)(euro?)(\d*\.\d+|\d+)/$1EUR $3/gi; s/&\s*#45\s*;\s*&\s*#45\s*;/--/g; s/&\s*#45\s*;/--/g; + s/�c/--/g; s/ ,,/ "/g; s/``/"/g; s/''/"/g; + s/[「」]/"/g; s/〃/"/g; s/¨/"/g; s/¡/ ¡ /g; s/¿/ ¿ /g; + # â<U+0080><U+0099> + s/â(\x{80}\x{99}|\x{80}\x{98})/'/g; + s/â(\x{80}\x{9c}|\x{80}\x{9d})/"/g; s/ˇ/'/g; s/´/'/g; s/`/'/g; @@ -39,7 +44,7 @@ while(<STDIN>) { s/»/"/g; tr/!-~/!-~/; s/、/,/g; - s/。/./g; + # s/。/./g; s/…/.../g; s/―/--/g; s/–/--/g; diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl index 23be00a5..e9c3a37d 100755 --- a/corpus/support/tokenizer.pl +++ b/corpus/support/tokenizer.pl @@ -107,24 +107,15 @@ my $orig_token_total = 0; my $deep_proc_token_total = 0; my $new_token_total = 0; -my $line_total = 0; -my $content_line_total = 0; - while(<STDIN>){ chomp(); - $line_total ++; - if ($line_total % 100000 == 0) { print STDERR " [$line_total]\n"; } - elsif ($line_total % 2500 == 0) { print STDERR "."; } - if(/^(\[b\s+|\]b|\]f|\[f\s+)/ || (/^\[[bf]$/) || (/^\s*$/) || /^<DOC/ || /^<\/DOC/) { ## markup print STDOUT "$_\n"; next; } - $content_line_total ++; - my $orig_num = 0; my $deep_proc_num = 0; |