diff options
| author | Michael Denkowski <michael.j.denkowski@gmail.com> | 2012-12-22 16:01:23 -0500 | 
|---|---|---|
| committer | Michael Denkowski <michael.j.denkowski@gmail.com> | 2012-12-22 16:01:23 -0500 | 
| commit | 597d89c11db53e91bc011eab70fd613bbe6453e8 (patch) | |
| tree | 83c87c07d1ff6d3ee4e3b1626f7eddd49c61095b /corpus/support | |
| parent | 65e958ff2678a41c22be7171456a63f002ef370b (diff) | |
| parent | 201af2acd394415a05072fbd53d42584875aa4b4 (diff) | |
Merge branch 'master' of git://github.com/redpony/cdec
Diffstat (limited to 'corpus/support')
| -rwxr-xr-x | corpus/support/quote-norm.pl | 7 | ||||
| -rw-r--r-- | corpus/support/token_patterns | 2 | ||||
| -rwxr-xr-x | corpus/support/tokenizer.pl | 9 | 
3 files changed, 7 insertions, 11 deletions
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index 0c5b9c26..72b0064d 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -18,13 +18,18 @@ while(<STDIN>) {    s/(\W)(euro?)(\d*\.\d+|\d+)/$1EUR $3/gi;    s/&\s*#45\s*;\s*&\s*#45\s*;/--/g;    s/&\s*#45\s*;/--/g; +  s/�c/--/g;    s/ ,,/ "/g;    s/``/"/g;    s/''/"/g; +  s/[「」]/"/g;    s/〃/"/g;    s/¨/"/g;    s/¡/ ¡ /g;    s/¿/ ¿ /g; +  # â<U+0080><U+0099> +  s/â(\x{80}\x{99}|\x{80}\x{98})/'/g; +  s/â(\x{80}\x{9c}|\x{80}\x{9d})/"/g;    s/ˇ/'/g;    s/´/'/g;    s/`/'/g; @@ -39,7 +44,7 @@ while(<STDIN>) {    s/»/"/g;    tr/!-~/!-~/;    s/、/,/g; -  s/。/./g; +  # s/。/./g;    s/…/.../g;    s/―/--/g;    s/–/--/g; diff --git a/corpus/support/token_patterns b/corpus/support/token_patterns index c0e6fe1a..8e69432b 100644 --- a/corpus/support/token_patterns +++ b/corpus/support/token_patterns @@ -1,3 +1,3 @@  /^(al|el|ul|e)\-[a-z]+$/ -/^(\d+)\.$/ +/^(\d|\d\d|\d\d\d)\.$/ diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl index 23be00a5..e9c3a37d 100755 --- a/corpus/support/tokenizer.pl +++ b/corpus/support/tokenizer.pl @@ -107,24 +107,15 @@ my $orig_token_total = 0;  my $deep_proc_token_total = 0;  my $new_token_total = 0; -my $line_total = 0; -my $content_line_total = 0; -  while(<STDIN>){      chomp(); -    $line_total ++; -    if ($line_total % 100000 == 0) { print STDERR " [$line_total]\n"; } -    elsif ($line_total % 2500 == 0) { print STDERR "."; } -      if(/^(\[b\s+|\]b|\]f|\[f\s+)/ || (/^\[[bf]$/) || (/^\s*$/) || /^<DOC/ || /^<\/DOC/) {  	## markup  	print STDOUT "$_\n";  	next;      } -    $content_line_total ++; -      my $orig_num = 0;      my $deep_proc_num = 0;  | 
