summaryrefslogtreecommitdiff
path: root/corpus
diff options
context:
space:
mode:
Diffstat (limited to 'corpus')
-rwxr-xr-xcorpus/cut-corpus.pl29
-rwxr-xr-xcorpus/support/quote-norm.pl7
-rw-r--r--corpus/support/token_patterns2
-rwxr-xr-xcorpus/support/tokenizer.pl9
4 files changed, 31 insertions, 16 deletions
diff --git a/corpus/cut-corpus.pl b/corpus/cut-corpus.pl
index fc9cce3b..7daac0e2 100755
--- a/corpus/cut-corpus.pl
+++ b/corpus/cut-corpus.pl
@@ -3,14 +3,33 @@ use strict;
die "Usage: $0 N\nSplits a corpus separated by ||| symbols and returns the Nth field\n" unless scalar @ARGV > 0;
my $x = shift @ARGV;
-die "N must be numeric" unless $x =~ /^\d+$/;
-$x--;
+my @ind = split /,/, $x;
+my @o = ();
+for my $ff (@ind) {
+ if ($ff =~ /^\d+$/) {
+ push @o, $ff - 1;
+ } elsif ($ff =~ /^(\d+)-(\d+)$/) {
+ my $a = $1;
+ my $b = $2;
+ die "$a-$b is a bad range in input: $x\n" unless $b > $a;
+ for (my $i=$a; $i <= $b; $i++) {
+ push @o, $i - 1;
+ }
+ } else {
+ die "Bad input: $x\n";
+ }
+}
while(<>) {
chomp;
my @fields = split / \|\|\| /;
- my $y = $fields[$x];
- if (!defined $y) { $y= ''; }
- print "$y\n";
+ my @sf;
+ for my $i (@o) {
+ my $y = $fields[$i];
+ if (!defined $y) { $y= ''; }
+ push @sf, $y;
+ }
+ print join(' ||| ', @sf) . "\n";
}
+
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl
index 0c5b9c26..72b0064d 100755
--- a/corpus/support/quote-norm.pl
+++ b/corpus/support/quote-norm.pl
@@ -18,13 +18,18 @@ while(<STDIN>) {
s/(\W)(euro?)(\d*\.\d+|\d+)/$1EUR $3/gi;
s/&\s*#45\s*;\s*&\s*#45\s*;/--/g;
s/&\s*#45\s*;/--/g;
+ s/�c/--/g;
s/ ,,/ "/g;
s/``/"/g;
s/''/"/g;
+ s/[「」]/"/g;
s/〃/"/g;
s/¨/"/g;
s/¡/ ¡ /g;
s/¿/ ¿ /g;
+ # â<U+0080><U+0099>
+ s/â(\x{80}\x{99}|\x{80}\x{98})/'/g;
+ s/â(\x{80}\x{9c}|\x{80}\x{9d})/"/g;
s/ˇ/'/g;
s/´/'/g;
s/`/'/g;
@@ -39,7 +44,7 @@ while(<STDIN>) {
s/»/"/g;
tr/!-~/!-~/;
s/、/,/g;
- s/。/./g;
+ # s/。/./g;
s/…/.../g;
s/―/--/g;
s/–/--/g;
diff --git a/corpus/support/token_patterns b/corpus/support/token_patterns
index c0e6fe1a..8e69432b 100644
--- a/corpus/support/token_patterns
+++ b/corpus/support/token_patterns
@@ -1,3 +1,3 @@
/^(al|el|ul|e)\-[a-z]+$/
-/^(\d+)\.$/
+/^(\d|\d\d|\d\d\d)\.$/
diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl
index 23be00a5..e9c3a37d 100755
--- a/corpus/support/tokenizer.pl
+++ b/corpus/support/tokenizer.pl
@@ -107,24 +107,15 @@ my $orig_token_total = 0;
my $deep_proc_token_total = 0;
my $new_token_total = 0;
-my $line_total = 0;
-my $content_line_total = 0;
-
while(<STDIN>){
chomp();
- $line_total ++;
- if ($line_total % 100000 == 0) { print STDERR " [$line_total]\n"; }
- elsif ($line_total % 2500 == 0) { print STDERR "."; }
-
if(/^(\[b\s+|\]b|\]f|\[f\s+)/ || (/^\[[bf]$/) || (/^\s*$/) || /^<DOC/ || /^<\/DOC/) {
## markup
print STDOUT "$_\n";
next;
}
- $content_line_total ++;
-
my $orig_num = 0;
my $deep_proc_num = 0;