From c81b0dc240f2233c3e5ecccd8982218115476f9a Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 4 Dec 2012 22:01:32 -0500 Subject: more flexible corpus cutting --- corpus/cut-corpus.pl | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) (limited to 'corpus') diff --git a/corpus/cut-corpus.pl b/corpus/cut-corpus.pl index fc9cce3b..7daac0e2 100755 --- a/corpus/cut-corpus.pl +++ b/corpus/cut-corpus.pl @@ -3,14 +3,33 @@ use strict; die "Usage: $0 N\nSplits a corpus separated by ||| symbols and returns the Nth field\n" unless scalar @ARGV > 0; my $x = shift @ARGV; -die "N must be numeric" unless $x =~ /^\d+$/; -$x--; +my @ind = split /,/, $x; +my @o = (); +for my $ff (@ind) { + if ($ff =~ /^\d+$/) { + push @o, $ff - 1; + } elsif ($ff =~ /^(\d+)-(\d+)$/) { + my $a = $1; + my $b = $2; + die "$a-$b is a bad range in input: $x\n" unless $b > $a; + for (my $i=$a; $i <= $b; $i++) { + push @o, $i - 1; + } + } else { + die "Bad input: $x\n"; + } +} while(<>) { chomp; my @fields = split / \|\|\| /; - my $y = $fields[$x]; - if (!defined $y) { $y= ''; } - print "$y\n"; + my @sf; + for my $i (@o) { + my $y = $fields[$i]; + if (!defined $y) { $y= ''; } + push @sf, $y; + } + print join(' ||| ', @sf) . "\n"; } + -- cgit v1.2.3 From a86a37cbe2fb6ffdcf4374f180010a118fed1063 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Wed, 5 Dec 2012 20:27:30 -0500 Subject: remove logging, you should be using pv --- corpus/support/quote-norm.pl | 7 ++++++- corpus/support/tokenizer.pl | 9 --------- 2 files changed, 6 insertions(+), 10 deletions(-) (limited to 'corpus') diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index 0c5b9c26..72b0064d 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -18,13 +18,18 @@ while() { s/(\W)(euro?)(\d*\.\d+|\d+)/$1EUR $3/gi; s/&\s*#45\s*;\s*&\s*#45\s*;/--/g; s/&\s*#45\s*;/--/g; + s/�c/--/g; s/ ,,/ "/g; s/``/"/g; s/''/"/g; + s/[「」]/"/g; s/〃/"/g; s/¨/"/g; s/¡/ ¡ /g; s/¿/ ¿ /g; + # â + s/â(\x{80}\x{99}|\x{80}\x{98})/'/g; + s/â(\x{80}\x{9c}|\x{80}\x{9d})/"/g; s/ˇ/'/g; s/´/'/g; s/`/'/g; @@ -39,7 +44,7 @@ while() { s/»/"/g; tr/!-~/!-~/; s/、/,/g; - s/。/./g; + # s/。/./g; s/…/.../g; s/―/--/g; s/–/--/g; diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl index 23be00a5..e9c3a37d 100755 --- a/corpus/support/tokenizer.pl +++ b/corpus/support/tokenizer.pl @@ -107,24 +107,15 @@ my $orig_token_total = 0; my $deep_proc_token_total = 0; my $new_token_total = 0; -my $line_total = 0; -my $content_line_total = 0; - while(){ chomp(); - $line_total ++; - if ($line_total % 100000 == 0) { print STDERR " [$line_total]\n"; } - elsif ($line_total % 2500 == 0) { print STDERR "."; } - if(/^(\[b\s+|\]b|\]f|\[f\s+)/ || (/^\[[bf]$/) || (/^\s*$/) || /^ Date: Wed, 5 Dec 2012 20:57:20 -0500 Subject: slight tokenization bug fix --- corpus/support/token_patterns | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'corpus') diff --git a/corpus/support/token_patterns b/corpus/support/token_patterns index c0e6fe1a..8e69432b 100644 --- a/corpus/support/token_patterns +++ b/corpus/support/token_patterns @@ -1,3 +1,3 @@ /^(al|el|ul|e)\-[a-z]+$/ -/^(\d+)\.$/ +/^(\d|\d\d|\d\d\d)\.$/ -- cgit v1.2.3