Merge branch 'master' of git://github.com/redpony/cdec

author: Michael Denkowski <michael.j.denkowski@gmail.com> 2012-12-22 16:01:23 -0500
committer: Michael Denkowski <michael.j.denkowski@gmail.com> 2012-12-22 16:01:23 -0500
commit: 597d89c11db53e91bc011eab70fd613bbe6453e8 (patch)
tree: 83c87c07d1ff6d3ee4e3b1626f7eddd49c61095b /corpus
parent: 65e958ff2678a41c22be7171456a63f002ef370b (diff)
parent: 201af2acd394415a05072fbd53d42584875aa4b4 (diff)
4 files changed, 31 insertions, 16 deletions
diff --git a/corpus/cut-corpus.pl b/corpus/cut-corpus.pl
index fc9cce3b..7daac0e2 100755
--- a/corpus/cut-corpus.pl
+++ b/corpus/cut-corpus.pl
@@ -3,14 +3,33 @@ use strict;
 die "Usage: $0 N\nSplits a corpus separated by ||| symbols and returns the Nth field\n" unless scalar @ARGV > 0;
 
 my $x = shift @ARGV;
-die "N must be numeric" unless $x =~ /^\d+$/;
-$x--;
+my @ind = split /,/, $x;
+my @o = ();
+for my $ff (@ind) {
+  if ($ff =~ /^\d+$/) {
+    push @o, $ff - 1;
+  } elsif ($ff =~ /^(\d+)-(\d+)$/) {
+    my $a = $1;
+    my $b = $2;
+    die "$a-$b is a bad range in input: $x\n" unless $b > $a;
+    for (my $i=$a; $i <= $b; $i++) {
+      push @o, $i - 1;
+    }
+  } else {
+    die "Bad input: $x\n";
+  }
+}
 
 while(<>) {
   chomp;
   my @fields = split / \|\|\| /;
-  my $y = $fields[$x];
-  if (!defined $y) { $y= ''; }
-  print "$y\n";
+  my @sf;
+  for my $i (@o) {
+    my $y = $fields[$i];
+    if (!defined $y) { $y= ''; }
+    push @sf, $y;
+  }
+  print join(' ||| ', @sf) . "\n";
 }
 
+
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl
index 0c5b9c26..72b0064d 100755
--- a/corpus/support/quote-norm.pl
+++ b/corpus/support/quote-norm.pl
@@ -18,13 +18,18 @@ while(<STDIN>) {
   s/(\W)(euro?)(\d*\.\d+|\d+)/$1EUR $3/gi;
   s/&\s*#45\s*;\s*&\s*#45\s*;/--/g;
   s/&\s*#45\s*;/--/g;
+  s/ï¿½c/--/g;
   s/ ,,/ "/g;
   s/``/"/g;
   s/''/"/g;
+  s/[「」]/"/g;
   s/〃/"/g;
   s/¨/"/g;
   s/¡/ ¡ /g;
   s/¿/ ¿ /g;
+  # â<U+0080><U+0099>
+  s/â(\x{80}\x{99}|\x{80}\x{98})/'/g;
+  s/â(\x{80}\x{9c}|\x{80}\x{9d})/"/g;
   s/ˇ/'/g;
   s/´/'/g;
   s/`/'/g;
@@ -39,7 +44,7 @@ while(<STDIN>) {
   s/»/"/g;
   tr/！-～/!-~/;
   s/、/,/g;
-  s/。/./g;
+  # s/。/./g;
   s/…/.../g;
   s/―/--/g;
   s/–/--/g;
diff --git a/corpus/support/token_patterns b/corpus/support/token_patterns
index c0e6fe1a..8e69432b 100644
--- a/corpus/support/token_patterns
+++ b/corpus/support/token_patterns
@@ -1,3 +1,3 @@
 /^(al|el|ul|e)\-[a-z]+$/
-/^(\d+)\.$/
+/^(\d|\d\d|\d\d\d)\.$/
 
diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl
index 23be00a5..e9c3a37d 100755
--- a/corpus/support/tokenizer.pl
+++ b/corpus/support/tokenizer.pl
@@ -107,24 +107,15 @@ my $orig_token_total = 0;
 my $deep_proc_token_total = 0;
 my $new_token_total = 0;
 
-my $line_total = 0;
-my $content_line_total = 0;
-
 while(<STDIN>){
     chomp();
 
-    $line_total ++;
-    if ($line_total % 100000 == 0) { print STDERR " [$line_total]\n"; }
-    elsif ($line_total % 2500 == 0) { print STDERR "."; }
-
     if(/^(\[b\s+|\]b|\]f|\[f\s+)/ || (/^\[[bf]$/) || (/^\s*$/) || /^<DOC/ || /^<\/DOC/) {
 	## markup
 	print STDOUT "$_\n";
 	next;
     }
 
-    $content_line_total ++;
-
     my $orig_num = 0;
     my $deep_proc_num = 0;
author	Michael Denkowski <michael.j.denkowski@gmail.com>	2012-12-22 16:01:23 -0500
committer	Michael Denkowski <michael.j.denkowski@gmail.com>	2012-12-22 16:01:23 -0500
commit	597d89c11db53e91bc011eab70fd613bbe6453e8 (patch)
tree	83c87c07d1ff6d3ee4e3b1626f7eddd49c61095b /corpus
parent	65e958ff2678a41c22be7171456a63f002ef370b (diff)
parent	201af2acd394415a05072fbd53d42584875aa4b4 (diff)