Merge branch 'master' of https://github.com/pauldb89/cdec

author: Paul Baltescu <pauldb89@gmail.com> 2013-02-21 14:13:55 +0000
committer: Paul Baltescu <pauldb89@gmail.com> 2013-02-21 14:13:55 +0000
commit: bca26d953a774b8efca12f30407390b3f5eef9d0 (patch)
tree: fe922de5c89b1844f677d550dcc24e87edd67a55 /corpus
parent: 54a1c0e2bde259e3acc9c0a8ec8da3c7704e80ca (diff)
parent: 95c364f2cb002241c4a62bedb1c5ef6f1e9a7f22 (diff)
8 files changed, 130 insertions, 9 deletions
diff --git a/corpus/add-self-translations.pl b/corpus/add-self-translations.pl
index 153bc454..d707ce29 100755
--- a/corpus/add-self-translations.pl
+++ b/corpus/add-self-translations.pl
@@ -6,7 +6,7 @@ use strict;
 my %df;
 my %def;
 while(<>) {
-  print;
+#  print;
   chomp;
   my ($sf, $se) = split / \|\|\| /;
   die "Format error: $_\n" unless defined $sf && defined $se;
diff --git a/corpus/cut-corpus.pl b/corpus/cut-corpus.pl
index 7daac0e2..0af3b23c 100755
--- a/corpus/cut-corpus.pl
+++ b/corpus/cut-corpus.pl
@@ -22,7 +22,7 @@ for my $ff (@ind) {
 
 while(<>) {
   chomp;
-  my @fields = split / \|\|\| /;
+  my @fields = split /\s*\|\|\|\s*/;
   my @sf;
   for my $i (@o) {
     my $y = $fields[$i];
diff --git a/corpus/filter-length.pl b/corpus/filter-length.pl
index 70032ca7..2e257cda 100755
--- a/corpus/filter-length.pl
+++ b/corpus/filter-length.pl
@@ -3,20 +3,30 @@ use strict;
 use utf8;
 
 ##### EDIT THESE SETTINGS ####################################################
-my $MAX_LENGTH = 99;  # discard a sentence if it is longer than this
-my $AUTOMATIC_INCLUDE_IF_SHORTER_THAN = 6; # if both are shorter, include
+my $AUTOMATIC_INCLUDE_IF_SHORTER_THAN = 7; # if both are shorter, include
 my $MAX_ZSCORE = 1.8; # how far from the mean can the (log)ratio be?
 ##############################################################################
 
-die "Usage: $0 corpus.fr-en\n\n  Filter sentence pairs containing sentences longer than $MAX_LENGTH words\n  or whose log length ratios are $MAX_ZSCORE stddevs away from the mean log ratio.\n\n" unless scalar @ARGV == 1;
+die "Usage: $0 [-NNN] corpus.fr-en\n\n  Filter sentence pairs containing sentences longer than NNN words (where NNN\n  is 150 by default) or whose log length ratios are $MAX_ZSCORE stddevs away from the\n  mean log ratio.\n\n" unless scalar @ARGV == 1 || scalar @ARGV == 2;
 binmode(STDOUT,":utf8");
 binmode(STDERR,":utf8");
 
+my $MAX_LENGTH = 150;  # discard a sentence if it is longer than this
+if (scalar @ARGV == 2) {
+  my $fp = shift @ARGV;
+  die "Expected -NNN for first parameter, but got $fp\n" unless $fp =~ /^-(\d+)$/;
+  $MAX_LENGTH=$1;
+}
+
 my $corpus = shift @ARGV;
+
 die "Cannot read from STDIN\n" if $corpus eq '-';
 my $ff = "<$corpus";
 $ff = "gunzip -c $corpus|" if $ff =~ /\.gz$/;
 
+print STDERR "Max line length (monolingual): $MAX_LENGTH\n";
+print STDERR "              Parallel corpus: $corpus\n";
+
 open F,$ff or die "Can't read $corpus: $!";
 binmode(F,":utf8");
 
@@ -34,11 +44,11 @@ while(<F>) {
   $lines++;
   if ($lines % 100000 == 0) { print STDERR " [$lines]\n"; }
   elsif ($lines % 2500 == 0) { print STDERR "."; }
-  my ($sf, $se, @d) = split / \|\|\| /;
+  my ($sf, $se, @d) = split /\s*\|\|\|\s*/;
   if (scalar @d != 0 or !defined $se) {
     $bad_format++;
     if ($bad_format > 100 && ($bad_format / $lines) > 0.02) {
-      die "Corpus appears to be incorretly formatted, example: $_";
+      die "$bad_format / $lines : Corpus appears to be incorretly formatted, example: $_";
     }
     next;
   }
@@ -128,6 +138,8 @@ while(<F>) {
       next;
     }
     print;
+  } else {
+    print;
   }
   $to++;
 }
diff --git a/corpus/paste-files.pl b/corpus/paste-files.pl
index 24c70599..4cb424ad 100755
--- a/corpus/paste-files.pl
+++ b/corpus/paste-files.pl
@@ -17,6 +17,7 @@ for my $file (@ARGV) {
 binmode(STDOUT,":utf8");
 binmode(STDERR,":utf8");
 
+my $bad = 0;
 my $lc = 0;
 my $done = 0;
 my $fl = 0;
@@ -34,7 +35,15 @@ while(1) {
       last;
     }
     chomp $r;
-    die "$ARGV[$anum]:$lc contains a ||| symbol - please remove.\n" if $r =~ /\|\|\|/;
+    if ($r =~ /\|\|\|/) {
+      $r = '';
+      $bad++;
+    }
+    warn "$ARGV[$anum]:$lc contains a ||| symbol - please remove.\n" if $r =~ /\|\|\|/;
+    $r =~ s/\|\|\|/ /g;
+    $r =~ s/\s+/ /g;
+    $r =~ s/^ +//;
+    $r =~ s/ +$//;
     $anum++;
     push @line, $r;
   }
@@ -47,4 +56,5 @@ for (my $i = 1; $i < scalar @fhs; $i++) {
   my $r = <$fh>;
   die "Mismatched number of lines.\n" if defined $r;
 }
+print STDERR "Number of lines containing ||| was: $bad\n" if $bad > 0;
 
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl
index 72b0064d..e4e5055e 100755
--- a/corpus/support/quote-norm.pl
+++ b/corpus/support/quote-norm.pl
@@ -27,6 +27,20 @@ while(<STDIN>) {
   s/¨/"/g;
   s/¡/ ¡ /g;
   s/¿/ ¿ /g;
+
+  s/ п. п. / п.п. /g;
+  s/ ст. л. / ст.л. /g;
+  s/ т. е. / т.е. /g;
+  s/ т. к. / т.к. /g;
+  s/ т. ч. / т.ч. /g;
+  s/ т. д. / т.д. /g;
+  s/ т. п. / т.п. /g;
+  s/ и. о. / и.о. /g;
+  s/ с. г. / с.г. /g;
+  s/ г. р. / г.р. /g;
+  s/ т. н. / т.н. /g;
+  s/ т. ч. / т.ч. /g;
+  s/ н. э. / н.э. /g;
   # â<U+0080><U+0099>
   s/â(\x{80}\x{99}|\x{80}\x{98})/'/g;
   s/â(\x{80}\x{9c}|\x{80}\x{9d})/"/g;
diff --git a/corpus/support/token_list b/corpus/support/token_list
index 28eb4396..d470cb22 100644
--- a/corpus/support/token_list
+++ b/corpus/support/token_list
@@ -210,3 +210,85 @@ W.
 X.
 Y.
 Z.
+А.
+Б.
+В.
+Г.
+Д.
+Е.
+Ё.
+Ж.
+З.
+И.
+Й.
+К.
+Л.
+М.
+Н.
+О.
+П.
+Р.
+С.
+Т.
+У.
+Ф.
+Х.
+Ц.
+Ч.
+Ш.
+Щ.
+Ъ.
+Ы.
+Ь.
+Э.
+Ю.
+Я.
+л.
+г.
+обл.
+гг.
+в.
+вв.
+мин.
+ч.
+тыс.
+млн.
+млрд.
+трлн.
+кв.
+куб.
+руб.
+коп.
+долл.
+Прим.
+прим.
+чел.
+грн.
+мин.
+им.
+проф.
+акад.
+ред.
+авт.
+корр.
+соб.
+спец.
+см.
+тж.
+др.
+пр.
+букв.
+# Two-letter abbreviations - can be written with space
+п.п.
+ст.л.
+т.е.
+т.к.
+т.ч.
+т.д.
+т.п.
+и.о.
+с.г.
+г.р.
+т.н.
+т.ч.
+н.э.
diff --git a/corpus/support/token_patterns b/corpus/support/token_patterns
index 8e69432b..de64fb2a 100644
--- a/corpus/support/token_patterns
+++ b/corpus/support/token_patterns
@@ -1,3 +1,5 @@
 /^(al|el|ul|e)\-[a-z]+$/
+/^((а|А)(ль|ш)|уль)-\p{Cyrillic}+$/
+/^\p{Cyrillic}\.\p{Cyrillic}\.$/
 /^(\d|\d\d|\d\d\d)\.$/
 
diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl
index e9c3a37d..0350a894 100755
--- a/corpus/support/tokenizer.pl
+++ b/corpus/support/tokenizer.pl
@@ -73,6 +73,7 @@ my $dict_file = "$workdir/token_list";
 my $word_patt_file = "$workdir/token_patterns";
 
 open(my $dict_fp, "$dict_file") or die;
+binmode($dict_fp, ":utf8");
 
 # read in the list of words that should not be segmented, 
 ##  e.g.,"I.B.M.", co-operation.
@@ -89,6 +90,7 @@ while(<$dict_fp>){
 }
 
 open(my $patt_fp, "$word_patt_file") or die;
+binmode($patt_fp, ":utf8");
 my @word_patts = ();
 my $word_patt_num = 0;
 while(<$patt_fp>){
@@ -147,7 +149,6 @@ while(<STDIN>){
 
     print STDOUT " $new_line\n";
 }
-print STDERR "\n";
 
 ########################################################################
author	Paul Baltescu <pauldb89@gmail.com>	2013-02-21 14:13:55 +0000
committer	Paul Baltescu <pauldb89@gmail.com>	2013-02-21 14:13:55 +0000
commit	bca26d953a774b8efca12f30407390b3f5eef9d0 (patch)
tree	fe922de5c89b1844f677d550dcc24e87edd67a55 /corpus
parent	54a1c0e2bde259e3acc9c0a8ec8da3c7704e80ca (diff)
parent	95c364f2cb002241c4a62bedb1c5ef6f1e9a7f22 (diff)