10 files changed, 375 insertions, 6 deletions
diff --git a/README.md b/README.md
index fd42922..1e4bb01 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,6 @@
-a number of NLP related scripts. Some scripts require my zipf gem, see [1]
-
-\*.perl taken from the moses [2] toolkit
-
-mem\_usage taken from [3]
+A number of NLP related scripts. Some scripts require my zipf gem, see [1].
+\*.perl taken from the moses [2] toolkit.
+mem\_usage taken from [3].
 
 
 [1] https://github.com/pks/zipf
diff --git a/detruecase.perl b/detruecase.perl
new file mode 100755
index 0000000..012c143
--- /dev/null
+++ b/detruecase.perl
@@ -0,0 +1,88 @@
+#!/usr/bin/perl -w
+
+use strict;
+use Getopt::Long "GetOptions";
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+my ($SRC,$INFILE,$UNBUFFERED);
+die("detruecase.perl < in > out")
+    unless &GetOptions('headline=s' => \$SRC,
+		       'in=s' => \$INFILE,
+                       'b|unbuffered' => \$UNBUFFERED);
+if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }
+
+my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
+my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"&quot;"=>1,"&apos;"=>1,"&#91;"=>1,"&#93;"=>1);
+
+# lowercase even in headline
+my %ALWAYS_LOWER;
+foreach ("a","after","against","al-.+","and","any","as","at","be","because","between","by","during","el-.+","for","from","his","in","is","its","last","not","of","off","on","than","the","their","this","to","was","were","which","will","with") { $ALWAYS_LOWER{$_} = 1; }
+
+# find out about the headlines
+my @HEADLINE;
+if (defined($SRC)) {
+    open(SRC,$SRC);
+    my $headline_flag = 0;
+    while(<SRC>) {
+	$headline_flag = 1 if /<hl>/;
+	$headline_flag = 0 if /<.hl>/;
+	next unless /^<seg/;
+	push @HEADLINE, $headline_flag;
+    }
+    close(SRC);
+}
+
+my $sentence = 0;
+if ($INFILE) {
+  open(IN,$INFILE) || die("ERROR: could not open file '$INFILE'");
+  binmode(IN, ":utf8");
+  while(<IN>) {
+    &process($_,$sentence++);
+  }
+  close(IN);
+}
+else {
+  while(<STDIN>) {
+    &process($_,$sentence++);
+  }
+}
+
+sub process {
+    my $line = $_[0];
+    chomp($line);
+    $line =~ s/^\s+//;
+    $line =~ s/\s+$//;
+    my @WORD  = split(/\s+/,$line);
+
+    # uppercase at sentence start
+    my $sentence_start = 1;
+    for(my $i=0;$i<scalar(@WORD);$i++) {
+      &uppercase(\$WORD[$i]) if $sentence_start;
+      if (defined($SENTENCE_END{ $WORD[$i] })) { $sentence_start = 1; }
+      elsif (!defined($DELAYED_SENTENCE_START{$WORD[$i] })) { $sentence_start = 0; }
+    }
+
+    # uppercase headlines {
+    if (defined($SRC) && $HEADLINE[$sentence]) {
+	foreach (@WORD) {
+	    &uppercase(\$_) unless $ALWAYS_LOWER{$_};
+	}	
+    }
+
+    # output
+    my $first = 1;
+    foreach (@WORD) {
+	print " " unless $first;
+	$first = 0;
+	print $_;
+    }
+    print "\n";
+    $sentence++;
+}
+
+sub uppercase {
+    my ($W) = @_;
+    $$W = uc(substr($$W,0,1)).substr($$W,1);
+}
diff --git a/fix-utf-8-pua b/fix-utf-8-pua
new file mode 100755
index 0000000..674d424
--- /dev/null
+++ b/fix-utf-8-pua
@@ -0,0 +1,10 @@
+#!/usr/bin/env ruby
+
+# remove private use area chars
+
+while line = STDIN.gets
+  line.strip!
+  line.gsub! /[\u{e000}-\u{f8ff}]/, " "
+  puts line
+end
+
diff --git a/ltok b/ltok
new file mode 100755
index 0000000..c90823e
--- /dev/null
+++ b/ltok
@@ -0,0 +1,9 @@
+#!/usr/bin/ruby
+
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
+while line = STDIN.gets
+  puts line.strip.split(/\s/).size
+end
+
diff --git a/norm_hyphens b/norm_hyphens
new file mode 100755
index 0000000..4a152a1
--- /dev/null
+++ b/norm_hyphens
@@ -0,0 +1,4 @@
+#!/bin/zsh -x
+
+sed "s|[ \t]\+\xc2\xad[ \t]\+||g"
+
diff --git a/normchr b/normchr
new file mode 100755
index 0000000..f8e5798
--- /dev/null
+++ b/normchr
@@ -0,0 +1,35 @@
+#!/usr/bin/env ruby
+
+# http://www.utf8-chartable.de/unicode-utf8-table.pl?start=8192&number=128&utf8=string-literal
+# https://www.cs.tut.fi/~jkorpela/chars/spaces.html
+
+require 'htmlentities'
+
+STDIN.set_encoding 'utf-8'
+STDOUT.set_encoding 'utf-8'
+
+coder = HTMLEntities.new
+
+while line = STDIN.gets
+  line.sub! "\xef\xbb\xbf", ""          # BOM
+  line.strip!                           # superfluous
+  line.lstrip!                          #   whitespace
+  line.gsub! /[[:cntrl:]]+/, " "        # control characters
+  line.gsub! /\u{00a0}/, " "            # misc whitespace
+  line.gsub! /\u{1680}/, " "            # ^
+  line.gsub! /\u{180e}/, " "            # ^
+  line.gsub! /\u{3000}/, " "            # ^
+  line.gsub! /\u{feff}/, " "            # ^
+  line = line.scan(/[[:print:]]/).join  # only printable characters
+  line.gsub! /[\u{e000}-\u{f8ff}]/, " " # UTF-8 PUA
+  line.gsub! /[\u{f0000}-\u{ffffd}]/, " "
+  line.gsub! /[\u{100000}-\u{10fffd}]/, " "
+  line.gsub! "\r", " "                  # carriage return  
+  line.gsub! /[\u{2000}-\u{200f}]/, " " #                   EN QUAD -- RIGHT-TO-LEFT MARK
+  line.gsub! /[\u{2028}-\u{202f}]/, " " #            LINE SEPARATOR -- NARROW NO-BREAK SPACE
+  line.gsub! /[\u{205f}-\u{206f}]/, " " # MEDIUM MATHEMATICAL SPACE -- NOMINAL DIGIT SHAPES
+  line.gsub! /\s*\xc2\xad\s*/, ""       # remove hyphens
+  line.gsub! /[[:space:]]+/, " "        # collapse space
+  puts coder.decode(line)
+end
+
diff --git a/preprocess_no_lower b/preprocess_no_lower
new file mode 100755
index 0000000..3a4d358
--- /dev/null
+++ b/preprocess_no_lower
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+pushd `dirname $0` > /dev/null
+P=`pwd -P`
+popd > /dev/null
+
+LANG=$1
+$P/no_non_printables | sed "s|[-,\.]\{4,\}|...|g" | $P/htmlentities 2>htmlentities.$LANG.err | $P/normalize_punctuation 2>normalize-punctuation.$LANG.err | $P/tokenizer-no-escape.perl -a -b -threads 1 -l $LANG 2>tokenizer.$LANG.err
+
diff --git a/sample b/sample
index 8dbd26d..8d3a322 100755
--- a/sample
+++ b/sample
@@ -50,7 +50,7 @@ while idx = sample.shift
     puts idx
   else
     if opts[:output_index]
-      puts "#{idx} #{input[idx]}"
+      puts "#{idx}\t#{input[idx]}"
     else
       puts "#{input[idx]}"
     end
diff --git a/train-truecaser.perl b/train-truecaser.perl
new file mode 100755
index 0000000..59a83ec
--- /dev/null
+++ b/train-truecaser.perl
@@ -0,0 +1,112 @@
+#!/usr/bin/perl -w
+
+# $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $
+
+#
+# Options:
+#
+# --possiblyUseFirstToken : boolean option; the default behaviour (when this option is not provided) is that the first token of a sentence is ignored, on the basis that the first word of a sentence is always capitalized; if this option is provided then: a) if a sentence-initial token is *not* capitalized, then it is counted, and b) if a capitalized sentence-initial token is the only token of the segment, then it is counted, but with only 10% of the weight of a normal token.
+#
+
+use strict;
+use Getopt::Long "GetOptions";
+
+# apply switches
+my ($MODEL,$CORPUS);
+die("train-truecaser.perl --model truecaser --corpus cased [--possiblyUseFirstToken]")
+    unless &GetOptions('corpus=s' => \$CORPUS,
+                       'model=s' => \$MODEL,
+                       'possiblyUseFirstToken' => \(my $possiblyUseFirstToken = 0))
+    && defined($CORPUS) && defined($MODEL);
+my %CASING;
+my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
+my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"&apos;"=>1,"&quot;"=>1,"&#91;"=>1,"&#93;"=>1);
+open(CORPUS,$CORPUS) || die("ERROR: could not open '$CORPUS'");
+binmode(CORPUS, ":utf8");
+while(<CORPUS>) {
+  chop;
+  my ($WORD,$MARKUP) = split_xml($_);
+  my $start = 0;
+  while($start<=$#$WORD && defined($DELAYED_SENTENCE_START{$$WORD[$start]})) { $start++; }
+  my $firstWordOfSentence = 1;
+  for(my $i=$start;$i<=$#$WORD;$i++) {
+    my $currentWord = $$WORD[$i];
+    if (! $firstWordOfSentence && defined($SENTENCE_END{$$WORD[$i-1]})) {
+      $firstWordOfSentence = 1;
+    }
+
+    my $currentWordWeight = 0;
+    if (! $firstWordOfSentence) {
+      $currentWordWeight = 1;
+    } elsif ($possiblyUseFirstToken) {
+      # gated special handling of first word of sentence
+      my $firstChar = substr($currentWord, 0, 1);
+      if (lc($firstChar) eq $firstChar) {
+        # if the first character is not upper case, count the token as full evidence (because if it's not capitalized, then there's no reason to be wary that the given casing is only due to being sentence-initial)
+	$currentWordWeight = 1;
+      } elsif (scalar(@$WORD) == 1) {
+	# if the first character is upper case, but the current token is the only token of the segment, then count the token as partial evidence (because the segment is presumably not a sentence and the token is therefore not the first word of a sentence and is possibly in its natural case)
+	$currentWordWeight = 0.1;
+      }
+    }
+    if ($currentWordWeight > 0) {
+      $CASING{ lc($currentWord) }{ $currentWord } += $currentWordWeight;
+    }
+
+    $firstWordOfSentence = 0;
+  }
+}
+close(CORPUS);
+
+open(MODEL,">$MODEL") || die("ERROR: could not create '$MODEL'");
+binmode(MODEL, ":utf8");
+foreach my $type (keys %CASING) {
+  my ($score,$total,$best) = (-1,0,"");
+  foreach my $word (keys %{$CASING{$type}}) {
+    my $count = $CASING{$type}{$word};
+    $total += $count;
+    if ($count > $score) {
+      $best = $word;
+      $score = $count;
+    }
+  }
+  print MODEL "$best ($score/$total)";
+  foreach my $word (keys %{$CASING{$type}}) {
+    print MODEL " $word ($CASING{$type}{$word})" unless $word eq $best;
+  }
+  print MODEL "\n";
+}
+close(MODEL);
+
+
+# store away xml markup
+sub split_xml {
+  my ($line) = @_;
+  my (@WORD,@MARKUP);
+  my $i = 0;
+  $MARKUP[0] = "";
+  while($line =~ /\S/) {
+    # XML tag
+    if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) {
+      $MARKUP[$i] .= $1." ";
+      $line = $2;
+    }
+    # non-XML text
+    elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) {
+      $WORD[$i++] = $1;
+      $MARKUP[$i] = "";
+      $line = $2;
+    }
+    # '<' or '>' occurs in word, but it's not an XML tag
+    elsif ($line =~ /^\s*(\S+)(.*)$/) {
+      $WORD[$i++] = $1;
+      $MARKUP[$i] = "";
+      $line = $2;
+      }
+    else {
+      die("ERROR: huh? $line\n");
+    }
+  }
+  chop($MARKUP[$#MARKUP]);
+  return (\@WORD,\@MARKUP);
+}
diff --git a/truecase.perl b/truecase.perl
new file mode 100755
index 0000000..0a4d366
--- /dev/null
+++ b/truecase.perl
@@ -0,0 +1,104 @@
+#!/usr/bin/perl -w
+
+# $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $
+use strict;
+use Getopt::Long "GetOptions";
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+# apply switches
+my ($MODEL, $UNBUFFERED);
+die("truecase.perl --model MODEL [-b] < in > out")
+    unless &GetOptions('model=s' => \$MODEL,'b|unbuffered' => \$UNBUFFERED)
+    && defined($MODEL);
+if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }
+
+my (%BEST,%KNOWN);
+open(MODEL,$MODEL) || die("ERROR: could not open '$MODEL'");
+binmode(MODEL, ":utf8");
+while(<MODEL>) {
+  my ($word,@OPTIONS) = split;
+  $BEST{ lc($word) } = $word;
+  $KNOWN{ $word } = 1;
+  for(my $i=1;$i<$#OPTIONS;$i+=2) {
+    $KNOWN{ $OPTIONS[$i] } = 1;
+  }
+}
+close(MODEL);
+
+my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
+my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"&apos;"=>1,"&quot;"=>1,"&#91;"=>1,"&#93;"=>1);
+
+while(<STDIN>) {
+  chop;
+  my ($WORD,$MARKUP) = split_xml($_);
+  my $sentence_start = 1;
+  for(my $i=0;$i<=$#$WORD;$i++) {
+    print " " if $i && $$MARKUP[$i] eq '';
+    print $$MARKUP[$i];
+
+    my ($word,$otherfactors);
+    if ($$WORD[$i] =~ /^([^\|]+)(.*)/)
+    {
+	$word = $1;
+	$otherfactors = $2;
+    }
+    else
+    {
+	$word = $$WORD[$i];
+	$otherfactors = "";
+    }
+
+    if ($sentence_start && defined($BEST{lc($word)})) {
+      print $BEST{lc($word)}; # truecase sentence start
+    }
+    elsif (defined($KNOWN{$word})) {
+      print $word; # don't change known words
+    }
+    elsif (defined($BEST{lc($word)})) {
+      print $BEST{lc($word)}; # truecase otherwise unknown words
+    }
+    else {
+      print $word; # unknown, nothing to do
+    }
+    print $otherfactors;
+
+    if    ( defined($SENTENCE_END{ $word }))           { $sentence_start = 1; }
+    elsif (!defined($DELAYED_SENTENCE_START{ $word })) { $sentence_start = 0; }
+  }
+  print $$MARKUP[$#$MARKUP];
+  print "\n";
+}
+
+# store away xml markup
+sub split_xml {
+  my ($line) = @_;
+  my (@WORD,@MARKUP);
+  my $i = 0;
+  $MARKUP[0] = "";
+  while($line =~ /\S/) {
+    # XML tag
+    if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) {
+      $MARKUP[$i] .= $1." ";
+      $line = $2;
+    }
+    # non-XML text
+    elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) {
+      $WORD[$i++] = $1;
+      $MARKUP[$i] = "";
+      $line = $2;
+    }
+    # '<' or '>' occurs in word, but it's not an XML tag
+    elsif ($line =~ /^\s*(\S+)(.*)$/) {
+      $WORD[$i++] = $1;
+      $MARKUP[$i] = "";
+      $line = $2;
+      }
+    else {
+      die("ERROR: huh? $line\n");
+    }
+  }
+  chop($MARKUP[$#MARKUP]);
+  return (\@WORD,\@MARKUP);
+}