summaryrefslogtreecommitdiff
path: root/external
diff options
context:
space:
mode:
Diffstat (limited to 'external')
-rw-r--r--external/README3
-rwxr-xr-xexternal/detokenizer.perl363
-rwxr-xr-xexternal/lowercase.perl10
-rw-r--r--external/nonbreaking_prefixes/README.txt5
-rw-r--r--external/nonbreaking_prefixes/nonbreaking_prefix.ca75
-rw-r--r--external/nonbreaking_prefixes/nonbreaking_prefix.cs390
-rw-r--r--external/nonbreaking_prefixes/nonbreaking_prefix.de325
-rw-r--r--external/nonbreaking_prefixes/nonbreaking_prefix.el2
-rw-r--r--external/nonbreaking_prefixes/nonbreaking_prefix.en107
-rw-r--r--external/nonbreaking_prefixes/nonbreaking_prefix.es118
-rw-r--r--external/nonbreaking_prefixes/nonbreaking_prefix.fr153
-rw-r--r--external/nonbreaking_prefixes/nonbreaking_prefix.is251
-rw-r--r--external/nonbreaking_prefixes/nonbreaking_prefix.it180
-rw-r--r--external/nonbreaking_prefixes/nonbreaking_prefix.nl115
-rw-r--r--external/nonbreaking_prefixes/nonbreaking_prefix.pl283
-rw-r--r--external/nonbreaking_prefixes/nonbreaking_prefix.pt210
-rw-r--r--external/nonbreaking_prefixes/nonbreaking_prefix.ro38
-rw-r--r--external/nonbreaking_prefixes/nonbreaking_prefix.ru259
-rw-r--r--external/nonbreaking_prefixes/nonbreaking_prefix.sk474
-rw-r--r--external/nonbreaking_prefixes/nonbreaking_prefix.sl78
-rw-r--r--external/nonbreaking_prefixes/nonbreaking_prefix.sv46
-rwxr-xr-xexternal/tokenizer-no-escape.perl348
-rwxr-xr-xexternal/truecase.perl104
23 files changed, 3937 insertions, 0 deletions
diff --git a/external/README b/external/README
new file mode 100644
index 0000000..a2e40bd
--- /dev/null
+++ b/external/README
@@ -0,0 +1,3 @@
+github.com/pks/scripts @e137509a77e3a8c12af32852ebba893dacb53f85
+moses @7b02017da1e2a09486627b543446ec78e51541a7
+
diff --git a/external/detokenizer.perl b/external/detokenizer.perl
new file mode 100755
index 0000000..a8de7e8
--- /dev/null
+++ b/external/detokenizer.perl
@@ -0,0 +1,363 @@
+#!/usr/bin/perl -w
+
+# $Id: detokenizer.perl 4134 2011-08-08 15:30:54Z bgottesman $
+# Sample De-Tokenizer
+# written by Josh Schroeder, based on code by Philipp Koehn
+# further modifications by Ondrej Bojar
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+use strict;
+use utf8; # tell perl this script file is in UTF-8 (see all funny punct below)
+
+my $language = "en";
+my $QUIET = 0;
+my $HELP = 0;
+my $UPPERCASE_SENT = 0;
+my $PENN = 0;
+
+while (@ARGV) {
+ $_ = shift;
+ /^-b$/ && ($| = 1, next);
+ /^-l$/ && ($language = shift, next);
+ /^-q$/ && ($QUIET = 1, next);
+ /^-h$/ && ($HELP = 1, next);
+ /^-u$/ && ($UPPERCASE_SENT = 1, next);
+ /^-penn$/ && ($PENN = 1, next);
+}
+
+if ($HELP) {
+ print "Usage ./detokenizer.perl (-l [en|fr|it|cs|...]) < tokenizedfile > detokenizedfile\n";
+ print "Options:\n";
+ print " -u ... uppercase the first char in the final sentence.\n";
+ print " -q ... don't report detokenizer revision.\n";
+ print " -b ... disable Perl buffering.\n";
+ print " -penn ... assume input is tokenized as per tokenizer.perl's -penn option.\n";
+ exit;
+}
+
+if ($language !~ /^(cs|en|fr|it)$/) {
+ print STDERR "Warning: No built-in rules for language $language.\n"
+}
+
+if ($PENN && $language ne "en") {
+ print STDERR "Error: -penn option only supported for English text.\n";
+ exit;
+}
+
+if (!$QUIET) {
+ print STDERR "Detokenizer Version ".'$Revision: 4134 $'."\n";
+ print STDERR "Language: $language\n";
+}
+
+while(<STDIN>) {
+ if (/^<.+>$/ || /^\s*$/) {
+ #don't try to detokenize XML/HTML tag lines
+ print $_;
+ } elsif ($PENN) {
+ print &detokenize_penn($_);
+ } else {
+ print &detokenize($_);
+ }
+}
+
+
+sub ucsecondarg {
+ # uppercase the second argument
+ my $arg1 = shift;
+ my $arg2 = shift;
+ return $arg1.uc($arg2);
+}
+
+sub deescape {
+ # de-escape special chars
+ my ($text) = @_;
+ $text =~ s/\&bar;/\|/g; # factor separator (legacy)
+ $text =~ s/\&#124;/\|/g; # factor separator
+ $text =~ s/\&lt;/\</g; # xml
+ $text =~ s/\&gt;/\>/g; # xml
+ $text =~ s/\&bra;/\[/g; # syntax non-terminal (legacy)
+ $text =~ s/\&ket;/\]/g; # syntax non-terminal (legacy)
+ $text =~ s/\&quot;/\"/g; # xml
+ $text =~ s/\&apos;/\'/g; # xml
+ $text =~ s/\&#91;/\[/g; # syntax non-terminal
+ $text =~ s/\&#93;/\]/g; # syntax non-terminal
+ $text =~ s/\&amp;/\&/g; # escape escape
+ return $text;
+}
+
+sub detokenize {
+ my($text) = @_;
+ chomp($text);
+ $text = " $text ";
+ $text =~ s/ \@\-\@ /-/g;
+ $text = &deescape($text);
+
+ my $word;
+ my $i;
+ my @words = split(/ /,$text);
+ $text = "";
+ my %quoteCount = ("\'"=>0,"\""=>0);
+ my $prependSpace = " ";
+ for ($i=0;$i<(scalar(@words));$i++) {
+ if (&startsWithCJKChar($words[$i])) {
+ if ($i > 0 && &endsWithCJKChar($words[$i-1])) {
+ # perform left shift if this is a second consecutive CJK (Chinese/Japanese/Korean) word
+ $text=$text.$words[$i];
+ } else {
+ # ... but do nothing special if this is a CJK word that doesn't follow a CJK word
+ $text=$text.$prependSpace.$words[$i];
+ }
+ $prependSpace = " ";
+ } elsif ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) {
+ #perform right shift on currency and other random punctuation items
+ $text = $text.$prependSpace.$words[$i];
+ $prependSpace = "";
+ } elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){
+ if (($language eq "fr") && ($words[$i] =~ /^[\?\!\:\;\\\%]$/)) {
+ #these punctuations are prefixed with a non-breakable space in french
+ $text .= " "; }
+ #perform left shift on punctuation items
+ $text=$text.$words[$i];
+ $prependSpace = " ";
+ } elsif (($language eq "en") && ($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) {
+ #left-shift the contraction for English
+ $text=$text.$words[$i];
+ $prependSpace = " ";
+ } elsif (($language eq "cs") && ($i>1) && ($words[$i-2] =~ /^[0-9]+$/) && ($words[$i-1] =~ /^[.,]$/) && ($words[$i] =~ /^[0-9]+$/)) {
+ #left-shift floats in Czech
+ $text=$text.$words[$i];
+ $prependSpace = " ";
+ } elsif ((($language eq "fr") ||($language eq "it")) && ($i<=(scalar(@words)-2)) && ($words[$i] =~ /[\p{IsAlpha}][\']$/) && ($words[$i+1] =~ /^[\p{IsAlpha}]/)) {
+ #right-shift the contraction for French and Italian
+ $text = $text.$prependSpace.$words[$i];
+ $prependSpace = "";
+ } elsif (($language eq "cs") && ($i<(scalar(@words)-3))
+ && ($words[$i] =~ /[\p{IsAlpha}]$/)
+ && ($words[$i+1] =~ /^[-–]$/)
+ && ($words[$i+2] =~ /^li$|^mail.*/i)
+ ) {
+ #right-shift "-li" in Czech and a few Czech dashed words (e-mail)
+ $text = $text.$prependSpace.$words[$i].$words[$i+1];
+ $i++; # advance over the dash
+ $prependSpace = "";
+ } elsif ($words[$i] =~ /^[\'\"„“`]+$/) {
+ #combine punctuation smartly
+ my $normalized_quo = $words[$i];
+ $normalized_quo = '"' if $words[$i] =~ /^[„“”]+$/;
+ $quoteCount{$normalized_quo} = 0
+ if !defined $quoteCount{$normalized_quo};
+ if ($language eq "cs" && $words[$i] eq "„") {
+ # this is always the starting quote in Czech
+ $quoteCount{$normalized_quo} = 0;
+ }
+ if ($language eq "cs" && $words[$i] eq "“") {
+ # this is usually the ending quote in Czech
+ $quoteCount{$normalized_quo} = 1;
+ }
+ if (($quoteCount{$normalized_quo} % 2) eq 0) {
+ if(($language eq "en") && ($words[$i] eq "'") && ($i > 0) && ($words[$i-1] =~ /[s]$/)) {
+ #single quote for posesssives ending in s... "The Jones' house"
+ #left shift
+ $text=$text.$words[$i];
+ $prependSpace = " ";
+ } else {
+ #right shift
+ $text = $text.$prependSpace.$words[$i];
+ $prependSpace = "";
+ $quoteCount{$normalized_quo} ++;
+
+ }
+ } else {
+ #left shift
+ $text=$text.$words[$i];
+ $prependSpace = " ";
+ $quoteCount{$normalized_quo} ++;
+
+ }
+
+ } else {
+ $text=$text.$prependSpace.$words[$i];
+ $prependSpace = " ";
+ }
+ }
+
+ # clean up spaces at head and tail of each line as well as any double-spacing
+ $text =~ s/ +/ /g;
+ $text =~ s/\n /\n/g;
+ $text =~ s/ \n/\n/g;
+ $text =~ s/^ //g;
+ $text =~ s/ $//g;
+
+ #add trailing break
+ $text .= "\n" unless $text =~ /\n$/;
+
+ $text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT;
+
+ return $text;
+}
+
+sub detokenize_penn {
+ my($text) = @_;
+
+ chomp($text);
+ $text = " $text ";
+ $text =~ s/ \@\-\@ /-/g;
+ $text =~ s/ \@\/\@ /\//g;
+ $text = &deescape($text);
+
+ # merge de-contracted forms except where the second word begins with an
+ # apostrophe (those are handled later)
+ $text =~ s/ n't /n't /g;
+ $text =~ s/ N'T /N'T /g;
+ $text =~ s/ ([Cc])an not / $1annot /g;
+ $text =~ s/ ([Dd])' ye / $1'ye /g;
+ $text =~ s/ ([Gg])im me / $1imme /g;
+ $text =~ s/ ([Gg])on na / $1onna /g;
+ $text =~ s/ ([Gg])ot ta / $1otta /g;
+ $text =~ s/ ([Ll])em me / $1emme /g;
+ $text =~ s/ '([Tt]) is / '$1is /g;
+ $text =~ s/ '([Tt]) was / '$1was /g;
+ $text =~ s/ ([Ww])an na / $1anna /g;
+
+ # restore brackets
+ $text =~ s/-LRB-/\(/g;
+ $text =~ s/-RRB-/\)/g;
+ $text =~ s/-LSB-/\[/g;
+ $text =~ s/-RSB-/\]/g;
+ $text =~ s/-LCB-/{/g;
+ $text =~ s/-RCB-/}/g;
+
+ my $i;
+ my @words = split(/ /,$text);
+ $text = "";
+ my $prependSpace = " ";
+ for ($i=0;$i<(scalar(@words));$i++) {
+ if ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) {
+ # perform right shift on currency and other random punctuation items
+ $text = $text.$prependSpace.$words[$i];
+ $prependSpace = "";
+ } elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){
+ # perform left shift on punctuation items
+ $text=$text.$words[$i];
+ $prependSpace = " ";
+ } elsif (($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) {
+ # left-shift the contraction
+ $text=$text.$words[$i];
+ $prependSpace = " ";
+ } elsif ($words[$i] eq "`") { # Assume that punctuation has been normalized and is one of `, ``, ', '' only
+ # opening single quote: convert to straight quote and right-shift
+ $text = $text.$prependSpace."\'";
+ $prependSpace = "";
+ } elsif ($words[$i] eq "``") {
+ # opening double quote: convert to straight quote and right-shift
+ $text = $text.$prependSpace."\"";
+ $prependSpace = "";
+ } elsif ($words[$i] eq "\'") {
+ # closing single quote: convert to straight quote and left shift
+ $text = $text."\'";
+ $prependSpace = " ";
+ } elsif ($words[$i] eq "\'\'") {
+ # closing double quote: convert to straight quote and left shift
+ $text = $text."\"";
+ $prependSpace = " ";
+ } else {
+ $text = $text.$prependSpace.$words[$i];
+ $prependSpace = " ";
+ }
+ }
+
+ # clean up spaces at head and tail of each line as well as any double-spacing
+ $text =~ s/ +/ /g;
+ $text =~ s/\n /\n/g;
+ $text =~ s/ \n/\n/g;
+ $text =~ s/^ //g;
+ $text =~ s/ $//g;
+
+ # add trailing break
+ $text .= "\n" unless $text =~ /\n$/;
+
+ $text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT;
+
+ return $text;
+}
+
+sub startsWithCJKChar {
+ my ($str) = @_;
+ return 0 if length($str) == 0;
+ my $firstChar = substr($str, 0, 1);
+ return &charIsCJK($firstChar);
+}
+
+sub endsWithCJKChar {
+ my ($str) = @_;
+ return 0 if length($str) == 0;
+ my $lastChar = substr($str, length($str)-1, 1);
+ return &charIsCJK($lastChar);
+}
+
+# Given a string consisting of one character, returns true iff the character
+# is a CJK (Chinese/Japanese/Korean) character
+sub charIsCJK {
+ my ($char) = @_;
+ # $char should be a string of length 1
+ my $codepoint = &codepoint_dec($char);
+
+ # The following is based on http://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane
+
+ # Hangul Jamo (1100–11FF)
+ return 1 if (&between_hexes($codepoint, '1100', '11FF'));
+
+ # CJK Radicals Supplement (2E80–2EFF)
+ # Kangxi Radicals (2F00–2FDF)
+ # Ideographic Description Characters (2FF0–2FFF)
+ # CJK Symbols and Punctuation (3000–303F)
+ # Hiragana (3040–309F)
+ # Katakana (30A0–30FF)
+ # Bopomofo (3100–312F)
+ # Hangul Compatibility Jamo (3130–318F)
+ # Kanbun (3190–319F)
+ # Bopomofo Extended (31A0–31BF)
+ # CJK Strokes (31C0–31EF)
+ # Katakana Phonetic Extensions (31F0–31FF)
+ # Enclosed CJK Letters and Months (3200–32FF)
+ # CJK Compatibility (3300–33FF)
+ # CJK Unified Ideographs Extension A (3400–4DBF)
+ # Yijing Hexagram Symbols (4DC0–4DFF)
+ # CJK Unified Ideographs (4E00–9FFF)
+ # Yi Syllables (A000–A48F)
+ # Yi Radicals (A490–A4CF)
+ return 1 if (&between_hexes($codepoint, '2E80', 'A4CF'));
+
+ # Phags-pa (A840–A87F)
+ return 1 if (&between_hexes($codepoint, 'A840', 'A87F'));
+
+ # Hangul Syllables (AC00–D7AF)
+ return 1 if (&between_hexes($codepoint, 'AC00', 'D7AF'));
+
+ # CJK Compatibility Ideographs (F900–FAFF)
+ return 1 if (&between_hexes($codepoint, 'F900', 'FAFF'));
+
+ # CJK Compatibility Forms (FE30–FE4F)
+ return 1 if (&between_hexes($codepoint, 'FE30', 'FE4F'));
+
+ # Range U+FF65–FFDC encodes halfwidth forms, of Katakana and Hangul characters
+ return 1 if (&between_hexes($codepoint, 'FF65', 'FFDC'));
+
+ # Supplementary Ideographic Plane 20000–2FFFF
+ return 1 if (&between_hexes($codepoint, '20000', '2FFFF'));
+
+ return 0;
+}
+
+# Returns the code point of a Unicode char, represented as a decimal number
+sub codepoint_dec {
+ if (my $char = shift) {
+ return unpack('U0U*', $char);
+ }
+}
+
+sub between_hexes {
+ my ($num, $left, $right) = @_;
+ return $num >= hex($left) && $num <= hex($right);
+}
diff --git a/external/lowercase.perl b/external/lowercase.perl
new file mode 100755
index 0000000..c30e029
--- /dev/null
+++ b/external/lowercase.perl
@@ -0,0 +1,10 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+while(<STDIN>) {
+ print lc($_);
+}
diff --git a/external/nonbreaking_prefixes/README.txt b/external/nonbreaking_prefixes/README.txt
new file mode 100644
index 0000000..02cdfcc
--- /dev/null
+++ b/external/nonbreaking_prefixes/README.txt
@@ -0,0 +1,5 @@
+The language suffix can be found here:
+
+http://www.loc.gov/standards/iso639-2/php/code_list.php
+
+
diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.ca b/external/nonbreaking_prefixes/nonbreaking_prefix.ca
new file mode 100644
index 0000000..2f4fdfc
--- /dev/null
+++ b/external/nonbreaking_prefixes/nonbreaking_prefix.ca
@@ -0,0 +1,75 @@
+Dr
+Dra
+pàg
+p
+c
+av
+Sr
+Sra
+adm
+esq
+Prof
+S.A
+S.L
+p.e
+ptes
+Sta
+St
+pl
+màx
+cast
+dir
+nre
+fra
+admdora
+Emm
+Excma
+espf
+dc
+admdor
+tel
+angl
+aprox
+ca
+dept
+dj
+dl
+dt
+ds
+dg
+dv
+ed
+entl
+al
+i.e
+maj
+smin
+n
+núm
+pta
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.cs b/external/nonbreaking_prefixes/nonbreaking_prefix.cs
new file mode 100644
index 0000000..dce6167
--- /dev/null
+++ b/external/nonbreaking_prefixes/nonbreaking_prefix.cs
@@ -0,0 +1,390 @@
+Bc
+BcA
+Ing
+Ing.arch
+MUDr
+MVDr
+MgA
+Mgr
+JUDr
+PhDr
+RNDr
+PharmDr
+ThLic
+ThDr
+Ph.D
+Th.D
+prof
+doc
+CSc
+DrSc
+dr. h. c
+PaedDr
+Dr
+PhMr
+DiS
+abt
+ad
+a.i
+aj
+angl
+anon
+apod
+atd
+atp
+aut
+bd
+biogr
+b.m
+b.p
+b.r
+cca
+cit
+cizojaz
+c.k
+col
+čes
+čín
+čj
+ed
+facs
+fasc
+fol
+fot
+franc
+h.c
+hist
+hl
+hrsg
+ibid
+il
+ind
+inv.č
+jap
+jhdt
+jv
+koed
+kol
+korej
+kl
+krit
+lat
+lit
+m.a
+maď
+mj
+mp
+násl
+např
+nepubl
+něm
+no
+nr
+n.s
+okr
+odd
+odp
+obr
+opr
+orig
+phil
+pl
+pokrač
+pol
+port
+pozn
+př.kr
+př.n.l
+přel
+přeprac
+příl
+pseud
+pt
+red
+repr
+resp
+revid
+rkp
+roč
+roz
+rozš
+samost
+sect
+sest
+seš
+sign
+sl
+srv
+stol
+sv
+šk
+šk.ro
+špan
+tab
+t.č
+tis
+tj
+tř
+tzv
+univ
+uspoř
+vol
+vl.jm
+vs
+vyd
+vyobr
+zal
+zejm
+zkr
+zprac
+zvl
+n.p
+např
+než
+MUDr
+abl
+absol
+adj
+adv
+ak
+ak. sl
+akt
+alch
+amer
+anat
+angl
+anglosas
+arab
+arch
+archit
+arg
+astr
+astrol
+att
+bás
+belg
+bibl
+biol
+boh
+bot
+bulh
+círk
+csl
+čas
+čes
+dat
+děj
+dep
+dět
+dial
+dór
+dopr
+dosl
+ekon
+epic
+etnonym
+eufem
+f
+fam
+fem
+fil
+film
+form
+fot
+fr
+fut
+fyz
+gen
+geogr
+geol
+geom
+germ
+gram
+hebr
+herald
+hist
+hl
+hovor
+hud
+hut
+chcsl
+chem
+ie
+imp
+impf
+ind
+indoevr
+inf
+instr
+interj
+ión
+iron
+it
+kanad
+katalán
+klas
+kniž
+komp
+konj
+
+konkr
+kř
+kuch
+lat
+lék
+les
+lid
+lit
+liturg
+lok
+log
+m
+mat
+meteor
+metr
+mod
+ms
+mysl
+n
+náb
+námoř
+neklas
+něm
+nesklon
+nom
+ob
+obch
+obyč
+ojed
+opt
+part
+pas
+pejor
+pers
+pf
+pl
+plpf
+
+práv
+prep
+předl
+přivl
+r
+rcsl
+refl
+reg
+rkp
+řec
+s
+samohl
+sg
+sl
+souhl
+spec
+srov
+stfr
+střv
+stsl
+subj
+subst
+superl
+sv
+sz
+táz
+tech
+telev
+teol
+trans
+typogr
+var
+vedl
+verb
+vl. jm
+voj
+vok
+vůb
+vulg
+výtv
+vztaž
+zahr
+zájm
+zast
+zejm
+
+zeměd
+zkr
+zř
+mj
+dl
+atp
+sport
+Mgr
+horn
+MVDr
+JUDr
+RSDr
+Bc
+PhDr
+ThDr
+Ing
+aj
+apod
+PharmDr
+pomn
+ev
+slang
+nprap
+odp
+dop
+pol
+st
+stol
+p. n. l
+před n. l
+n. l
+př. Kr
+po Kr
+př. n. l
+odd
+RNDr
+tzv
+atd
+tzn
+resp
+tj
+p
+br
+č. j
+čj
+č. p
+čp
+a. s
+s. r. o
+spol. s r. o
+p. o
+s. p
+v. o. s
+k. s
+o. p. s
+o. s
+v. r
+v z
+ml
+vč
+kr
+mld
+hod
+popř
+ap
+event
+rus
+slov
+rum
+švýc
+P. T
+zvl
+hor
+dol
+S.O.S \ No newline at end of file
diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.de b/external/nonbreaking_prefixes/nonbreaking_prefix.de
new file mode 100644
index 0000000..35fdf5e
--- /dev/null
+++ b/external/nonbreaking_prefixes/nonbreaking_prefix.de
@@ -0,0 +1,325 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+#no german words end in single lower-case letters, so we throw those in too.
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+
+
+#Roman Numerals. A dot after one of these is not a sentence break in German.
+I
+II
+III
+IV
+V
+VI
+VII
+VIII
+IX
+X
+XI
+XII
+XIII
+XIV
+XV
+XVI
+XVII
+XVIII
+XIX
+XX
+i
+ii
+iii
+iv
+v
+vi
+vii
+viii
+ix
+x
+xi
+xii
+xiii
+xiv
+xv
+xvi
+xvii
+xviii
+xix
+xx
+
+#Titles and Honorifics
+Adj
+Adm
+Adv
+Asst
+Bart
+Bldg
+Brig
+Bros
+Capt
+Cmdr
+Col
+Comdr
+Con
+Corp
+Cpl
+DR
+Dr
+Ens
+Gen
+Gov
+Hon
+Hosp
+Insp
+Lt
+MM
+MR
+MRS
+MS
+Maj
+Messrs
+Mlle
+Mme
+Mr
+Mrs
+Ms
+Msgr
+Op
+Ord
+Pfc
+Ph
+Prof
+Pvt
+Rep
+Reps
+Res
+Rev
+Rt
+Sen
+Sens
+Sfc
+Sgt
+Sr
+St
+Supt
+Surg
+
+#Misc symbols
+Mio
+Mrd
+bzw
+v
+vs
+usw
+d.h
+z.B
+u.a
+etc
+Mrd
+MwSt
+ggf
+d.J
+D.h
+m.E
+vgl
+I.F
+z.T
+sogen
+ff
+u.E
+g.U
+g.g.A
+c.-à-d
+Buchst
+u.s.w
+sog
+u.ä
+Std
+evtl
+Zt
+Chr
+u.U
+o.ä
+Ltd
+b.A
+z.Zt
+spp
+sen
+SA
+k.o
+jun
+i.H.v
+dgl
+dergl
+Co
+zzt
+usf
+s.p.a
+Dkr
+Corp
+bzgl
+BSE
+
+#Number indicators
+# add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it
+No
+Nos
+Art
+Nr
+pp
+ca
+Ca
+
+#Ordinals are done with . in German - "1." = "1st" in English
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
+85
+86
+87
+88
+89
+90
+91
+92
+93
+94
+95
+96
+97
+98
+99
diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.el b/external/nonbreaking_prefixes/nonbreaking_prefix.el
new file mode 100644
index 0000000..0470f91
--- /dev/null
+++ b/external/nonbreaking_prefixes/nonbreaking_prefix.el
@@ -0,0 +1,2 @@
+# for now, just include the Greek equivalent of "Mr."
diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.en b/external/nonbreaking_prefixes/nonbreaking_prefix.en
new file mode 100644
index 0000000..e1a3733
--- /dev/null
+++ b/external/nonbreaking_prefixes/nonbreaking_prefix.en
@@ -0,0 +1,107 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+Adj
+Adm
+Adv
+Asst
+Bart
+Bldg
+Brig
+Bros
+Capt
+Cmdr
+Col
+Comdr
+Con
+Corp
+Cpl
+DR
+Dr
+Drs
+Ens
+Gen
+Gov
+Hon
+Hr
+Hosp
+Insp
+Lt
+MM
+MR
+MRS
+MS
+Maj
+Messrs
+Mlle
+Mme
+Mr
+Mrs
+Ms
+Msgr
+Op
+Ord
+Pfc
+Ph
+Prof
+Pvt
+Rep
+Reps
+Res
+Rev
+Rt
+Sen
+Sens
+Sfc
+Sgt
+Sr
+St
+Supt
+Surg
+
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
+v
+vs
+i.e
+rev
+e.g
+
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+No #NUMERIC_ONLY#
+Nos
+Art #NUMERIC_ONLY#
+Nr
+pp #NUMERIC_ONLY#
diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.es b/external/nonbreaking_prefixes/nonbreaking_prefix.es
new file mode 100644
index 0000000..d8b2755
--- /dev/null
+++ b/external/nonbreaking_prefixes/nonbreaking_prefix.es
@@ -0,0 +1,118 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter followed by a period is not a sentence ender
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+
+# Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm
+
+A.C
+Apdo
+Av
+Bco
+CC.AA
+Da
+Dep
+Dn
+Dr
+Dra
+EE.UU
+Excmo
+FF.CC
+Fil
+Gral
+J.C
+Let
+Lic
+N.B
+P.D
+P.V.P
+Prof
+Pts
+Rte
+S.A
+S.A.R
+S.E
+S.L
+S.R.C
+Sr
+Sra
+Srta
+Sta
+Sto
+T.V.E
+Tel
+Ud
+Uds
+V.B
+V.E
+Vd
+Vds
+a/c
+adj
+admón
+afmo
+apdo
+av
+c
+c.f
+c.g
+cap
+cm
+cta
+dcha
+doc
+ej
+entlo
+esq
+etc
+f.c
+gr
+grs
+izq
+kg
+km
+mg
+mm
+núm
+núm
+p
+p.a
+p.ej
+ptas
+pág
+págs
+pág
+págs
+q.e.g.e
+q.e.s.m
+s
+s.s.s
+vid
+vol
diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.fr b/external/nonbreaking_prefixes/nonbreaking_prefix.fr
new file mode 100644
index 0000000..28126fa
--- /dev/null
+++ b/external/nonbreaking_prefixes/nonbreaking_prefix.fr
@@ -0,0 +1,153 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+#
+#any single upper case letter followed by a period is not a sentence ender
+#usually upper case letters are initials in a name
+#no French words end in single lower-case letters, so we throw those in too?
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+
+# Period-final abbreviation list for French
+A.C.N
+A.M
+art
+ann
+apr
+av
+auj
+lib
+B.P
+boul
+ca
+c.-à-d
+cf
+ch.-l
+chap
+contr
+C.P.I
+C.Q.F.D
+C.N
+C.N.S
+C.S
+dir
+éd
+e.g
+env
+al
+etc
+E.V
+ex
+fasc
+fém
+fig
+fr
+hab
+ibid
+id
+i.e
+inf
+LL.AA
+LL.AA.II
+LL.AA.RR
+LL.AA.SS
+L.D
+LL.EE
+LL.MM
+LL.MM.II.RR
+loc.cit
+masc
+MM
+ms
+N.B
+N.D.A
+N.D.L.R
+N.D.T
+n/réf
+NN.SS
+N.S
+N.D
+N.P.A.I
+p.c.c
+pl
+pp
+p.ex
+p.j
+P.S
+R.A.S
+R.-V
+R.P
+R.I.P
+SS
+S.S
+S.A
+S.A.I
+S.A.R
+S.A.S
+S.E
+sec
+sect
+sing
+S.M
+S.M.I.R
+sq
+sqq
+suiv
+sup
+suppl
+tél
+T.S.V.P
+vb
+vol
+vs
+X.O
+Z.I
diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.is b/external/nonbreaking_prefixes/nonbreaking_prefix.is
new file mode 100644
index 0000000..5b8a710
--- /dev/null
+++ b/external/nonbreaking_prefixes/nonbreaking_prefix.is
@@ -0,0 +1,251 @@
+no #NUMERIC_ONLY#
+No #NUMERIC_ONLY#
+nr #NUMERIC_ONLY#
+Nr #NUMERIC_ONLY#
+nR #NUMERIC_ONLY#
+NR #NUMERIC_ONLY#
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+^
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+ab.fn
+a.fn
+afs
+al
+alm
+alg
+andh
+ath
+aths
+atr
+ao
+au
+aukaf
+áfn
+áhrl.s
+áhrs
+ákv.gr
+ákv
+bh
+bls
+dr
+e.Kr
+et
+ef
+efn
+ennfr
+eink
+end
+e.st
+erl
+fél
+fskj
+fh
+f.hl
+físl
+fl
+fn
+fo
+forl
+frb
+frl
+frh
+frt
+fsl
+fsh
+fs
+fsk
+fst
+f.Kr
+ft
+fv
+fyrrn
+fyrrv
+germ
+gm
+gr
+hdl
+hdr
+hf
+hl
+hlsk
+hljsk
+hljv
+hljóðv
+hr
+hv
+hvk
+holl
+Hos
+höf
+hk
+hrl
+ísl
+kaf
+kap
+Khöfn
+kk
+kg
+kk
+km
+kl
+klst
+kr
+kt
+kgúrsk
+kvk
+leturbr
+lh
+lh.nt
+lh.þt
+lo
+ltr
+mlja
+mljó
+millj
+mm
+mms
+m.fl
+miðm
+mgr
+mst
+mín
+nf
+nh
+nhm
+nl
+nk
+nmgr
+no
+núv
+nt
+o.áfr
+o.m.fl
+ohf
+o.fl
+o.s.frv
+ófn
+ób
+óákv.gr
+óákv
+pfn
+PR
+pr
+Ritstj
+Rvík
+Rvk
+samb
+samhlj
+samn
+samn
+sbr
+sek
+sérn
+sf
+sfn
+sh
+sfn
+sh
+s.hl
+sk
+skv
+sl
+sn
+so
+ss.us
+s.st
+samþ
+sbr
+shlj
+sign
+skál
+st
+st.s
+stk
+sþ
+teg
+tbl
+tfn
+tl
+tvíhlj
+tvt
+till
+to
+umr
+uh
+us
+uppl
+útg
+vb
+Vf
+vh
+vkf
+Vl
+vl
+vlf
+vmf
+8vo
+vsk
+vth
+þt
+þf
+þjs
+þgf
+þlt
+þolm
+þm
+þml
+þýð
diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.it b/external/nonbreaking_prefixes/nonbreaking_prefix.it
new file mode 100644
index 0000000..992b9ec
--- /dev/null
+++ b/external/nonbreaking_prefixes/nonbreaking_prefix.it
@@ -0,0 +1,180 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+Adj
+Adm
+Adv
+Amn
+Arch
+Asst
+Avv
+Bart
+Bcc
+Bldg
+Brig
+Bros
+C.A.P
+C.P
+Capt
+Cc
+Cmdr
+Co
+Col
+Comdr
+Con
+Corp
+Cpl
+DR
+Dott
+Dr
+Drs
+Egr
+Ens
+Gen
+Geom
+Gov
+Hon
+Hosp
+Hr
+Id
+Ing
+Insp
+Lt
+MM
+MR
+MRS
+MS
+Maj
+Messrs
+Mlle
+Mme
+Mo
+Mons
+Mr
+Mrs
+Ms
+Msgr
+N.B
+Op
+Ord
+P.S
+P.T
+Pfc
+Ph
+Prof
+Pvt
+RP
+RSVP
+Rag
+Rep
+Reps
+Res
+Rev
+Rif
+Rt
+S.A
+S.B.F
+S.P.M
+S.p.A
+S.r.l
+Sen
+Sens
+Sfc
+Sgt
+Sig
+Sigg
+Soc
+Spett
+Sr
+St
+Supt
+Surg
+V.P
+
+# other
+a.c
+acc
+all
+banc
+c.a
+c.c.p
+c.m
+c.p
+c.s
+c.v
+corr
+dott
+e.p.c
+ecc
+es
+fatt
+gg
+int
+lett
+ogg
+on
+p.c
+p.c.c
+p.es
+p.f
+p.r
+p.v
+post
+pp
+racc
+ric
+s.n.c
+seg
+sgg
+ss
+tel
+u.s
+v.r
+v.s
+
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
+v
+vs
+i.e
+rev
+e.g
+
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+No #NUMERIC_ONLY#
+Nos
+Art #NUMERIC_ONLY#
+Nr
+pp #NUMERIC_ONLY#
diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.nl b/external/nonbreaking_prefixes/nonbreaking_prefix.nl
new file mode 100644
index 0000000..c80c417
--- /dev/null
+++ b/external/nonbreaking_prefixes/nonbreaking_prefix.nl
@@ -0,0 +1,115 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+#Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen
+# http://nl.wikipedia.org/wiki/Aanspreekvorm
+# http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs
+#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+bacc
+bc
+bgen
+c.i
+dhr
+dr
+dr.h.c
+drs
+drs
+ds
+eint
+fa
+Fa
+fam
+gen
+genm
+ing
+ir
+jhr
+jkvr
+jr
+kand
+kol
+lgen
+lkol
+Lt
+maj
+Mej
+mevr
+Mme
+mr
+mr
+Mw
+o.b.s
+plv
+prof
+ritm
+tint
+Vz
+Z.D
+Z.D.H
+Z.E
+Z.Em
+Z.H
+Z.K.H
+Z.K.M
+Z.M
+z.v
+
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
+#we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence
+a.g.v
+bijv
+bijz
+bv
+d.w.z
+e.c
+e.g
+e.k
+ev
+i.p.v
+i.s.m
+i.t.t
+i.v.m
+m.a.w
+m.b.t
+m.b.v
+m.h.o
+m.i
+m.i.v
+v.w.t
+
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+Nr #NUMERIC_ONLY#
+Nrs
+nrs
+nr #NUMERIC_ONLY#
diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.pl b/external/nonbreaking_prefixes/nonbreaking_prefix.pl
new file mode 100644
index 0000000..6b7c106
--- /dev/null
+++ b/external/nonbreaking_prefixes/nonbreaking_prefix.pl
@@ -0,0 +1,283 @@
+adw
+afr
+akad
+al
+Al
+am
+amer
+arch
+art
+Art
+artyst
+astr
+austr
+bałt
+bdb
+bł
+bm
+br
+bryg
+bryt
+centr
+ces
+chem
+chiń
+chir
+c.k
+c.o
+cyg
+cyw
+cyt
+czes
+czw
+cd
+Cd
+czyt
+ćw
+ćwicz
+daw
+dcn
+dekl
+demokr
+det
+diec
+dł
+dn
+dot
+dol
+dop
+dost
+dosł
+h.c
+ds
+dst
+duszp
+dypl
+egz
+ekol
+ekon
+elektr
+em
+ew
+fab
+farm
+fot
+fr
+gat
+gastr
+geogr
+geol
+gimn
+głęb
+gm
+godz
+górn
+gosp
+gr
+gram
+hist
+hiszp
+hr
+Hr
+hot
+id
+in
+im
+iron
+jn
+kard
+kat
+katol
+k.k
+kk
+kol
+kl
+k.p.a
+kpc
+k.p.c
+kpt
+kr
+k.r
+krak
+k.r.o
+kryt
+kult
+laic
+łac
+niem
+woj
+nb
+np
+Nb
+Np
+pol
+pow
+m.in
+pt
+ps
+Pt
+Ps
+cdn
+jw
+ryc
+rys
+Ryc
+Rys
+tj
+tzw
+Tzw
+tzn
+zob
+ang
+ub
+ul
+pw
+pn
+pl
+al
+k
+n
+nr #NUMERIC_ONLY#
+Nr #NUMERIC_ONLY#
+ww
+wł
+ur
+zm
+żyd
+żarg
+żyw
+wył
+bp
+bp
+wyst
+tow
+Tow
+o
+sp
+Sp
+st
+spółdz
+Spółdz
+społ
+spółgł
+stoł
+stow
+Stoł
+Stow
+zn
+zew
+zewn
+zdr
+zazw
+zast
+zaw
+zał
+zal
+zam
+zak
+zakł
+zagr
+zach
+adw
+Adw
+lek
+Lek
+med
+mec
+Mec
+doc
+Doc
+dyw
+dyr
+Dyw
+Dyr
+inż
+Inż
+mgr
+Mgr
+dh
+dr
+Dh
+Dr
+p
+P
+red
+Red
+prof
+prok
+Prof
+Prok
+hab
+płk
+Płk
+nadkom
+Nadkom
+podkom
+Podkom
+ks
+Ks
+gen
+Gen
+por
+Por
+reż
+Reż
+przyp
+Przyp
+śp
+św
+śW
+Śp
+Św
+ŚW
+szer
+Szer
+pkt #NUMERIC_ONLY#
+str #NUMERIC_ONLY#
+tab #NUMERIC_ONLY#
+Tab #NUMERIC_ONLY#
+tel
+ust #NUMERIC_ONLY#
+par #NUMERIC_ONLY#
+poz
+pok
+oo
+oO
+Oo
+OO
+r #NUMERIC_ONLY#
+l #NUMERIC_ONLY#
+s #NUMERIC_ONLY#
+najśw
+Najśw
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+Dz
diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.pt b/external/nonbreaking_prefixes/nonbreaking_prefix.pt
new file mode 100644
index 0000000..5d65bf2
--- /dev/null
+++ b/external/nonbreaking_prefixes/nonbreaking_prefix.pt
@@ -0,0 +1,210 @@
+#File adapted for PT by H. Leal Fontes from the EN & DE versions published with moses-2009-04-13. Last update: 10.11.2009.
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+
+
+#Roman Numerals. A dot after one of these is not a sentence break in Portuguese.
+I
+II
+III
+IV
+V
+VI
+VII
+VIII
+IX
+X
+XI
+XII
+XIII
+XIV
+XV
+XVI
+XVII
+XVIII
+XIX
+XX
+i
+ii
+iii
+iv
+v
+vi
+vii
+viii
+ix
+x
+xi
+xii
+xiii
+xiv
+xv
+xvi
+xvii
+xviii
+xix
+xx
+
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+Adj
+Adm
+Adv
+Art
+Ca
+Capt
+Cmdr
+Col
+Comdr
+Con
+Corp
+Cpl
+DR
+DRA
+Dr
+Dra
+Dras
+Drs
+Eng
+Enga
+Engas
+Engos
+Ex
+Exo
+Exmo
+Fig
+Gen
+Hosp
+Insp
+Lda
+MM
+MR
+MRS
+MS
+Maj
+Mrs
+Ms
+Msgr
+Op
+Ord
+Pfc
+Ph
+Prof
+Pvt
+Rep
+Reps
+Res
+Rev
+Rt
+Sen
+Sens
+Sfc
+Sgt
+Sr
+Sra
+Sras
+Srs
+Sto
+Supt
+Surg
+adj
+adm
+adv
+art
+cit
+col
+con
+corp
+cpl
+dr
+dra
+dras
+drs
+eng
+enga
+engas
+engos
+ex
+exo
+exmo
+fig
+op
+prof
+sr
+sra
+sras
+srs
+sto
+
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
+v
+vs
+i.e
+rev
+e.g
+
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+No #NUMERIC_ONLY#
+Nos
+Art #NUMERIC_ONLY#
+Nr
+p #NUMERIC_ONLY#
+pp #NUMERIC_ONLY#
+
diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.ro b/external/nonbreaking_prefixes/nonbreaking_prefix.ro
new file mode 100644
index 0000000..d489f46
--- /dev/null
+++ b/external/nonbreaking_prefixes/nonbreaking_prefix.ro
@@ -0,0 +1,38 @@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+dpdv
+etc
+șamd
+M.Ap.N
+dl
+Dl
+d-na
+D-na
+dvs
+Dvs
+pt
+Pt
diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.ru b/external/nonbreaking_prefixes/nonbreaking_prefix.ru
new file mode 100644
index 0000000..444465b
--- /dev/null
+++ b/external/nonbreaking_prefixes/nonbreaking_prefix.ru
@@ -0,0 +1,259 @@
+TBD: Russian uppercase alphabet [А-Я]
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+0гг
+1гг
+2гг
+3гг
+4гг
+5гг
+6гг
+7гг
+8гг
+9гг
+0г
+1г
+2г
+3г
+4г
+5г
+6г
+7г
+8г
+9г
+Xвв
+Vвв
+Iвв
+Lвв
+Mвв
+Cвв
+Xв
+Vв
+Iв
+Lв
+Mв
+Cв
+0м
+1м
+2м
+3м
+4м
+5м
+6м
+7м
+8м
+9м
+0мм
+1мм
+2мм
+3мм
+4мм
+5мм
+6мм
+7мм
+8мм
+9мм
+0см
+1см
+2см
+3см
+4см
+5см
+6см
+7см
+8см
+9см
+0дм
+1дм
+2дм
+3дм
+4дм
+5дм
+6дм
+7дм
+8дм
+9дм
+0л
+1л
+2л
+3л
+4л
+5л
+6л
+7л
+8л
+9л
+0км
+1км
+2км
+3км
+4км
+5км
+6км
+7км
+8км
+9км
+0га
+1га
+2га
+3га
+4га
+5га
+6га
+7га
+8га
+9га
+0кг
+1кг
+2кг
+3кг
+4кг
+5кг
+6кг
+7кг
+8кг
+9кг
+0т
+1т
+2т
+3т
+4т
+5т
+6т
+7т
+8т
+9т
+0г
+1г
+2г
+3г
+4г
+5г
+6г
+7г
+8г
+9г
+0мг
+1мг
+2мг
+3мг
+4мг
+5мг
+6мг
+7мг
+8мг
+9мг
+бульв
+вв
+га
+гг
+гл
+гос
+дм
+доп
+др
+ед
+ед
+зам
+инд
+исп
+Исп
+кап
+кг
+кв
+кл
+км
+кол
+комн
+коп
+куб
+лиц
+лл
+макс
+мг
+мин
+мл
+млн
+млрд
+мм
+наб
+нач
+неуд
+ном
+обл
+обр
+общ
+ок
+ост
+отл
+п
+пер
+перераб
+пл
+пос
+пр
+просп
+проф
+ред
+руб
+сб
+св
+см
+соч
+ср
+ст
+стр
+тел
+Тел
+тех
+тт
+туп
+тыс
+уд
+ул
+уч
+физ
+хор
+чел
+шт
+экз
diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.sk b/external/nonbreaking_prefixes/nonbreaking_prefix.sk
new file mode 100644
index 0000000..1198d48
--- /dev/null
+++ b/external/nonbreaking_prefixes/nonbreaking_prefix.sk
@@ -0,0 +1,474 @@
+Bc
+Mgr
+RNDr
+PharmDr
+PhDr
+JUDr
+PaedDr
+ThDr
+Ing
+MUDr
+MDDr
+MVDr
+Dr
+ThLic
+PhD
+ArtD
+ThDr
+Dr
+DrSc
+CSs
+prof
+obr
+Obr
+absol
+adj
+admin
+adr
+Adr
+adv
+advok
+afr
+ak
+akad
+akc
+akuz
+et
+al
+alch
+amer
+anat
+angl
+Angl
+anglosas
+anorg
+ap
+apod
+arch
+archeol
+archit
+arg
+art
+astr
+astrol
+astron
+atp
+atď
+austr
+Austr
+aut
+belg
+Belg
+bibl
+Bibl
+biol
+bot
+bud
+bás
+býv
+cest
+chem
+cirk
+csl
+čs
+Čs
+dat
+dep
+det
+dial
+diaľ
+dipl
+distrib
+dokl
+dosl
+dopr
+dram
+duš
+dv
+dvojčl
+dór
+ekol
+ekon
+el
+elektr
+elektrotech
+energet
+epic
+est
+etc
+etonym
+eufem
+európ
+Európ
+ev
+evid
+expr
+fa
+fam
+farm
+fem
+feud
+fil
+filat
+filoz
+fi
+fon
+form
+fot
+fr
+Fr
+franc
+Franc
+fraz
+fut
+fyz
+fyziol
+garb
+gen
+genet
+genpor
+geod
+geogr
+geol
+geom
+germ
+gr
+Gr
+gréc
+Gréc
+gréckokat
+hebr
+herald
+hist
+hlav
+hosp
+hromad
+hud
+hypok
+ident
+i.e
+ident
+imp
+impf
+indoeur
+inf
+inform
+instr
+int
+interj
+inšt
+inštr
+iron
+jap
+Jap
+jaz
+jedn
+juhoamer
+juhových
+juhozáp
+juž
+kanad
+Kanad
+kanc
+kapit
+kpt
+kart
+katastr
+knih
+kniž
+komp
+konj
+konkr
+kozmet
+krajč
+kresť
+kt
+kuch
+lat
+latinskoamer
+lek
+lex
+lingv
+lit
+litur
+log
+lok
+max
+Max
+maď
+Maď
+medzinár
+mest
+metr
+mil
+Mil
+min
+Min
+miner
+ml
+mld
+mn
+mod
+mytol
+napr
+nar
+Nar
+nasl
+nedok
+neg
+negat
+neklas
+nem
+Nem
+neodb
+neos
+neskl
+nesklon
+nespis
+nespráv
+neved
+než
+niekt
+niž
+nom
+náb
+nákl
+námor
+nár
+obch
+obj
+obv
+obyč
+obč
+občian
+odb
+odd
+ods
+ojed
+okr
+Okr
+opt
+opyt
+org
+os
+osob
+ot
+ovoc
+par
+part
+pejor
+pers
+pf
+Pf
+P.f
+p.f
+pl
+Plk
+pod
+podst
+pokl
+polit
+politol
+polygr
+pomn
+popl
+por
+porad
+porov
+posch
+potrav
+použ
+poz
+pozit
+poľ
+poľno
+poľnohosp
+poľov
+pošt
+pož
+prac
+predl
+pren
+prep
+preuk
+priezv
+Priezv
+privl
+prof
+práv
+príd
+príj
+prík
+príp
+prír
+prísl
+príslov
+príč
+psych
+publ
+pís
+písm
+pôv
+refl
+reg
+rep
+resp
+rozk
+rozlič
+rozpráv
+roč
+Roč
+ryb
+rádiotech
+rím
+samohl
+semest
+sev
+severoamer
+severových
+severozáp
+sg
+skr
+skup
+sl
+Sloven
+soc
+soch
+sociol
+sp
+spol
+Spol
+spoloč
+spoluhl
+správ
+spôs
+st
+star
+starogréc
+starorím
+s.r.o
+stol
+stor
+str
+stredoamer
+stredoškol
+subj
+subst
+superl
+sv
+sz
+súkr
+súp
+súvzť
+tal
+Tal
+tech
+tel
+Tel
+telef
+teles
+telev
+teol
+trans
+turist
+tuzem
+typogr
+tzn
+tzv
+ukaz
+ul
+Ul
+umel
+univ
+ust
+ved
+vedľ
+verb
+veter
+vin
+viď
+vl
+vod
+vodohosp
+pnl
+vulg
+vyj
+vys
+vysokoškol
+vzťaž
+vôb
+vých
+výd
+výrob
+výsk
+výsl
+výtv
+výtvar
+význ
+včel
+vš
+všeob
+zahr
+zar
+zariad
+zast
+zastar
+zastaráv
+zb
+zdravot
+združ
+zjemn
+zlat
+zn
+Zn
+zool
+zr
+zried
+zv
+záhr
+zák
+zákl
+zám
+záp
+západoeur
+zázn
+územ
+účt
+čast
+čes
+Čes
+čl
+čísl
+živ
+pr
+fak
+Kr
+p.n.l
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.sl b/external/nonbreaking_prefixes/nonbreaking_prefix.sl
new file mode 100644
index 0000000..230062c
--- /dev/null
+++ b/external/nonbreaking_prefixes/nonbreaking_prefix.sl
@@ -0,0 +1,78 @@
+dr
+Dr
+itd
+itn
+št #NUMERIC_ONLY#
+Št #NUMERIC_ONLY#
+d
+jan
+Jan
+feb
+Feb
+mar
+Mar
+apr
+Apr
+jun
+Jun
+jul
+Jul
+avg
+Avg
+sept
+Sept
+sep
+Sep
+okt
+Okt
+nov
+Nov
+dec
+Dec
+tj
+Tj
+npr
+Npr
+sl
+Sl
+op
+Op
+gl
+Gl
+oz
+Oz
+prev
+dipl
+ing
+prim
+Prim
+cf
+Cf
+gl
+Gl
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.sv b/external/nonbreaking_prefixes/nonbreaking_prefix.sv
new file mode 100644
index 0000000..df5ef29
--- /dev/null
+++ b/external/nonbreaking_prefixes/nonbreaking_prefix.sv
@@ -0,0 +1,46 @@
+#single upper case letter are usually initials
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+#misc abbreviations
+AB
+G
+VG
+dvs
+etc
+from
+iaf
+jfr
+kl
+kr
+mao
+mfl
+mm
+osv
+pga
+tex
+tom
+vs
diff --git a/external/tokenizer-no-escape.perl b/external/tokenizer-no-escape.perl
new file mode 100755
index 0000000..4397360
--- /dev/null
+++ b/external/tokenizer-no-escape.perl
@@ -0,0 +1,348 @@
+#!/usr/bin/perl -w
+
+# Sample Tokenizer
+### Version 1.1
+# written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn
+# Version 1.1 updates:
+# (1) add multithreading option "-threads NUM_THREADS" (default is 1);
+# (2) add a timing option "-time" to calculate the average speed of this tokenizer;
+# (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed);
+### Version 1.0
+# $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $
+# written by Josh Schroeder, based on code by Philipp Koehn
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+use FindBin qw($RealBin);
+use strict;
+use Time::HiRes;
+#use Thread;
+
+my $mydir = "$RealBin/nonbreaking_prefixes";
+
+my %NONBREAKING_PREFIX = ();
+my $language = "en";
+my $QUIET = 0;
+my $HELP = 0;
+my $AGGRESSIVE = 0;
+my $SKIP_XML = 0;
+my $TIMING = 0;
+my $NUM_THREADS = 1;
+my $NUM_SENTENCES_PER_THREAD = 2000;
+
+while (@ARGV)
+{
+ $_ = shift;
+ /^-b$/ && ($| = 1, next);
+ /^-l$/ && ($language = shift, next);
+ /^-q$/ && ($QUIET = 1, next);
+ /^-h$/ && ($HELP = 1, next);
+ /^-x$/ && ($SKIP_XML = 1, next);
+ /^-a$/ && ($AGGRESSIVE = 1, next);
+ /^-time$/ && ($TIMING = 1, next);
+ /^-threads$/ && ($NUM_THREADS = int(shift), next);
+ /^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next);
+}
+
+# for time calculation
+my $start_time;
+if ($TIMING)
+{
+ $start_time = [ Time::HiRes::gettimeofday( ) ];
+}
+
+# print help message
+if ($HELP)
+{
+ print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n";
+ print "Options:\n";
+ print " -q ... quiet.\n";
+ print " -a ... aggressive hyphen splitting.\n";
+ print " -b ... disable Perl buffering.\n";
+ print " -time ... enable processing time calculation.\n";
+ exit;
+}
+
+if (!$QUIET)
+{
+ print STDERR "Tokenizer Version 1.1\n";
+ print STDERR "Language: $language\n";
+ print STDERR "Number of threads: $NUM_THREADS\n";
+}
+
+# load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes
+load_prefixes($language,\%NONBREAKING_PREFIX);
+
+if (scalar(%NONBREAKING_PREFIX) eq 0)
+{
+ print STDERR "Warning: No known abbreviations for language '$language'\n";
+}
+
+my @batch_sentences = ();
+my @thread_list = ();
+my $count_sentences = 0;
+
+if ($NUM_THREADS > 1)
+{# multi-threading tokenization
+ while(<STDIN>)
+ {
+ $count_sentences = $count_sentences + 1;
+ push(@batch_sentences, $_);
+ if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS))
+ {
+ # assign each thread work
+ for (my $i=0; $i<$NUM_THREADS; $i++)
+ {
+ my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
+ my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
+ my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
+ my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
+ push(@thread_list, $new_thread);
+ }
+ foreach (@thread_list)
+ {
+ my $tokenized_list = $_->join;
+ foreach (@$tokenized_list)
+ {
+ print $_;
+ }
+ }
+ # reset for the new run
+ @thread_list = ();
+ @batch_sentences = ();
+ }
+ }
+ # the last batch
+ if (scalar(@batch_sentences)>0)
+ {
+ # assign each thread work
+ for (my $i=0; $i<$NUM_THREADS; $i++)
+ {
+ my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
+ if ($start_index >= scalar(@batch_sentences))
+ {
+ last;
+ }
+ my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
+ if ($end_index >= scalar(@batch_sentences))
+ {
+ $end_index = scalar(@batch_sentences)-1;
+ }
+ my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
+ my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
+ push(@thread_list, $new_thread);
+ }
+ foreach (@thread_list)
+ {
+ my $tokenized_list = $_->join;
+ foreach (@$tokenized_list)
+ {
+ print $_;
+ }
+ }
+ }
+}
+else
+{# single thread only
+ while(<STDIN>)
+ {
+ if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
+ {
+ #don't try to tokenize XML/HTML tag lines
+ print $_;
+ }
+ else
+ {
+ print &tokenize($_);
+ }
+ }
+}
+
+if ($TIMING)
+{
+ my $duration = Time::HiRes::tv_interval( $start_time );
+ print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n");
+ print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n");
+}
+
+#####################################################################################
+# subroutines afterward
+
+# tokenize a batch of texts saved in an array
+# input: an array containing a batch of texts
+# return: another array cotaining a batch of tokenized texts for the input array
+sub tokenize_batch
+{
+ my(@text_list) = @_;
+ my(@tokenized_list) = ();
+ foreach (@text_list)
+ {
+ if (($SKIP_XML && /^<.+>$/) || /^\s*$/)
+ {
+ #don't try to tokenize XML/HTML tag lines
+ push(@tokenized_list, $_);
+ }
+ else
+ {
+ push(@tokenized_list, &tokenize($_));
+ }
+ }
+ return \@tokenized_list;
+}
+
+# the actual tokenize function which tokenizes one input string
+# input: one string
+# return: the tokenized string for the input string
+sub tokenize
+{
+ my($text) = @_;
+ chomp($text);
+ $text = " $text ";
+
+ # remove ASCII junk
+ $text =~ s/\s+/ /g;
+ $text =~ s/[\000-\037]//g;
+
+ # seperate out all "other" special characters
+ $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
+
+ # aggressive hyphen splitting
+ if ($AGGRESSIVE)
+ {
+ $text =~ s/([\p{IsAlnum}])\-([\p{IsAlnum}])/$1 \@-\@ $2/g;
+ }
+
+ #multi-dots stay together
+ $text =~ s/\.([\.]+)/ DOTMULTI$1/g;
+ while($text =~ /DOTMULTI\./)
+ {
+ $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
+ $text =~ s/DOTMULTI\./DOTDOTMULTI/g;
+ }
+
+ # seperate out "," except if within numbers (5,300)
+ $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+ # separate , pre and post number
+ $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+ $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
+
+ # turn `into '
+ $text =~ s/\`/\'/g;
+
+ #turn '' into "
+ $text =~ s/\'\'/ \" /g;
+
+ if ($language eq "en")
+ {
+ #split contractions right
+ $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
+ #special case for "1990's"
+ $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
+ }
+ elsif (($language eq "fr") or ($language eq "it"))
+ {
+ #split contractions left
+ $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+ $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
+ }
+ else
+ {
+ $text =~ s/\'/ \' /g;
+ }
+
+ #word token method
+ my @words = split(/\s/,$text);
+ $text = "";
+ for (my $i=0;$i<(scalar(@words));$i++)
+ {
+ my $word = $words[$i];
+ if ( $word =~ /^(\S+)\.$/)
+ {
+ my $pre = $1;
+ if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/)))
+ {
+ #no change
+ }
+ elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/)))
+ {
+ #no change
+ }
+ else
+ {
+ $word = $pre." .";
+ }
+ }
+ $text .= $word." ";
+ }
+
+ # clean up extraneous spaces
+ $text =~ s/ +/ /g;
+ $text =~ s/^ //g;
+ $text =~ s/ $//g;
+
+ #restore multi-dots
+ while($text =~ /DOTDOTMULTI/)
+ {
+ $text =~ s/DOTDOTMULTI/DOTMULTI./g;
+ }
+ $text =~ s/DOTMULTI/./g;
+
+ #escape special chars
+ #$text =~ s/\&/\&amp;/g; # escape escape
+ #$text =~ s/\|/\&#124;/g; # factor separator
+ #$text =~ s/\</\&lt;/g; # xml
+ #$text =~ s/\>/\&gt;/g; # xml
+ #$text =~ s/\'/\&apos;/g; # xml
+ #$text =~ s/\"/\&quot;/g; # xml
+ #$text =~ s/\[/\&#91;/g; # syntax non-terminal
+ #$text =~ s/\]/\&#93;/g; # syntax non-terminal
+
+ #ensure final line break
+ $text .= "\n" unless $text =~ /\n$/;
+
+ return $text;
+}
+
+sub load_prefixes
+{
+ my ($language, $PREFIX_REF) = @_;
+
+ my $prefixfile = "$mydir/nonbreaking_prefix.$language";
+
+ #default back to English if we don't have a language-specific prefix file
+ if (!(-e $prefixfile))
+ {
+ $prefixfile = "$mydir/nonbreaking_prefix.en";
+ print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
+ die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
+ }
+
+ if (-e "$prefixfile")
+ {
+ open(PREFIX, "<:utf8", "$prefixfile");
+ while (<PREFIX>)
+ {
+ my $item = $_;
+ chomp($item);
+ if (($item) && (substr($item,0,1) ne "#"))
+ {
+ if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/)
+ {
+ $PREFIX_REF->{$1} = 2;
+ }
+ else
+ {
+ $PREFIX_REF->{$item} = 1;
+ }
+ }
+ }
+ close(PREFIX);
+ }
+}
+
diff --git a/external/truecase.perl b/external/truecase.perl
new file mode 100755
index 0000000..0a4d366
--- /dev/null
+++ b/external/truecase.perl
@@ -0,0 +1,104 @@
+#!/usr/bin/perl -w
+
+# $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $
+use strict;
+use Getopt::Long "GetOptions";
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+# apply switches
+my ($MODEL, $UNBUFFERED);
+die("truecase.perl --model MODEL [-b] < in > out")
+ unless &GetOptions('model=s' => \$MODEL,'b|unbuffered' => \$UNBUFFERED)
+ && defined($MODEL);
+if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }
+
+my (%BEST,%KNOWN);
+open(MODEL,$MODEL) || die("ERROR: could not open '$MODEL'");
+binmode(MODEL, ":utf8");
+while(<MODEL>) {
+ my ($word,@OPTIONS) = split;
+ $BEST{ lc($word) } = $word;
+ $KNOWN{ $word } = 1;
+ for(my $i=1;$i<$#OPTIONS;$i+=2) {
+ $KNOWN{ $OPTIONS[$i] } = 1;
+ }
+}
+close(MODEL);
+
+my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
+my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"&apos;"=>1,"&quot;"=>1,"&#91;"=>1,"&#93;"=>1);
+
+while(<STDIN>) {
+ chop;
+ my ($WORD,$MARKUP) = split_xml($_);
+ my $sentence_start = 1;
+ for(my $i=0;$i<=$#$WORD;$i++) {
+ print " " if $i && $$MARKUP[$i] eq '';
+ print $$MARKUP[$i];
+
+ my ($word,$otherfactors);
+ if ($$WORD[$i] =~ /^([^\|]+)(.*)/)
+ {
+ $word = $1;
+ $otherfactors = $2;
+ }
+ else
+ {
+ $word = $$WORD[$i];
+ $otherfactors = "";
+ }
+
+ if ($sentence_start && defined($BEST{lc($word)})) {
+ print $BEST{lc($word)}; # truecase sentence start
+ }
+ elsif (defined($KNOWN{$word})) {
+ print $word; # don't change known words
+ }
+ elsif (defined($BEST{lc($word)})) {
+ print $BEST{lc($word)}; # truecase otherwise unknown words
+ }
+ else {
+ print $word; # unknown, nothing to do
+ }
+ print $otherfactors;
+
+ if ( defined($SENTENCE_END{ $word })) { $sentence_start = 1; }
+ elsif (!defined($DELAYED_SENTENCE_START{ $word })) { $sentence_start = 0; }
+ }
+ print $$MARKUP[$#$MARKUP];
+ print "\n";
+}
+
+# store away xml markup
+sub split_xml {
+ my ($line) = @_;
+ my (@WORD,@MARKUP);
+ my $i = 0;
+ $MARKUP[0] = "";
+ while($line =~ /\S/) {
+ # XML tag
+ if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) {
+ $MARKUP[$i] .= $1." ";
+ $line = $2;
+ }
+ # non-XML text
+ elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) {
+ $WORD[$i++] = $1;
+ $MARKUP[$i] = "";
+ $line = $2;
+ }
+ # '<' or '>' occurs in word, but it's not an XML tag
+ elsif ($line =~ /^\s*(\S+)(.*)$/) {
+ $WORD[$i++] = $1;
+ $MARKUP[$i] = "";
+ $line = $2;
+ }
+ else {
+ die("ERROR: huh? $line\n");
+ }
+ }
+ chop($MARKUP[$#MARKUP]);
+ return (\@WORD,\@MARKUP);
+}