From 60f93614186ebd6150602cae140b7a96dc4bca8a Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Wed, 24 Jun 2015 17:47:32 +0200 Subject: better wrapper script --- external/README | 3 + external/detokenizer.perl | 363 ++++++++++++++++ external/lowercase.perl | 10 + external/nonbreaking_prefixes/README.txt | 5 + .../nonbreaking_prefixes/nonbreaking_prefix.ca | 75 ++++ .../nonbreaking_prefixes/nonbreaking_prefix.cs | 390 +++++++++++++++++ .../nonbreaking_prefixes/nonbreaking_prefix.de | 325 ++++++++++++++ .../nonbreaking_prefixes/nonbreaking_prefix.el | 2 + .../nonbreaking_prefixes/nonbreaking_prefix.en | 107 +++++ .../nonbreaking_prefixes/nonbreaking_prefix.es | 118 +++++ .../nonbreaking_prefixes/nonbreaking_prefix.fr | 153 +++++++ .../nonbreaking_prefixes/nonbreaking_prefix.is | 251 +++++++++++ .../nonbreaking_prefixes/nonbreaking_prefix.it | 180 ++++++++ .../nonbreaking_prefixes/nonbreaking_prefix.nl | 115 +++++ .../nonbreaking_prefixes/nonbreaking_prefix.pl | 283 ++++++++++++ .../nonbreaking_prefixes/nonbreaking_prefix.pt | 210 +++++++++ .../nonbreaking_prefixes/nonbreaking_prefix.ro | 38 ++ .../nonbreaking_prefixes/nonbreaking_prefix.ru | 259 +++++++++++ .../nonbreaking_prefixes/nonbreaking_prefix.sk | 474 +++++++++++++++++++++ .../nonbreaking_prefixes/nonbreaking_prefix.sl | 78 ++++ .../nonbreaking_prefixes/nonbreaking_prefix.sv | 46 ++ external/tokenizer-no-escape.perl | 348 +++++++++++++++ external/truecase.perl | 104 +++++ 23 files changed, 3937 insertions(+) create mode 100644 external/README create mode 100755 external/detokenizer.perl create mode 100755 external/lowercase.perl create mode 100644 external/nonbreaking_prefixes/README.txt create mode 100644 external/nonbreaking_prefixes/nonbreaking_prefix.ca create mode 100644 external/nonbreaking_prefixes/nonbreaking_prefix.cs create mode 100644 external/nonbreaking_prefixes/nonbreaking_prefix.de create mode 100644 external/nonbreaking_prefixes/nonbreaking_prefix.el create mode 100644 external/nonbreaking_prefixes/nonbreaking_prefix.en create mode 100644 external/nonbreaking_prefixes/nonbreaking_prefix.es create mode 100644 external/nonbreaking_prefixes/nonbreaking_prefix.fr create mode 100644 external/nonbreaking_prefixes/nonbreaking_prefix.is create mode 100644 external/nonbreaking_prefixes/nonbreaking_prefix.it create mode 100644 external/nonbreaking_prefixes/nonbreaking_prefix.nl create mode 100644 external/nonbreaking_prefixes/nonbreaking_prefix.pl create mode 100644 external/nonbreaking_prefixes/nonbreaking_prefix.pt create mode 100644 external/nonbreaking_prefixes/nonbreaking_prefix.ro create mode 100644 external/nonbreaking_prefixes/nonbreaking_prefix.ru create mode 100644 external/nonbreaking_prefixes/nonbreaking_prefix.sk create mode 100644 external/nonbreaking_prefixes/nonbreaking_prefix.sl create mode 100644 external/nonbreaking_prefixes/nonbreaking_prefix.sv create mode 100755 external/tokenizer-no-escape.perl create mode 100755 external/truecase.perl (limited to 'external') diff --git a/external/README b/external/README new file mode 100644 index 0000000..a2e40bd --- /dev/null +++ b/external/README @@ -0,0 +1,3 @@ +github.com/pks/scripts @e137509a77e3a8c12af32852ebba893dacb53f85 +moses @7b02017da1e2a09486627b543446ec78e51541a7 + diff --git a/external/detokenizer.perl b/external/detokenizer.perl new file mode 100755 index 0000000..a8de7e8 --- /dev/null +++ b/external/detokenizer.perl @@ -0,0 +1,363 @@ +#!/usr/bin/perl -w + +# $Id: detokenizer.perl 4134 2011-08-08 15:30:54Z bgottesman $ +# Sample De-Tokenizer +# written by Josh Schroeder, based on code by Philipp Koehn +# further modifications by Ondrej Bojar + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); +use strict; +use utf8; # tell perl this script file is in UTF-8 (see all funny punct below) + +my $language = "en"; +my $QUIET = 0; +my $HELP = 0; +my $UPPERCASE_SENT = 0; +my $PENN = 0; + +while (@ARGV) { + $_ = shift; + /^-b$/ && ($| = 1, next); + /^-l$/ && ($language = shift, next); + /^-q$/ && ($QUIET = 1, next); + /^-h$/ && ($HELP = 1, next); + /^-u$/ && ($UPPERCASE_SENT = 1, next); + /^-penn$/ && ($PENN = 1, next); +} + +if ($HELP) { + print "Usage ./detokenizer.perl (-l [en|fr|it|cs|...]) < tokenizedfile > detokenizedfile\n"; + print "Options:\n"; + print " -u ... uppercase the first char in the final sentence.\n"; + print " -q ... don't report detokenizer revision.\n"; + print " -b ... disable Perl buffering.\n"; + print " -penn ... assume input is tokenized as per tokenizer.perl's -penn option.\n"; + exit; +} + +if ($language !~ /^(cs|en|fr|it)$/) { + print STDERR "Warning: No built-in rules for language $language.\n" +} + +if ($PENN && $language ne "en") { + print STDERR "Error: -penn option only supported for English text.\n"; + exit; +} + +if (!$QUIET) { + print STDERR "Detokenizer Version ".'$Revision: 4134 $'."\n"; + print STDERR "Language: $language\n"; +} + +while() { + if (/^<.+>$/ || /^\s*$/) { + #don't try to detokenize XML/HTML tag lines + print $_; + } elsif ($PENN) { + print &detokenize_penn($_); + } else { + print &detokenize($_); + } +} + + +sub ucsecondarg { + # uppercase the second argument + my $arg1 = shift; + my $arg2 = shift; + return $arg1.uc($arg2); +} + +sub deescape { + # de-escape special chars + my ($text) = @_; + $text =~ s/\&bar;/\|/g; # factor separator (legacy) + $text =~ s/\|/\|/g; # factor separator + $text =~ s/\</\/g; # xml + $text =~ s/\&bra;/\[/g; # syntax non-terminal (legacy) + $text =~ s/\&ket;/\]/g; # syntax non-terminal (legacy) + $text =~ s/\"/\"/g; # xml + $text =~ s/\'/\'/g; # xml + $text =~ s/\[/\[/g; # syntax non-terminal + $text =~ s/\]/\]/g; # syntax non-terminal + $text =~ s/\&/\&/g; # escape escape + return $text; +} + +sub detokenize { + my($text) = @_; + chomp($text); + $text = " $text "; + $text =~ s/ \@\-\@ /-/g; + $text = &deescape($text); + + my $word; + my $i; + my @words = split(/ /,$text); + $text = ""; + my %quoteCount = ("\'"=>0,"\""=>0); + my $prependSpace = " "; + for ($i=0;$i<(scalar(@words));$i++) { + if (&startsWithCJKChar($words[$i])) { + if ($i > 0 && &endsWithCJKChar($words[$i-1])) { + # perform left shift if this is a second consecutive CJK (Chinese/Japanese/Korean) word + $text=$text.$words[$i]; + } else { + # ... but do nothing special if this is a CJK word that doesn't follow a CJK word + $text=$text.$prependSpace.$words[$i]; + } + $prependSpace = " "; + } elsif ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) { + #perform right shift on currency and other random punctuation items + $text = $text.$prependSpace.$words[$i]; + $prependSpace = ""; + } elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){ + if (($language eq "fr") && ($words[$i] =~ /^[\?\!\:\;\\\%]$/)) { + #these punctuations are prefixed with a non-breakable space in french + $text .= " "; } + #perform left shift on punctuation items + $text=$text.$words[$i]; + $prependSpace = " "; + } elsif (($language eq "en") && ($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) { + #left-shift the contraction for English + $text=$text.$words[$i]; + $prependSpace = " "; + } elsif (($language eq "cs") && ($i>1) && ($words[$i-2] =~ /^[0-9]+$/) && ($words[$i-1] =~ /^[.,]$/) && ($words[$i] =~ /^[0-9]+$/)) { + #left-shift floats in Czech + $text=$text.$words[$i]; + $prependSpace = " "; + } elsif ((($language eq "fr") ||($language eq "it")) && ($i<=(scalar(@words)-2)) && ($words[$i] =~ /[\p{IsAlpha}][\']$/) && ($words[$i+1] =~ /^[\p{IsAlpha}]/)) { + #right-shift the contraction for French and Italian + $text = $text.$prependSpace.$words[$i]; + $prependSpace = ""; + } elsif (($language eq "cs") && ($i<(scalar(@words)-3)) + && ($words[$i] =~ /[\p{IsAlpha}]$/) + && ($words[$i+1] =~ /^[-–]$/) + && ($words[$i+2] =~ /^li$|^mail.*/i) + ) { + #right-shift "-li" in Czech and a few Czech dashed words (e-mail) + $text = $text.$prependSpace.$words[$i].$words[$i+1]; + $i++; # advance over the dash + $prependSpace = ""; + } elsif ($words[$i] =~ /^[\'\"„“`]+$/) { + #combine punctuation smartly + my $normalized_quo = $words[$i]; + $normalized_quo = '"' if $words[$i] =~ /^[„“”]+$/; + $quoteCount{$normalized_quo} = 0 + if !defined $quoteCount{$normalized_quo}; + if ($language eq "cs" && $words[$i] eq "„") { + # this is always the starting quote in Czech + $quoteCount{$normalized_quo} = 0; + } + if ($language eq "cs" && $words[$i] eq "“") { + # this is usually the ending quote in Czech + $quoteCount{$normalized_quo} = 1; + } + if (($quoteCount{$normalized_quo} % 2) eq 0) { + if(($language eq "en") && ($words[$i] eq "'") && ($i > 0) && ($words[$i-1] =~ /[s]$/)) { + #single quote for posesssives ending in s... "The Jones' house" + #left shift + $text=$text.$words[$i]; + $prependSpace = " "; + } else { + #right shift + $text = $text.$prependSpace.$words[$i]; + $prependSpace = ""; + $quoteCount{$normalized_quo} ++; + + } + } else { + #left shift + $text=$text.$words[$i]; + $prependSpace = " "; + $quoteCount{$normalized_quo} ++; + + } + + } else { + $text=$text.$prependSpace.$words[$i]; + $prependSpace = " "; + } + } + + # clean up spaces at head and tail of each line as well as any double-spacing + $text =~ s/ +/ /g; + $text =~ s/\n /\n/g; + $text =~ s/ \n/\n/g; + $text =~ s/^ //g; + $text =~ s/ $//g; + + #add trailing break + $text .= "\n" unless $text =~ /\n$/; + + $text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT; + + return $text; +} + +sub detokenize_penn { + my($text) = @_; + + chomp($text); + $text = " $text "; + $text =~ s/ \@\-\@ /-/g; + $text =~ s/ \@\/\@ /\//g; + $text = &deescape($text); + + # merge de-contracted forms except where the second word begins with an + # apostrophe (those are handled later) + $text =~ s/ n't /n't /g; + $text =~ s/ N'T /N'T /g; + $text =~ s/ ([Cc])an not / $1annot /g; + $text =~ s/ ([Dd])' ye / $1'ye /g; + $text =~ s/ ([Gg])im me / $1imme /g; + $text =~ s/ ([Gg])on na / $1onna /g; + $text =~ s/ ([Gg])ot ta / $1otta /g; + $text =~ s/ ([Ll])em me / $1emme /g; + $text =~ s/ '([Tt]) is / '$1is /g; + $text =~ s/ '([Tt]) was / '$1was /g; + $text =~ s/ ([Ww])an na / $1anna /g; + + # restore brackets + $text =~ s/-LRB-/\(/g; + $text =~ s/-RRB-/\)/g; + $text =~ s/-LSB-/\[/g; + $text =~ s/-RSB-/\]/g; + $text =~ s/-LCB-/{/g; + $text =~ s/-RCB-/}/g; + + my $i; + my @words = split(/ /,$text); + $text = ""; + my $prependSpace = " "; + for ($i=0;$i<(scalar(@words));$i++) { + if ($words[$i] =~ /^[\p{IsSc}\(\[\{\¿\¡]+$/) { + # perform right shift on currency and other random punctuation items + $text = $text.$prependSpace.$words[$i]; + $prependSpace = ""; + } elsif ($words[$i] =~ /^[\,\.\?\!\:\;\\\%\}\]\)]+$/){ + # perform left shift on punctuation items + $text=$text.$words[$i]; + $prependSpace = " "; + } elsif (($i>0) && ($words[$i] =~ /^[\'][\p{IsAlpha}]/) && ($words[$i-1] =~ /[\p{IsAlnum}]$/)) { + # left-shift the contraction + $text=$text.$words[$i]; + $prependSpace = " "; + } elsif ($words[$i] eq "`") { # Assume that punctuation has been normalized and is one of `, ``, ', '' only + # opening single quote: convert to straight quote and right-shift + $text = $text.$prependSpace."\'"; + $prependSpace = ""; + } elsif ($words[$i] eq "``") { + # opening double quote: convert to straight quote and right-shift + $text = $text.$prependSpace."\""; + $prependSpace = ""; + } elsif ($words[$i] eq "\'") { + # closing single quote: convert to straight quote and left shift + $text = $text."\'"; + $prependSpace = " "; + } elsif ($words[$i] eq "\'\'") { + # closing double quote: convert to straight quote and left shift + $text = $text."\""; + $prependSpace = " "; + } else { + $text = $text.$prependSpace.$words[$i]; + $prependSpace = " "; + } + } + + # clean up spaces at head and tail of each line as well as any double-spacing + $text =~ s/ +/ /g; + $text =~ s/\n /\n/g; + $text =~ s/ \n/\n/g; + $text =~ s/^ //g; + $text =~ s/ $//g; + + # add trailing break + $text .= "\n" unless $text =~ /\n$/; + + $text =~ s/^([[:punct:]\s]*)([[:alpha:]])/ucsecondarg($1, $2)/e if $UPPERCASE_SENT; + + return $text; +} + +sub startsWithCJKChar { + my ($str) = @_; + return 0 if length($str) == 0; + my $firstChar = substr($str, 0, 1); + return &charIsCJK($firstChar); +} + +sub endsWithCJKChar { + my ($str) = @_; + return 0 if length($str) == 0; + my $lastChar = substr($str, length($str)-1, 1); + return &charIsCJK($lastChar); +} + +# Given a string consisting of one character, returns true iff the character +# is a CJK (Chinese/Japanese/Korean) character +sub charIsCJK { + my ($char) = @_; + # $char should be a string of length 1 + my $codepoint = &codepoint_dec($char); + + # The following is based on http://en.wikipedia.org/wiki/Basic_Multilingual_Plane#Basic_Multilingual_Plane + + # Hangul Jamo (1100–11FF) + return 1 if (&between_hexes($codepoint, '1100', '11FF')); + + # CJK Radicals Supplement (2E80–2EFF) + # Kangxi Radicals (2F00–2FDF) + # Ideographic Description Characters (2FF0–2FFF) + # CJK Symbols and Punctuation (3000–303F) + # Hiragana (3040–309F) + # Katakana (30A0–30FF) + # Bopomofo (3100–312F) + # Hangul Compatibility Jamo (3130–318F) + # Kanbun (3190–319F) + # Bopomofo Extended (31A0–31BF) + # CJK Strokes (31C0–31EF) + # Katakana Phonetic Extensions (31F0–31FF) + # Enclosed CJK Letters and Months (3200–32FF) + # CJK Compatibility (3300–33FF) + # CJK Unified Ideographs Extension A (3400–4DBF) + # Yijing Hexagram Symbols (4DC0–4DFF) + # CJK Unified Ideographs (4E00–9FFF) + # Yi Syllables (A000–A48F) + # Yi Radicals (A490–A4CF) + return 1 if (&between_hexes($codepoint, '2E80', 'A4CF')); + + # Phags-pa (A840–A87F) + return 1 if (&between_hexes($codepoint, 'A840', 'A87F')); + + # Hangul Syllables (AC00–D7AF) + return 1 if (&between_hexes($codepoint, 'AC00', 'D7AF')); + + # CJK Compatibility Ideographs (F900–FAFF) + return 1 if (&between_hexes($codepoint, 'F900', 'FAFF')); + + # CJK Compatibility Forms (FE30–FE4F) + return 1 if (&between_hexes($codepoint, 'FE30', 'FE4F')); + + # Range U+FF65–FFDC encodes halfwidth forms, of Katakana and Hangul characters + return 1 if (&between_hexes($codepoint, 'FF65', 'FFDC')); + + # Supplementary Ideographic Plane 20000–2FFFF + return 1 if (&between_hexes($codepoint, '20000', '2FFFF')); + + return 0; +} + +# Returns the code point of a Unicode char, represented as a decimal number +sub codepoint_dec { + if (my $char = shift) { + return unpack('U0U*', $char); + } +} + +sub between_hexes { + my ($num, $left, $right) = @_; + return $num >= hex($left) && $num <= hex($right); +} diff --git a/external/lowercase.perl b/external/lowercase.perl new file mode 100755 index 0000000..c30e029 --- /dev/null +++ b/external/lowercase.perl @@ -0,0 +1,10 @@ +#!/usr/bin/perl -w + +use strict; + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); + +while() { + print lc($_); +} diff --git a/external/nonbreaking_prefixes/README.txt b/external/nonbreaking_prefixes/README.txt new file mode 100644 index 0000000..02cdfcc --- /dev/null +++ b/external/nonbreaking_prefixes/README.txt @@ -0,0 +1,5 @@ +The language suffix can be found here: + +http://www.loc.gov/standards/iso639-2/php/code_list.php + + diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.ca b/external/nonbreaking_prefixes/nonbreaking_prefix.ca new file mode 100644 index 0000000..2f4fdfc --- /dev/null +++ b/external/nonbreaking_prefixes/nonbreaking_prefix.ca @@ -0,0 +1,75 @@ +Dr +Dra +pàg +p +c +av +Sr +Sra +adm +esq +Prof +S.A +S.L +p.e +ptes +Sta +St +pl +màx +cast +dir +nre +fra +admdora +Emm +Excma +espf +dc +admdor +tel +angl +aprox +ca +dept +dj +dl +dt +ds +dg +dv +ed +entl +al +i.e +maj +smin +n +núm +pta +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.cs b/external/nonbreaking_prefixes/nonbreaking_prefix.cs new file mode 100644 index 0000000..dce6167 --- /dev/null +++ b/external/nonbreaking_prefixes/nonbreaking_prefix.cs @@ -0,0 +1,390 @@ +Bc +BcA +Ing +Ing.arch +MUDr +MVDr +MgA +Mgr +JUDr +PhDr +RNDr +PharmDr +ThLic +ThDr +Ph.D +Th.D +prof +doc +CSc +DrSc +dr. h. c +PaedDr +Dr +PhMr +DiS +abt +ad +a.i +aj +angl +anon +apod +atd +atp +aut +bd +biogr +b.m +b.p +b.r +cca +cit +cizojaz +c.k +col +čes +čín +čj +ed +facs +fasc +fol +fot +franc +h.c +hist +hl +hrsg +ibid +il +ind +inv.č +jap +jhdt +jv +koed +kol +korej +kl +krit +lat +lit +m.a +maď +mj +mp +násl +např +nepubl +něm +no +nr +n.s +okr +odd +odp +obr +opr +orig +phil +pl +pokrač +pol +port +pozn +př.kr +př.n.l +přel +přeprac +příl +pseud +pt +red +repr +resp +revid +rkp +roč +roz +rozš +samost +sect +sest +seš +sign +sl +srv +stol +sv +šk +šk.ro +špan +tab +t.č +tis +tj +tř +tzv +univ +uspoř +vol +vl.jm +vs +vyd +vyobr +zal +zejm +zkr +zprac +zvl +n.p +např +než +MUDr +abl +absol +adj +adv +ak +ak. sl +akt +alch +amer +anat +angl +anglosas +arab +arch +archit +arg +astr +astrol +att +bás +belg +bibl +biol +boh +bot +bulh +círk +csl +č +čas +čes +dat +děj +dep +dět +dial +dór +dopr +dosl +ekon +epic +etnonym +eufem +f +fam +fem +fil +film +form +fot +fr +fut +fyz +gen +geogr +geol +geom +germ +gram +hebr +herald +hist +hl +hovor +hud +hut +chcsl +chem +ie +imp +impf +ind +indoevr +inf +instr +interj +ión +iron +it +kanad +katalán +klas +kniž +komp +konj + +konkr +kř +kuch +lat +lék +les +lid +lit +liturg +lok +log +m +mat +meteor +metr +mod +ms +mysl +n +náb +námoř +neklas +něm +nesklon +nom +ob +obch +obyč +ojed +opt +part +pas +pejor +pers +pf +pl +plpf + +práv +prep +předl +přivl +r +rcsl +refl +reg +rkp +ř +řec +s +samohl +sg +sl +souhl +spec +srov +stfr +střv +stsl +subj +subst +superl +sv +sz +táz +tech +telev +teol +trans +typogr +var +vedl +verb +vl. jm +voj +vok +vůb +vulg +výtv +vztaž +zahr +zájm +zast +zejm + +zeměd +zkr +zř +mj +dl +atp +sport +Mgr +horn +MVDr +JUDr +RSDr +Bc +PhDr +ThDr +Ing +aj +apod +PharmDr +pomn +ev +slang +nprap +odp +dop +pol +st +stol +p. n. l +před n. l +n. l +př. Kr +po Kr +př. n. l +odd +RNDr +tzv +atd +tzn +resp +tj +p +br +č. j +čj +č. p +čp +a. s +s. r. o +spol. s r. o +p. o +s. p +v. o. s +k. s +o. p. s +o. s +v. r +v z +ml +vč +kr +mld +hod +popř +ap +event +rus +slov +rum +švýc +P. T +zvl +hor +dol +S.O.S \ No newline at end of file diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.de b/external/nonbreaking_prefixes/nonbreaking_prefix.de new file mode 100644 index 0000000..35fdf5e --- /dev/null +++ b/external/nonbreaking_prefixes/nonbreaking_prefix.de @@ -0,0 +1,325 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. +#Special cases are included for prefixes that ONLY appear before 0-9 numbers. + +#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) +#usually upper case letters are initials in a name +#no german words end in single lower-case letters, so we throw those in too. +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z + + +#Roman Numerals. A dot after one of these is not a sentence break in German. +I +II +III +IV +V +VI +VII +VIII +IX +X +XI +XII +XIII +XIV +XV +XVI +XVII +XVIII +XIX +XX +i +ii +iii +iv +v +vi +vii +viii +ix +x +xi +xii +xiii +xiv +xv +xvi +xvii +xviii +xix +xx + +#Titles and Honorifics +Adj +Adm +Adv +Asst +Bart +Bldg +Brig +Bros +Capt +Cmdr +Col +Comdr +Con +Corp +Cpl +DR +Dr +Ens +Gen +Gov +Hon +Hosp +Insp +Lt +MM +MR +MRS +MS +Maj +Messrs +Mlle +Mme +Mr +Mrs +Ms +Msgr +Op +Ord +Pfc +Ph +Prof +Pvt +Rep +Reps +Res +Rev +Rt +Sen +Sens +Sfc +Sgt +Sr +St +Supt +Surg + +#Misc symbols +Mio +Mrd +bzw +v +vs +usw +d.h +z.B +u.a +etc +Mrd +MwSt +ggf +d.J +D.h +m.E +vgl +I.F +z.T +sogen +ff +u.E +g.U +g.g.A +c.-à-d +Buchst +u.s.w +sog +u.ä +Std +evtl +Zt +Chr +u.U +o.ä +Ltd +b.A +z.Zt +spp +sen +SA +k.o +jun +i.H.v +dgl +dergl +Co +zzt +usf +s.p.a +Dkr +Corp +bzgl +BSE + +#Number indicators +# add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it +No +Nos +Art +Nr +pp +ca +Ca + +#Ordinals are done with . in German - "1." = "1st" in English +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.el b/external/nonbreaking_prefixes/nonbreaking_prefix.el new file mode 100644 index 0000000..0470f91 --- /dev/null +++ b/external/nonbreaking_prefixes/nonbreaking_prefix.el @@ -0,0 +1,2 @@ +# for now, just include the Greek equivalent of "Mr." +κ diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.en b/external/nonbreaking_prefixes/nonbreaking_prefix.en new file mode 100644 index 0000000..e1a3733 --- /dev/null +++ b/external/nonbreaking_prefixes/nonbreaking_prefix.en @@ -0,0 +1,107 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. +#Special cases are included for prefixes that ONLY appear before 0-9 numbers. + +#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) +#usually upper case letters are initials in a name +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z + +#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks +Adj +Adm +Adv +Asst +Bart +Bldg +Brig +Bros +Capt +Cmdr +Col +Comdr +Con +Corp +Cpl +DR +Dr +Drs +Ens +Gen +Gov +Hon +Hr +Hosp +Insp +Lt +MM +MR +MRS +MS +Maj +Messrs +Mlle +Mme +Mr +Mrs +Ms +Msgr +Op +Ord +Pfc +Ph +Prof +Pvt +Rep +Reps +Res +Rev +Rt +Sen +Sens +Sfc +Sgt +Sr +St +Supt +Surg + +#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) +v +vs +i.e +rev +e.g + +#Numbers only. These should only induce breaks when followed by a numeric sequence +# add NUMERIC_ONLY after the word for this function +#This case is mostly for the english "No." which can either be a sentence of its own, or +#if followed by a number, a non-breaking prefix +No #NUMERIC_ONLY# +Nos +Art #NUMERIC_ONLY# +Nr +pp #NUMERIC_ONLY# diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.es b/external/nonbreaking_prefixes/nonbreaking_prefix.es new file mode 100644 index 0000000..d8b2755 --- /dev/null +++ b/external/nonbreaking_prefixes/nonbreaking_prefix.es @@ -0,0 +1,118 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. +#Special cases are included for prefixes that ONLY appear before 0-9 numbers. + +#any single upper case letter followed by a period is not a sentence ender +#usually upper case letters are initials in a name +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z + +# Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm + +A.C +Apdo +Av +Bco +CC.AA +Da +Dep +Dn +Dr +Dra +EE.UU +Excmo +FF.CC +Fil +Gral +J.C +Let +Lic +N.B +P.D +P.V.P +Prof +Pts +Rte +S.A +S.A.R +S.E +S.L +S.R.C +Sr +Sra +Srta +Sta +Sto +T.V.E +Tel +Ud +Uds +V.B +V.E +Vd +Vds +a/c +adj +admón +afmo +apdo +av +c +c.f +c.g +cap +cm +cta +dcha +doc +ej +entlo +esq +etc +f.c +gr +grs +izq +kg +km +mg +mm +núm +núm +p +p.a +p.ej +ptas +pág +págs +pág +págs +q.e.g.e +q.e.s.m +s +s.s.s +vid +vol diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.fr b/external/nonbreaking_prefixes/nonbreaking_prefix.fr new file mode 100644 index 0000000..28126fa --- /dev/null +++ b/external/nonbreaking_prefixes/nonbreaking_prefix.fr @@ -0,0 +1,153 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. +#Special cases are included for prefixes that ONLY appear before 0-9 numbers. +# +#any single upper case letter followed by a period is not a sentence ender +#usually upper case letters are initials in a name +#no French words end in single lower-case letters, so we throw those in too? +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z + +# Period-final abbreviation list for French +A.C.N +A.M +art +ann +apr +av +auj +lib +B.P +boul +ca +c.-à-d +cf +ch.-l +chap +contr +C.P.I +C.Q.F.D +C.N +C.N.S +C.S +dir +éd +e.g +env +al +etc +E.V +ex +fasc +fém +fig +fr +hab +ibid +id +i.e +inf +LL.AA +LL.AA.II +LL.AA.RR +LL.AA.SS +L.D +LL.EE +LL.MM +LL.MM.II.RR +loc.cit +masc +MM +ms +N.B +N.D.A +N.D.L.R +N.D.T +n/réf +NN.SS +N.S +N.D +N.P.A.I +p.c.c +pl +pp +p.ex +p.j +P.S +R.A.S +R.-V +R.P +R.I.P +SS +S.S +S.A +S.A.I +S.A.R +S.A.S +S.E +sec +sect +sing +S.M +S.M.I.R +sq +sqq +suiv +sup +suppl +tél +T.S.V.P +vb +vol +vs +X.O +Z.I diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.is b/external/nonbreaking_prefixes/nonbreaking_prefix.is new file mode 100644 index 0000000..5b8a710 --- /dev/null +++ b/external/nonbreaking_prefixes/nonbreaking_prefix.is @@ -0,0 +1,251 @@ +no #NUMERIC_ONLY# +No #NUMERIC_ONLY# +nr #NUMERIC_ONLY# +Nr #NUMERIC_ONLY# +nR #NUMERIC_ONLY# +NR #NUMERIC_ONLY# +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +^ +í +á +ó +æ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +ab.fn +a.fn +afs +al +alm +alg +andh +ath +aths +atr +ao +au +aukaf +áfn +áhrl.s +áhrs +ákv.gr +ákv +bh +bls +dr +e.Kr +et +ef +efn +ennfr +eink +end +e.st +erl +fél +fskj +fh +f.hl +físl +fl +fn +fo +forl +frb +frl +frh +frt +fsl +fsh +fs +fsk +fst +f.Kr +ft +fv +fyrrn +fyrrv +germ +gm +gr +hdl +hdr +hf +hl +hlsk +hljsk +hljv +hljóðv +hr +hv +hvk +holl +Hos +höf +hk +hrl +ísl +kaf +kap +Khöfn +kk +kg +kk +km +kl +klst +kr +kt +kgúrsk +kvk +leturbr +lh +lh.nt +lh.þt +lo +ltr +mlja +mljó +millj +mm +mms +m.fl +miðm +mgr +mst +mín +nf +nh +nhm +nl +nk +nmgr +no +núv +nt +o.áfr +o.m.fl +ohf +o.fl +o.s.frv +ófn +ób +óákv.gr +óákv +pfn +PR +pr +Ritstj +Rvík +Rvk +samb +samhlj +samn +samn +sbr +sek +sérn +sf +sfn +sh +sfn +sh +s.hl +sk +skv +sl +sn +so +ss.us +s.st +samþ +sbr +shlj +sign +skál +st +st.s +stk +sþ +teg +tbl +tfn +tl +tvíhlj +tvt +till +to +umr +uh +us +uppl +útg +vb +Vf +vh +vkf +Vl +vl +vlf +vmf +8vo +vsk +vth +þt +þf +þjs +þgf +þlt +þolm +þm +þml +þýð diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.it b/external/nonbreaking_prefixes/nonbreaking_prefix.it new file mode 100644 index 0000000..992b9ec --- /dev/null +++ b/external/nonbreaking_prefixes/nonbreaking_prefix.it @@ -0,0 +1,180 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. +#Special cases are included for prefixes that ONLY appear before 0-9 numbers. + +#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) +#usually upper case letters are initials in a name +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z + +#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks +Adj +Adm +Adv +Amn +Arch +Asst +Avv +Bart +Bcc +Bldg +Brig +Bros +C.A.P +C.P +Capt +Cc +Cmdr +Co +Col +Comdr +Con +Corp +Cpl +DR +Dott +Dr +Drs +Egr +Ens +Gen +Geom +Gov +Hon +Hosp +Hr +Id +Ing +Insp +Lt +MM +MR +MRS +MS +Maj +Messrs +Mlle +Mme +Mo +Mons +Mr +Mrs +Ms +Msgr +N.B +Op +Ord +P.S +P.T +Pfc +Ph +Prof +Pvt +RP +RSVP +Rag +Rep +Reps +Res +Rev +Rif +Rt +S.A +S.B.F +S.P.M +S.p.A +S.r.l +Sen +Sens +Sfc +Sgt +Sig +Sigg +Soc +Spett +Sr +St +Supt +Surg +V.P + +# other +a.c +acc +all +banc +c.a +c.c.p +c.m +c.p +c.s +c.v +corr +dott +e.p.c +ecc +es +fatt +gg +int +lett +ogg +on +p.c +p.c.c +p.es +p.f +p.r +p.v +post +pp +racc +ric +s.n.c +seg +sgg +ss +tel +u.s +v.r +v.s + +#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) +v +vs +i.e +rev +e.g + +#Numbers only. These should only induce breaks when followed by a numeric sequence +# add NUMERIC_ONLY after the word for this function +#This case is mostly for the english "No." which can either be a sentence of its own, or +#if followed by a number, a non-breaking prefix +No #NUMERIC_ONLY# +Nos +Art #NUMERIC_ONLY# +Nr +pp #NUMERIC_ONLY# diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.nl b/external/nonbreaking_prefixes/nonbreaking_prefix.nl new file mode 100644 index 0000000..c80c417 --- /dev/null +++ b/external/nonbreaking_prefixes/nonbreaking_prefix.nl @@ -0,0 +1,115 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. +#Special cases are included for prefixes that ONLY appear before 0-9 numbers. +#Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen +# http://nl.wikipedia.org/wiki/Aanspreekvorm +# http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs +#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) +#usually upper case letters are initials in a name +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z + +#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks +bacc +bc +bgen +c.i +dhr +dr +dr.h.c +drs +drs +ds +eint +fa +Fa +fam +gen +genm +ing +ir +jhr +jkvr +jr +kand +kol +lgen +lkol +Lt +maj +Mej +mevr +Mme +mr +mr +Mw +o.b.s +plv +prof +ritm +tint +Vz +Z.D +Z.D.H +Z.E +Z.Em +Z.H +Z.K.H +Z.K.M +Z.M +z.v + +#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) +#we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence +a.g.v +bijv +bijz +bv +d.w.z +e.c +e.g +e.k +ev +i.p.v +i.s.m +i.t.t +i.v.m +m.a.w +m.b.t +m.b.v +m.h.o +m.i +m.i.v +v.w.t + +#Numbers only. These should only induce breaks when followed by a numeric sequence +# add NUMERIC_ONLY after the word for this function +#This case is mostly for the english "No." which can either be a sentence of its own, or +#if followed by a number, a non-breaking prefix +Nr #NUMERIC_ONLY# +Nrs +nrs +nr #NUMERIC_ONLY# diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.pl b/external/nonbreaking_prefixes/nonbreaking_prefix.pl new file mode 100644 index 0000000..6b7c106 --- /dev/null +++ b/external/nonbreaking_prefixes/nonbreaking_prefix.pl @@ -0,0 +1,283 @@ +adw +afr +akad +al +Al +am +amer +arch +art +Art +artyst +astr +austr +bałt +bdb +bł +bm +br +bryg +bryt +centr +ces +chem +chiń +chir +c.k +c.o +cyg +cyw +cyt +czes +czw +cd +Cd +czyt +ćw +ćwicz +daw +dcn +dekl +demokr +det +diec +dł +dn +dot +dol +dop +dost +dosł +h.c +ds +dst +duszp +dypl +egz +ekol +ekon +elektr +em +ew +fab +farm +fot +fr +gat +gastr +geogr +geol +gimn +głęb +gm +godz +górn +gosp +gr +gram +hist +hiszp +hr +Hr +hot +id +in +im +iron +jn +kard +kat +katol +k.k +kk +kol +kl +k.p.a +kpc +k.p.c +kpt +kr +k.r +krak +k.r.o +kryt +kult +laic +łac +niem +woj +nb +np +Nb +Np +pol +pow +m.in +pt +ps +Pt +Ps +cdn +jw +ryc +rys +Ryc +Rys +tj +tzw +Tzw +tzn +zob +ang +ub +ul +pw +pn +pl +al +k +n +nr #NUMERIC_ONLY# +Nr #NUMERIC_ONLY# +ww +wł +ur +zm +żyd +żarg +żyw +wył +bp +bp +wyst +tow +Tow +o +sp +Sp +st +spółdz +Spółdz +społ +spółgł +stoł +stow +Stoł +Stow +zn +zew +zewn +zdr +zazw +zast +zaw +zał +zal +zam +zak +zakł +zagr +zach +adw +Adw +lek +Lek +med +mec +Mec +doc +Doc +dyw +dyr +Dyw +Dyr +inż +Inż +mgr +Mgr +dh +dr +Dh +Dr +p +P +red +Red +prof +prok +Prof +Prok +hab +płk +Płk +nadkom +Nadkom +podkom +Podkom +ks +Ks +gen +Gen +por +Por +reż +Reż +przyp +Przyp +śp +św +śW +Śp +Św +ŚW +szer +Szer +pkt #NUMERIC_ONLY# +str #NUMERIC_ONLY# +tab #NUMERIC_ONLY# +Tab #NUMERIC_ONLY# +tel +ust #NUMERIC_ONLY# +par #NUMERIC_ONLY# +poz +pok +oo +oO +Oo +OO +r #NUMERIC_ONLY# +l #NUMERIC_ONLY# +s #NUMERIC_ONLY# +najśw +Najśw +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +Ś +Ć +Ż +Ź +Dz diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.pt b/external/nonbreaking_prefixes/nonbreaking_prefix.pt new file mode 100644 index 0000000..5d65bf2 --- /dev/null +++ b/external/nonbreaking_prefixes/nonbreaking_prefix.pt @@ -0,0 +1,210 @@ +#File adapted for PT by H. Leal Fontes from the EN & DE versions published with moses-2009-04-13. Last update: 10.11.2009. +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. +#Special cases are included for prefixes that ONLY appear before 0-9 numbers. + +#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) +#usually upper case letters are initials in a name +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z + + +#Roman Numerals. A dot after one of these is not a sentence break in Portuguese. +I +II +III +IV +V +VI +VII +VIII +IX +X +XI +XII +XIII +XIV +XV +XVI +XVII +XVIII +XIX +XX +i +ii +iii +iv +v +vi +vii +viii +ix +x +xi +xii +xiii +xiv +xv +xvi +xvii +xviii +xix +xx + +#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks +Adj +Adm +Adv +Art +Ca +Capt +Cmdr +Col +Comdr +Con +Corp +Cpl +DR +DRA +Dr +Dra +Dras +Drs +Eng +Enga +Engas +Engos +Ex +Exo +Exmo +Fig +Gen +Hosp +Insp +Lda +MM +MR +MRS +MS +Maj +Mrs +Ms +Msgr +Op +Ord +Pfc +Ph +Prof +Pvt +Rep +Reps +Res +Rev +Rt +Sen +Sens +Sfc +Sgt +Sr +Sra +Sras +Srs +Sto +Supt +Surg +adj +adm +adv +art +cit +col +con +corp +cpl +dr +dra +dras +drs +eng +enga +engas +engos +ex +exo +exmo +fig +op +prof +sr +sra +sras +srs +sto + +#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence) +v +vs +i.e +rev +e.g + +#Numbers only. These should only induce breaks when followed by a numeric sequence +# add NUMERIC_ONLY after the word for this function +#This case is mostly for the english "No." which can either be a sentence of its own, or +#if followed by a number, a non-breaking prefix +No #NUMERIC_ONLY# +Nos +Art #NUMERIC_ONLY# +Nr +p #NUMERIC_ONLY# +pp #NUMERIC_ONLY# + diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.ro b/external/nonbreaking_prefixes/nonbreaking_prefix.ro new file mode 100644 index 0000000..d489f46 --- /dev/null +++ b/external/nonbreaking_prefixes/nonbreaking_prefix.ro @@ -0,0 +1,38 @@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +dpdv +etc +șamd +M.Ap.N +dl +Dl +d-na +D-na +dvs +Dvs +pt +Pt diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.ru b/external/nonbreaking_prefixes/nonbreaking_prefix.ru new file mode 100644 index 0000000..444465b --- /dev/null +++ b/external/nonbreaking_prefixes/nonbreaking_prefix.ru @@ -0,0 +1,259 @@ +TBD: Russian uppercase alphabet [А-Я] +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +0гг +1гг +2гг +3гг +4гг +5гг +6гг +7гг +8гг +9гг +0г +1г +2г +3г +4г +5г +6г +7г +8г +9г +Xвв +Vвв +Iвв +Lвв +Mвв +Cвв +Xв +Vв +Iв +Lв +Mв +Cв +0м +1м +2м +3м +4м +5м +6м +7м +8м +9м +0мм +1мм +2мм +3мм +4мм +5мм +6мм +7мм +8мм +9мм +0см +1см +2см +3см +4см +5см +6см +7см +8см +9см +0дм +1дм +2дм +3дм +4дм +5дм +6дм +7дм +8дм +9дм +0л +1л +2л +3л +4л +5л +6л +7л +8л +9л +0км +1км +2км +3км +4км +5км +6км +7км +8км +9км +0га +1га +2га +3га +4га +5га +6га +7га +8га +9га +0кг +1кг +2кг +3кг +4кг +5кг +6кг +7кг +8кг +9кг +0т +1т +2т +3т +4т +5т +6т +7т +8т +9т +0г +1г +2г +3г +4г +5г +6г +7г +8г +9г +0мг +1мг +2мг +3мг +4мг +5мг +6мг +7мг +8мг +9мг +бульв +в +вв +г +га +гг +гл +гос +д +дм +доп +др +е +ед +ед +зам +и +инд +исп +Исп +к +кап +кг +кв +кл +км +кол +комн +коп +куб +л +лиц +лл +м +макс +мг +мин +мл +млн +млрд +мм +н +наб +нач +неуд +ном +о +обл +обр +общ +ок +ост +отл +п +пер +перераб +пл +пос +пр +просп +проф +р +ред +руб +с +сб +св +см +соч +ср +ст +стр +т +тел +Тел +тех +тт +туп +тыс +уд +ул +уч +физ +х +хор +ч +чел +шт +экз +э diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.sk b/external/nonbreaking_prefixes/nonbreaking_prefix.sk new file mode 100644 index 0000000..1198d48 --- /dev/null +++ b/external/nonbreaking_prefixes/nonbreaking_prefix.sk @@ -0,0 +1,474 @@ +Bc +Mgr +RNDr +PharmDr +PhDr +JUDr +PaedDr +ThDr +Ing +MUDr +MDDr +MVDr +Dr +ThLic +PhD +ArtD +ThDr +Dr +DrSc +CSs +prof +obr +Obr +Č +č +absol +adj +admin +adr +Adr +adv +advok +afr +ak +akad +akc +akuz +et +al +alch +amer +anat +angl +Angl +anglosas +anorg +ap +apod +arch +archeol +archit +arg +art +astr +astrol +astron +atp +atď +austr +Austr +aut +belg +Belg +bibl +Bibl +biol +bot +bud +bás +býv +cest +chem +cirk +csl +čs +Čs +dat +dep +det +dial +diaľ +dipl +distrib +dokl +dosl +dopr +dram +duš +dv +dvojčl +dór +ekol +ekon +el +elektr +elektrotech +energet +epic +est +etc +etonym +eufem +európ +Európ +ev +evid +expr +fa +fam +farm +fem +feud +fil +filat +filoz +fi +fon +form +fot +fr +Fr +franc +Franc +fraz +fut +fyz +fyziol +garb +gen +genet +genpor +geod +geogr +geol +geom +germ +gr +Gr +gréc +Gréc +gréckokat +hebr +herald +hist +hlav +hosp +hromad +hud +hypok +ident +i.e +ident +imp +impf +indoeur +inf +inform +instr +int +interj +inšt +inštr +iron +jap +Jap +jaz +jedn +juhoamer +juhových +juhozáp +juž +kanad +Kanad +kanc +kapit +kpt +kart +katastr +knih +kniž +komp +konj +konkr +kozmet +krajč +kresť +kt +kuch +lat +latinskoamer +lek +lex +lingv +lit +litur +log +lok +max +Max +maď +Maď +medzinár +mest +metr +mil +Mil +min +Min +miner +ml +mld +mn +mod +mytol +napr +nar +Nar +nasl +nedok +neg +negat +neklas +nem +Nem +neodb +neos +neskl +nesklon +nespis +nespráv +neved +než +niekt +niž +nom +náb +nákl +námor +nár +obch +obj +obv +obyč +obč +občian +odb +odd +ods +ojed +okr +Okr +opt +opyt +org +os +osob +ot +ovoc +par +part +pejor +pers +pf +Pf +P.f +p.f +pl +Plk +pod +podst +pokl +polit +politol +polygr +pomn +popl +por +porad +porov +posch +potrav +použ +poz +pozit +poľ +poľno +poľnohosp +poľov +pošt +pož +prac +predl +pren +prep +preuk +priezv +Priezv +privl +prof +práv +príd +príj +prík +príp +prír +prísl +príslov +príč +psych +publ +pís +písm +pôv +refl +reg +rep +resp +rozk +rozlič +rozpráv +roč +Roč +ryb +rádiotech +rím +samohl +semest +sev +severoamer +severových +severozáp +sg +skr +skup +sl +Sloven +soc +soch +sociol +sp +spol +Spol +spoloč +spoluhl +správ +spôs +st +star +starogréc +starorím +s.r.o +stol +stor +str +stredoamer +stredoškol +subj +subst +superl +sv +sz +súkr +súp +súvzť +tal +Tal +tech +tel +Tel +telef +teles +telev +teol +trans +turist +tuzem +typogr +tzn +tzv +ukaz +ul +Ul +umel +univ +ust +ved +vedľ +verb +veter +vin +viď +vl +vod +vodohosp +pnl +vulg +vyj +vys +vysokoškol +vzťaž +vôb +vých +výd +výrob +výsk +výsl +výtv +výtvar +význ +včel +vš +všeob +zahr +zar +zariad +zast +zastar +zastaráv +zb +zdravot +združ +zjemn +zlat +zn +Zn +zool +zr +zried +zv +záhr +zák +zákl +zám +záp +západoeur +zázn +územ +účt +čast +čes +Čes +čl +čísl +živ +pr +fak +Kr +p.n.l +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.sl b/external/nonbreaking_prefixes/nonbreaking_prefix.sl new file mode 100644 index 0000000..230062c --- /dev/null +++ b/external/nonbreaking_prefixes/nonbreaking_prefix.sl @@ -0,0 +1,78 @@ +dr +Dr +itd +itn +št #NUMERIC_ONLY# +Št #NUMERIC_ONLY# +d +jan +Jan +feb +Feb +mar +Mar +apr +Apr +jun +Jun +jul +Jul +avg +Avg +sept +Sept +sep +Sep +okt +Okt +nov +Nov +dec +Dec +tj +Tj +npr +Npr +sl +Sl +op +Op +gl +Gl +oz +Oz +prev +dipl +ing +prim +Prim +cf +Cf +gl +Gl +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z diff --git a/external/nonbreaking_prefixes/nonbreaking_prefix.sv b/external/nonbreaking_prefixes/nonbreaking_prefix.sv new file mode 100644 index 0000000..df5ef29 --- /dev/null +++ b/external/nonbreaking_prefixes/nonbreaking_prefix.sv @@ -0,0 +1,46 @@ +#single upper case letter are usually initials +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +#misc abbreviations +AB +G +VG +dvs +etc +from +iaf +jfr +kl +kr +mao +mfl +mm +osv +pga +tex +tom +vs diff --git a/external/tokenizer-no-escape.perl b/external/tokenizer-no-escape.perl new file mode 100755 index 0000000..4397360 --- /dev/null +++ b/external/tokenizer-no-escape.perl @@ -0,0 +1,348 @@ +#!/usr/bin/perl -w + +# Sample Tokenizer +### Version 1.1 +# written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn +# Version 1.1 updates: +# (1) add multithreading option "-threads NUM_THREADS" (default is 1); +# (2) add a timing option "-time" to calculate the average speed of this tokenizer; +# (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed); +### Version 1.0 +# $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $ +# written by Josh Schroeder, based on code by Philipp Koehn + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); + +use FindBin qw($RealBin); +use strict; +use Time::HiRes; +#use Thread; + +my $mydir = "$RealBin/nonbreaking_prefixes"; + +my %NONBREAKING_PREFIX = (); +my $language = "en"; +my $QUIET = 0; +my $HELP = 0; +my $AGGRESSIVE = 0; +my $SKIP_XML = 0; +my $TIMING = 0; +my $NUM_THREADS = 1; +my $NUM_SENTENCES_PER_THREAD = 2000; + +while (@ARGV) +{ + $_ = shift; + /^-b$/ && ($| = 1, next); + /^-l$/ && ($language = shift, next); + /^-q$/ && ($QUIET = 1, next); + /^-h$/ && ($HELP = 1, next); + /^-x$/ && ($SKIP_XML = 1, next); + /^-a$/ && ($AGGRESSIVE = 1, next); + /^-time$/ && ($TIMING = 1, next); + /^-threads$/ && ($NUM_THREADS = int(shift), next); + /^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next); +} + +# for time calculation +my $start_time; +if ($TIMING) +{ + $start_time = [ Time::HiRes::gettimeofday( ) ]; +} + +# print help message +if ($HELP) +{ + print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n"; + print "Options:\n"; + print " -q ... quiet.\n"; + print " -a ... aggressive hyphen splitting.\n"; + print " -b ... disable Perl buffering.\n"; + print " -time ... enable processing time calculation.\n"; + exit; +} + +if (!$QUIET) +{ + print STDERR "Tokenizer Version 1.1\n"; + print STDERR "Language: $language\n"; + print STDERR "Number of threads: $NUM_THREADS\n"; +} + +# load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes +load_prefixes($language,\%NONBREAKING_PREFIX); + +if (scalar(%NONBREAKING_PREFIX) eq 0) +{ + print STDERR "Warning: No known abbreviations for language '$language'\n"; +} + +my @batch_sentences = (); +my @thread_list = (); +my $count_sentences = 0; + +if ($NUM_THREADS > 1) +{# multi-threading tokenization + while() + { + $count_sentences = $count_sentences + 1; + push(@batch_sentences, $_); + if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS)) + { + # assign each thread work + for (my $i=0; $i<$NUM_THREADS; $i++) + { + my $start_index = $i*$NUM_SENTENCES_PER_THREAD; + my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1; + my @subbatch_sentences = @batch_sentences[$start_index..$end_index]; + my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences; + push(@thread_list, $new_thread); + } + foreach (@thread_list) + { + my $tokenized_list = $_->join; + foreach (@$tokenized_list) + { + print $_; + } + } + # reset for the new run + @thread_list = (); + @batch_sentences = (); + } + } + # the last batch + if (scalar(@batch_sentences)>0) + { + # assign each thread work + for (my $i=0; $i<$NUM_THREADS; $i++) + { + my $start_index = $i*$NUM_SENTENCES_PER_THREAD; + if ($start_index >= scalar(@batch_sentences)) + { + last; + } + my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1; + if ($end_index >= scalar(@batch_sentences)) + { + $end_index = scalar(@batch_sentences)-1; + } + my @subbatch_sentences = @batch_sentences[$start_index..$end_index]; + my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences; + push(@thread_list, $new_thread); + } + foreach (@thread_list) + { + my $tokenized_list = $_->join; + foreach (@$tokenized_list) + { + print $_; + } + } + } +} +else +{# single thread only + while() + { + if (($SKIP_XML && /^<.+>$/) || /^\s*$/) + { + #don't try to tokenize XML/HTML tag lines + print $_; + } + else + { + print &tokenize($_); + } + } +} + +if ($TIMING) +{ + my $duration = Time::HiRes::tv_interval( $start_time ); + print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n"); + print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n"); +} + +##################################################################################### +# subroutines afterward + +# tokenize a batch of texts saved in an array +# input: an array containing a batch of texts +# return: another array cotaining a batch of tokenized texts for the input array +sub tokenize_batch +{ + my(@text_list) = @_; + my(@tokenized_list) = (); + foreach (@text_list) + { + if (($SKIP_XML && /^<.+>$/) || /^\s*$/) + { + #don't try to tokenize XML/HTML tag lines + push(@tokenized_list, $_); + } + else + { + push(@tokenized_list, &tokenize($_)); + } + } + return \@tokenized_list; +} + +# the actual tokenize function which tokenizes one input string +# input: one string +# return: the tokenized string for the input string +sub tokenize +{ + my($text) = @_; + chomp($text); + $text = " $text "; + + # remove ASCII junk + $text =~ s/\s+/ /g; + $text =~ s/[\000-\037]//g; + + # seperate out all "other" special characters + $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g; + + # aggressive hyphen splitting + if ($AGGRESSIVE) + { + $text =~ s/([\p{IsAlnum}])\-([\p{IsAlnum}])/$1 \@-\@ $2/g; + } + + #multi-dots stay together + $text =~ s/\.([\.]+)/ DOTMULTI$1/g; + while($text =~ /DOTMULTI\./) + { + $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g; + $text =~ s/DOTMULTI\./DOTDOTMULTI/g; + } + + # seperate out "," except if within numbers (5,300) + $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; + # separate , pre and post number + $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g; + $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g; + + # turn `into ' + $text =~ s/\`/\'/g; + + #turn '' into " + $text =~ s/\'\'/ \" /g; + + if ($language eq "en") + { + #split contractions right + $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g; + #special case for "1990's" + $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g; + } + elsif (($language eq "fr") or ($language eq "it")) + { + #split contractions left + $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g; + $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g; + } + else + { + $text =~ s/\'/ \' /g; + } + + #word token method + my @words = split(/\s/,$text); + $text = ""; + for (my $i=0;$i<(scalar(@words));$i++) + { + my $word = $words[$i]; + if ( $word =~ /^(\S+)\.$/) + { + my $pre = $1; + if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i/\>/g; # xml + #$text =~ s/\'/\'/g; # xml + #$text =~ s/\"/\"/g; # xml + #$text =~ s/\[/\[/g; # syntax non-terminal + #$text =~ s/\]/\]/g; # syntax non-terminal + + #ensure final line break + $text .= "\n" unless $text =~ /\n$/; + + return $text; +} + +sub load_prefixes +{ + my ($language, $PREFIX_REF) = @_; + + my $prefixfile = "$mydir/nonbreaking_prefix.$language"; + + #default back to English if we don't have a language-specific prefix file + if (!(-e $prefixfile)) + { + $prefixfile = "$mydir/nonbreaking_prefix.en"; + print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n"; + die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile); + } + + if (-e "$prefixfile") + { + open(PREFIX, "<:utf8", "$prefixfile"); + while () + { + my $item = $_; + chomp($item); + if (($item) && (substr($item,0,1) ne "#")) + { + if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) + { + $PREFIX_REF->{$1} = 2; + } + else + { + $PREFIX_REF->{$item} = 1; + } + } + } + close(PREFIX); + } +} + diff --git a/external/truecase.perl b/external/truecase.perl new file mode 100755 index 0000000..0a4d366 --- /dev/null +++ b/external/truecase.perl @@ -0,0 +1,104 @@ +#!/usr/bin/perl -w + +# $Id: train-recaser.perl 1326 2007-03-26 05:44:27Z bojar $ +use strict; +use Getopt::Long "GetOptions"; + +binmode(STDIN, ":utf8"); +binmode(STDOUT, ":utf8"); + +# apply switches +my ($MODEL, $UNBUFFERED); +die("truecase.perl --model MODEL [-b] < in > out") + unless &GetOptions('model=s' => \$MODEL,'b|unbuffered' => \$UNBUFFERED) + && defined($MODEL); +if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; } + +my (%BEST,%KNOWN); +open(MODEL,$MODEL) || die("ERROR: could not open '$MODEL'"); +binmode(MODEL, ":utf8"); +while() { + my ($word,@OPTIONS) = split; + $BEST{ lc($word) } = $word; + $KNOWN{ $word } = 1; + for(my $i=1;$i<$#OPTIONS;$i+=2) { + $KNOWN{ $OPTIONS[$i] } = 1; + } +} +close(MODEL); + +my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1); +my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"'"=>1,"""=>1,"["=>1,"]"=>1); + +while() { + chop; + my ($WORD,$MARKUP) = split_xml($_); + my $sentence_start = 1; + for(my $i=0;$i<=$#$WORD;$i++) { + print " " if $i && $$MARKUP[$i] eq ''; + print $$MARKUP[$i]; + + my ($word,$otherfactors); + if ($$WORD[$i] =~ /^([^\|]+)(.*)/) + { + $word = $1; + $otherfactors = $2; + } + else + { + $word = $$WORD[$i]; + $otherfactors = ""; + } + + if ($sentence_start && defined($BEST{lc($word)})) { + print $BEST{lc($word)}; # truecase sentence start + } + elsif (defined($KNOWN{$word})) { + print $word; # don't change known words + } + elsif (defined($BEST{lc($word)})) { + print $BEST{lc($word)}; # truecase otherwise unknown words + } + else { + print $word; # unknown, nothing to do + } + print $otherfactors; + + if ( defined($SENTENCE_END{ $word })) { $sentence_start = 1; } + elsif (!defined($DELAYED_SENTENCE_START{ $word })) { $sentence_start = 0; } + } + print $$MARKUP[$#$MARKUP]; + print "\n"; +} + +# store away xml markup +sub split_xml { + my ($line) = @_; + my (@WORD,@MARKUP); + my $i = 0; + $MARKUP[0] = ""; + while($line =~ /\S/) { + # XML tag + if ($line =~ /^\s*(<\S[^>]*>)(.*)$/) { + $MARKUP[$i] .= $1." "; + $line = $2; + } + # non-XML text + elsif ($line =~ /^\s*([^\s<>]+)(.*)$/) { + $WORD[$i++] = $1; + $MARKUP[$i] = ""; + $line = $2; + } + # '<' or '>' occurs in word, but it's not an XML tag + elsif ($line =~ /^\s*(\S+)(.*)$/) { + $WORD[$i++] = $1; + $MARKUP[$i] = ""; + $line = $2; + } + else { + die("ERROR: huh? $line\n"); + } + } + chop($MARKUP[$#MARKUP]); + return (\@WORD,\@MARKUP); +} -- cgit v1.2.3