From 2783f837303ae07c4a1d676302bca779abbb1296 Mon Sep 17 00:00:00 2001 From: Patrick Simianer Date: Sat, 14 Jun 2014 14:43:14 +0200 Subject: steal tokenizer from moses' scripts --- nonbreaking_prefixes/nonbreaking_prefix.de | 325 +++++++++++++++++++++++++++++ 1 file changed, 325 insertions(+) create mode 100644 nonbreaking_prefixes/nonbreaking_prefix.de (limited to 'nonbreaking_prefixes/nonbreaking_prefix.de') diff --git a/nonbreaking_prefixes/nonbreaking_prefix.de b/nonbreaking_prefixes/nonbreaking_prefix.de new file mode 100644 index 0000000..35fdf5e --- /dev/null +++ b/nonbreaking_prefixes/nonbreaking_prefix.de @@ -0,0 +1,325 @@ +#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker. +#Special cases are included for prefixes that ONLY appear before 0-9 numbers. + +#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in) +#usually upper case letters are initials in a name +#no german words end in single lower-case letters, so we throw those in too. +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z + + +#Roman Numerals. A dot after one of these is not a sentence break in German. +I +II +III +IV +V +VI +VII +VIII +IX +X +XI +XII +XIII +XIV +XV +XVI +XVII +XVIII +XIX +XX +i +ii +iii +iv +v +vi +vii +viii +ix +x +xi +xii +xiii +xiv +xv +xvi +xvii +xviii +xix +xx + +#Titles and Honorifics +Adj +Adm +Adv +Asst +Bart +Bldg +Brig +Bros +Capt +Cmdr +Col +Comdr +Con +Corp +Cpl +DR +Dr +Ens +Gen +Gov +Hon +Hosp +Insp +Lt +MM +MR +MRS +MS +Maj +Messrs +Mlle +Mme +Mr +Mrs +Ms +Msgr +Op +Ord +Pfc +Ph +Prof +Pvt +Rep +Reps +Res +Rev +Rt +Sen +Sens +Sfc +Sgt +Sr +St +Supt +Surg + +#Misc symbols +Mio +Mrd +bzw +v +vs +usw +d.h +z.B +u.a +etc +Mrd +MwSt +ggf +d.J +D.h +m.E +vgl +I.F +z.T +sogen +ff +u.E +g.U +g.g.A +c.-à-d +Buchst +u.s.w +sog +u.ä +Std +evtl +Zt +Chr +u.U +o.ä +Ltd +b.A +z.Zt +spp +sen +SA +k.o +jun +i.H.v +dgl +dergl +Co +zzt +usf +s.p.a +Dkr +Corp +bzgl +BSE + +#Number indicators +# add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it +No +Nos +Art +Nr +pp +ca +Ca + +#Ordinals are done with . in German - "1." = "1st" in English +1 +2 +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +22 +23 +24 +25 +26 +27 +28 +29 +30 +31 +32 +33 +34 +35 +36 +37 +38 +39 +40 +41 +42 +43 +44 +45 +46 +47 +48 +49 +50 +51 +52 +53 +54 +55 +56 +57 +58 +59 +60 +61 +62 +63 +64 +65 +66 +67 +68 +69 +70 +71 +72 +73 +74 +75 +76 +77 +78 +79 +80 +81 +82 +83 +84 +85 +86 +87 +88 +89 +90 +91 +92 +93 +94 +95 +96 +97 +98 +99 -- cgit v1.2.3