summaryrefslogtreecommitdiff
path: root/nonbreaking_prefixes/nonbreaking_prefix.de
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2014-06-14 14:43:14 +0200
committerPatrick Simianer <p@simianer.de>2014-06-14 14:43:14 +0200
commit2783f837303ae07c4a1d676302bca779abbb1296 (patch)
treee388dda12d6d31285b32663b937a8d55ecc909c5 /nonbreaking_prefixes/nonbreaking_prefix.de
parent85ea0fc5e3ae7ea646cc6e843d01939b4d8e4dbf (diff)
steal tokenizer from moses' scripts
Diffstat (limited to 'nonbreaking_prefixes/nonbreaking_prefix.de')
-rw-r--r--nonbreaking_prefixes/nonbreaking_prefix.de325
1 files changed, 325 insertions, 0 deletions
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.de b/nonbreaking_prefixes/nonbreaking_prefix.de
new file mode 100644
index 0000000..35fdf5e
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.de
@@ -0,0 +1,325 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+#no german words end in single lower-case letters, so we throw those in too.
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+
+
+#Roman Numerals. A dot after one of these is not a sentence break in German.
+I
+II
+III
+IV
+V
+VI
+VII
+VIII
+IX
+X
+XI
+XII
+XIII
+XIV
+XV
+XVI
+XVII
+XVIII
+XIX
+XX
+i
+ii
+iii
+iv
+v
+vi
+vii
+viii
+ix
+x
+xi
+xii
+xiii
+xiv
+xv
+xvi
+xvii
+xviii
+xix
+xx
+
+#Titles and Honorifics
+Adj
+Adm
+Adv
+Asst
+Bart
+Bldg
+Brig
+Bros
+Capt
+Cmdr
+Col
+Comdr
+Con
+Corp
+Cpl
+DR
+Dr
+Ens
+Gen
+Gov
+Hon
+Hosp
+Insp
+Lt
+MM
+MR
+MRS
+MS
+Maj
+Messrs
+Mlle
+Mme
+Mr
+Mrs
+Ms
+Msgr
+Op
+Ord
+Pfc
+Ph
+Prof
+Pvt
+Rep
+Reps
+Res
+Rev
+Rt
+Sen
+Sens
+Sfc
+Sgt
+Sr
+St
+Supt
+Surg
+
+#Misc symbols
+Mio
+Mrd
+bzw
+v
+vs
+usw
+d.h
+z.B
+u.a
+etc
+Mrd
+MwSt
+ggf
+d.J
+D.h
+m.E
+vgl
+I.F
+z.T
+sogen
+ff
+u.E
+g.U
+g.g.A
+c.-à-d
+Buchst
+u.s.w
+sog
+u.ä
+Std
+evtl
+Zt
+Chr
+u.U
+o.ä
+Ltd
+b.A
+z.Zt
+spp
+sen
+SA
+k.o
+jun
+i.H.v
+dgl
+dergl
+Co
+zzt
+usf
+s.p.a
+Dkr
+Corp
+bzgl
+BSE
+
+#Number indicators
+# add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it
+No
+Nos
+Art
+Nr
+pp
+ca
+Ca
+
+#Ordinals are done with . in German - "1." = "1st" in English
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
+85
+86
+87
+88
+89
+90
+91
+92
+93
+94
+95
+96
+97
+98
+99