summaryrefslogtreecommitdiff
path: root/nonbreaking_prefixes
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2014-06-14 14:43:14 +0200
committerPatrick Simianer <p@simianer.de>2014-06-14 14:43:14 +0200
commit2783f837303ae07c4a1d676302bca779abbb1296 (patch)
treee388dda12d6d31285b32663b937a8d55ecc909c5 /nonbreaking_prefixes
parent85ea0fc5e3ae7ea646cc6e843d01939b4d8e4dbf (diff)
steal tokenizer from moses' scripts
Diffstat (limited to 'nonbreaking_prefixes')
-rw-r--r--nonbreaking_prefixes/README.txt5
-rw-r--r--nonbreaking_prefixes/nonbreaking_prefix.ca75
-rw-r--r--nonbreaking_prefixes/nonbreaking_prefix.cs390
-rw-r--r--nonbreaking_prefixes/nonbreaking_prefix.de325
-rw-r--r--nonbreaking_prefixes/nonbreaking_prefix.el2
-rw-r--r--nonbreaking_prefixes/nonbreaking_prefix.en107
-rw-r--r--nonbreaking_prefixes/nonbreaking_prefix.es118
-rw-r--r--nonbreaking_prefixes/nonbreaking_prefix.fr153
-rw-r--r--nonbreaking_prefixes/nonbreaking_prefix.is251
-rw-r--r--nonbreaking_prefixes/nonbreaking_prefix.it180
-rw-r--r--nonbreaking_prefixes/nonbreaking_prefix.nl115
-rw-r--r--nonbreaking_prefixes/nonbreaking_prefix.pl283
-rw-r--r--nonbreaking_prefixes/nonbreaking_prefix.pt210
-rw-r--r--nonbreaking_prefixes/nonbreaking_prefix.ro38
-rw-r--r--nonbreaking_prefixes/nonbreaking_prefix.ru259
-rw-r--r--nonbreaking_prefixes/nonbreaking_prefix.sk474
-rw-r--r--nonbreaking_prefixes/nonbreaking_prefix.sl78
-rw-r--r--nonbreaking_prefixes/nonbreaking_prefix.sv46
18 files changed, 3109 insertions, 0 deletions
diff --git a/nonbreaking_prefixes/README.txt b/nonbreaking_prefixes/README.txt
new file mode 100644
index 0000000..02cdfcc
--- /dev/null
+++ b/nonbreaking_prefixes/README.txt
@@ -0,0 +1,5 @@
+The language suffix can be found here:
+
+http://www.loc.gov/standards/iso639-2/php/code_list.php
+
+
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.ca b/nonbreaking_prefixes/nonbreaking_prefix.ca
new file mode 100644
index 0000000..2f4fdfc
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.ca
@@ -0,0 +1,75 @@
+Dr
+Dra
+pàg
+p
+c
+av
+Sr
+Sra
+adm
+esq
+Prof
+S.A
+S.L
+p.e
+ptes
+Sta
+St
+pl
+màx
+cast
+dir
+nre
+fra
+admdora
+Emm
+Excma
+espf
+dc
+admdor
+tel
+angl
+aprox
+ca
+dept
+dj
+dl
+dt
+ds
+dg
+dv
+ed
+entl
+al
+i.e
+maj
+smin
+n
+núm
+pta
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.cs b/nonbreaking_prefixes/nonbreaking_prefix.cs
new file mode 100644
index 0000000..dce6167
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.cs
@@ -0,0 +1,390 @@
+Bc
+BcA
+Ing
+Ing.arch
+MUDr
+MVDr
+MgA
+Mgr
+JUDr
+PhDr
+RNDr
+PharmDr
+ThLic
+ThDr
+Ph.D
+Th.D
+prof
+doc
+CSc
+DrSc
+dr. h. c
+PaedDr
+Dr
+PhMr
+DiS
+abt
+ad
+a.i
+aj
+angl
+anon
+apod
+atd
+atp
+aut
+bd
+biogr
+b.m
+b.p
+b.r
+cca
+cit
+cizojaz
+c.k
+col
+čes
+čín
+čj
+ed
+facs
+fasc
+fol
+fot
+franc
+h.c
+hist
+hl
+hrsg
+ibid
+il
+ind
+inv.č
+jap
+jhdt
+jv
+koed
+kol
+korej
+kl
+krit
+lat
+lit
+m.a
+maď
+mj
+mp
+násl
+např
+nepubl
+něm
+no
+nr
+n.s
+okr
+odd
+odp
+obr
+opr
+orig
+phil
+pl
+pokrač
+pol
+port
+pozn
+př.kr
+př.n.l
+přel
+přeprac
+příl
+pseud
+pt
+red
+repr
+resp
+revid
+rkp
+roč
+roz
+rozš
+samost
+sect
+sest
+seš
+sign
+sl
+srv
+stol
+sv
+šk
+šk.ro
+špan
+tab
+t.č
+tis
+tj
+tř
+tzv
+univ
+uspoř
+vol
+vl.jm
+vs
+vyd
+vyobr
+zal
+zejm
+zkr
+zprac
+zvl
+n.p
+např
+než
+MUDr
+abl
+absol
+adj
+adv
+ak
+ak. sl
+akt
+alch
+amer
+anat
+angl
+anglosas
+arab
+arch
+archit
+arg
+astr
+astrol
+att
+bás
+belg
+bibl
+biol
+boh
+bot
+bulh
+círk
+csl
+čas
+čes
+dat
+děj
+dep
+dět
+dial
+dór
+dopr
+dosl
+ekon
+epic
+etnonym
+eufem
+f
+fam
+fem
+fil
+film
+form
+fot
+fr
+fut
+fyz
+gen
+geogr
+geol
+geom
+germ
+gram
+hebr
+herald
+hist
+hl
+hovor
+hud
+hut
+chcsl
+chem
+ie
+imp
+impf
+ind
+indoevr
+inf
+instr
+interj
+ión
+iron
+it
+kanad
+katalán
+klas
+kniž
+komp
+konj
+
+konkr
+kř
+kuch
+lat
+lék
+les
+lid
+lit
+liturg
+lok
+log
+m
+mat
+meteor
+metr
+mod
+ms
+mysl
+n
+náb
+námoř
+neklas
+něm
+nesklon
+nom
+ob
+obch
+obyč
+ojed
+opt
+part
+pas
+pejor
+pers
+pf
+pl
+plpf
+
+práv
+prep
+předl
+přivl
+r
+rcsl
+refl
+reg
+rkp
+řec
+s
+samohl
+sg
+sl
+souhl
+spec
+srov
+stfr
+střv
+stsl
+subj
+subst
+superl
+sv
+sz
+táz
+tech
+telev
+teol
+trans
+typogr
+var
+vedl
+verb
+vl. jm
+voj
+vok
+vůb
+vulg
+výtv
+vztaž
+zahr
+zájm
+zast
+zejm
+
+zeměd
+zkr
+zř
+mj
+dl
+atp
+sport
+Mgr
+horn
+MVDr
+JUDr
+RSDr
+Bc
+PhDr
+ThDr
+Ing
+aj
+apod
+PharmDr
+pomn
+ev
+slang
+nprap
+odp
+dop
+pol
+st
+stol
+p. n. l
+před n. l
+n. l
+př. Kr
+po Kr
+př. n. l
+odd
+RNDr
+tzv
+atd
+tzn
+resp
+tj
+p
+br
+č. j
+čj
+č. p
+čp
+a. s
+s. r. o
+spol. s r. o
+p. o
+s. p
+v. o. s
+k. s
+o. p. s
+o. s
+v. r
+v z
+ml
+vč
+kr
+mld
+hod
+popř
+ap
+event
+rus
+slov
+rum
+švýc
+P. T
+zvl
+hor
+dol
+S.O.S \ No newline at end of file
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.de b/nonbreaking_prefixes/nonbreaking_prefix.de
new file mode 100644
index 0000000..35fdf5e
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.de
@@ -0,0 +1,325 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+#no german words end in single lower-case letters, so we throw those in too.
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+
+
+#Roman Numerals. A dot after one of these is not a sentence break in German.
+I
+II
+III
+IV
+V
+VI
+VII
+VIII
+IX
+X
+XI
+XII
+XIII
+XIV
+XV
+XVI
+XVII
+XVIII
+XIX
+XX
+i
+ii
+iii
+iv
+v
+vi
+vii
+viii
+ix
+x
+xi
+xii
+xiii
+xiv
+xv
+xvi
+xvii
+xviii
+xix
+xx
+
+#Titles and Honorifics
+Adj
+Adm
+Adv
+Asst
+Bart
+Bldg
+Brig
+Bros
+Capt
+Cmdr
+Col
+Comdr
+Con
+Corp
+Cpl
+DR
+Dr
+Ens
+Gen
+Gov
+Hon
+Hosp
+Insp
+Lt
+MM
+MR
+MRS
+MS
+Maj
+Messrs
+Mlle
+Mme
+Mr
+Mrs
+Ms
+Msgr
+Op
+Ord
+Pfc
+Ph
+Prof
+Pvt
+Rep
+Reps
+Res
+Rev
+Rt
+Sen
+Sens
+Sfc
+Sgt
+Sr
+St
+Supt
+Surg
+
+#Misc symbols
+Mio
+Mrd
+bzw
+v
+vs
+usw
+d.h
+z.B
+u.a
+etc
+Mrd
+MwSt
+ggf
+d.J
+D.h
+m.E
+vgl
+I.F
+z.T
+sogen
+ff
+u.E
+g.U
+g.g.A
+c.-à-d
+Buchst
+u.s.w
+sog
+u.ä
+Std
+evtl
+Zt
+Chr
+u.U
+o.ä
+Ltd
+b.A
+z.Zt
+spp
+sen
+SA
+k.o
+jun
+i.H.v
+dgl
+dergl
+Co
+zzt
+usf
+s.p.a
+Dkr
+Corp
+bzgl
+BSE
+
+#Number indicators
+# add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it
+No
+Nos
+Art
+Nr
+pp
+ca
+Ca
+
+#Ordinals are done with . in German - "1." = "1st" in English
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
+85
+86
+87
+88
+89
+90
+91
+92
+93
+94
+95
+96
+97
+98
+99
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.el b/nonbreaking_prefixes/nonbreaking_prefix.el
new file mode 100644
index 0000000..0470f91
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.el
@@ -0,0 +1,2 @@
+# for now, just include the Greek equivalent of "Mr."
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.en b/nonbreaking_prefixes/nonbreaking_prefix.en
new file mode 100644
index 0000000..e1a3733
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.en
@@ -0,0 +1,107 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+Adj
+Adm
+Adv
+Asst
+Bart
+Bldg
+Brig
+Bros
+Capt
+Cmdr
+Col
+Comdr
+Con
+Corp
+Cpl
+DR
+Dr
+Drs
+Ens
+Gen
+Gov
+Hon
+Hr
+Hosp
+Insp
+Lt
+MM
+MR
+MRS
+MS
+Maj
+Messrs
+Mlle
+Mme
+Mr
+Mrs
+Ms
+Msgr
+Op
+Ord
+Pfc
+Ph
+Prof
+Pvt
+Rep
+Reps
+Res
+Rev
+Rt
+Sen
+Sens
+Sfc
+Sgt
+Sr
+St
+Supt
+Surg
+
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
+v
+vs
+i.e
+rev
+e.g
+
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+No #NUMERIC_ONLY#
+Nos
+Art #NUMERIC_ONLY#
+Nr
+pp #NUMERIC_ONLY#
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.es b/nonbreaking_prefixes/nonbreaking_prefix.es
new file mode 100644
index 0000000..d8b2755
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.es
@@ -0,0 +1,118 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter followed by a period is not a sentence ender
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+
+# Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm
+
+A.C
+Apdo
+Av
+Bco
+CC.AA
+Da
+Dep
+Dn
+Dr
+Dra
+EE.UU
+Excmo
+FF.CC
+Fil
+Gral
+J.C
+Let
+Lic
+N.B
+P.D
+P.V.P
+Prof
+Pts
+Rte
+S.A
+S.A.R
+S.E
+S.L
+S.R.C
+Sr
+Sra
+Srta
+Sta
+Sto
+T.V.E
+Tel
+Ud
+Uds
+V.B
+V.E
+Vd
+Vds
+a/c
+adj
+admón
+afmo
+apdo
+av
+c
+c.f
+c.g
+cap
+cm
+cta
+dcha
+doc
+ej
+entlo
+esq
+etc
+f.c
+gr
+grs
+izq
+kg
+km
+mg
+mm
+núm
+núm
+p
+p.a
+p.ej
+ptas
+pág
+págs
+pág
+págs
+q.e.g.e
+q.e.s.m
+s
+s.s.s
+vid
+vol
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.fr b/nonbreaking_prefixes/nonbreaking_prefix.fr
new file mode 100644
index 0000000..28126fa
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.fr
@@ -0,0 +1,153 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+#
+#any single upper case letter followed by a period is not a sentence ender
+#usually upper case letters are initials in a name
+#no French words end in single lower-case letters, so we throw those in too?
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+
+# Period-final abbreviation list for French
+A.C.N
+A.M
+art
+ann
+apr
+av
+auj
+lib
+B.P
+boul
+ca
+c.-à-d
+cf
+ch.-l
+chap
+contr
+C.P.I
+C.Q.F.D
+C.N
+C.N.S
+C.S
+dir
+éd
+e.g
+env
+al
+etc
+E.V
+ex
+fasc
+fém
+fig
+fr
+hab
+ibid
+id
+i.e
+inf
+LL.AA
+LL.AA.II
+LL.AA.RR
+LL.AA.SS
+L.D
+LL.EE
+LL.MM
+LL.MM.II.RR
+loc.cit
+masc
+MM
+ms
+N.B
+N.D.A
+N.D.L.R
+N.D.T
+n/réf
+NN.SS
+N.S
+N.D
+N.P.A.I
+p.c.c
+pl
+pp
+p.ex
+p.j
+P.S
+R.A.S
+R.-V
+R.P
+R.I.P
+SS
+S.S
+S.A
+S.A.I
+S.A.R
+S.A.S
+S.E
+sec
+sect
+sing
+S.M
+S.M.I.R
+sq
+sqq
+suiv
+sup
+suppl
+tél
+T.S.V.P
+vb
+vol
+vs
+X.O
+Z.I
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.is b/nonbreaking_prefixes/nonbreaking_prefix.is
new file mode 100644
index 0000000..5b8a710
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.is
@@ -0,0 +1,251 @@
+no #NUMERIC_ONLY#
+No #NUMERIC_ONLY#
+nr #NUMERIC_ONLY#
+Nr #NUMERIC_ONLY#
+nR #NUMERIC_ONLY#
+NR #NUMERIC_ONLY#
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+^
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+ab.fn
+a.fn
+afs
+al
+alm
+alg
+andh
+ath
+aths
+atr
+ao
+au
+aukaf
+áfn
+áhrl.s
+áhrs
+ákv.gr
+ákv
+bh
+bls
+dr
+e.Kr
+et
+ef
+efn
+ennfr
+eink
+end
+e.st
+erl
+fél
+fskj
+fh
+f.hl
+físl
+fl
+fn
+fo
+forl
+frb
+frl
+frh
+frt
+fsl
+fsh
+fs
+fsk
+fst
+f.Kr
+ft
+fv
+fyrrn
+fyrrv
+germ
+gm
+gr
+hdl
+hdr
+hf
+hl
+hlsk
+hljsk
+hljv
+hljóðv
+hr
+hv
+hvk
+holl
+Hos
+höf
+hk
+hrl
+ísl
+kaf
+kap
+Khöfn
+kk
+kg
+kk
+km
+kl
+klst
+kr
+kt
+kgúrsk
+kvk
+leturbr
+lh
+lh.nt
+lh.þt
+lo
+ltr
+mlja
+mljó
+millj
+mm
+mms
+m.fl
+miðm
+mgr
+mst
+mín
+nf
+nh
+nhm
+nl
+nk
+nmgr
+no
+núv
+nt
+o.áfr
+o.m.fl
+ohf
+o.fl
+o.s.frv
+ófn
+ób
+óákv.gr
+óákv
+pfn
+PR
+pr
+Ritstj
+Rvík
+Rvk
+samb
+samhlj
+samn
+samn
+sbr
+sek
+sérn
+sf
+sfn
+sh
+sfn
+sh
+s.hl
+sk
+skv
+sl
+sn
+so
+ss.us
+s.st
+samþ
+sbr
+shlj
+sign
+skál
+st
+st.s
+stk
+sþ
+teg
+tbl
+tfn
+tl
+tvíhlj
+tvt
+till
+to
+umr
+uh
+us
+uppl
+útg
+vb
+Vf
+vh
+vkf
+Vl
+vl
+vlf
+vmf
+8vo
+vsk
+vth
+þt
+þf
+þjs
+þgf
+þlt
+þolm
+þm
+þml
+þýð
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.it b/nonbreaking_prefixes/nonbreaking_prefix.it
new file mode 100644
index 0000000..992b9ec
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.it
@@ -0,0 +1,180 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+Adj
+Adm
+Adv
+Amn
+Arch
+Asst
+Avv
+Bart
+Bcc
+Bldg
+Brig
+Bros
+C.A.P
+C.P
+Capt
+Cc
+Cmdr
+Co
+Col
+Comdr
+Con
+Corp
+Cpl
+DR
+Dott
+Dr
+Drs
+Egr
+Ens
+Gen
+Geom
+Gov
+Hon
+Hosp
+Hr
+Id
+Ing
+Insp
+Lt
+MM
+MR
+MRS
+MS
+Maj
+Messrs
+Mlle
+Mme
+Mo
+Mons
+Mr
+Mrs
+Ms
+Msgr
+N.B
+Op
+Ord
+P.S
+P.T
+Pfc
+Ph
+Prof
+Pvt
+RP
+RSVP
+Rag
+Rep
+Reps
+Res
+Rev
+Rif
+Rt
+S.A
+S.B.F
+S.P.M
+S.p.A
+S.r.l
+Sen
+Sens
+Sfc
+Sgt
+Sig
+Sigg
+Soc
+Spett
+Sr
+St
+Supt
+Surg
+V.P
+
+# other
+a.c
+acc
+all
+banc
+c.a
+c.c.p
+c.m
+c.p
+c.s
+c.v
+corr
+dott
+e.p.c
+ecc
+es
+fatt
+gg
+int
+lett
+ogg
+on
+p.c
+p.c.c
+p.es
+p.f
+p.r
+p.v
+post
+pp
+racc
+ric
+s.n.c
+seg
+sgg
+ss
+tel
+u.s
+v.r
+v.s
+
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
+v
+vs
+i.e
+rev
+e.g
+
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+No #NUMERIC_ONLY#
+Nos
+Art #NUMERIC_ONLY#
+Nr
+pp #NUMERIC_ONLY#
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.nl b/nonbreaking_prefixes/nonbreaking_prefix.nl
new file mode 100644
index 0000000..c80c417
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.nl
@@ -0,0 +1,115 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+#Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen
+# http://nl.wikipedia.org/wiki/Aanspreekvorm
+# http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs
+#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+bacc
+bc
+bgen
+c.i
+dhr
+dr
+dr.h.c
+drs
+drs
+ds
+eint
+fa
+Fa
+fam
+gen
+genm
+ing
+ir
+jhr
+jkvr
+jr
+kand
+kol
+lgen
+lkol
+Lt
+maj
+Mej
+mevr
+Mme
+mr
+mr
+Mw
+o.b.s
+plv
+prof
+ritm
+tint
+Vz
+Z.D
+Z.D.H
+Z.E
+Z.Em
+Z.H
+Z.K.H
+Z.K.M
+Z.M
+z.v
+
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
+#we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence
+a.g.v
+bijv
+bijz
+bv
+d.w.z
+e.c
+e.g
+e.k
+ev
+i.p.v
+i.s.m
+i.t.t
+i.v.m
+m.a.w
+m.b.t
+m.b.v
+m.h.o
+m.i
+m.i.v
+v.w.t
+
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+Nr #NUMERIC_ONLY#
+Nrs
+nrs
+nr #NUMERIC_ONLY#
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.pl b/nonbreaking_prefixes/nonbreaking_prefix.pl
new file mode 100644
index 0000000..6b7c106
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.pl
@@ -0,0 +1,283 @@
+adw
+afr
+akad
+al
+Al
+am
+amer
+arch
+art
+Art
+artyst
+astr
+austr
+bałt
+bdb
+bł
+bm
+br
+bryg
+bryt
+centr
+ces
+chem
+chiń
+chir
+c.k
+c.o
+cyg
+cyw
+cyt
+czes
+czw
+cd
+Cd
+czyt
+ćw
+ćwicz
+daw
+dcn
+dekl
+demokr
+det
+diec
+dł
+dn
+dot
+dol
+dop
+dost
+dosł
+h.c
+ds
+dst
+duszp
+dypl
+egz
+ekol
+ekon
+elektr
+em
+ew
+fab
+farm
+fot
+fr
+gat
+gastr
+geogr
+geol
+gimn
+głęb
+gm
+godz
+górn
+gosp
+gr
+gram
+hist
+hiszp
+hr
+Hr
+hot
+id
+in
+im
+iron
+jn
+kard
+kat
+katol
+k.k
+kk
+kol
+kl
+k.p.a
+kpc
+k.p.c
+kpt
+kr
+k.r
+krak
+k.r.o
+kryt
+kult
+laic
+łac
+niem
+woj
+nb
+np
+Nb
+Np
+pol
+pow
+m.in
+pt
+ps
+Pt
+Ps
+cdn
+jw
+ryc
+rys
+Ryc
+Rys
+tj
+tzw
+Tzw
+tzn
+zob
+ang
+ub
+ul
+pw
+pn
+pl
+al
+k
+n
+nr #NUMERIC_ONLY#
+Nr #NUMERIC_ONLY#
+ww
+wł
+ur
+zm
+żyd
+żarg
+żyw
+wył
+bp
+bp
+wyst
+tow
+Tow
+o
+sp
+Sp
+st
+spółdz
+Spółdz
+społ
+spółgł
+stoł
+stow
+Stoł
+Stow
+zn
+zew
+zewn
+zdr
+zazw
+zast
+zaw
+zał
+zal
+zam
+zak
+zakł
+zagr
+zach
+adw
+Adw
+lek
+Lek
+med
+mec
+Mec
+doc
+Doc
+dyw
+dyr
+Dyw
+Dyr
+inż
+Inż
+mgr
+Mgr
+dh
+dr
+Dh
+Dr
+p
+P
+red
+Red
+prof
+prok
+Prof
+Prok
+hab
+płk
+Płk
+nadkom
+Nadkom
+podkom
+Podkom
+ks
+Ks
+gen
+Gen
+por
+Por
+reż
+Reż
+przyp
+Przyp
+śp
+św
+śW
+Śp
+Św
+ŚW
+szer
+Szer
+pkt #NUMERIC_ONLY#
+str #NUMERIC_ONLY#
+tab #NUMERIC_ONLY#
+Tab #NUMERIC_ONLY#
+tel
+ust #NUMERIC_ONLY#
+par #NUMERIC_ONLY#
+poz
+pok
+oo
+oO
+Oo
+OO
+r #NUMERIC_ONLY#
+l #NUMERIC_ONLY#
+s #NUMERIC_ONLY#
+najśw
+Najśw
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+Dz
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.pt b/nonbreaking_prefixes/nonbreaking_prefix.pt
new file mode 100644
index 0000000..5d65bf2
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.pt
@@ -0,0 +1,210 @@
+#File adapted for PT by H. Leal Fontes from the EN & DE versions published with moses-2009-04-13. Last update: 10.11.2009.
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+
+
+#Roman Numerals. A dot after one of these is not a sentence break in Portuguese.
+I
+II
+III
+IV
+V
+VI
+VII
+VIII
+IX
+X
+XI
+XII
+XIII
+XIV
+XV
+XVI
+XVII
+XVIII
+XIX
+XX
+i
+ii
+iii
+iv
+v
+vi
+vii
+viii
+ix
+x
+xi
+xii
+xiii
+xiv
+xv
+xvi
+xvii
+xviii
+xix
+xx
+
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+Adj
+Adm
+Adv
+Art
+Ca
+Capt
+Cmdr
+Col
+Comdr
+Con
+Corp
+Cpl
+DR
+DRA
+Dr
+Dra
+Dras
+Drs
+Eng
+Enga
+Engas
+Engos
+Ex
+Exo
+Exmo
+Fig
+Gen
+Hosp
+Insp
+Lda
+MM
+MR
+MRS
+MS
+Maj
+Mrs
+Ms
+Msgr
+Op
+Ord
+Pfc
+Ph
+Prof
+Pvt
+Rep
+Reps
+Res
+Rev
+Rt
+Sen
+Sens
+Sfc
+Sgt
+Sr
+Sra
+Sras
+Srs
+Sto
+Supt
+Surg
+adj
+adm
+adv
+art
+cit
+col
+con
+corp
+cpl
+dr
+dra
+dras
+drs
+eng
+enga
+engas
+engos
+ex
+exo
+exmo
+fig
+op
+prof
+sr
+sra
+sras
+srs
+sto
+
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
+v
+vs
+i.e
+rev
+e.g
+
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+No #NUMERIC_ONLY#
+Nos
+Art #NUMERIC_ONLY#
+Nr
+p #NUMERIC_ONLY#
+pp #NUMERIC_ONLY#
+
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.ro b/nonbreaking_prefixes/nonbreaking_prefix.ro
new file mode 100644
index 0000000..d489f46
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.ro
@@ -0,0 +1,38 @@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+dpdv
+etc
+șamd
+M.Ap.N
+dl
+Dl
+d-na
+D-na
+dvs
+Dvs
+pt
+Pt
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.ru b/nonbreaking_prefixes/nonbreaking_prefix.ru
new file mode 100644
index 0000000..444465b
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.ru
@@ -0,0 +1,259 @@
+TBD: Russian uppercase alphabet [А-Я]
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+0гг
+1гг
+2гг
+3гг
+4гг
+5гг
+6гг
+7гг
+8гг
+9гг
+0г
+1г
+2г
+3г
+4г
+5г
+6г
+7г
+8г
+9г
+Xвв
+Vвв
+Iвв
+Lвв
+Mвв
+Cвв
+Xв
+Vв
+Iв
+Lв
+Mв
+Cв
+0м
+1м
+2м
+3м
+4м
+5м
+6м
+7м
+8м
+9м
+0мм
+1мм
+2мм
+3мм
+4мм
+5мм
+6мм
+7мм
+8мм
+9мм
+0см
+1см
+2см
+3см
+4см
+5см
+6см
+7см
+8см
+9см
+0дм
+1дм
+2дм
+3дм
+4дм
+5дм
+6дм
+7дм
+8дм
+9дм
+0л
+1л
+2л
+3л
+4л
+5л
+6л
+7л
+8л
+9л
+0км
+1км
+2км
+3км
+4км
+5км
+6км
+7км
+8км
+9км
+0га
+1га
+2га
+3га
+4га
+5га
+6га
+7га
+8га
+9га
+0кг
+1кг
+2кг
+3кг
+4кг
+5кг
+6кг
+7кг
+8кг
+9кг
+0т
+1т
+2т
+3т
+4т
+5т
+6т
+7т
+8т
+9т
+0г
+1г
+2г
+3г
+4г
+5г
+6г
+7г
+8г
+9г
+0мг
+1мг
+2мг
+3мг
+4мг
+5мг
+6мг
+7мг
+8мг
+9мг
+бульв
+вв
+га
+гг
+гл
+гос
+дм
+доп
+др
+ед
+ед
+зам
+инд
+исп
+Исп
+кап
+кг
+кв
+кл
+км
+кол
+комн
+коп
+куб
+лиц
+лл
+макс
+мг
+мин
+мл
+млн
+млрд
+мм
+наб
+нач
+неуд
+ном
+обл
+обр
+общ
+ок
+ост
+отл
+п
+пер
+перераб
+пл
+пос
+пр
+просп
+проф
+ред
+руб
+сб
+св
+см
+соч
+ср
+ст
+стр
+тел
+Тел
+тех
+тт
+туп
+тыс
+уд
+ул
+уч
+физ
+хор
+чел
+шт
+экз
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.sk b/nonbreaking_prefixes/nonbreaking_prefix.sk
new file mode 100644
index 0000000..1198d48
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.sk
@@ -0,0 +1,474 @@
+Bc
+Mgr
+RNDr
+PharmDr
+PhDr
+JUDr
+PaedDr
+ThDr
+Ing
+MUDr
+MDDr
+MVDr
+Dr
+ThLic
+PhD
+ArtD
+ThDr
+Dr
+DrSc
+CSs
+prof
+obr
+Obr
+absol
+adj
+admin
+adr
+Adr
+adv
+advok
+afr
+ak
+akad
+akc
+akuz
+et
+al
+alch
+amer
+anat
+angl
+Angl
+anglosas
+anorg
+ap
+apod
+arch
+archeol
+archit
+arg
+art
+astr
+astrol
+astron
+atp
+atď
+austr
+Austr
+aut
+belg
+Belg
+bibl
+Bibl
+biol
+bot
+bud
+bás
+býv
+cest
+chem
+cirk
+csl
+čs
+Čs
+dat
+dep
+det
+dial
+diaľ
+dipl
+distrib
+dokl
+dosl
+dopr
+dram
+duš
+dv
+dvojčl
+dór
+ekol
+ekon
+el
+elektr
+elektrotech
+energet
+epic
+est
+etc
+etonym
+eufem
+európ
+Európ
+ev
+evid
+expr
+fa
+fam
+farm
+fem
+feud
+fil
+filat
+filoz
+fi
+fon
+form
+fot
+fr
+Fr
+franc
+Franc
+fraz
+fut
+fyz
+fyziol
+garb
+gen
+genet
+genpor
+geod
+geogr
+geol
+geom
+germ
+gr
+Gr
+gréc
+Gréc
+gréckokat
+hebr
+herald
+hist
+hlav
+hosp
+hromad
+hud
+hypok
+ident
+i.e
+ident
+imp
+impf
+indoeur
+inf
+inform
+instr
+int
+interj
+inšt
+inštr
+iron
+jap
+Jap
+jaz
+jedn
+juhoamer
+juhových
+juhozáp
+juž
+kanad
+Kanad
+kanc
+kapit
+kpt
+kart
+katastr
+knih
+kniž
+komp
+konj
+konkr
+kozmet
+krajč
+kresť
+kt
+kuch
+lat
+latinskoamer
+lek
+lex
+lingv
+lit
+litur
+log
+lok
+max
+Max
+maď
+Maď
+medzinár
+mest
+metr
+mil
+Mil
+min
+Min
+miner
+ml
+mld
+mn
+mod
+mytol
+napr
+nar
+Nar
+nasl
+nedok
+neg
+negat
+neklas
+nem
+Nem
+neodb
+neos
+neskl
+nesklon
+nespis
+nespráv
+neved
+než
+niekt
+niž
+nom
+náb
+nákl
+námor
+nár
+obch
+obj
+obv
+obyč
+obč
+občian
+odb
+odd
+ods
+ojed
+okr
+Okr
+opt
+opyt
+org
+os
+osob
+ot
+ovoc
+par
+part
+pejor
+pers
+pf
+Pf
+P.f
+p.f
+pl
+Plk
+pod
+podst
+pokl
+polit
+politol
+polygr
+pomn
+popl
+por
+porad
+porov
+posch
+potrav
+použ
+poz
+pozit
+poľ
+poľno
+poľnohosp
+poľov
+pošt
+pož
+prac
+predl
+pren
+prep
+preuk
+priezv
+Priezv
+privl
+prof
+práv
+príd
+príj
+prík
+príp
+prír
+prísl
+príslov
+príč
+psych
+publ
+pís
+písm
+pôv
+refl
+reg
+rep
+resp
+rozk
+rozlič
+rozpráv
+roč
+Roč
+ryb
+rádiotech
+rím
+samohl
+semest
+sev
+severoamer
+severových
+severozáp
+sg
+skr
+skup
+sl
+Sloven
+soc
+soch
+sociol
+sp
+spol
+Spol
+spoloč
+spoluhl
+správ
+spôs
+st
+star
+starogréc
+starorím
+s.r.o
+stol
+stor
+str
+stredoamer
+stredoškol
+subj
+subst
+superl
+sv
+sz
+súkr
+súp
+súvzť
+tal
+Tal
+tech
+tel
+Tel
+telef
+teles
+telev
+teol
+trans
+turist
+tuzem
+typogr
+tzn
+tzv
+ukaz
+ul
+Ul
+umel
+univ
+ust
+ved
+vedľ
+verb
+veter
+vin
+viď
+vl
+vod
+vodohosp
+pnl
+vulg
+vyj
+vys
+vysokoškol
+vzťaž
+vôb
+vých
+výd
+výrob
+výsk
+výsl
+výtv
+výtvar
+význ
+včel
+vš
+všeob
+zahr
+zar
+zariad
+zast
+zastar
+zastaráv
+zb
+zdravot
+združ
+zjemn
+zlat
+zn
+Zn
+zool
+zr
+zried
+zv
+záhr
+zák
+zákl
+zám
+záp
+západoeur
+zázn
+územ
+účt
+čast
+čes
+Čes
+čl
+čísl
+živ
+pr
+fak
+Kr
+p.n.l
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.sl b/nonbreaking_prefixes/nonbreaking_prefix.sl
new file mode 100644
index 0000000..230062c
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.sl
@@ -0,0 +1,78 @@
+dr
+Dr
+itd
+itn
+št #NUMERIC_ONLY#
+Št #NUMERIC_ONLY#
+d
+jan
+Jan
+feb
+Feb
+mar
+Mar
+apr
+Apr
+jun
+Jun
+jul
+Jul
+avg
+Avg
+sept
+Sept
+sep
+Sep
+okt
+Okt
+nov
+Nov
+dec
+Dec
+tj
+Tj
+npr
+Npr
+sl
+Sl
+op
+Op
+gl
+Gl
+oz
+Oz
+prev
+dipl
+ing
+prim
+Prim
+cf
+Cf
+gl
+Gl
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.sv b/nonbreaking_prefixes/nonbreaking_prefix.sv
new file mode 100644
index 0000000..df5ef29
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.sv
@@ -0,0 +1,46 @@
+#single upper case letter are usually initials
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+#misc abbreviations
+AB
+G
+VG
+dvs
+etc
+from
+iaf
+jfr
+kl
+kr
+mao
+mfl
+mm
+osv
+pga
+tex
+tom
+vs