From 2783f837303ae07c4a1d676302bca779abbb1296 Mon Sep 17 00:00:00 2001
From: Patrick Simianer <p@simianer.de>
Date: Sat, 14 Jun 2014 14:43:14 +0200
Subject: steal tokenizer from moses' scripts

---
 lowercase.perl                             |  10 +
 nonbreaking_prefixes/README.txt            |   5 +
 nonbreaking_prefixes/nonbreaking_prefix.ca |  75 +++++
 nonbreaking_prefixes/nonbreaking_prefix.cs | 390 ++++++++++++++++++++++++
 nonbreaking_prefixes/nonbreaking_prefix.de | 325 ++++++++++++++++++++
 nonbreaking_prefixes/nonbreaking_prefix.el |   2 +
 nonbreaking_prefixes/nonbreaking_prefix.en | 107 +++++++
 nonbreaking_prefixes/nonbreaking_prefix.es | 118 +++++++
 nonbreaking_prefixes/nonbreaking_prefix.fr | 153 ++++++++++
 nonbreaking_prefixes/nonbreaking_prefix.is | 251 +++++++++++++++
 nonbreaking_prefixes/nonbreaking_prefix.it | 180 +++++++++++
 nonbreaking_prefixes/nonbreaking_prefix.nl | 115 +++++++
 nonbreaking_prefixes/nonbreaking_prefix.pl | 283 +++++++++++++++++
 nonbreaking_prefixes/nonbreaking_prefix.pt | 210 +++++++++++++
 nonbreaking_prefixes/nonbreaking_prefix.ro |  38 +++
 nonbreaking_prefixes/nonbreaking_prefix.ru | 259 ++++++++++++++++
 nonbreaking_prefixes/nonbreaking_prefix.sk | 474 +++++++++++++++++++++++++++++
 nonbreaking_prefixes/nonbreaking_prefix.sl |  78 +++++
 nonbreaking_prefixes/nonbreaking_prefix.sv |  46 +++
 preprocess                                 |   6 +-
 tokenizer.no-escape.perl                   | 348 +++++++++++++++++++++
 21 files changed, 3472 insertions(+), 1 deletion(-)
 create mode 100755 lowercase.perl
 create mode 100644 nonbreaking_prefixes/README.txt
 create mode 100644 nonbreaking_prefixes/nonbreaking_prefix.ca
 create mode 100644 nonbreaking_prefixes/nonbreaking_prefix.cs
 create mode 100644 nonbreaking_prefixes/nonbreaking_prefix.de
 create mode 100644 nonbreaking_prefixes/nonbreaking_prefix.el
 create mode 100644 nonbreaking_prefixes/nonbreaking_prefix.en
 create mode 100644 nonbreaking_prefixes/nonbreaking_prefix.es
 create mode 100644 nonbreaking_prefixes/nonbreaking_prefix.fr
 create mode 100644 nonbreaking_prefixes/nonbreaking_prefix.is
 create mode 100644 nonbreaking_prefixes/nonbreaking_prefix.it
 create mode 100644 nonbreaking_prefixes/nonbreaking_prefix.nl
 create mode 100644 nonbreaking_prefixes/nonbreaking_prefix.pl
 create mode 100644 nonbreaking_prefixes/nonbreaking_prefix.pt
 create mode 100644 nonbreaking_prefixes/nonbreaking_prefix.ro
 create mode 100644 nonbreaking_prefixes/nonbreaking_prefix.ru
 create mode 100644 nonbreaking_prefixes/nonbreaking_prefix.sk
 create mode 100644 nonbreaking_prefixes/nonbreaking_prefix.sl
 create mode 100644 nonbreaking_prefixes/nonbreaking_prefix.sv
 create mode 100755 tokenizer.no-escape.perl
diff --git a/lowercase.perl b/lowercase.perl
new file mode 100755
index 0000000..c30e029
--- /dev/null
+++ b/lowercase.perl
@@ -0,0 +1,10 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+while(<STDIN>) {
+  print lc($_);
+}
diff --git a/nonbreaking_prefixes/README.txt b/nonbreaking_prefixes/README.txt
new file mode 100644
index 0000000..02cdfcc
--- /dev/null
+++ b/nonbreaking_prefixes/README.txt
@@ -0,0 +1,5 @@
+The language suffix can be found here:
+
+http://www.loc.gov/standards/iso639-2/php/code_list.php
+
+
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.ca b/nonbreaking_prefixes/nonbreaking_prefix.ca
new file mode 100644
index 0000000..2f4fdfc
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.ca
@@ -0,0 +1,75 @@
+Dr
+Dra
+pàg
+p
+c
+av
+Sr
+Sra
+adm
+esq
+Prof
+S.A
+S.L
+p.e
+ptes
+Sta
+St
+pl
+màx
+cast
+dir
+nre
+fra
+admdora
+Emm
+Excma
+espf
+dc
+admdor
+tel
+angl
+aprox
+ca
+dept
+dj
+dl
+dt
+ds
+dg
+dv
+ed
+entl
+al
+i.e
+maj
+smin
+n
+núm
+pta
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.cs b/nonbreaking_prefixes/nonbreaking_prefix.cs
new file mode 100644
index 0000000..dce6167
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.cs
@@ -0,0 +1,390 @@
+Bc
+BcA
+Ing
+Ing.arch
+MUDr
+MVDr
+MgA
+Mgr
+JUDr
+PhDr
+RNDr
+PharmDr
+ThLic
+ThDr
+Ph.D
+Th.D
+prof
+doc
+CSc
+DrSc
+dr. h. c
+PaedDr
+Dr
+PhMr
+DiS
+abt
+ad
+a.i
+aj
+angl
+anon
+apod
+atd
+atp
+aut
+bd
+biogr
+b.m
+b.p
+b.r
+cca
+cit
+cizojaz
+c.k
+col
+čes
+čín
+čj
+ed
+facs
+fasc
+fol
+fot
+franc
+h.c
+hist
+hl
+hrsg
+ibid
+il
+ind
+inv.č
+jap
+jhdt
+jv
+koed
+kol
+korej
+kl
+krit
+lat
+lit
+m.a
+maď
+mj
+mp
+násl
+např
+nepubl
+něm
+no
+nr
+n.s
+okr
+odd
+odp
+obr
+opr
+orig
+phil
+pl
+pokrač
+pol
+port
+pozn
+př.kr
+př.n.l
+přel
+přeprac
+příl
+pseud
+pt
+red
+repr
+resp
+revid
+rkp
+roč
+roz
+rozš
+samost
+sect
+sest
+seš
+sign
+sl
+srv
+stol
+sv
+šk
+šk.ro
+špan
+tab
+t.č
+tis
+tj
+tř
+tzv
+univ
+uspoř
+vol
+vl.jm
+vs
+vyd
+vyobr
+zal
+zejm
+zkr
+zprac
+zvl
+n.p
+např
+než
+MUDr
+abl
+absol
+adj
+adv
+ak
+ak. sl
+akt
+alch
+amer
+anat
+angl
+anglosas
+arab
+arch
+archit
+arg
+astr
+astrol
+att
+bás
+belg
+bibl
+biol
+boh
+bot
+bulh
+círk
+csl
+č
+čas
+čes
+dat
+děj
+dep
+dět
+dial
+dór
+dopr
+dosl
+ekon
+epic
+etnonym
+eufem
+f
+fam
+fem
+fil
+film
+form
+fot
+fr
+fut
+fyz
+gen
+geogr
+geol
+geom
+germ
+gram
+hebr
+herald
+hist
+hl
+hovor
+hud
+hut
+chcsl
+chem
+ie
+imp
+impf
+ind
+indoevr
+inf
+instr
+interj
+ión
+iron
+it
+kanad
+katalán
+klas
+kniž
+komp
+konj
+ 
+konkr
+kř
+kuch
+lat
+lék
+les
+lid
+lit
+liturg
+lok
+log
+m
+mat
+meteor
+metr
+mod
+ms
+mysl
+n
+náb
+námoř
+neklas
+něm
+nesklon
+nom
+ob
+obch
+obyč
+ojed
+opt
+part
+pas
+pejor
+pers
+pf
+pl
+plpf
+ 
+práv
+prep
+předl
+přivl
+r
+rcsl
+refl
+reg
+rkp
+ř
+řec
+s
+samohl
+sg
+sl
+souhl
+spec
+srov
+stfr
+střv
+stsl
+subj
+subst
+superl
+sv
+sz
+táz
+tech
+telev
+teol
+trans
+typogr
+var
+vedl
+verb
+vl. jm
+voj
+vok
+vůb
+vulg
+výtv
+vztaž
+zahr
+zájm
+zast
+zejm
+ 
+zeměd
+zkr
+zř
+mj
+dl
+atp
+sport
+Mgr
+horn
+MVDr
+JUDr
+RSDr
+Bc
+PhDr
+ThDr
+Ing
+aj
+apod
+PharmDr
+pomn
+ev
+slang
+nprap
+odp
+dop
+pol
+st
+stol
+p. n. l
+před n. l
+n. l
+př. Kr
+po Kr
+př. n. l
+odd
+RNDr
+tzv
+atd
+tzn
+resp
+tj
+p
+br
+č. j
+čj
+č. p
+čp
+a. s
+s. r. o
+spol. s r. o
+p. o
+s. p
+v. o. s
+k. s
+o. p. s
+o. s
+v. r
+v z
+ml
+vč
+kr
+mld
+hod
+popř
+ap
+event
+rus
+slov
+rum
+švýc
+P. T
+zvl
+hor
+dol
+S.O.S
\ No newline at end of file
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.de b/nonbreaking_prefixes/nonbreaking_prefix.de
new file mode 100644
index 0000000..35fdf5e
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.de
@@ -0,0 +1,325 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+#no german words end in single lower-case letters, so we throw those in too.
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+
+
+#Roman Numerals. A dot after one of these is not a sentence break in German.
+I
+II
+III
+IV
+V
+VI
+VII
+VIII
+IX
+X
+XI
+XII
+XIII
+XIV
+XV
+XVI
+XVII
+XVIII
+XIX
+XX
+i
+ii
+iii
+iv
+v
+vi
+vii
+viii
+ix
+x
+xi
+xii
+xiii
+xiv
+xv
+xvi
+xvii
+xviii
+xix
+xx
+
+#Titles and Honorifics
+Adj
+Adm
+Adv
+Asst
+Bart
+Bldg
+Brig
+Bros
+Capt
+Cmdr
+Col
+Comdr
+Con
+Corp
+Cpl
+DR
+Dr
+Ens
+Gen
+Gov
+Hon
+Hosp
+Insp
+Lt
+MM
+MR
+MRS
+MS
+Maj
+Messrs
+Mlle
+Mme
+Mr
+Mrs
+Ms
+Msgr
+Op
+Ord
+Pfc
+Ph
+Prof
+Pvt
+Rep
+Reps
+Res
+Rev
+Rt
+Sen
+Sens
+Sfc
+Sgt
+Sr
+St
+Supt
+Surg
+
+#Misc symbols
+Mio
+Mrd
+bzw
+v
+vs
+usw
+d.h
+z.B
+u.a
+etc
+Mrd
+MwSt
+ggf
+d.J
+D.h
+m.E
+vgl
+I.F
+z.T
+sogen
+ff
+u.E
+g.U
+g.g.A
+c.-à-d
+Buchst
+u.s.w
+sog
+u.ä
+Std
+evtl
+Zt
+Chr
+u.U
+o.ä
+Ltd
+b.A
+z.Zt
+spp
+sen
+SA
+k.o
+jun
+i.H.v
+dgl
+dergl
+Co
+zzt
+usf
+s.p.a
+Dkr
+Corp
+bzgl
+BSE
+
+#Number indicators
+# add #NUMERIC_ONLY# after the word if it should ONLY be non-breaking when a 0-9 digit follows it
+No
+Nos
+Art
+Nr
+pp
+ca
+Ca
+
+#Ordinals are done with . in German - "1." = "1st" in English
+1
+2
+3
+4
+5
+6
+7
+8
+9
+10
+11
+12
+13
+14
+15
+16
+17
+18
+19
+20
+21
+22
+23
+24
+25
+26
+27
+28
+29
+30
+31
+32
+33
+34
+35
+36
+37
+38
+39
+40
+41
+42
+43
+44
+45
+46
+47
+48
+49
+50
+51
+52
+53
+54
+55
+56
+57
+58
+59
+60
+61
+62
+63
+64
+65
+66
+67
+68
+69
+70
+71
+72
+73
+74
+75
+76
+77
+78
+79
+80
+81
+82
+83
+84
+85
+86
+87
+88
+89
+90
+91
+92
+93
+94
+95
+96
+97
+98
+99
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.el b/nonbreaking_prefixes/nonbreaking_prefix.el
new file mode 100644
index 0000000..0470f91
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.el
@@ -0,0 +1,2 @@
+# for now, just include the Greek equivalent of "Mr."
+κ
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.en b/nonbreaking_prefixes/nonbreaking_prefix.en
new file mode 100644
index 0000000..e1a3733
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.en
@@ -0,0 +1,107 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+Adj
+Adm
+Adv
+Asst
+Bart
+Bldg
+Brig
+Bros
+Capt
+Cmdr
+Col
+Comdr
+Con
+Corp
+Cpl
+DR
+Dr
+Drs
+Ens
+Gen
+Gov
+Hon
+Hr
+Hosp
+Insp
+Lt
+MM
+MR
+MRS
+MS
+Maj
+Messrs
+Mlle
+Mme
+Mr
+Mrs
+Ms
+Msgr
+Op
+Ord
+Pfc
+Ph
+Prof
+Pvt
+Rep
+Reps
+Res
+Rev
+Rt
+Sen
+Sens
+Sfc
+Sgt
+Sr
+St
+Supt
+Surg
+
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
+v
+vs
+i.e
+rev
+e.g
+
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+No #NUMERIC_ONLY# 
+Nos
+Art #NUMERIC_ONLY#
+Nr
+pp #NUMERIC_ONLY#
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.es b/nonbreaking_prefixes/nonbreaking_prefix.es
new file mode 100644
index 0000000..d8b2755
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.es
@@ -0,0 +1,118 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter  followed by a period is not a sentence ender
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+
+# Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm
+
+A.C
+Apdo
+Av
+Bco
+CC.AA
+Da
+Dep
+Dn
+Dr
+Dra
+EE.UU
+Excmo
+FF.CC
+Fil 
+Gral
+J.C
+Let
+Lic
+N.B
+P.D
+P.V.P
+Prof
+Pts
+Rte
+S.A
+S.A.R
+S.E
+S.L
+S.R.C
+Sr
+Sra
+Srta
+Sta
+Sto
+T.V.E
+Tel
+Ud
+Uds
+V.B
+V.E
+Vd
+Vds
+a/c
+adj
+admón
+afmo
+apdo
+av
+c
+c.f
+c.g
+cap
+cm
+cta
+dcha
+doc
+ej
+entlo
+esq
+etc
+f.c
+gr 
+grs
+izq
+kg
+km
+mg
+mm
+nÃºm
+núm
+p
+p.a
+p.ej
+ptas
+pÃ¡g 
+pÃ¡gs
+pág
+págs
+q.e.g.e
+q.e.s.m
+s
+s.s.s
+vid
+vol
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.fr b/nonbreaking_prefixes/nonbreaking_prefix.fr
new file mode 100644
index 0000000..28126fa
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.fr
@@ -0,0 +1,153 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+#
+#any single upper case letter  followed by a period is not a sentence ender
+#usually upper case letters are initials in a name
+#no French words end in single lower-case letters, so we throw those in too?
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+
+# Period-final abbreviation list for French
+A.C.N
+A.M
+art
+ann
+apr
+av
+auj
+lib
+B.P
+boul
+ca
+c.-à-d
+cf
+ch.-l
+chap
+contr
+C.P.I
+C.Q.F.D
+C.N
+C.N.S
+C.S
+dir
+éd
+e.g
+env
+al
+etc
+E.V
+ex
+fasc
+fém
+fig
+fr
+hab
+ibid
+id
+i.e
+inf
+LL.AA
+LL.AA.II
+LL.AA.RR
+LL.AA.SS
+L.D
+LL.EE
+LL.MM
+LL.MM.II.RR
+loc.cit
+masc
+MM
+ms
+N.B
+N.D.A
+N.D.L.R
+N.D.T
+n/réf
+NN.SS
+N.S
+N.D
+N.P.A.I
+p.c.c
+pl
+pp
+p.ex
+p.j
+P.S
+R.A.S
+R.-V
+R.P
+R.I.P
+SS
+S.S
+S.A
+S.A.I
+S.A.R
+S.A.S
+S.E
+sec
+sect
+sing
+S.M
+S.M.I.R
+sq
+sqq
+suiv
+sup
+suppl
+tél
+T.S.V.P
+vb
+vol
+vs
+X.O
+Z.I
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.is b/nonbreaking_prefixes/nonbreaking_prefix.is
new file mode 100644
index 0000000..5b8a710
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.is
@@ -0,0 +1,251 @@
+no #NUMERIC_ONLY#
+No #NUMERIC_ONLY#
+nr #NUMERIC_ONLY#
+Nr #NUMERIC_ONLY#
+nR #NUMERIC_ONLY#
+NR #NUMERIC_ONLY#
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+^
+í
+á
+ó
+æ
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+ab.fn
+a.fn
+afs
+al
+alm
+alg
+andh
+ath
+aths
+atr
+ao
+au
+aukaf
+áfn
+áhrl.s
+áhrs
+ákv.gr
+ákv
+bh
+bls
+dr
+e.Kr
+et
+ef
+efn
+ennfr
+eink
+end
+e.st
+erl
+fél
+fskj
+fh
+f.hl
+físl
+fl
+fn
+fo
+forl
+frb
+frl
+frh
+frt
+fsl
+fsh
+fs
+fsk
+fst
+f.Kr
+ft
+fv
+fyrrn
+fyrrv
+germ
+gm
+gr
+hdl
+hdr
+hf
+hl
+hlsk
+hljsk
+hljv
+hljóðv
+hr
+hv
+hvk
+holl
+Hos
+höf
+hk
+hrl
+ísl
+kaf
+kap
+Khöfn
+kk
+kg
+kk
+km
+kl
+klst
+kr
+kt
+kgúrsk
+kvk
+leturbr
+lh
+lh.nt
+lh.þt
+lo
+ltr
+mlja
+mljó
+millj
+mm
+mms
+m.fl
+miðm
+mgr
+mst
+mín
+nf
+nh
+nhm
+nl
+nk
+nmgr
+no
+núv
+nt
+o.áfr
+o.m.fl
+ohf
+o.fl
+o.s.frv
+ófn
+ób
+óákv.gr
+óákv
+pfn
+PR
+pr
+Ritstj
+Rvík
+Rvk
+samb
+samhlj
+samn
+samn
+sbr
+sek
+sérn
+sf
+sfn
+sh
+sfn
+sh
+s.hl
+sk
+skv
+sl
+sn
+so
+ss.us
+s.st
+samþ
+sbr
+shlj
+sign
+skál
+st
+st.s
+stk
+sþ
+teg
+tbl
+tfn
+tl
+tvíhlj
+tvt
+till
+to
+umr
+uh
+us
+uppl
+útg
+vb
+Vf
+vh
+vkf
+Vl
+vl
+vlf
+vmf
+8vo
+vsk
+vth
+þt
+þf
+þjs
+þgf
+þlt
+þolm
+þm
+þml
+þýð
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.it b/nonbreaking_prefixes/nonbreaking_prefix.it
new file mode 100644
index 0000000..992b9ec
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.it
@@ -0,0 +1,180 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+Adj
+Adm
+Adv
+Amn 
+Arch 
+Asst
+Avv
+Bart
+Bcc
+Bldg
+Brig
+Bros
+C.A.P
+C.P
+Capt
+Cc
+Cmdr
+Co
+Col
+Comdr
+Con
+Corp
+Cpl
+DR
+Dott
+Dr
+Drs
+Egr
+Ens
+Gen
+Geom
+Gov
+Hon
+Hosp
+Hr
+Id
+Ing
+Insp
+Lt
+MM
+MR
+MRS
+MS
+Maj
+Messrs
+Mlle
+Mme
+Mo
+Mons
+Mr
+Mrs
+Ms
+Msgr
+N.B
+Op
+Ord
+P.S
+P.T
+Pfc
+Ph
+Prof
+Pvt
+RP
+RSVP
+Rag
+Rep
+Reps
+Res
+Rev
+Rif
+Rt
+S.A
+S.B.F
+S.P.M
+S.p.A
+S.r.l
+Sen
+Sens
+Sfc
+Sgt
+Sig
+Sigg
+Soc
+Spett
+Sr
+St
+Supt
+Surg
+V.P
+
+# other
+a.c 
+acc
+all 
+banc
+c.a
+c.c.p
+c.m
+c.p
+c.s
+c.v
+corr
+dott
+e.p.c
+ecc
+es 
+fatt
+gg
+int
+lett
+ogg
+on
+p.c
+p.c.c
+p.es
+p.f
+p.r
+p.v
+post
+pp
+racc
+ric
+s.n.c
+seg
+sgg
+ss
+tel
+u.s
+v.r
+v.s
+
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
+v
+vs
+i.e
+rev
+e.g
+
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+No #NUMERIC_ONLY# 
+Nos
+Art #NUMERIC_ONLY#
+Nr
+pp #NUMERIC_ONLY#
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.nl b/nonbreaking_prefixes/nonbreaking_prefix.nl
new file mode 100644
index 0000000..c80c417
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.nl
@@ -0,0 +1,115 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+#Sources: http://nl.wikipedia.org/wiki/Lijst_van_afkortingen 
+#         http://nl.wikipedia.org/wiki/Aanspreekvorm
+#         http://nl.wikipedia.org/wiki/Titulatuur_in_het_Nederlands_hoger_onderwijs
+#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+bacc
+bc
+bgen
+c.i
+dhr
+dr
+dr.h.c
+drs
+drs
+ds
+eint
+fa
+Fa
+fam
+gen
+genm
+ing
+ir
+jhr
+jkvr
+jr
+kand
+kol
+lgen
+lkol
+Lt
+maj
+Mej
+mevr
+Mme
+mr
+mr
+Mw
+o.b.s
+plv
+prof
+ritm
+tint
+Vz
+Z.D
+Z.D.H
+Z.E
+Z.Em
+Z.H
+Z.K.H
+Z.K.M
+Z.M
+z.v
+
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
+#we seem to have a lot of these in dutch i.e.: i.p.v - in plaats van (in stead of) never ends a sentence
+a.g.v
+bijv
+bijz
+bv
+d.w.z
+e.c
+e.g
+e.k
+ev
+i.p.v
+i.s.m
+i.t.t
+i.v.m
+m.a.w
+m.b.t
+m.b.v
+m.h.o
+m.i
+m.i.v
+v.w.t
+
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+Nr #NUMERIC_ONLY# 
+Nrs 
+nrs
+nr #NUMERIC_ONLY#
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.pl b/nonbreaking_prefixes/nonbreaking_prefix.pl
new file mode 100644
index 0000000..6b7c106
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.pl
@@ -0,0 +1,283 @@
+adw
+afr
+akad
+al
+Al
+am
+amer
+arch
+art
+Art
+artyst
+astr
+austr
+bałt
+bdb
+bł
+bm
+br
+bryg
+bryt
+centr
+ces
+chem
+chiń
+chir
+c.k
+c.o
+cyg
+cyw
+cyt
+czes
+czw
+cd
+Cd
+czyt
+ćw
+ćwicz
+daw
+dcn
+dekl
+demokr
+det
+diec
+dł
+dn
+dot
+dol
+dop
+dost
+dosł
+h.c
+ds
+dst
+duszp
+dypl
+egz
+ekol
+ekon
+elektr
+em
+ew
+fab
+farm
+fot
+fr
+gat
+gastr
+geogr
+geol
+gimn
+głęb
+gm
+godz
+górn
+gosp
+gr
+gram
+hist
+hiszp
+hr
+Hr
+hot
+id
+in
+im
+iron
+jn
+kard
+kat
+katol
+k.k
+kk
+kol
+kl
+k.p.a
+kpc
+k.p.c
+kpt
+kr
+k.r
+krak
+k.r.o
+kryt
+kult
+laic
+łac
+niem
+woj
+nb
+np
+Nb
+Np
+pol
+pow
+m.in
+pt
+ps
+Pt
+Ps
+cdn
+jw
+ryc
+rys
+Ryc
+Rys
+tj
+tzw
+Tzw
+tzn
+zob
+ang
+ub
+ul
+pw
+pn
+pl
+al
+k
+n
+nr #NUMERIC_ONLY#
+Nr #NUMERIC_ONLY#
+ww
+wł
+ur
+zm
+żyd
+żarg
+żyw
+wył
+bp
+bp
+wyst
+tow
+Tow
+o
+sp
+Sp
+st
+spółdz
+Spółdz
+społ
+spółgł
+stoł
+stow
+Stoł
+Stow
+zn
+zew
+zewn
+zdr
+zazw
+zast
+zaw
+zał
+zal
+zam
+zak
+zakł
+zagr
+zach
+adw
+Adw
+lek
+Lek
+med
+mec
+Mec
+doc
+Doc
+dyw
+dyr
+Dyw
+Dyr
+inż
+Inż
+mgr
+Mgr
+dh
+dr
+Dh
+Dr
+p
+P
+red
+Red
+prof
+prok
+Prof
+Prok
+hab
+płk
+Płk
+nadkom
+Nadkom
+podkom
+Podkom
+ks
+Ks
+gen
+Gen
+por
+Por
+reż
+Reż
+przyp
+Przyp
+śp
+św
+śW
+Śp
+Św
+ŚW
+szer
+Szer
+pkt #NUMERIC_ONLY#
+str #NUMERIC_ONLY#
+tab #NUMERIC_ONLY#
+Tab #NUMERIC_ONLY#
+tel
+ust #NUMERIC_ONLY#
+par #NUMERIC_ONLY#
+poz
+pok
+oo
+oO
+Oo
+OO
+r #NUMERIC_ONLY#
+l #NUMERIC_ONLY#
+s #NUMERIC_ONLY#
+najśw
+Najśw
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+Ś
+Ć
+Ż
+Ź
+Dz
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.pt b/nonbreaking_prefixes/nonbreaking_prefix.pt
new file mode 100644
index 0000000..5d65bf2
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.pt
@@ -0,0 +1,210 @@
+#File adapted for PT by H. Leal Fontes from the EN & DE versions published with moses-2009-04-13. Last update: 10.11.2009.
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+a
+b
+c
+d
+e
+f
+g
+h
+i
+j
+k
+l
+m
+n
+o
+p
+q
+r
+s
+t
+u
+v
+w
+x
+y
+z
+
+
+#Roman Numerals. A dot after one of these is not a sentence break in Portuguese.
+I
+II
+III
+IV
+V
+VI
+VII
+VIII
+IX
+X
+XI
+XII
+XIII
+XIV
+XV
+XVI
+XVII
+XVIII
+XIX
+XX
+i
+ii
+iii
+iv
+v
+vi
+vii
+viii
+ix
+x
+xi
+xii
+xiii
+xiv
+xv
+xvi
+xvii
+xviii
+xix
+xx
+
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+Adj
+Adm
+Adv
+Art
+Ca
+Capt
+Cmdr
+Col
+Comdr
+Con
+Corp
+Cpl
+DR
+DRA
+Dr
+Dra
+Dras
+Drs
+Eng
+Enga
+Engas
+Engos
+Ex
+Exo
+Exmo
+Fig
+Gen
+Hosp
+Insp
+Lda
+MM
+MR
+MRS
+MS
+Maj
+Mrs
+Ms
+Msgr
+Op
+Ord
+Pfc
+Ph
+Prof
+Pvt
+Rep
+Reps
+Res
+Rev
+Rt
+Sen
+Sens
+Sfc
+Sgt
+Sr
+Sra
+Sras
+Srs
+Sto
+Supt
+Surg
+adj
+adm
+adv
+art
+cit
+col
+con
+corp
+cpl
+dr
+dra
+dras
+drs
+eng
+enga
+engas
+engos
+ex
+exo
+exmo
+fig
+op
+prof
+sr
+sra
+sras
+srs
+sto
+
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
+v
+vs
+i.e
+rev
+e.g
+
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+No #NUMERIC_ONLY# 
+Nos
+Art #NUMERIC_ONLY#
+Nr
+p #NUMERIC_ONLY#
+pp #NUMERIC_ONLY#
+
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.ro b/nonbreaking_prefixes/nonbreaking_prefix.ro
new file mode 100644
index 0000000..d489f46
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.ro
@@ -0,0 +1,38 @@
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+dpdv
+etc
+șamd
+M.Ap.N
+dl
+Dl
+d-na
+D-na
+dvs
+Dvs
+pt
+Pt
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.ru b/nonbreaking_prefixes/nonbreaking_prefix.ru
new file mode 100644
index 0000000..444465b
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.ru
@@ -0,0 +1,259 @@
+TBD: Russian uppercase alphabet [А-Я]
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+0гг
+1гг
+2гг
+3гг
+4гг
+5гг
+6гг
+7гг
+8гг
+9гг
+0г
+1г
+2г
+3г
+4г
+5г
+6г
+7г
+8г
+9г
+Xвв
+Vвв
+Iвв
+Lвв
+Mвв
+Cвв
+Xв
+Vв
+Iв
+Lв
+Mв
+Cв
+0м
+1м
+2м
+3м
+4м
+5м
+6м
+7м
+8м
+9м
+0мм
+1мм
+2мм
+3мм
+4мм
+5мм
+6мм
+7мм
+8мм
+9мм
+0см
+1см
+2см
+3см
+4см
+5см
+6см
+7см
+8см
+9см
+0дм
+1дм
+2дм
+3дм
+4дм
+5дм
+6дм
+7дм
+8дм
+9дм
+0л
+1л
+2л
+3л
+4л
+5л
+6л
+7л
+8л
+9л
+0км
+1км
+2км
+3км
+4км
+5км
+6км
+7км
+8км
+9км
+0га
+1га
+2га
+3га
+4га
+5га
+6га
+7га
+8га
+9га
+0кг
+1кг
+2кг
+3кг
+4кг
+5кг
+6кг
+7кг
+8кг
+9кг
+0т
+1т
+2т
+3т
+4т
+5т
+6т
+7т
+8т
+9т
+0г
+1г
+2г
+3г
+4г
+5г
+6г
+7г
+8г
+9г
+0мг
+1мг
+2мг
+3мг
+4мг
+5мг
+6мг
+7мг
+8мг
+9мг
+бульв
+в
+вв
+г
+га
+гг
+гл
+гос
+д
+дм
+доп
+др
+е
+ед
+ед
+зам
+и
+инд
+исп
+Исп
+к
+кап
+кг
+кв
+кл
+км
+кол
+комн
+коп
+куб
+л
+лиц
+лл
+м
+макс
+мг
+мин
+мл
+млн
+млрд
+мм
+н
+наб
+нач
+неуд
+ном
+о
+обл
+обр
+общ
+ок
+ост
+отл
+п
+пер
+перераб
+пл
+пос
+пр
+просп
+проф
+р
+ред
+руб
+с
+сб
+св
+см
+соч
+ср
+ст
+стр
+т
+тел
+Тел
+тех
+тт
+туп
+тыс
+уд
+ул
+уч
+физ
+х
+хор
+ч
+чел
+шт
+экз
+э
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.sk b/nonbreaking_prefixes/nonbreaking_prefix.sk
new file mode 100644
index 0000000..1198d48
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.sk
@@ -0,0 +1,474 @@
+Bc
+Mgr
+RNDr
+PharmDr
+PhDr
+JUDr
+PaedDr
+ThDr
+Ing
+MUDr
+MDDr
+MVDr
+Dr
+ThLic
+PhD
+ArtD
+ThDr
+Dr
+DrSc
+CSs
+prof
+obr
+Obr
+Č
+č
+absol
+adj
+admin
+adr
+Adr
+adv
+advok
+afr
+ak
+akad
+akc
+akuz
+et
+al
+alch
+amer
+anat
+angl
+Angl
+anglosas
+anorg
+ap
+apod
+arch
+archeol
+archit
+arg
+art
+astr
+astrol
+astron
+atp
+atď
+austr
+Austr
+aut
+belg
+Belg
+bibl
+Bibl
+biol
+bot
+bud
+bás
+býv
+cest
+chem
+cirk
+csl
+čs
+Čs
+dat
+dep
+det
+dial
+diaľ
+dipl
+distrib
+dokl
+dosl
+dopr
+dram
+duš
+dv
+dvojčl
+dór
+ekol
+ekon
+el
+elektr
+elektrotech
+energet
+epic
+est
+etc
+etonym
+eufem
+európ
+Európ
+ev
+evid
+expr
+fa
+fam
+farm
+fem
+feud
+fil
+filat
+filoz
+fi
+fon
+form
+fot
+fr
+Fr
+franc
+Franc
+fraz
+fut
+fyz
+fyziol
+garb
+gen
+genet
+genpor
+geod
+geogr
+geol
+geom
+germ
+gr
+Gr
+gréc
+Gréc
+gréckokat
+hebr
+herald
+hist
+hlav
+hosp
+hromad
+hud
+hypok
+ident
+i.e
+ident
+imp
+impf
+indoeur
+inf
+inform
+instr
+int
+interj
+inšt
+inštr
+iron
+jap
+Jap
+jaz
+jedn
+juhoamer
+juhových
+juhozáp
+juž
+kanad
+Kanad
+kanc
+kapit
+kpt
+kart
+katastr
+knih
+kniž
+komp
+konj
+konkr
+kozmet
+krajč
+kresť
+kt
+kuch
+lat
+latinskoamer
+lek
+lex
+lingv
+lit
+litur
+log
+lok
+max
+Max
+maď
+Maď
+medzinár
+mest
+metr
+mil
+Mil
+min
+Min
+miner
+ml
+mld
+mn
+mod
+mytol
+napr
+nar
+Nar
+nasl
+nedok
+neg
+negat
+neklas
+nem
+Nem
+neodb
+neos
+neskl
+nesklon
+nespis
+nespráv
+neved
+než
+niekt
+niž
+nom
+náb
+nákl
+námor
+nár
+obch
+obj
+obv
+obyč
+obč
+občian
+odb
+odd
+ods
+ojed
+okr
+Okr
+opt
+opyt
+org
+os
+osob
+ot
+ovoc
+par
+part
+pejor
+pers
+pf
+Pf 
+P.f
+p.f
+pl
+Plk
+pod
+podst
+pokl
+polit
+politol
+polygr
+pomn
+popl
+por
+porad
+porov
+posch
+potrav
+použ
+poz
+pozit
+poľ
+poľno
+poľnohosp
+poľov
+pošt
+pož
+prac
+predl
+pren
+prep
+preuk
+priezv
+Priezv
+privl
+prof
+práv
+príd
+príj
+prík
+príp
+prír
+prísl
+príslov
+príč
+psych
+publ
+pís
+písm
+pôv
+refl
+reg
+rep
+resp
+rozk
+rozlič
+rozpráv
+roč
+Roč
+ryb
+rádiotech
+rím
+samohl
+semest
+sev
+severoamer
+severových
+severozáp
+sg
+skr
+skup
+sl
+Sloven
+soc
+soch
+sociol
+sp
+spol
+Spol
+spoloč
+spoluhl
+správ
+spôs
+st
+star
+starogréc
+starorím
+s.r.o
+stol
+stor
+str
+stredoamer
+stredoškol
+subj
+subst
+superl
+sv
+sz
+súkr
+súp
+súvzť
+tal
+Tal
+tech
+tel
+Tel
+telef
+teles
+telev
+teol
+trans
+turist
+tuzem
+typogr
+tzn
+tzv
+ukaz
+ul
+Ul
+umel
+univ
+ust
+ved
+vedľ
+verb
+veter
+vin
+viď
+vl
+vod
+vodohosp
+pnl
+vulg
+vyj
+vys
+vysokoškol
+vzťaž
+vôb
+vých
+výd
+výrob
+výsk
+výsl
+výtv
+výtvar
+význ
+včel
+vš
+všeob
+zahr
+zar
+zariad
+zast
+zastar
+zastaráv
+zb
+zdravot
+združ
+zjemn
+zlat
+zn
+Zn
+zool
+zr
+zried
+zv
+záhr
+zák
+zákl
+zám
+záp
+západoeur
+zázn
+územ
+účt
+čast
+čes
+Čes
+čl
+čísl
+živ
+pr
+fak
+Kr
+p.n.l
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.sl b/nonbreaking_prefixes/nonbreaking_prefix.sl
new file mode 100644
index 0000000..230062c
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.sl
@@ -0,0 +1,78 @@
+dr
+Dr
+itd
+itn
+št #NUMERIC_ONLY#
+Št #NUMERIC_ONLY#
+d
+jan
+Jan
+feb
+Feb
+mar
+Mar
+apr
+Apr
+jun
+Jun
+jul
+Jul
+avg
+Avg
+sept
+Sept
+sep
+Sep
+okt
+Okt
+nov
+Nov
+dec
+Dec
+tj
+Tj
+npr
+Npr
+sl
+Sl
+op
+Op
+gl
+Gl
+oz
+Oz
+prev
+dipl
+ing
+prim
+Prim
+cf
+Cf
+gl
+Gl
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.sv b/nonbreaking_prefixes/nonbreaking_prefix.sv
new file mode 100644
index 0000000..df5ef29
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.sv
@@ -0,0 +1,46 @@
+#single upper case letter are usually initials
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+#misc abbreviations
+AB
+G
+VG
+dvs
+etc
+from
+iaf
+jfr
+kl
+kr
+mao
+mfl
+mm
+osv
+pga
+tex
+tom
+vs
diff --git a/preprocess b/preprocess
index b034e48..c4eeb39 100755
--- a/preprocess
+++ b/preprocess
@@ -1,5 +1,9 @@
 #!/bin/bash
 
+pushd `dirname $0` > /dev/null
+P=`pwd -P`
+popd > /dev/null
+
 LANG=$1
-/toolbox/scripts/no_non_printables | sed "s|[-,\.]\{4,\}|...|g" | /toolbox/scripts/htmlentities 2>htmlentities.$LANG.err | /toolbox/scripts/normalize_punctuation 2>normalize-punctuation.$LANG.err | /toolbox/moses/scripts/tokenizer/tokenizer.no-escape.perl -a -b -threads 1 -l $LANG 2>tokenizer.$LANG.err | /toolbox/moses/scripts/tokenizer/lowercase.perl 2>lowercase.$LANG.err
+$P/no_non_printables | sed "s|[-,\.]\{4,\}|...|g" | $P/htmlentities 2>htmlentities.$LANG.err | $P/normalize_punctuation 2>normalize-punctuation.$LANG.err | $P/tokenizer.no-escape.perl -a -b -threads 1 -l $LANG 2>tokenizer.$LANG.err | $P/lowercase.perl 2>lowercase.$LANG.err
 
diff --git a/tokenizer.no-escape.perl b/tokenizer.no-escape.perl
new file mode 100755
index 0000000..4397360
--- /dev/null
+++ b/tokenizer.no-escape.perl
@@ -0,0 +1,348 @@
+#!/usr/bin/perl -w
+
+# Sample Tokenizer
+### Version 1.1
+# written by Pidong Wang, based on the code written by Josh Schroeder and Philipp Koehn
+# Version 1.1 updates:
+#       (1) add multithreading option "-threads NUM_THREADS" (default is 1);
+#       (2) add a timing option "-time" to calculate the average speed of this tokenizer;
+#       (3) add an option "-lines NUM_SENTENCES_PER_THREAD" to set the number of lines for each thread (default is 2000), and this option controls the memory amount needed: the larger this number is, the larger memory is required (the higher tokenization speed);
+### Version 1.0
+# $Id: tokenizer.perl 915 2009-08-10 08:15:49Z philipp $
+# written by Josh Schroeder, based on code by Philipp Koehn
+
+binmode(STDIN, ":utf8");
+binmode(STDOUT, ":utf8");
+
+use FindBin qw($RealBin);
+use strict;
+use Time::HiRes;
+#use Thread;
+
+my $mydir = "$RealBin/nonbreaking_prefixes";
+
+my %NONBREAKING_PREFIX = ();
+my $language = "en";
+my $QUIET = 0;
+my $HELP = 0;
+my $AGGRESSIVE = 0;
+my $SKIP_XML = 0;
+my $TIMING = 0;
+my $NUM_THREADS = 1;
+my $NUM_SENTENCES_PER_THREAD = 2000;
+
+while (@ARGV) 
+{
+	$_ = shift;
+	/^-b$/ && ($| = 1, next);
+	/^-l$/ && ($language = shift, next);
+	/^-q$/ && ($QUIET = 1, next);
+	/^-h$/ && ($HELP = 1, next);
+	/^-x$/ && ($SKIP_XML = 1, next);
+	/^-a$/ && ($AGGRESSIVE = 1, next);
+	/^-time$/ && ($TIMING = 1, next);
+	/^-threads$/ && ($NUM_THREADS = int(shift), next);
+	/^-lines$/ && ($NUM_SENTENCES_PER_THREAD = int(shift), next);
+}
+
+# for time calculation
+my $start_time;
+if ($TIMING)
+{
+    $start_time = [ Time::HiRes::gettimeofday( ) ];
+}
+
+# print help message
+if ($HELP) 
+{
+	print "Usage ./tokenizer.perl (-l [en|de|...]) (-threads 4) < textfile > tokenizedfile\n";
+        print "Options:\n";
+        print "  -q     ... quiet.\n";
+        print "  -a     ... aggressive hyphen splitting.\n";
+        print "  -b     ... disable Perl buffering.\n";
+        print "  -time  ... enable processing time calculation.\n";
+	exit;
+}
+
+if (!$QUIET) 
+{
+	print STDERR "Tokenizer Version 1.1\n";
+	print STDERR "Language: $language\n";
+	print STDERR "Number of threads: $NUM_THREADS\n";
+}
+
+# load the language-specific non-breaking prefix info from files in the directory nonbreaking_prefixes
+load_prefixes($language,\%NONBREAKING_PREFIX);
+
+if (scalar(%NONBREAKING_PREFIX) eq 0)
+{
+	print STDERR "Warning: No known abbreviations for language '$language'\n";
+}
+
+my @batch_sentences = ();
+my @thread_list = ();
+my $count_sentences = 0;
+
+if ($NUM_THREADS > 1)
+{# multi-threading tokenization
+    while(<STDIN>) 
+    {
+        $count_sentences = $count_sentences + 1;
+        push(@batch_sentences, $_);
+        if (scalar(@batch_sentences)>=($NUM_SENTENCES_PER_THREAD*$NUM_THREADS))
+        {
+            # assign each thread work
+            for (my $i=0; $i<$NUM_THREADS; $i++)
+            {
+                my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
+                my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
+                my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
+                my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
+                push(@thread_list, $new_thread);
+            }
+            foreach (@thread_list)
+            {
+                my $tokenized_list = $_->join;
+                foreach (@$tokenized_list)
+                {
+                    print $_;
+                }
+            }
+            # reset for the new run
+            @thread_list = ();
+            @batch_sentences = ();
+        }
+    }
+    # the last batch
+    if (scalar(@batch_sentences)>0)
+    {
+        # assign each thread work
+        for (my $i=0; $i<$NUM_THREADS; $i++)
+        {
+            my $start_index = $i*$NUM_SENTENCES_PER_THREAD;
+            if ($start_index >= scalar(@batch_sentences))
+            {
+                last;
+            }
+            my $end_index = $start_index+$NUM_SENTENCES_PER_THREAD-1;
+            if ($end_index >= scalar(@batch_sentences))
+            {
+                $end_index = scalar(@batch_sentences)-1;
+            }
+            my @subbatch_sentences = @batch_sentences[$start_index..$end_index];
+            my $new_thread = new Thread \&tokenize_batch, @subbatch_sentences;
+            push(@thread_list, $new_thread);
+        }
+        foreach (@thread_list)
+        {
+            my $tokenized_list = $_->join;
+            foreach (@$tokenized_list)
+            {
+                print $_;
+            }
+        }
+    }
+}
+else
+{# single thread only
+    while(<STDIN>) 
+    {
+        if (($SKIP_XML && /^<.+>$/) || /^\s*$/) 
+        {
+            #don't try to tokenize XML/HTML tag lines
+            print $_;
+        }
+        else 
+        {
+            print &tokenize($_);
+        }
+    }
+}
+
+if ($TIMING)
+{
+    my $duration = Time::HiRes::tv_interval( $start_time );
+    print STDERR ("TOTAL EXECUTION TIME: ".$duration."\n");
+    print STDERR ("TOKENIZATION SPEED: ".($duration/$count_sentences*1000)." milliseconds/line\n");
+}
+
+#####################################################################################
+# subroutines afterward
+
+# tokenize a batch of texts saved in an array
+# input: an array containing a batch of texts
+# return: another array cotaining a batch of tokenized texts for the input array
+sub tokenize_batch
+{
+    my(@text_list) = @_;
+    my(@tokenized_list) = ();
+    foreach (@text_list)
+    {
+        if (($SKIP_XML && /^<.+>$/) || /^\s*$/) 
+        {
+            #don't try to tokenize XML/HTML tag lines
+            push(@tokenized_list, $_);
+        }
+        else
+        {
+            push(@tokenized_list, &tokenize($_));
+        }
+    }
+    return \@tokenized_list;
+}
+
+# the actual tokenize function which tokenizes one input string
+# input: one string
+# return: the tokenized string for the input string
+sub tokenize 
+{
+    my($text) = @_;
+    chomp($text);
+    $text = " $text ";
+    
+    # remove ASCII junk
+    $text =~ s/\s+/ /g;
+    $text =~ s/[\000-\037]//g;
+
+    # seperate out all "other" special characters
+    $text =~ s/([^\p{IsAlnum}\s\.\'\`\,\-])/ $1 /g;
+
+    # aggressive hyphen splitting
+    if ($AGGRESSIVE) 
+    {
+        $text =~ s/([\p{IsAlnum}])\-([\p{IsAlnum}])/$1 \@-\@ $2/g;
+    }
+
+    #multi-dots stay together
+    $text =~ s/\.([\.]+)/ DOTMULTI$1/g;
+    while($text =~ /DOTMULTI\./) 
+    {
+        $text =~ s/DOTMULTI\.([^\.])/DOTDOTMULTI $1/g;
+        $text =~ s/DOTMULTI\./DOTDOTMULTI/g;
+    }
+
+    # seperate out "," except if within numbers (5,300)
+    $text =~ s/([^\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+    # separate , pre and post number
+    $text =~ s/([\p{IsN}])[,]([^\p{IsN}])/$1 , $2/g;
+    $text =~ s/([^\p{IsN}])[,]([\p{IsN}])/$1 , $2/g;
+	      
+    # turn `into '
+    $text =~ s/\`/\'/g;
+	
+    #turn '' into "
+    $text =~ s/\'\'/ \" /g;
+
+    if ($language eq "en") 
+    {
+        #split contractions right
+        $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([^\p{IsAlpha}\p{IsN}])[']([\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1 '$2/g;
+        #special case for "1990's"
+        $text =~ s/([\p{IsN}])[']([s])/$1 '$2/g;
+    } 
+    elsif (($language eq "fr") or ($language eq "it")) 
+    {
+        #split contractions left	
+        $text =~ s/([^\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([^\p{IsAlpha}])[']([\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([^\p{IsAlpha}])/$1 ' $2/g;
+        $text =~ s/([\p{IsAlpha}])[']([\p{IsAlpha}])/$1' $2/g;
+    } 
+    else 
+    {
+        $text =~ s/\'/ \' /g;
+    }
+	
+    #word token method
+    my @words = split(/\s/,$text);
+    $text = "";
+    for (my $i=0;$i<(scalar(@words));$i++) 
+    {
+        my $word = $words[$i];
+        if ( $word =~ /^(\S+)\.$/) 
+        {
+            my $pre = $1;
+            if (($pre =~ /\./ && $pre =~ /\p{IsAlpha}/) || ($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==1) || ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[\p{IsLower}]/))) 
+            {
+                #no change
+			} 
+            elsif (($NONBREAKING_PREFIX{$pre} && $NONBREAKING_PREFIX{$pre}==2) && ($i<scalar(@words)-1 && ($words[$i+1] =~ /^[0-9]+/))) 
+            {
+                #no change
+            } 
+            else 
+            {
+                $word = $pre." .";
+            }
+        }
+        $text .= $word." ";
+    }		
+
+    # clean up extraneous spaces
+    $text =~ s/ +/ /g;
+    $text =~ s/^ //g;
+    $text =~ s/ $//g;
+
+    #restore multi-dots
+    while($text =~ /DOTDOTMULTI/) 
+    {
+        $text =~ s/DOTDOTMULTI/DOTMULTI./g;
+    }
+    $text =~ s/DOTMULTI/./g;
+
+    #escape special chars
+    #$text =~ s/\&/\&amp;/g;   # escape escape
+    #$text =~ s/\|/\&#124;/g;  # factor separator
+    #$text =~ s/\</\&lt;/g;    # xml
+    #$text =~ s/\>/\&gt;/g;    # xml
+    #$text =~ s/\'/\&apos;/g;  # xml
+    #$text =~ s/\"/\&quot;/g;  # xml
+    #$text =~ s/\[/\&#91;/g;   # syntax non-terminal
+    #$text =~ s/\]/\&#93;/g;   # syntax non-terminal
+
+    #ensure final line break
+    $text .= "\n" unless $text =~ /\n$/;
+
+    return $text;
+}
+
+sub load_prefixes 
+{
+    my ($language, $PREFIX_REF) = @_;
+	
+    my $prefixfile = "$mydir/nonbreaking_prefix.$language";
+	
+    #default back to English if we don't have a language-specific prefix file
+    if (!(-e $prefixfile)) 
+    {
+        $prefixfile = "$mydir/nonbreaking_prefix.en";
+        print STDERR "WARNING: No known abbreviations for language '$language', attempting fall-back to English version...\n";
+        die ("ERROR: No abbreviations files found in $mydir\n") unless (-e $prefixfile);
+    }
+	
+    if (-e "$prefixfile") 
+    {
+        open(PREFIX, "<:utf8", "$prefixfile");
+        while (<PREFIX>) 
+        {
+            my $item = $_;
+            chomp($item);
+            if (($item) && (substr($item,0,1) ne "#")) 
+            {
+                if ($item =~ /(.*)[\s]+(\#NUMERIC_ONLY\#)/) 
+                {
+                    $PREFIX_REF->{$1} = 2;
+                } 
+                else 
+                {
+                    $PREFIX_REF->{$item} = 1;
+                }
+            }
+        }
+        close(PREFIX);
+    }
+}
+
-- 
cgit v1.2.3