summaryrefslogtreecommitdiff
path: root/nonbreaking_prefixes/nonbreaking_prefix.es
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2014-06-14 14:43:14 +0200
committerPatrick Simianer <p@simianer.de>2014-06-14 14:43:14 +0200
commit2783f837303ae07c4a1d676302bca779abbb1296 (patch)
treee388dda12d6d31285b32663b937a8d55ecc909c5 /nonbreaking_prefixes/nonbreaking_prefix.es
parent85ea0fc5e3ae7ea646cc6e843d01939b4d8e4dbf (diff)
steal tokenizer from moses' scripts
Diffstat (limited to 'nonbreaking_prefixes/nonbreaking_prefix.es')
-rw-r--r--nonbreaking_prefixes/nonbreaking_prefix.es118
1 files changed, 118 insertions, 0 deletions
diff --git a/nonbreaking_prefixes/nonbreaking_prefix.es b/nonbreaking_prefixes/nonbreaking_prefix.es
new file mode 100644
index 0000000..d8b2755
--- /dev/null
+++ b/nonbreaking_prefixes/nonbreaking_prefix.es
@@ -0,0 +1,118 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter followed by a period is not a sentence ender
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+
+# Period-final abbreviation list from http://www.ctspanish.com/words/abbreviations.htm
+
+A.C
+Apdo
+Av
+Bco
+CC.AA
+Da
+Dep
+Dn
+Dr
+Dra
+EE.UU
+Excmo
+FF.CC
+Fil
+Gral
+J.C
+Let
+Lic
+N.B
+P.D
+P.V.P
+Prof
+Pts
+Rte
+S.A
+S.A.R
+S.E
+S.L
+S.R.C
+Sr
+Sra
+Srta
+Sta
+Sto
+T.V.E
+Tel
+Ud
+Uds
+V.B
+V.E
+Vd
+Vds
+a/c
+adj
+admón
+afmo
+apdo
+av
+c
+c.f
+c.g
+cap
+cm
+cta
+dcha
+doc
+ej
+entlo
+esq
+etc
+f.c
+gr
+grs
+izq
+kg
+km
+mg
+mm
+núm
+núm
+p
+p.a
+p.ej
+ptas
+pág
+págs
+pág
+págs
+q.e.g.e
+q.e.s.m
+s
+s.s.s
+vid
+vol