From 55a1914e8998b2dc613d0f1e452a714b51169953 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 21 Jan 2013 16:53:05 -0500 Subject: a little bit of cleanup --- corpus/support/tokenizer.pl | 1 - 1 file changed, 1 deletion(-) (limited to 'corpus/support') diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl index e9c3a37d..b5190858 100755 --- a/corpus/support/tokenizer.pl +++ b/corpus/support/tokenizer.pl @@ -147,7 +147,6 @@ while(){ print STDOUT " $new_line\n"; } -print STDERR "\n"; ######################################################################## -- cgit v1.2.3 From fba0f3db9aa896807e3932f9d3323767cedd9338 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 21 Jan 2013 17:28:19 -0500 Subject: tokenizer support for utf8 patterns --- corpus/support/token_patterns | 1 + corpus/support/tokenizer.pl | 2 ++ 2 files changed, 3 insertions(+) (limited to 'corpus/support') diff --git a/corpus/support/token_patterns b/corpus/support/token_patterns index 8e69432b..b25ac6de 100644 --- a/corpus/support/token_patterns +++ b/corpus/support/token_patterns @@ -1,3 +1,4 @@ /^(al|el|ul|e)\-[a-z]+$/ +/^((а|А)(ль|ш)|уль)-\p{Cyrillic}+$/ /^(\d|\d\d|\d\d\d)\.$/ diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl index b5190858..0350a894 100755 --- a/corpus/support/tokenizer.pl +++ b/corpus/support/tokenizer.pl @@ -73,6 +73,7 @@ my $dict_file = "$workdir/token_list"; my $word_patt_file = "$workdir/token_patterns"; open(my $dict_fp, "$dict_file") or die; +binmode($dict_fp, ":utf8"); # read in the list of words that should not be segmented, ## e.g.,"I.B.M.", co-operation. @@ -89,6 +90,7 @@ while(<$dict_fp>){ } open(my $patt_fp, "$word_patt_file") or die; +binmode($patt_fp, ":utf8"); my @word_patts = (); my $word_patt_num = 0; while(<$patt_fp>){ -- cgit v1.2.3 From d30e63f84f836fa3223cd01ea3168f282c280be9 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 22 Jan 2013 20:08:28 -0500 Subject: russian abbrevs --- corpus/support/quote-norm.pl | 14 ++++++++ corpus/support/token_list | 82 +++++++++++++++++++++++++++++++++++++++++++ corpus/support/token_patterns | 1 + 3 files changed, 97 insertions(+) (limited to 'corpus/support') diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index 72b0064d..e4e5055e 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -27,6 +27,20 @@ while() { s/¨/"/g; s/¡/ ¡ /g; s/¿/ ¿ /g; + + s/ п. п. / п.п. /g; + s/ ст. л. / ст.л. /g; + s/ т. е. / т.е. /g; + s/ т. к. / т.к. /g; + s/ т. ч. / т.ч. /g; + s/ т. д. / т.д. /g; + s/ т. п. / т.п. /g; + s/ и. о. / и.о. /g; + s/ с. г. / с.г. /g; + s/ г. р. / г.р. /g; + s/ т. н. / т.н. /g; + s/ т. ч. / т.ч. /g; + s/ н. э. / н.э. /g; # â s/â(\x{80}\x{99}|\x{80}\x{98})/'/g; s/â(\x{80}\x{9c}|\x{80}\x{9d})/"/g; diff --git a/corpus/support/token_list b/corpus/support/token_list index 28eb4396..d470cb22 100644 --- a/corpus/support/token_list +++ b/corpus/support/token_list @@ -210,3 +210,85 @@ W. X. Y. Z. +А. +Б. +В. +Г. +Д. +Е. +Ё. +Ж. +З. +И. +Й. +К. +Л. +М. +Н. +О. +П. +Р. +С. +Т. +У. +Ф. +Х. +Ц. +Ч. +Ш. +Щ. +Ъ. +Ы. +Ь. +Э. +Ю. +Я. +л. +г. +обл. +гг. +в. +вв. +мин. +ч. +тыс. +млн. +млрд. +трлн. +кв. +куб. +руб. +коп. +долл. +Прим. +прим. +чел. +грн. +мин. +им. +проф. +акад. +ред. +авт. +корр. +соб. +спец. +см. +тж. +др. +пр. +букв. +# Two-letter abbreviations - can be written with space +п.п. +ст.л. +т.е. +т.к. +т.ч. +т.д. +т.п. +и.о. +с.г. +г.р. +т.н. +т.ч. +н.э. diff --git a/corpus/support/token_patterns b/corpus/support/token_patterns index b25ac6de..de64fb2a 100644 --- a/corpus/support/token_patterns +++ b/corpus/support/token_patterns @@ -1,4 +1,5 @@ /^(al|el|ul|e)\-[a-z]+$/ /^((а|А)(ль|ш)|уль)-\p{Cyrillic}+$/ +/^\p{Cyrillic}\.\p{Cyrillic}\.$/ /^(\d|\d\d|\d\d\d)\.$/ -- cgit v1.2.3