diff options
author | Paul Baltescu <pauldb89@gmail.com> | 2013-02-21 14:13:55 +0000 |
---|---|---|
committer | Paul Baltescu <pauldb89@gmail.com> | 2013-02-21 14:13:55 +0000 |
commit | b5491898549c61bd799d199aa9178a8394a1ef69 (patch) | |
tree | fb2686a2aae03ff07bcdf4cd47e8c3191eff8d1e /corpus/support | |
parent | 0187447a643c3ea262b13b3052cb1531990eafe6 (diff) | |
parent | c17d9c23d023a5c08656376944f636180f0a437b (diff) |
Merge branch 'master' of https://github.com/pauldb89/cdec
Diffstat (limited to 'corpus/support')
-rwxr-xr-x | corpus/support/quote-norm.pl | 14 | ||||
-rw-r--r-- | corpus/support/token_list | 82 | ||||
-rw-r--r-- | corpus/support/token_patterns | 2 | ||||
-rwxr-xr-x | corpus/support/tokenizer.pl | 3 |
4 files changed, 100 insertions, 1 deletions
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl index 72b0064d..e4e5055e 100755 --- a/corpus/support/quote-norm.pl +++ b/corpus/support/quote-norm.pl @@ -27,6 +27,20 @@ while(<STDIN>) { s/¨/"/g; s/¡/ ¡ /g; s/¿/ ¿ /g; + + s/ п. п. / п.п. /g; + s/ ст. л. / ст.л. /g; + s/ т. е. / т.е. /g; + s/ т. к. / т.к. /g; + s/ т. ч. / т.ч. /g; + s/ т. д. / т.д. /g; + s/ т. п. / т.п. /g; + s/ и. о. / и.о. /g; + s/ с. г. / с.г. /g; + s/ г. р. / г.р. /g; + s/ т. н. / т.н. /g; + s/ т. ч. / т.ч. /g; + s/ н. э. / н.э. /g; # â<U+0080><U+0099> s/â(\x{80}\x{99}|\x{80}\x{98})/'/g; s/â(\x{80}\x{9c}|\x{80}\x{9d})/"/g; diff --git a/corpus/support/token_list b/corpus/support/token_list index 28eb4396..d470cb22 100644 --- a/corpus/support/token_list +++ b/corpus/support/token_list @@ -210,3 +210,85 @@ W. X. Y. Z. +А. +Б. +В. +Г. +Д. +Е. +Ё. +Ж. +З. +И. +Й. +К. +Л. +М. +Н. +О. +П. +Р. +С. +Т. +У. +Ф. +Х. +Ц. +Ч. +Ш. +Щ. +Ъ. +Ы. +Ь. +Э. +Ю. +Я. +л. +г. +обл. +гг. +в. +вв. +мин. +ч. +тыс. +млн. +млрд. +трлн. +кв. +куб. +руб. +коп. +долл. +Прим. +прим. +чел. +грн. +мин. +им. +проф. +акад. +ред. +авт. +корр. +соб. +спец. +см. +тж. +др. +пр. +букв. +# Two-letter abbreviations - can be written with space +п.п. +ст.л. +т.е. +т.к. +т.ч. +т.д. +т.п. +и.о. +с.г. +г.р. +т.н. +т.ч. +н.э. diff --git a/corpus/support/token_patterns b/corpus/support/token_patterns index 8e69432b..de64fb2a 100644 --- a/corpus/support/token_patterns +++ b/corpus/support/token_patterns @@ -1,3 +1,5 @@ /^(al|el|ul|e)\-[a-z]+$/ +/^((а|А)(ль|ш)|уль)-\p{Cyrillic}+$/ +/^\p{Cyrillic}\.\p{Cyrillic}\.$/ /^(\d|\d\d|\d\d\d)\.$/ diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl index e9c3a37d..0350a894 100755 --- a/corpus/support/tokenizer.pl +++ b/corpus/support/tokenizer.pl @@ -73,6 +73,7 @@ my $dict_file = "$workdir/token_list"; my $word_patt_file = "$workdir/token_patterns"; open(my $dict_fp, "$dict_file") or die; +binmode($dict_fp, ":utf8"); # read in the list of words that should not be segmented, ## e.g.,"I.B.M.", co-operation. @@ -89,6 +90,7 @@ while(<$dict_fp>){ } open(my $patt_fp, "$word_patt_file") or die; +binmode($patt_fp, ":utf8"); my @word_patts = (); my $word_patt_num = 0; while(<$patt_fp>){ @@ -147,7 +149,6 @@ while(<STDIN>){ print STDOUT " $new_line\n"; } -print STDERR "\n"; ######################################################################## |