summaryrefslogtreecommitdiff
path: root/corpus
diff options
context:
space:
mode:
Diffstat (limited to 'corpus')
-rwxr-xr-xcorpus/conll2cdec.pl42
-rwxr-xr-xcorpus/sample-dev-sets.py74
-rw-r--r--corpus/support/token_list49
-rw-r--r--corpus/support/token_patterns2
-rwxr-xr-xcorpus/support/tokenizer.pl8
-rwxr-xr-xcorpus/utf8-normalize.sh2
6 files changed, 172 insertions, 5 deletions
diff --git a/corpus/conll2cdec.pl b/corpus/conll2cdec.pl
new file mode 100755
index 00000000..ee4e07db
--- /dev/null
+++ b/corpus/conll2cdec.pl
@@ -0,0 +1,42 @@
+#!/usr/bin/perl -w
+use strict;
+
+die "Usage: $0 file.conll\n\n Converts a CoNLL formatted labeled sequence into cdec's format.\n\n" unless scalar @ARGV == 1;
+open F, "<$ARGV[0]" or die "Can't read $ARGV[0]: $!\n";
+
+my @xx;
+my @yy;
+my @os;
+my $sec = undef;
+my $i = 0;
+while(<F>) {
+ chomp;
+ if (/^\s*$/) {
+ print "<seg id=\"$i\"";
+ $i++;
+ for (my $j = 0; $j < $sec; $j++) {
+ my @oo = ();
+ for (my $k = 0; $k < scalar @xx; $k++) {
+ my $sym = $os[$k]->[$j];
+ $sym =~ s/"/'/g;
+ push @oo, $sym;
+ }
+ my $zz = $j + 1;
+ print " feat$zz=\"@oo\"";
+ }
+
+ print "> @xx ||| @yy </seg>\n";
+ @xx = ();
+ @yy = ();
+ @os = ();
+ } else {
+ my ($x, @fs) = split /\s+/;
+ my $y = pop @fs;
+ if (!defined $sec) { $sec = scalar @fs; }
+ die unless $sec == scalar @fs;
+ push @xx, $x;
+ push @yy, $y;
+ push @os, \@fs;
+ }
+}
+
diff --git a/corpus/sample-dev-sets.py b/corpus/sample-dev-sets.py
new file mode 100755
index 00000000..3c969bbe
--- /dev/null
+++ b/corpus/sample-dev-sets.py
@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+
+import gzip
+import os
+import sys
+
+HELP = '''Process an input corpus by dividing it into pseudo-documents and uniformly
+sampling train and dev sets (simulate uniform sampling at the document level
+when document boundaries are unknown)
+
+usage: {} in_file out_prefix doc_size docs_per_dev_set dev_sets [-lc]
+recommended: doc_size=20, docs_per_dev_set=100, dev_sets=2 (dev and test)
+'''
+
+def gzopen(f):
+ return gzip.open(f, 'rb') if f.endswith('.gz') else open(f, 'r')
+
+def wc(f):
+ return sum(1 for _ in gzopen(f))
+
+def main(argv):
+
+ if len(argv[1:]) < 5:
+ sys.stderr.write(HELP.format(os.path.basename(argv[0])))
+ sys.exit(2)
+
+ # Args
+ in_file = os.path.abspath(argv[1])
+ out_prefix = os.path.abspath(argv[2])
+ doc_size = int(argv[3])
+ docs_per_dev_set = int(argv[4])
+ dev_sets = int(argv[5])
+ lc = (len(argv[1:]) == 6 and argv[6] == '-lc')
+
+ # Compute sizes
+ corpus_size = wc(in_file)
+ total_docs = corpus_size / doc_size
+ leftover = corpus_size % doc_size
+ train_docs = total_docs - (dev_sets * docs_per_dev_set)
+ train_batch_size = (train_docs / docs_per_dev_set)
+
+ # Report
+ sys.stderr.write('Splitting {} lines ({} documents)\n'.format(corpus_size, total_docs + (1 if leftover else 0)))
+ sys.stderr.write('Train: {} ({})\n'.format((train_docs * doc_size) + leftover, train_docs + (1 if leftover else 0)))
+ sys.stderr.write('Dev: {} x {} ({})\n'.format(dev_sets, docs_per_dev_set * doc_size, docs_per_dev_set))
+
+ inp = gzopen(in_file)
+ train_out = open('{}.train'.format(out_prefix), 'w')
+ dev_out = [open('{}.dev.{}'.format(out_prefix, i + 1), 'w') for i in range(dev_sets)]
+ i = 0
+
+ # For each set of documents
+ for _ in range(docs_per_dev_set):
+ # Write several documents to train
+ for _ in range(train_batch_size):
+ for _ in range(doc_size):
+ i += 1
+ train_out.write('{} ||| {}'.format(i, inp.readline()) if lc else inp.readline())
+ # Write a document to each dev
+ for out in dev_out:
+ for _ in range(doc_size):
+ i += 1
+ out.write('{} ||| {}'.format(i, inp.readline()) if lc else inp.readline())
+ # Write leftover lines to train
+ for line in inp:
+ i += 1
+ train_out.write('{} ||| {}'.format(i, line) if lc else line)
+
+ train_out.close()
+ for out in dev_out:
+ out.close()
+
+if __name__ == '__main__':
+ main(sys.argv)
diff --git a/corpus/support/token_list b/corpus/support/token_list
index d38638cf..00daa82b 100644
--- a/corpus/support/token_list
+++ b/corpus/support/token_list
@@ -1,6 +1,55 @@
##################### hyphenated words added by Fei since 3/7/05
##X-ray
+# Finnish
+eaa.
+ap.
+arv.
+ay.
+eKr.
+em.
+engl.
+esim.
+fil.
+lis.
+fil.
+maist.
+fil.toht.
+harv.
+ilt.
+jatk.
+jKr.
+jms.
+jne.
+joht.
+klo
+ko.
+ks.
+leht.
+lv.
+lyh.
+mm.
+mon.
+nim.
+nro.
+ns.
+nti.
+os.
+oy.
+pj.
+pnä.
+puh.
+pvm.
+rva.
+tms.
+ts.
+vars.
+vrt.
+ym.
+yms.
+yo.
+>>>>>>> 8646b68e5b124f612fd65b51ea40624f65a2f3d6
+
# hindi abbreviation patterns
जन.
फर.
diff --git a/corpus/support/token_patterns b/corpus/support/token_patterns
index de64fb2a..12558cdd 100644
--- a/corpus/support/token_patterns
+++ b/corpus/support/token_patterns
@@ -1,4 +1,6 @@
/^(al|el|ul|e)\-[a-z]+$/
+/\.(fi|fr|es|co\.uk|de)$/
+/:[a-zä]+$/
/^((а|А)(ль|ш)|уль)-\p{Cyrillic}+$/
/^\p{Cyrillic}\.\p{Cyrillic}\.$/
/^(\d|\d\d|\d\d\d)\.$/
diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl
index aa285be4..718d78cc 100755
--- a/corpus/support/tokenizer.pl
+++ b/corpus/support/tokenizer.pl
@@ -415,7 +415,7 @@ sub deep_proc_token {
}
## remove the ending periods that follow number etc.
- if($line =~ /^(.*(\d|\~|\^|\&|\:|\,|\#|\*|\%|\-|\_|\/|\\|\$|\'))(\.+)$/){
+ if($line =~ /^(.*(\d|\~|\^|\&|\:|\,|\#|\*|\%|€|\-|\_|\/|\\|\$|\'))(\.+)$/){
## 12~13. => 12~13 .
my $t1 = $1;
my $t3 = $3;
@@ -600,12 +600,12 @@ sub deep_proc_token {
## deal with "%"
- if(($line =~ /\%/) && ($Split_On_PercentSign > 0)){
+ if(($line =~ /\%|€/) && ($Split_On_PercentSign > 0)){
my $suc = 0;
if($Split_On_PercentSign >= 2){
- $suc += ($line =~ s/(\D)(\%+)/$1 $2/g);
+ $suc += ($line =~ s/(\D)(\%+|€+)/$1 $2/g);
}else{
- $suc += ($line =~ s/(\%+)/ $1 /g);
+ $suc += ($line =~ s/(\%+|€+)/ $1 /g);
}
if($suc){
diff --git a/corpus/utf8-normalize.sh b/corpus/utf8-normalize.sh
index dcf8bc59..7c0db611 100755
--- a/corpus/utf8-normalize.sh
+++ b/corpus/utf8-normalize.sh
@@ -7,7 +7,7 @@
if which uconv > /dev/null
then
- CMD="uconv -f utf8 -t utf8 -x Any-NFKC --callback skip"
+ CMD="uconv -f utf8 -t utf8 -x Any-NFKC --callback skip --remove-signature"
else
echo "Cannot find ICU uconv (http://site.icu-project.org/) ... falling back to iconv. Normalization NOT taking place." 1>&2
CMD="iconv -f utf8 -t utf8 -c"