From 0968a3e56e5fb8d3afee2b1c4904d76160b0fb17 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Fri, 24 Oct 2014 15:31:16 -0400 Subject: conll2cdec conversion --- corpus/conll2cdec.pl | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) create mode 100755 corpus/conll2cdec.pl (limited to 'corpus') diff --git a/corpus/conll2cdec.pl b/corpus/conll2cdec.pl new file mode 100755 index 00000000..f65b86f8 --- /dev/null +++ b/corpus/conll2cdec.pl @@ -0,0 +1,39 @@ +#!/usr/bin/perl -w +use strict; + +my @xx; +my @yy; +my @os; +my $sec = undef; +my $i = 0; +while(<>) { + chomp; + if (/^\s*$/) { + print "[$j]; + $sym =~ s/"/'/g; + push @oo, $sym; + } + my $zz = $j + 1; + print " feat$zz=\"@oo\""; + } + + print "> @xx ||| @yy \n"; + @xx = (); + @yy = (); + @os = (); + } else { + my ($x, @fs) = split /\s+/; + my $y = pop @fs; + if (!defined $sec) { $sec = scalar @fs; } + die unless $sec == scalar @fs; + push @xx, $x; + push @yy, $y; + push @os, \@fs; + } +} + -- cgit v1.2.3 From 328f3808a62975d5bdc2d1bc56f0e4867364fe7a Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Sat, 25 Oct 2014 15:01:49 -0400 Subject: bit more info --- corpus/conll2cdec.pl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'corpus') diff --git a/corpus/conll2cdec.pl b/corpus/conll2cdec.pl index f65b86f8..ee4e07db 100755 --- a/corpus/conll2cdec.pl +++ b/corpus/conll2cdec.pl @@ -1,12 +1,15 @@ #!/usr/bin/perl -w use strict; +die "Usage: $0 file.conll\n\n Converts a CoNLL formatted labeled sequence into cdec's format.\n\n" unless scalar @ARGV == 1; +open F, "<$ARGV[0]" or die "Can't read $ARGV[0]: $!\n"; + my @xx; my @yy; my @os; my $sec = undef; my $i = 0; -while(<>) { +while() { chomp; if (/^\s*$/) { print " Date: Fri, 19 Dec 2014 02:46:54 -0500 Subject: Sample dev and test sets with pseudo-documents --- corpus/sample-dev-test.py | 65 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100755 corpus/sample-dev-test.py (limited to 'corpus') diff --git a/corpus/sample-dev-test.py b/corpus/sample-dev-test.py new file mode 100755 index 00000000..0c0514ee --- /dev/null +++ b/corpus/sample-dev-test.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python + +import gzip +import os +import sys + +HELP = '''Process an input corpus by dividing it into pseudo-documents and uniformly +sampling train, dev, and test sets (simulate uniform sampling at the document +level when document boundaries are unknown) + +usage: {} in_file out_prefix doc_size dev_test_docs [-lc] +recommended: doc_size=20, dev_test_docs=100 +''' + +def gzopen(f): + return gzip.open(f, 'rb') if f.endswith('.gz') else open(f, 'r') + +def wc(f): + return sum(1 for _ in gzopen(f)) + +def main(argv): + + if len(argv[1:]) < 4: + sys.stderr.write(HELP.format(os.path.basename(argv[0]))) + sys.exit(2) + + in_file = os.path.abspath(argv[1]) + out_prefix = os.path.abspath(argv[2]) + doc_size = int(argv[3]) + dev_test_docs = int(argv[4]) + lc = (len(argv[1:]) == 5 and argv[5] == '-lc') + + corpus_size = wc(in_file) + total_docs = corpus_size / doc_size + leftover = corpus_size % doc_size + train_docs = total_docs - (2 * dev_test_docs) + train_batch_size = (train_docs / dev_test_docs) - 2 + + sys.stderr.write('Splitting {} lines ({} documents)\n'.format(corpus_size, total_docs + (1 if leftover else 0))) + sys.stderr.write('Train: {} ({})\n'.format((train_docs * doc_size) + leftover, train_docs + (1 if leftover else 0))) + sys.stderr.write('Dev: {} ({})\n'.format(dev_test_docs * doc_size, dev_test_docs)) + sys.stderr.write('Test: {} ({})\n'.format(dev_test_docs * doc_size, dev_test_docs)) + + with gzopen(in_file) as inp, \ + open('{}.train'.format(out_prefix), 'w') as train_out, \ + open('{}.dev'.format(out_prefix), 'w') as dev_out, \ + open('{}.test'.format(out_prefix), 'w') as test_out: + i = 0 + for _ in range(dev_test_docs): + for _ in range(train_batch_size): + for _ in range(doc_size): + i += 1 + train_out.write('{} ||| {}'.format(i, inp.readline()) if lc else inp.readline()) + for _ in range(doc_size): + i += 1 + dev_out.write('{} ||| {}'.format(i, inp.readline()) if lc else inp.readline()) + for _ in range(doc_size): + i += 1 + test_out.write('{} ||| {}'.format(i, inp.readline()) if lc else inp.readline()) + for line in inp: + i += 1 + train_out.write('{} ||| {}'.format(i, line) if lc else line) + +if __name__ == '__main__': + main(sys.argv) -- cgit v1.2.3 From d8e9f8c4df61a8162dcb7ac0e53c416eeeb36d26 Mon Sep 17 00:00:00 2001 From: mjdenkowski Date: Sat, 20 Dec 2014 03:01:42 -0500 Subject: Generalize to sample any number of dev sets --- corpus/sample-dev-sets.py | 74 +++++++++++++++++++++++++++++++++++++++++++++++ corpus/sample-dev-test.py | 65 ----------------------------------------- 2 files changed, 74 insertions(+), 65 deletions(-) create mode 100755 corpus/sample-dev-sets.py delete mode 100755 corpus/sample-dev-test.py (limited to 'corpus') diff --git a/corpus/sample-dev-sets.py b/corpus/sample-dev-sets.py new file mode 100755 index 00000000..3c969bbe --- /dev/null +++ b/corpus/sample-dev-sets.py @@ -0,0 +1,74 @@ +#!/usr/bin/env python + +import gzip +import os +import sys + +HELP = '''Process an input corpus by dividing it into pseudo-documents and uniformly +sampling train and dev sets (simulate uniform sampling at the document level +when document boundaries are unknown) + +usage: {} in_file out_prefix doc_size docs_per_dev_set dev_sets [-lc] +recommended: doc_size=20, docs_per_dev_set=100, dev_sets=2 (dev and test) +''' + +def gzopen(f): + return gzip.open(f, 'rb') if f.endswith('.gz') else open(f, 'r') + +def wc(f): + return sum(1 for _ in gzopen(f)) + +def main(argv): + + if len(argv[1:]) < 5: + sys.stderr.write(HELP.format(os.path.basename(argv[0]))) + sys.exit(2) + + # Args + in_file = os.path.abspath(argv[1]) + out_prefix = os.path.abspath(argv[2]) + doc_size = int(argv[3]) + docs_per_dev_set = int(argv[4]) + dev_sets = int(argv[5]) + lc = (len(argv[1:]) == 6 and argv[6] == '-lc') + + # Compute sizes + corpus_size = wc(in_file) + total_docs = corpus_size / doc_size + leftover = corpus_size % doc_size + train_docs = total_docs - (dev_sets * docs_per_dev_set) + train_batch_size = (train_docs / docs_per_dev_set) + + # Report + sys.stderr.write('Splitting {} lines ({} documents)\n'.format(corpus_size, total_docs + (1 if leftover else 0))) + sys.stderr.write('Train: {} ({})\n'.format((train_docs * doc_size) + leftover, train_docs + (1 if leftover else 0))) + sys.stderr.write('Dev: {} x {} ({})\n'.format(dev_sets, docs_per_dev_set * doc_size, docs_per_dev_set)) + + inp = gzopen(in_file) + train_out = open('{}.train'.format(out_prefix), 'w') + dev_out = [open('{}.dev.{}'.format(out_prefix, i + 1), 'w') for i in range(dev_sets)] + i = 0 + + # For each set of documents + for _ in range(docs_per_dev_set): + # Write several documents to train + for _ in range(train_batch_size): + for _ in range(doc_size): + i += 1 + train_out.write('{} ||| {}'.format(i, inp.readline()) if lc else inp.readline()) + # Write a document to each dev + for out in dev_out: + for _ in range(doc_size): + i += 1 + out.write('{} ||| {}'.format(i, inp.readline()) if lc else inp.readline()) + # Write leftover lines to train + for line in inp: + i += 1 + train_out.write('{} ||| {}'.format(i, line) if lc else line) + + train_out.close() + for out in dev_out: + out.close() + +if __name__ == '__main__': + main(sys.argv) diff --git a/corpus/sample-dev-test.py b/corpus/sample-dev-test.py deleted file mode 100755 index 0c0514ee..00000000 --- a/corpus/sample-dev-test.py +++ /dev/null @@ -1,65 +0,0 @@ -#!/usr/bin/env python - -import gzip -import os -import sys - -HELP = '''Process an input corpus by dividing it into pseudo-documents and uniformly -sampling train, dev, and test sets (simulate uniform sampling at the document -level when document boundaries are unknown) - -usage: {} in_file out_prefix doc_size dev_test_docs [-lc] -recommended: doc_size=20, dev_test_docs=100 -''' - -def gzopen(f): - return gzip.open(f, 'rb') if f.endswith('.gz') else open(f, 'r') - -def wc(f): - return sum(1 for _ in gzopen(f)) - -def main(argv): - - if len(argv[1:]) < 4: - sys.stderr.write(HELP.format(os.path.basename(argv[0]))) - sys.exit(2) - - in_file = os.path.abspath(argv[1]) - out_prefix = os.path.abspath(argv[2]) - doc_size = int(argv[3]) - dev_test_docs = int(argv[4]) - lc = (len(argv[1:]) == 5 and argv[5] == '-lc') - - corpus_size = wc(in_file) - total_docs = corpus_size / doc_size - leftover = corpus_size % doc_size - train_docs = total_docs - (2 * dev_test_docs) - train_batch_size = (train_docs / dev_test_docs) - 2 - - sys.stderr.write('Splitting {} lines ({} documents)\n'.format(corpus_size, total_docs + (1 if leftover else 0))) - sys.stderr.write('Train: {} ({})\n'.format((train_docs * doc_size) + leftover, train_docs + (1 if leftover else 0))) - sys.stderr.write('Dev: {} ({})\n'.format(dev_test_docs * doc_size, dev_test_docs)) - sys.stderr.write('Test: {} ({})\n'.format(dev_test_docs * doc_size, dev_test_docs)) - - with gzopen(in_file) as inp, \ - open('{}.train'.format(out_prefix), 'w') as train_out, \ - open('{}.dev'.format(out_prefix), 'w') as dev_out, \ - open('{}.test'.format(out_prefix), 'w') as test_out: - i = 0 - for _ in range(dev_test_docs): - for _ in range(train_batch_size): - for _ in range(doc_size): - i += 1 - train_out.write('{} ||| {}'.format(i, inp.readline()) if lc else inp.readline()) - for _ in range(doc_size): - i += 1 - dev_out.write('{} ||| {}'.format(i, inp.readline()) if lc else inp.readline()) - for _ in range(doc_size): - i += 1 - test_out.write('{} ||| {}'.format(i, inp.readline()) if lc else inp.readline()) - for line in inp: - i += 1 - train_out.write('{} ||| {}'.format(i, line) if lc else line) - -if __name__ == '__main__': - main(sys.argv) -- cgit v1.2.3 From 8646b68e5b124f612fd65b51ea40624f65a2f3d6 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 29 Dec 2014 01:59:41 -0500 Subject: finnish abbrevs --- corpus/support/token_list | 48 +++++++++++++++++++++++++++++++++++++++++++ corpus/support/token_patterns | 1 + 2 files changed, 49 insertions(+) (limited to 'corpus') diff --git a/corpus/support/token_list b/corpus/support/token_list index d38638cf..69dbfdc2 100644 --- a/corpus/support/token_list +++ b/corpus/support/token_list @@ -1,6 +1,54 @@ ##################### hyphenated words added by Fei since 3/7/05 ##X-ray +# Finnish +eaa. +ap. +arv. +ay. +eKr. +em. +engl. +esim. +fil. +lis. +fil. +maist. +fil.toht. +harv. +ilt. +jatk. +jKr. +jms. +jne. +joht. +klo +ko. +ks. +leht. +lv. +lyh. +mm. +mon. +nim. +nro. +ns. +nti. +os. +oy. +pj. +pnä. +puh. +pvm. +rva. +tms. +ts. +vars. +vrt. +ym. +yms. +yo. + # hindi abbreviation patterns जन. फर. diff --git a/corpus/support/token_patterns b/corpus/support/token_patterns index de64fb2a..e51e5e72 100644 --- a/corpus/support/token_patterns +++ b/corpus/support/token_patterns @@ -1,5 +1,6 @@ /^(al|el|ul|e)\-[a-z]+$/ /^((а|А)(ль|ш)|уль)-\p{Cyrillic}+$/ +/^(юл)-\p{Cyrillic}+$/ /^\p{Cyrillic}\.\p{Cyrillic}\.$/ /^(\d|\d\d|\d\d\d)\.$/ -- cgit v1.2.3 From 5bc712f5bfa3ec7924a52ded4a7d6b3bab9539f0 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 29 Dec 2014 02:00:56 -0500 Subject: foo --- corpus/support/token_list | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'corpus') diff --git a/corpus/support/token_list b/corpus/support/token_list index d38638cf..c857b363 100644 --- a/corpus/support/token_list +++ b/corpus/support/token_list @@ -1,6 +1,10 @@ ##################### hyphenated words added by Fei since 3/7/05 ##X-ray +# Finnish abbreviation patterns +eaa. +v. + # hindi abbreviation patterns जन. फर. -- cgit v1.2.3 From a66b19342f105546adfb9cd9189f862e8a9c0b15 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 29 Dec 2014 02:02:27 -0500 Subject: finnish case markings --- corpus/support/token_patterns | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'corpus') diff --git a/corpus/support/token_patterns b/corpus/support/token_patterns index e51e5e72..12558cdd 100644 --- a/corpus/support/token_patterns +++ b/corpus/support/token_patterns @@ -1,6 +1,7 @@ /^(al|el|ul|e)\-[a-z]+$/ +/\.(fi|fr|es|co\.uk|de)$/ +/:[a-zä]+$/ /^((а|А)(ль|ш)|уль)-\p{Cyrillic}+$/ -/^(юл)-\p{Cyrillic}+$/ /^\p{Cyrillic}\.\p{Cyrillic}\.$/ /^(\d|\d\d|\d\d\d)\.$/ -- cgit v1.2.3 From 992556cc0931b255b9e299c0f489c3b449b22ab4 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Mon, 29 Dec 2014 19:20:49 -0500 Subject: deal with eur symbol --- corpus/support/tokenizer.pl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'corpus') diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl index aa285be4..718d78cc 100755 --- a/corpus/support/tokenizer.pl +++ b/corpus/support/tokenizer.pl @@ -415,7 +415,7 @@ sub deep_proc_token { } ## remove the ending periods that follow number etc. - if($line =~ /^(.*(\d|\~|\^|\&|\:|\,|\#|\*|\%|\-|\_|\/|\\|\$|\'))(\.+)$/){ + if($line =~ /^(.*(\d|\~|\^|\&|\:|\,|\#|\*|\%|€|\-|\_|\/|\\|\$|\'))(\.+)$/){ ## 12~13. => 12~13 . my $t1 = $1; my $t3 = $3; @@ -600,12 +600,12 @@ sub deep_proc_token { ## deal with "%" - if(($line =~ /\%/) && ($Split_On_PercentSign > 0)){ + if(($line =~ /\%|€/) && ($Split_On_PercentSign > 0)){ my $suc = 0; if($Split_On_PercentSign >= 2){ - $suc += ($line =~ s/(\D)(\%+)/$1 $2/g); + $suc += ($line =~ s/(\D)(\%+|€+)/$1 $2/g); }else{ - $suc += ($line =~ s/(\%+)/ $1 /g); + $suc += ($line =~ s/(\%+|€+)/ $1 /g); } if($suc){ -- cgit v1.2.3 From 1bce604809399a0adc581fb0102bff11decf3436 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Thu, 8 Jan 2015 21:35:29 -0500 Subject: Stop BOMbs before they decrease quality --- corpus/utf8-normalize.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'corpus') diff --git a/corpus/utf8-normalize.sh b/corpus/utf8-normalize.sh index dcf8bc59..7c0db611 100755 --- a/corpus/utf8-normalize.sh +++ b/corpus/utf8-normalize.sh @@ -7,7 +7,7 @@ if which uconv > /dev/null then - CMD="uconv -f utf8 -t utf8 -x Any-NFKC --callback skip" + CMD="uconv -f utf8 -t utf8 -x Any-NFKC --callback skip --remove-signature" else echo "Cannot find ICU uconv (http://site.icu-project.org/) ... falling back to iconv. Normalization NOT taking place." 1>&2 CMD="iconv -f utf8 -t utf8 -c" -- cgit v1.2.3