From 9d7167751a3712a79ad356764d803106a71ce5e3 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 15 Jan 2013 01:20:00 -0500 Subject: corpus files --- Makefile.am | 1 + configure.ac | 2 +- corpus/add-self-translations.pl | 2 +- corpus/filter-length.pl | 6 ++++-- corpus/paste-files.pl | 12 +++++++++++- 5 files changed, 18 insertions(+), 5 deletions(-) diff --git a/Makefile.am b/Makefile.am index dbf604a1..1d898156 100644 --- a/Makefile.am +++ b/Makefile.am @@ -15,6 +15,7 @@ SUBDIRS = \ #gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava +EXTRA_DIST = python/pkg python/src python/tests python/examples AUTOMAKE_OPTIONS = foreign ACLOCAL_AMFLAGS = -I m4 AM_CPPFLAGS = -D_GLIBCXX_PARALLEL diff --git a/configure.ac b/configure.ac index dcd0a0d8..69971dc3 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cdec],[1.0]) +AC_INIT([cdec],[2013-01-15]) AC_CONFIG_SRCDIR([decoder/cdec.cc]) AM_INIT_AUTOMAKE AC_CONFIG_HEADERS(config.h) diff --git a/corpus/add-self-translations.pl b/corpus/add-self-translations.pl index 153bc454..d707ce29 100755 --- a/corpus/add-self-translations.pl +++ b/corpus/add-self-translations.pl @@ -6,7 +6,7 @@ use strict; my %df; my %def; while(<>) { - print; +# print; chomp; my ($sf, $se) = split / \|\|\| /; die "Format error: $_\n" unless defined $sf && defined $se; diff --git a/corpus/filter-length.pl b/corpus/filter-length.pl index 70032ca7..3cfa40cc 100755 --- a/corpus/filter-length.pl +++ b/corpus/filter-length.pl @@ -3,8 +3,8 @@ use strict; use utf8; ##### EDIT THESE SETTINGS #################################################### -my $MAX_LENGTH = 99; # discard a sentence if it is longer than this -my $AUTOMATIC_INCLUDE_IF_SHORTER_THAN = 6; # if both are shorter, include +my $MAX_LENGTH = 150; # discard a sentence if it is longer than this +my $AUTOMATIC_INCLUDE_IF_SHORTER_THAN = 7; # if both are shorter, include my $MAX_ZSCORE = 1.8; # how far from the mean can the (log)ratio be? ############################################################################## @@ -128,6 +128,8 @@ while() { next; } print; + } else { + print; } $to++; } diff --git a/corpus/paste-files.pl b/corpus/paste-files.pl index 24c70599..0b788386 100755 --- a/corpus/paste-files.pl +++ b/corpus/paste-files.pl @@ -17,6 +17,7 @@ for my $file (@ARGV) { binmode(STDOUT,":utf8"); binmode(STDERR,":utf8"); +my $bad = 0; my $lc = 0; my $done = 0; my $fl = 0; @@ -34,7 +35,15 @@ while(1) { last; } chomp $r; - die "$ARGV[$anum]:$lc contains a ||| symbol - please remove.\n" if $r =~ /\|\|\|/; + if ($r =~ /\|\|\|/) { + $r = ''; + $bad++; + } + warn "$ARGV[$anum]:$lc contains a ||| symbol - please remove.\n" if $r =~ /\|\|\|/; + $r =~ s/\|\|\|/ /g; + $r =~ s/ +//g; + $r =~ s/^ //; + $r =~ s/ $//; $anum++; push @line, $r; } @@ -47,4 +56,5 @@ for (my $i = 1; $i < scalar @fhs; $i++) { my $r = <$fh>; die "Mismatched number of lines.\n" if defined $r; } +print STDERR "Bad lines containing ||| were $bad\n"; -- cgit v1.2.3