summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2013-01-15 01:20:00 -0500
committerChris Dyer <cdyer@allegro.clab.cs.cmu.edu>2013-01-15 01:20:00 -0500
commit9d7167751a3712a79ad356764d803106a71ce5e3 (patch)
tree4eb2feff7b0c6bb66ca28f70f45a933fc2aed600
parenta52035d1bce0566bfd9603d93e4fb601bf25a73f (diff)
corpus files
-rw-r--r--Makefile.am1
-rw-r--r--configure.ac2
-rwxr-xr-xcorpus/add-self-translations.pl2
-rwxr-xr-xcorpus/filter-length.pl6
-rwxr-xr-xcorpus/paste-files.pl12
5 files changed, 18 insertions, 5 deletions
diff --git a/Makefile.am b/Makefile.am
index dbf604a1..1d898156 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -15,6 +15,7 @@ SUBDIRS = \
#gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava
+EXTRA_DIST = python/pkg python/src python/tests python/examples
AUTOMAKE_OPTIONS = foreign
ACLOCAL_AMFLAGS = -I m4
AM_CPPFLAGS = -D_GLIBCXX_PARALLEL
diff --git a/configure.ac b/configure.ac
index dcd0a0d8..69971dc3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1,4 +1,4 @@
-AC_INIT([cdec],[1.0])
+AC_INIT([cdec],[2013-01-15])
AC_CONFIG_SRCDIR([decoder/cdec.cc])
AM_INIT_AUTOMAKE
AC_CONFIG_HEADERS(config.h)
diff --git a/corpus/add-self-translations.pl b/corpus/add-self-translations.pl
index 153bc454..d707ce29 100755
--- a/corpus/add-self-translations.pl
+++ b/corpus/add-self-translations.pl
@@ -6,7 +6,7 @@ use strict;
my %df;
my %def;
while(<>) {
- print;
+# print;
chomp;
my ($sf, $se) = split / \|\|\| /;
die "Format error: $_\n" unless defined $sf && defined $se;
diff --git a/corpus/filter-length.pl b/corpus/filter-length.pl
index 70032ca7..3cfa40cc 100755
--- a/corpus/filter-length.pl
+++ b/corpus/filter-length.pl
@@ -3,8 +3,8 @@ use strict;
use utf8;
##### EDIT THESE SETTINGS ####################################################
-my $MAX_LENGTH = 99; # discard a sentence if it is longer than this
-my $AUTOMATIC_INCLUDE_IF_SHORTER_THAN = 6; # if both are shorter, include
+my $MAX_LENGTH = 150; # discard a sentence if it is longer than this
+my $AUTOMATIC_INCLUDE_IF_SHORTER_THAN = 7; # if both are shorter, include
my $MAX_ZSCORE = 1.8; # how far from the mean can the (log)ratio be?
##############################################################################
@@ -128,6 +128,8 @@ while(<F>) {
next;
}
print;
+ } else {
+ print;
}
$to++;
}
diff --git a/corpus/paste-files.pl b/corpus/paste-files.pl
index 24c70599..0b788386 100755
--- a/corpus/paste-files.pl
+++ b/corpus/paste-files.pl
@@ -17,6 +17,7 @@ for my $file (@ARGV) {
binmode(STDOUT,":utf8");
binmode(STDERR,":utf8");
+my $bad = 0;
my $lc = 0;
my $done = 0;
my $fl = 0;
@@ -34,7 +35,15 @@ while(1) {
last;
}
chomp $r;
- die "$ARGV[$anum]:$lc contains a ||| symbol - please remove.\n" if $r =~ /\|\|\|/;
+ if ($r =~ /\|\|\|/) {
+ $r = '';
+ $bad++;
+ }
+ warn "$ARGV[$anum]:$lc contains a ||| symbol - please remove.\n" if $r =~ /\|\|\|/;
+ $r =~ s/\|\|\|/ /g;
+ $r =~ s/ +//g;
+ $r =~ s/^ //;
+ $r =~ s/ $//;
$anum++;
push @line, $r;
}
@@ -47,4 +56,5 @@ for (my $i = 1; $i < scalar @fhs; $i++) {
my $r = <$fh>;
die "Mismatched number of lines.\n" if defined $r;
}
+print STDERR "Bad lines containing ||| were $bad\n";