From df5445c3651fa1cc99ed4bdb682dcf57092dd4e2 Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Thu, 25 Oct 2012 16:05:56 -0400 Subject: add self translation --- corpus/add-self-translations.pl | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100755 corpus/add-self-translations.pl (limited to 'corpus/add-self-translations.pl') diff --git a/corpus/add-self-translations.pl b/corpus/add-self-translations.pl new file mode 100755 index 00000000..153bc454 --- /dev/null +++ b/corpus/add-self-translations.pl @@ -0,0 +1,29 @@ +#!/usr/bin/perl -w +use strict; + +# ADDS SELF-TRANSLATIONS OF POORLY ATTESTED WORDS TO THE PARALLEL DATA + +my %df; +my %def; +while(<>) { + print; + chomp; + my ($sf, $se) = split / \|\|\| /; + die "Format error: $_\n" unless defined $sf && defined $se; + my @fs = split /\s+/, $sf; + my @es = split /\s+/, $se; + for my $f (@fs) { + $df{$f}++; + for my $e (@es) { + if ($f eq $e) { $def{$f}++; } + } + } +} + +for my $k (sort keys %def) { + next if $df{$k} > 4; + print "$k ||| $k\n"; + print "$k ||| $k\n"; + print "$k ||| $k\n"; +} + -- cgit v1.2.3 From bae5fe99037ae7e101953ad0df118127191c711c Mon Sep 17 00:00:00 2001 From: Chris Dyer Date: Tue, 15 Jan 2013 01:20:00 -0500 Subject: corpus files --- Makefile.am | 1 + configure.ac | 2 +- corpus/add-self-translations.pl | 2 +- corpus/filter-length.pl | 6 ++++-- corpus/paste-files.pl | 12 +++++++++++- 5 files changed, 18 insertions(+), 5 deletions(-) (limited to 'corpus/add-self-translations.pl') diff --git a/Makefile.am b/Makefile.am index dbf604a1..1d898156 100644 --- a/Makefile.am +++ b/Makefile.am @@ -15,6 +15,7 @@ SUBDIRS = \ #gi/pyp-topics/src gi/clda/src gi/posterior-regularisation/prjava +EXTRA_DIST = python/pkg python/src python/tests python/examples AUTOMAKE_OPTIONS = foreign ACLOCAL_AMFLAGS = -I m4 AM_CPPFLAGS = -D_GLIBCXX_PARALLEL diff --git a/configure.ac b/configure.ac index dcd0a0d8..69971dc3 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cdec],[1.0]) +AC_INIT([cdec],[2013-01-15]) AC_CONFIG_SRCDIR([decoder/cdec.cc]) AM_INIT_AUTOMAKE AC_CONFIG_HEADERS(config.h) diff --git a/corpus/add-self-translations.pl b/corpus/add-self-translations.pl index 153bc454..d707ce29 100755 --- a/corpus/add-self-translations.pl +++ b/corpus/add-self-translations.pl @@ -6,7 +6,7 @@ use strict; my %df; my %def; while(<>) { - print; +# print; chomp; my ($sf, $se) = split / \|\|\| /; die "Format error: $_\n" unless defined $sf && defined $se; diff --git a/corpus/filter-length.pl b/corpus/filter-length.pl index 70032ca7..3cfa40cc 100755 --- a/corpus/filter-length.pl +++ b/corpus/filter-length.pl @@ -3,8 +3,8 @@ use strict; use utf8; ##### EDIT THESE SETTINGS #################################################### -my $MAX_LENGTH = 99; # discard a sentence if it is longer than this -my $AUTOMATIC_INCLUDE_IF_SHORTER_THAN = 6; # if both are shorter, include +my $MAX_LENGTH = 150; # discard a sentence if it is longer than this +my $AUTOMATIC_INCLUDE_IF_SHORTER_THAN = 7; # if both are shorter, include my $MAX_ZSCORE = 1.8; # how far from the mean can the (log)ratio be? ############################################################################## @@ -128,6 +128,8 @@ while() { next; } print; + } else { + print; } $to++; } diff --git a/corpus/paste-files.pl b/corpus/paste-files.pl index 24c70599..0b788386 100755 --- a/corpus/paste-files.pl +++ b/corpus/paste-files.pl @@ -17,6 +17,7 @@ for my $file (@ARGV) { binmode(STDOUT,":utf8"); binmode(STDERR,":utf8"); +my $bad = 0; my $lc = 0; my $done = 0; my $fl = 0; @@ -34,7 +35,15 @@ while(1) { last; } chomp $r; - die "$ARGV[$anum]:$lc contains a ||| symbol - please remove.\n" if $r =~ /\|\|\|/; + if ($r =~ /\|\|\|/) { + $r = ''; + $bad++; + } + warn "$ARGV[$anum]:$lc contains a ||| symbol - please remove.\n" if $r =~ /\|\|\|/; + $r =~ s/\|\|\|/ /g; + $r =~ s/ +//g; + $r =~ s/^ //; + $r =~ s/ $//; $anum++; push @line, $r; } @@ -47,4 +56,5 @@ for (my $i = 1; $i < scalar @fhs; $i++) { my $r = <$fh>; die "Mismatched number of lines.\n" if defined $r; } +print STDERR "Bad lines containing ||| were $bad\n"; -- cgit v1.2.3