summaryrefslogtreecommitdiff
path: root/corpus
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2012-10-25 16:05:56 -0400
committerChris Dyer <cdyer@cs.cmu.edu>2012-10-25 16:05:56 -0400
commitdf5445c3651fa1cc99ed4bdb682dcf57092dd4e2 (patch)
tree172386e00c5354af1e1e2b70301ebfd92f3e86a4 /corpus
parent172ac12e0fa224e038f04faa60984e321f73430a (diff)
add self translation
Diffstat (limited to 'corpus')
-rwxr-xr-xcorpus/add-self-translations.pl29
1 files changed, 29 insertions, 0 deletions
diff --git a/corpus/add-self-translations.pl b/corpus/add-self-translations.pl
new file mode 100755
index 00000000..153bc454
--- /dev/null
+++ b/corpus/add-self-translations.pl
@@ -0,0 +1,29 @@
+#!/usr/bin/perl -w
+use strict;
+
+# ADDS SELF-TRANSLATIONS OF POORLY ATTESTED WORDS TO THE PARALLEL DATA
+
+my %df;
+my %def;
+while(<>) {
+ print;
+ chomp;
+ my ($sf, $se) = split / \|\|\| /;
+ die "Format error: $_\n" unless defined $sf && defined $se;
+ my @fs = split /\s+/, $sf;
+ my @es = split /\s+/, $se;
+ for my $f (@fs) {
+ $df{$f}++;
+ for my $e (@es) {
+ if ($f eq $e) { $def{$f}++; }
+ }
+ }
+}
+
+for my $k (sort keys %def) {
+ next if $df{$k} > 4;
+ print "$k ||| $k\n";
+ print "$k ||| $k\n";
+ print "$k ||| $k\n";
+}
+