summaryrefslogtreecommitdiff
path: root/extools/test_data
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-22 05:12:27 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-22 05:12:27 +0000
commit0172721855098ca02b207231a654dffa5e4eb1c9 (patch)
tree8069c3a62e2d72bd64a2cdeee9724b2679c8a56b /extools/test_data
parent37728b8be4d0b3df9da81fdda2198ff55b4b2d91 (diff)
initial checkin
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@2 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'extools/test_data')
-rw-r--r--extools/test_data/README10
-rw-r--r--extools/test_data/corpus.aligned5
-rw-r--r--extools/test_data/corpus.en5
-rw-r--r--extools/test_data/corpus.fr5
-rw-r--r--extools/test_data/corpus.len_cats5
-rw-r--r--extools/test_data/fr-en.al.len5
-rwxr-xr-xextools/test_data/make_len_cats.pl23
7 files changed, 58 insertions, 0 deletions
diff --git a/extools/test_data/README b/extools/test_data/README
new file mode 100644
index 00000000..e368cffc
--- /dev/null
+++ b/extools/test_data/README
@@ -0,0 +1,10 @@
+The following was used to create the test data. The real inputs
+were corpus.fr, corpus.en, and corpus.aligned. The generated files
+were corpus.len_cats and fr-en.al.len.
+
+
+ ./make_len_cats.pl corpus.en > corpus.len_cats
+
+ ../merge_lines.pl corpus.fr corpus.en corpus.aligned corpus.len_cats > fr-en.al.len
+
+
diff --git a/extools/test_data/corpus.aligned b/extools/test_data/corpus.aligned
new file mode 100644
index 00000000..aa09e9ab
--- /dev/null
+++ b/extools/test_data/corpus.aligned
@@ -0,0 +1,5 @@
+0-0 1-2 2-1
+0-0 1-1
+0-0 0-1 1-0 1-1 2-0 2-1 3-2 4-3
+0-0
+0-0 1-1
diff --git a/extools/test_data/corpus.en b/extools/test_data/corpus.en
new file mode 100644
index 00000000..2d4751bf
--- /dev/null
+++ b/extools/test_data/corpus.en
@@ -0,0 +1,5 @@
+the blue house
+the hat
+there is a hat
+cap
+the cat
diff --git a/extools/test_data/corpus.fr b/extools/test_data/corpus.fr
new file mode 100644
index 00000000..75b5e127
--- /dev/null
+++ b/extools/test_data/corpus.fr
@@ -0,0 +1,5 @@
+la maison bleue
+le chapeau
+il y a un chapeau
+chapeau
+le chat
diff --git a/extools/test_data/corpus.len_cats b/extools/test_data/corpus.len_cats
new file mode 100644
index 00000000..18d321de
--- /dev/null
+++ b/extools/test_data/corpus.len_cats
@@ -0,0 +1,5 @@
+0-1:SHORT 0-2:SHORT 0-3:MID 1-2:SHORT 1-3:SHORT 2-3:SHORT
+0-1:SHORT 0-2:SHORT 1-2:SHORT
+0-1:SHORT 0-2:SHORT 0-3:MID 0-4:MID 1-2:SHORT 1-3:SHORT 1-4:MID 2-3:SHORT 2-4:SHORT 3-4:SHORT
+0-1:SHORT
+0-1:SHORT 0-2:SHORT 1-2:SHORT
diff --git a/extools/test_data/fr-en.al.len b/extools/test_data/fr-en.al.len
new file mode 100644
index 00000000..7ee6b85d
--- /dev/null
+++ b/extools/test_data/fr-en.al.len
@@ -0,0 +1,5 @@
+la maison bleue ||| the blue house ||| 0-0 1-2 2-1 ||| 0-1:SHORT 0-2:SHORT 0-3:MID 1-2:SHORT 1-3:SHORT 2-3:SHORT
+le chapeau ||| the hat ||| 0-0 1-1 ||| 0-1:SHORT 0-2:SHORT 1-2:SHORT
+il y a un chapeau ||| there is a hat ||| 0-0 0-1 1-0 1-1 2-0 2-1 3-2 4-3 ||| 0-1:SHORT 0-2:SHORT 0-3:MID 0-4:MID 1-2:SHORT 1-3:SHORT 1-4:MID 2-3:SHORT 2-4:SHORT 3-4:SHORT
+chapeau ||| cap ||| 0-0 ||| 0-1:SHORT
+le chat ||| the cat ||| 0-0 1-1 ||| 0-1:SHORT 0-2:SHORT 1-2:SHORT
diff --git a/extools/test_data/make_len_cats.pl b/extools/test_data/make_len_cats.pl
new file mode 100755
index 00000000..25ef75fa
--- /dev/null
+++ b/extools/test_data/make_len_cats.pl
@@ -0,0 +1,23 @@
+#!/usr/bin/perl -w
+use strict;
+
+my $max_len = 15;
+my @cat_names = qw( NULL SHORT SHORT MID MID MID LONG LONG LONG LONG LONG VLONG VLONG VLONG VLONG VLONG );
+
+while(<>) {
+ chomp;
+ my @words = split /\s+/;
+ my $len = scalar @words;
+ my @spans;
+ for (my $i =0; $i < $len; $i++) {
+ for (my $k = 1; $k <= $max_len; $k++) {
+ my $j = $i + $k;
+ next if ($j > $len);
+ my $cat = $cat_names[$k];
+ die unless $cat;
+ push @spans, "$i-$j:$cat";
+ }
+ }
+ print "@spans\n";
+}
+