summaryrefslogtreecommitdiff
path: root/sa-extract/escape-testset.pl
diff options
context:
space:
mode:
authorChris Dyer <cdyer@cs.cmu.edu>2012-02-03 18:03:49 -0500
committerChris Dyer <cdyer@cs.cmu.edu>2012-02-03 18:03:49 -0500
commitf08ff03664ee7c9601c9daaa217cb032160f386f (patch)
tree5e93393df8bddb128a778f29ea86a0ea81ce7ebf /sa-extract/escape-testset.pl
parent16d08eefddbecfefced16a0dd5a13d4c64c139b0 (diff)
escaping tool for grammar extractor
Diffstat (limited to 'sa-extract/escape-testset.pl')
-rwxr-xr-xsa-extract/escape-testset.pl35
1 files changed, 35 insertions, 0 deletions
diff --git a/sa-extract/escape-testset.pl b/sa-extract/escape-testset.pl
new file mode 100755
index 00000000..02fd7445
--- /dev/null
+++ b/sa-extract/escape-testset.pl
@@ -0,0 +1,35 @@
+#!/usr/bin/perl -w
+
+use utf8;
+use strict;
+
+binmode(STDIN,":utf8");
+binmode(STDOUT,":utf8");
+
+my @fh = ();
+if (scalar @ARGV == 0) {
+ push @fh, \*STDIN;
+} else {
+ for my $file (@ARGV) {
+ my $f;
+ open $f, "<$file" or die "Can't read $file: $!\n";
+ binmode $f, ":utf8";
+ push @fh, $f;
+ }
+}
+
+my $id = -1;
+for my $f (@fh) {
+ while(<$f>) {
+ chomp;
+ die "Empty line in test set" if /^\s*$/;
+ die "Please remove <seg> tags from input:\n$_" if /^\s*<seg/i;
+ $id++;
+ s/&/\&amp;/g;
+ s/</\&lt;/g;
+ s/>/\&gt;/g;
+ print "<seg id=\"$id\"> $_ </seg>\n";
+ }
+}
+
+