summaryrefslogtreecommitdiff
path: root/corpus
diff options
context:
space:
mode:
Diffstat (limited to 'corpus')
-rwxr-xr-xcorpus/moses-xml.pl36
1 files changed, 36 insertions, 0 deletions
diff --git a/corpus/moses-xml.pl b/corpus/moses-xml.pl
new file mode 100755
index 00000000..fca63aa8
--- /dev/null
+++ b/corpus/moses-xml.pl
@@ -0,0 +1,36 @@
+#!/usr/bin/perl -w
+
+use strict;
+$|++;
+
+my $msg = "Usage: $0 (escape|unescape)\n\n Escapes XMl entities and other special characters for use with Moses.\n\n";
+
+die $msg unless scalar @ARGV == 1;
+
+if ($ARGV[0] eq "escape") {
+ while (<STDIN>) {
+ $_ =~ s/\&/\&amp;/g; # escape escape
+ $_ =~ s/\|/\&#124;/g; # factor separator
+ $_ =~ s/\</\&lt;/g; # xml
+ $_ =~ s/\>/\&gt;/g; # xml
+ $_ =~ s/\'/\&apos;/g; # xml
+ $_ =~ s/\"/\&quot;/g; # xml
+ $_ =~ s/\[/\&#91;/g; # syntax non-terminal
+ $_ =~ s/\]/\&#93;/g; # syntax non-terminal
+ print;
+ }
+} elsif ($ARGV[0] eq "unescape") {
+ while (<STDIN>) {
+ $_ =~ s/\&#124;/\|/g; # factor separator
+ $_ =~ s/\&lt;/\</g; # xml
+ $_ =~ s/\&gt;/\>/g; # xml
+ $_ =~ s/\&apos;/\'/g; # xml
+ $_ =~ s/\&quot;/\"/g; # xml
+ $_ =~ s/\&#91;/\[/g; # syntax non-terminal
+ $_ =~ s/\&#93;/\]/g; # syntax non-terminal
+ $_ =~ s/\&amp;/\&/g; # escape escape
+ print;
+ }
+} else {
+ die $msg;
+}