summaryrefslogtreecommitdiff
path: root/corpus
diff options
context:
space:
mode:
authorMichael Denkowski <mdenkows@cs.cmu.edu>2015-04-13 07:18:34 -0700
committerMichael Denkowski <mdenkows@cs.cmu.edu>2015-04-13 07:18:34 -0700
commit52f1cae58b6a092b253b71ff8a85612a1b844ba6 (patch)
treeb0d988522a17b79fff3370932b45072c1bc7584b /corpus
parent3afa8eaa99740f89e0dadda21cf7a9e3546d6047 (diff)
Moses compatibility for tokenizer
Diffstat (limited to 'corpus')
-rwxr-xr-xcorpus/moses-xml.pl36
1 files changed, 36 insertions, 0 deletions
diff --git a/corpus/moses-xml.pl b/corpus/moses-xml.pl
new file mode 100755
index 00000000..fca63aa8
--- /dev/null
+++ b/corpus/moses-xml.pl
@@ -0,0 +1,36 @@
+#!/usr/bin/perl -w
+
+use strict;
+$|++;
+
+my $msg = "Usage: $0 (escape|unescape)\n\n Escapes XMl entities and other special characters for use with Moses.\n\n";
+
+die $msg unless scalar @ARGV == 1;
+
+if ($ARGV[0] eq "escape") {
+ while (<STDIN>) {
+ $_ =~ s/\&/\&amp;/g; # escape escape
+ $_ =~ s/\|/\&#124;/g; # factor separator
+ $_ =~ s/\</\&lt;/g; # xml
+ $_ =~ s/\>/\&gt;/g; # xml
+ $_ =~ s/\'/\&apos;/g; # xml
+ $_ =~ s/\"/\&quot;/g; # xml
+ $_ =~ s/\[/\&#91;/g; # syntax non-terminal
+ $_ =~ s/\]/\&#93;/g; # syntax non-terminal
+ print;
+ }
+} elsif ($ARGV[0] eq "unescape") {
+ while (<STDIN>) {
+ $_ =~ s/\&#124;/\|/g; # factor separator
+ $_ =~ s/\&lt;/\</g; # xml
+ $_ =~ s/\&gt;/\>/g; # xml
+ $_ =~ s/\&apos;/\'/g; # xml
+ $_ =~ s/\&quot;/\"/g; # xml
+ $_ =~ s/\&#91;/\[/g; # syntax non-terminal
+ $_ =~ s/\&#93;/\]/g; # syntax non-terminal
+ $_ =~ s/\&amp;/\&/g; # escape escape
+ print;
+ }
+} else {
+ die $msg;
+}