diff options
-rwxr-xr-x | corpus/moses-xml.pl | 36 |
1 files changed, 36 insertions, 0 deletions
diff --git a/corpus/moses-xml.pl b/corpus/moses-xml.pl new file mode 100755 index 00000000..fca63aa8 --- /dev/null +++ b/corpus/moses-xml.pl @@ -0,0 +1,36 @@ +#!/usr/bin/perl -w + +use strict; +$|++; + +my $msg = "Usage: $0 (escape|unescape)\n\n Escapes XMl entities and other special characters for use with Moses.\n\n"; + +die $msg unless scalar @ARGV == 1; + +if ($ARGV[0] eq "escape") { + while (<STDIN>) { + $_ =~ s/\&/\&/g; # escape escape + $_ =~ s/\|/\|/g; # factor separator + $_ =~ s/\</\</g; # xml + $_ =~ s/\>/\>/g; # xml + $_ =~ s/\'/\'/g; # xml + $_ =~ s/\"/\"/g; # xml + $_ =~ s/\[/\[/g; # syntax non-terminal + $_ =~ s/\]/\]/g; # syntax non-terminal + print; + } +} elsif ($ARGV[0] eq "unescape") { + while (<STDIN>) { + $_ =~ s/\|/\|/g; # factor separator + $_ =~ s/\</\</g; # xml + $_ =~ s/\>/\>/g; # xml + $_ =~ s/\'/\'/g; # xml + $_ =~ s/\"/\"/g; # xml + $_ =~ s/\[/\[/g; # syntax non-terminal + $_ =~ s/\]/\]/g; # syntax non-terminal + $_ =~ s/\&/\&/g; # escape escape + print; + } +} else { + die $msg; +} |