diff options
author | Michael Denkowski <mdenkows@cs.cmu.edu> | 2015-04-13 07:18:34 -0700 |
---|---|---|
committer | Michael Denkowski <mdenkows@cs.cmu.edu> | 2015-04-13 07:18:34 -0700 |
commit | 52f1cae58b6a092b253b71ff8a85612a1b844ba6 (patch) | |
tree | b0d988522a17b79fff3370932b45072c1bc7584b | |
parent | 3afa8eaa99740f89e0dadda21cf7a9e3546d6047 (diff) |
Moses compatibility for tokenizer
-rwxr-xr-x | corpus/moses-xml.pl | 36 |
1 files changed, 36 insertions, 0 deletions
diff --git a/corpus/moses-xml.pl b/corpus/moses-xml.pl new file mode 100755 index 00000000..fca63aa8 --- /dev/null +++ b/corpus/moses-xml.pl @@ -0,0 +1,36 @@ +#!/usr/bin/perl -w + +use strict; +$|++; + +my $msg = "Usage: $0 (escape|unescape)\n\n Escapes XMl entities and other special characters for use with Moses.\n\n"; + +die $msg unless scalar @ARGV == 1; + +if ($ARGV[0] eq "escape") { + while (<STDIN>) { + $_ =~ s/\&/\&/g; # escape escape + $_ =~ s/\|/\|/g; # factor separator + $_ =~ s/\</\</g; # xml + $_ =~ s/\>/\>/g; # xml + $_ =~ s/\'/\'/g; # xml + $_ =~ s/\"/\"/g; # xml + $_ =~ s/\[/\[/g; # syntax non-terminal + $_ =~ s/\]/\]/g; # syntax non-terminal + print; + } +} elsif ($ARGV[0] eq "unescape") { + while (<STDIN>) { + $_ =~ s/\|/\|/g; # factor separator + $_ =~ s/\</\</g; # xml + $_ =~ s/\>/\>/g; # xml + $_ =~ s/\'/\'/g; # xml + $_ =~ s/\"/\"/g; # xml + $_ =~ s/\[/\[/g; # syntax non-terminal + $_ =~ s/\]/\]/g; # syntax non-terminal + $_ =~ s/\&/\&/g; # escape escape + print; + } +} else { + die $msg; +} |