From 52f1cae58b6a092b253b71ff8a85612a1b844ba6 Mon Sep 17 00:00:00 2001 From: Michael Denkowski Date: Mon, 13 Apr 2015 07:18:34 -0700 Subject: Moses compatibility for tokenizer --- corpus/moses-xml.pl | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) create mode 100755 corpus/moses-xml.pl (limited to 'corpus/moses-xml.pl') diff --git a/corpus/moses-xml.pl b/corpus/moses-xml.pl new file mode 100755 index 00000000..fca63aa8 --- /dev/null +++ b/corpus/moses-xml.pl @@ -0,0 +1,36 @@ +#!/usr/bin/perl -w + +use strict; +$|++; + +my $msg = "Usage: $0 (escape|unescape)\n\n Escapes XMl entities and other special characters for use with Moses.\n\n"; + +die $msg unless scalar @ARGV == 1; + +if ($ARGV[0] eq "escape") { + while () { + $_ =~ s/\&/\&/g; # escape escape + $_ =~ s/\|/\|/g; # factor separator + $_ =~ s/\/\>/g; # xml + $_ =~ s/\'/\'/g; # xml + $_ =~ s/\"/\"/g; # xml + $_ =~ s/\[/\[/g; # syntax non-terminal + $_ =~ s/\]/\]/g; # syntax non-terminal + print; + } +} elsif ($ARGV[0] eq "unescape") { + while () { + $_ =~ s/\|/\|/g; # factor separator + $_ =~ s/\</\/g; # xml + $_ =~ s/\'/\'/g; # xml + $_ =~ s/\"/\"/g; # xml + $_ =~ s/\[/\[/g; # syntax non-terminal + $_ =~ s/\]/\]/g; # syntax non-terminal + $_ =~ s/\&/\&/g; # escape escape + print; + } +} else { + die $msg; +} -- cgit v1.2.3