summaryrefslogtreecommitdiff
path: root/corpus
diff options
context:
space:
mode:
authorPatrick Simianer <p@simianer.de>2014-06-12 13:56:42 +0200
committerPatrick Simianer <p@simianer.de>2014-06-12 13:56:42 +0200
commita39aa79b18347e22ef36ebc0da5a7eb220bcb23f (patch)
tree2c0f3009f8e381002bfeb82c0ea3bd0c41125761 /corpus
parent62bd9a4bdcea606d6ff2031fa4b207ef20caac31 (diff)
parent0e2f8d3d049f06afb08b4639c6a28aa5461cdc78 (diff)
Merge remote-tracking branch 'upstream/master'
Diffstat (limited to 'corpus')
-rwxr-xr-xcorpus/moses-scfg-to-cdec.pl69
-rwxr-xr-xcorpus/support/quote-norm.pl1
-rwxr-xr-xcorpus/support/tokenizer.pl4
3 files changed, 74 insertions, 0 deletions
diff --git a/corpus/moses-scfg-to-cdec.pl b/corpus/moses-scfg-to-cdec.pl
new file mode 100755
index 00000000..9b8e3617
--- /dev/null
+++ b/corpus/moses-scfg-to-cdec.pl
@@ -0,0 +1,69 @@
+#!/usr/bin/perl -w
+use strict;
+
+while(<>) {
+ my ($src, $trg, $feats, $al) = split / \|\|\| /;
+ # [X][NP] von [X][NP] [X] ||| [X][NP] &apos;s [X][NP] [S] ||| 0.00110169 0.0073223 2.84566e-06 0.0027702 0.0121867 2.718 0.606531 ||| 0-0 1-1 2-2 ||| 635 245838 2
+
+ my @srcs = split /\s+/, $src;
+ my @trgs = split /\s+/, $trg;
+ my $lhs = pop @trgs;
+ $lhs =~ s/&amp;apos;/'/g;
+ $lhs =~ s/&apos;/'/g;
+ $lhs =~ s/,/COMMA/g;
+ my $ntc = 0;
+ my $sc = 0;
+ my @of = ();
+ my $x = pop @srcs;
+ my %d = (); # src index to nonterminal count
+ die "Expected [X]" unless $x eq '[X]';
+ my %amap = ();
+ my @als = split / /, $al;
+ for my $st (@als) {
+ my ($s, $t) = split /-/, $st;
+ $amap{$t} = $s;
+ }
+ for my $f (@srcs) {
+ if ($f =~ /^\[X\]\[([^]]+)\]$/) {
+ $ntc++;
+ my $nt = $1;
+ $nt =~ s/&amp;apos;/'/g;
+ $nt =~ s/&apos;/'/g;
+ $nt =~ s/,/COMMA/g;
+ push @of, "[$nt]";
+ $d{$sc} = $ntc;
+ } elsif ($f =~ /^\[[^]]+\]$/) {
+ die "Unexpected $f";
+ } else {
+ push @of, $f;
+ }
+ $sc++;
+ }
+ my @oe = ();
+ my $ind = 0;
+ for my $e (@trgs) {
+ if ($e =~ /^\[X\]\[([^]]+)\]$/) {
+ my $imap = $d{$amap{$ind}};
+ push @oe, "[$imap]";
+ } else {
+ push @oe, $e;
+ }
+ $ind++;
+ }
+ my ($fe, $ef, $j, $lfe, $lef, $dummy, $of) = split / /, $feats;
+ next if $lef eq '0';
+ next if $lfe eq '0';
+ next if $ef eq '0';
+ next if $fe eq '0';
+ next if $j eq '0';
+ next if $of eq '0';
+ $ef = sprintf('%.6g', log($ef));
+ $fe = sprintf('%.6g',log($fe));
+ $j = sprintf('%.6g',log($j));
+ $lef = sprintf('%.6g',log($lef));
+ $lfe = sprintf('%.6g',log($lfe));
+ $of = sprintf('%.6g',log($of));
+ print "$lhs ||| @of ||| @oe ||| RuleCount=1 FgivenE=$fe EgivenF=$ef Joint=$j LexEgivenF=$lef LexFgivenE=$lfe Other=$of\n";
+}
+
+# [X][ADVP] angestiegen [X] ||| rose [X][ADVP] [VP] ||| 0.0538131 0.0097508 0.00744224 0.0249653 0.000698602 2.718 0.606531 ||| 0-1 1-0 ||| 13 94 2
diff --git a/corpus/support/quote-norm.pl b/corpus/support/quote-norm.pl
index 0366fad5..3eee0666 100755
--- a/corpus/support/quote-norm.pl
+++ b/corpus/support/quote-norm.pl
@@ -40,6 +40,7 @@ while(<STDIN>) {
# Regularlize spaces:
s/\x{ad}//g; # soft hyphen
+ s/\x{200C}//g; # zero-width non-joiner
s/\x{a0}/ /g; # non-breaking space
s/\x{2009}/ /g; # thin space
s/\x{2028}/ /g; # "line separator"
diff --git a/corpus/support/tokenizer.pl b/corpus/support/tokenizer.pl
index 7771201f..f57bc87a 100755
--- a/corpus/support/tokenizer.pl
+++ b/corpus/support/tokenizer.pl
@@ -240,6 +240,10 @@ sub proc_token {
return $token;
}
+ if($token =~ /^\d+(.\d+)+(亿|百万|万|千)?$/){
+ return $token;
+ }
+
## 1,234,345.34
if($token =~ /^\d+(\.\d{3})*,\d+$/){
## number