diff options
author | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-22 05:12:27 +0000 |
---|---|---|
committer | redpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f> | 2010-06-22 05:12:27 +0000 |
commit | 0172721855098ca02b207231a654dffa5e4eb1c9 (patch) | |
tree | 8069c3a62e2d72bd64a2cdeee9724b2679c8a56b /training/dep-reorder/scripts | |
parent | 37728b8be4d0b3df9da81fdda2198ff55b4b2d91 (diff) |
initial checkin
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@2 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'training/dep-reorder/scripts')
-rwxr-xr-x | training/dep-reorder/scripts/conll2simplecfg.pl | 57 |
1 files changed, 57 insertions, 0 deletions
diff --git a/training/dep-reorder/scripts/conll2simplecfg.pl b/training/dep-reorder/scripts/conll2simplecfg.pl new file mode 100755 index 00000000..b101347a --- /dev/null +++ b/training/dep-reorder/scripts/conll2simplecfg.pl @@ -0,0 +1,57 @@ +#!/usr/bin/perl -w +use strict; + +# 1 在 _ 10 _ _ 4 X _ _ +# 2 门厅 _ 3 _ _ 1 X _ _ +# 3 下面 _ 23 _ _ 4 X _ _ +# 4 。 _ 45 _ _ 0 X _ _ + +my @ldeps; +my @rdeps; +@ldeps=(); for (my $i =0; $i <1000; $i++) { push @ldeps, []; } +@rdeps=(); for (my $i =0; $i <1000; $i++) { push @rdeps, []; } +my $rootcat = 0; +my @cats = ('S'); +my $len = 0; +my @noposcats = ('S'); +while(<>) { + chomp; + if (/^\s*$/) { + write_cfg($len); + $len = 0; + @cats=('S'); + @noposcats = ('S'); + @ldeps=(); for (my $i =0; $i <1000; $i++) { push @ldeps, []; } + @rdeps=(); for (my $i =0; $i <1000; $i++) { push @rdeps, []; } + next; + } + $len++; + my ($pos, $word, $d1, $xcat, $d2, $d3, $headpos, $deptype) = split /\s+/; + my $cat = "C$xcat"; + my $catpos = $cat . "_$pos"; + push @cats, $catpos; + push @noposcats, $cat; + print "[$catpos] ||| $word ||| $word ||| Word=1\n"; + if ($headpos == 0) { $rootcat = $pos; } + if ($pos < $headpos) { + push @{$ldeps[$headpos]}, $pos; + } else { + push @{$rdeps[$headpos]}, $pos; + } +} + +sub write_cfg { + my $len = shift; + for (my $i = 1; $i <= $len; $i++) { + my @lds = @{$ldeps[$i]}; + for my $ld (@lds) { + print "[$cats[$i]] ||| [$cats[$ld],1] [$cats[$i],2] ||| [1] [2] ||| $noposcats[$ld]_$noposcats[$i]=1\n"; + } + my @rds = @{$rdeps[$i]}; + for my $rd (@rds) { + print "[$cats[$i]] ||| [$cats[$i],1] [$cats[$rd],2] ||| [1] [2] ||| $noposcats[$i]_$noposcats[$rd]=1\n"; + } + } + print "[S] ||| [$cats[$rootcat],1] ||| [1] ||| TOP=1\n"; +} + |