summaryrefslogtreecommitdiff
path: root/training/dep-reorder/scripts/conll2simplecfg.pl
diff options
context:
space:
mode:
authorredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-22 05:12:27 +0000
committerredpony <redpony@ec762483-ff6d-05da-a07a-a48fb63a330f>2010-06-22 05:12:27 +0000
commit7cc92b65a3185aa242088d830e166e495674efc9 (patch)
tree681fe5237612a4e96ce36fb9fabef00042c8ee61 /training/dep-reorder/scripts/conll2simplecfg.pl
parent37728b8be4d0b3df9da81fdda2198ff55b4b2d91 (diff)
initial checkin
git-svn-id: https://ws10smt.googlecode.com/svn/trunk@2 ec762483-ff6d-05da-a07a-a48fb63a330f
Diffstat (limited to 'training/dep-reorder/scripts/conll2simplecfg.pl')
-rwxr-xr-xtraining/dep-reorder/scripts/conll2simplecfg.pl57
1 files changed, 57 insertions, 0 deletions
diff --git a/training/dep-reorder/scripts/conll2simplecfg.pl b/training/dep-reorder/scripts/conll2simplecfg.pl
new file mode 100755
index 00000000..b101347a
--- /dev/null
+++ b/training/dep-reorder/scripts/conll2simplecfg.pl
@@ -0,0 +1,57 @@
+#!/usr/bin/perl -w
+use strict;
+
+# 1 在 _ 10 _ _ 4 X _ _
+# 2 门厅 _ 3 _ _ 1 X _ _
+# 3 下面 _ 23 _ _ 4 X _ _
+# 4 。 _ 45 _ _ 0 X _ _
+
+my @ldeps;
+my @rdeps;
+@ldeps=(); for (my $i =0; $i <1000; $i++) { push @ldeps, []; }
+@rdeps=(); for (my $i =0; $i <1000; $i++) { push @rdeps, []; }
+my $rootcat = 0;
+my @cats = ('S');
+my $len = 0;
+my @noposcats = ('S');
+while(<>) {
+ chomp;
+ if (/^\s*$/) {
+ write_cfg($len);
+ $len = 0;
+ @cats=('S');
+ @noposcats = ('S');
+ @ldeps=(); for (my $i =0; $i <1000; $i++) { push @ldeps, []; }
+ @rdeps=(); for (my $i =0; $i <1000; $i++) { push @rdeps, []; }
+ next;
+ }
+ $len++;
+ my ($pos, $word, $d1, $xcat, $d2, $d3, $headpos, $deptype) = split /\s+/;
+ my $cat = "C$xcat";
+ my $catpos = $cat . "_$pos";
+ push @cats, $catpos;
+ push @noposcats, $cat;
+ print "[$catpos] ||| $word ||| $word ||| Word=1\n";
+ if ($headpos == 0) { $rootcat = $pos; }
+ if ($pos < $headpos) {
+ push @{$ldeps[$headpos]}, $pos;
+ } else {
+ push @{$rdeps[$headpos]}, $pos;
+ }
+}
+
+sub write_cfg {
+ my $len = shift;
+ for (my $i = 1; $i <= $len; $i++) {
+ my @lds = @{$ldeps[$i]};
+ for my $ld (@lds) {
+ print "[$cats[$i]] ||| [$cats[$ld],1] [$cats[$i],2] ||| [1] [2] ||| $noposcats[$ld]_$noposcats[$i]=1\n";
+ }
+ my @rds = @{$rdeps[$i]};
+ for my $rd (@rds) {
+ print "[$cats[$i]] ||| [$cats[$i],1] [$cats[$rd],2] ||| [1] [2] ||| $noposcats[$i]_$noposcats[$rd]=1\n";
+ }
+ }
+ print "[S] ||| [$cats[$rootcat],1] ||| [1] ||| TOP=1\n";
+}
+