diff options
Diffstat (limited to 'training/dep-reorder')
-rwxr-xr-x | training/dep-reorder/conll2reordering-forest.pl | 65 | ||||
-rw-r--r-- | training/dep-reorder/george.conll | 4 | ||||
-rwxr-xr-x | training/dep-reorder/scripts/conll2simplecfg.pl | 57 |
3 files changed, 126 insertions, 0 deletions
diff --git a/training/dep-reorder/conll2reordering-forest.pl b/training/dep-reorder/conll2reordering-forest.pl new file mode 100755 index 00000000..3cd226be --- /dev/null +++ b/training/dep-reorder/conll2reordering-forest.pl @@ -0,0 +1,65 @@ +#!/usr/bin/perl -w +use strict; + +my $script_dir; BEGIN { use Cwd qw/ abs_path cwd /; use File::Basename; $script_dir = dirname(abs_path($0)); push @INC, $script_dir; } +my $FIRST_CONV = "$script_dir/scripts/conll2simplecfg.pl"; +my $CDEC = "$script_dir/../../decoder/cdec"; + +our $tfile1 = "grammar1.$$"; +our $tfile2 = "text.$$"; + +die "Usage: $0 parses.conll\n" unless scalar @ARGV == 1; +open C, "<$ARGV[0]" or die "Can't read $ARGV[0]: $!"; + +END { unlink $tfile1; unlink "$tfile1.cfg"; unlink $tfile2; } + +my $first = 1; +open T, ">$tfile1" or die "Can't write $tfile1: $!"; +my $lc = 0; +my $flag = 0; +my @words = (); +while(<C>) { + print T; + chomp; + if (/^$/) { + if ($first) { $first = undef; } else { if ($flag) { print "\n"; $flag = 0; } } + $first = undef; + close T; + open SO, ">$tfile2" or die "Can't write $tfile2: $!"; + print SO "@words\n"; + close SO; + @words=(); + `$FIRST_CONV < $tfile1 > $tfile1.cfg`; + if ($? != 0) { + die "Error code: $?"; + } + my $cfg = `$CDEC -n -S 10000 -f scfg -g $tfile1.cfg -i $tfile2 --show_cfg_search_space 2>/dev/null`; + if ($? != 0) { + die "Error code: $?"; + } + my @rules = split /\n/, $cfg; + shift @rules; # get rid of output + for my $rule (@rules) { + my ($lhs, $f, $e, $feats) = split / \|\|\| /, $rule; + $f =~ s/,\d\]/\]/g; + $feats = 'TOP=1' unless $feats; + if ($lhs =~ /\[Goal_\d+\]/) { $lhs = '[S]'; } + print "$lhs ||| $f ||| $feats\n"; + if ($e eq '[1] [2]') { + my ($a, $b) = split /\s+/, $f; + $feats =~ s/=1$//; + my ($x, $y) = split /_/, $feats; + print "$lhs ||| $b $a ||| ${y}_$x=1\n"; + } + $flag = 1; + } + open T, ">$tfile1" or die "Can't write $tfile1: $!"; + $lc = -1; + } else { + my ($ind, $word, @dmmy) = split /\s+/; + push @words, $word; + } + $lc++; +} +close T; + diff --git a/training/dep-reorder/george.conll b/training/dep-reorder/george.conll new file mode 100644 index 00000000..7eebb360 --- /dev/null +++ b/training/dep-reorder/george.conll @@ -0,0 +1,4 @@ +1 George _ GEORGE _ _ 2 X _ _ +2 hates _ HATES _ _ 0 X _ _ +3 broccoli _ BROC _ _ 2 X _ _ + diff --git a/training/dep-reorder/scripts/conll2simplecfg.pl b/training/dep-reorder/scripts/conll2simplecfg.pl new file mode 100755 index 00000000..b101347a --- /dev/null +++ b/training/dep-reorder/scripts/conll2simplecfg.pl @@ -0,0 +1,57 @@ +#!/usr/bin/perl -w +use strict; + +# 1 在 _ 10 _ _ 4 X _ _ +# 2 门厅 _ 3 _ _ 1 X _ _ +# 3 下面 _ 23 _ _ 4 X _ _ +# 4 。 _ 45 _ _ 0 X _ _ + +my @ldeps; +my @rdeps; +@ldeps=(); for (my $i =0; $i <1000; $i++) { push @ldeps, []; } +@rdeps=(); for (my $i =0; $i <1000; $i++) { push @rdeps, []; } +my $rootcat = 0; +my @cats = ('S'); +my $len = 0; +my @noposcats = ('S'); +while(<>) { + chomp; + if (/^\s*$/) { + write_cfg($len); + $len = 0; + @cats=('S'); + @noposcats = ('S'); + @ldeps=(); for (my $i =0; $i <1000; $i++) { push @ldeps, []; } + @rdeps=(); for (my $i =0; $i <1000; $i++) { push @rdeps, []; } + next; + } + $len++; + my ($pos, $word, $d1, $xcat, $d2, $d3, $headpos, $deptype) = split /\s+/; + my $cat = "C$xcat"; + my $catpos = $cat . "_$pos"; + push @cats, $catpos; + push @noposcats, $cat; + print "[$catpos] ||| $word ||| $word ||| Word=1\n"; + if ($headpos == 0) { $rootcat = $pos; } + if ($pos < $headpos) { + push @{$ldeps[$headpos]}, $pos; + } else { + push @{$rdeps[$headpos]}, $pos; + } +} + +sub write_cfg { + my $len = shift; + for (my $i = 1; $i <= $len; $i++) { + my @lds = @{$ldeps[$i]}; + for my $ld (@lds) { + print "[$cats[$i]] ||| [$cats[$ld],1] [$cats[$i],2] ||| [1] [2] ||| $noposcats[$ld]_$noposcats[$i]=1\n"; + } + my @rds = @{$rdeps[$i]}; + for my $rd (@rds) { + print "[$cats[$i]] ||| [$cats[$i],1] [$cats[$rd],2] ||| [1] [2] ||| $noposcats[$i]_$noposcats[$rd]=1\n"; + } + } + print "[S] ||| [$cats[$rootcat],1] ||| [1] ||| TOP=1\n"; +} + |